src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 #include "pipe/p_config.h"
  30
  31 #if defined(PIPE_ARCH_X86)
  32
  33 #include "util/u_debug.h"
  34 #include "pipe/p_shader_tokens.h"
  35 #include "util/u_math.h"
  36 #include "util/u_memory.h"
  37 #if defined(PIPE_ARCH_SSE)
  38 #include "util/u_sse.h"
  39 #endif
  40 #include "tgsi/tgsi_info.h"
  41 #include "tgsi/tgsi_parse.h"
  42 #include "tgsi/tgsi_util.h"
  43 #include "tgsi/tgsi_dump.h"
  44 #include "tgsi/tgsi_exec.h"
  45 #include "tgsi/tgsi_sse2.h"
  46
  47 #include "rtasm/rtasm_x86sse.h"
  48
  49 /* for 1/sqrt()
  50  *
  51  * This costs about 100fps (close to 10%) in gears:
  52  */
  53 #define HIGH_PRECISION 1
  54
  55 #define FAST_MATH 1
  56
  57
  58 #define FOR_EACH_CHANNEL( CHAN )\
  59    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  60
  61 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  62    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
  63
  64 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  65    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  66
  67 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  68    FOR_EACH_CHANNEL( CHAN )\
  69       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  70
  71 #define CHAN_X 0
  72 #define CHAN_Y 1
  73 #define CHAN_Z 2
  74 #define CHAN_W 3
  75
  76 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  77 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  78
  79 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  80 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  81 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  82 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  83
  84
  85 /**
  86  * X86 utility functions.
  87  */
  88
  89 static struct x86_reg
  90 make_xmm(
  91    unsigned xmm )
  92 {
  93    return x86_make_reg(
  94       file_XMM,
  95       (enum x86_reg_name) xmm );
  96 }
  97
  98 /**
  99  * X86 register mapping helpers.
 100  */
 101
 102 static struct x86_reg
 103 get_const_base( void )
 104 {
 105    return x86_make_reg(
 106       file_REG32,
 107       reg_AX );
 108 }
 109
 110 static struct x86_reg
 111 get_machine_base( void )
 112 {
 113    return x86_make_reg(
 114       file_REG32,
 115       reg_CX );
 116 }
 117
 118 static struct x86_reg
 119 get_input_base( void )
 120 {
 121    return x86_make_disp(
 122       get_machine_base(),
 123       Offset(struct tgsi_exec_machine, Inputs) );
 124 }
 125
 126 static struct x86_reg
 127 get_output_base( void )
 128 {
 129    return x86_make_disp(
 130       get_machine_base(),
 131       Offset(struct tgsi_exec_machine, Outputs) );
 132 }
 133
 134 static struct x86_reg
 135 get_temp_base( void )
 136 {
 137    return x86_make_disp(
 138       get_machine_base(),
 139       Offset(struct tgsi_exec_machine, Temps) );
 140 }
 141
 142 static struct x86_reg
 143 get_coef_base( void )
 144 {
 145    return x86_make_reg(
 146       file_REG32,
 147       reg_BX );
 148 }
 149
 150 static struct x86_reg
 151 get_sampler_base( void )
 152 {
 153    return x86_make_reg(
 154       file_REG32,
 155       reg_DI );
 156 }
 157
 158 static struct x86_reg
 159 get_immediate_base( void )
 160 {
 161    return x86_make_reg(
 162       file_REG32,
 163       reg_DX );
 164 }
 165
 166
 167 /**
 168  * Data access helpers.
 169  */
 170
 171
 172 static struct x86_reg
 173 get_immediate(
 174    unsigned vec,
 175    unsigned chan )
 176 {
 177    return x86_make_disp(
 178       get_immediate_base(),
 179       (vec * 4 + chan) * 4 );
 180 }
 181
 182 static struct x86_reg
 183 get_const(
 184    unsigned vec,
 185    unsigned chan )
 186 {
 187    return x86_make_disp(
 188       get_const_base(),
 189       (vec * 4 + chan) * 4 );
 190 }
 191
 192 static struct x86_reg
 193 get_sampler_ptr(
 194    unsigned unit )
 195 {
 196    return x86_make_disp(
 197       get_sampler_base(),
 198       unit * sizeof( struct tgsi_sampler * ) );
 199 }
 200
 201 static struct x86_reg
 202 get_input(
 203    unsigned vec,
 204    unsigned chan )
 205 {
 206    return x86_make_disp(
 207       get_input_base(),
 208       (vec * 4 + chan) * 16 );
 209 }
 210
 211 static struct x86_reg
 212 get_output(
 213    unsigned vec,
 214    unsigned chan )
 215 {
 216    return x86_make_disp(
 217       get_output_base(),
 218       (vec * 4 + chan) * 16 );
 219 }
 220
 221 static struct x86_reg
 222 get_temp(
 223    unsigned vec,
 224    unsigned chan )
 225 {
 226    return x86_make_disp(
 227       get_temp_base(),
 228       (vec * 4 + chan) * 16 );
 229 }
 230
 231 static struct x86_reg
 232 get_coef(
 233    unsigned vec,
 234    unsigned chan,
 235    unsigned member )
 236 {
 237    return x86_make_disp(
 238       get_coef_base(),
 239       ((vec * 3 + member) * 4 + chan) * 4 );
 240 }
 241
 242
 243 static void
 244 emit_ret(
 245    struct x86_function  *func )
 246 {
 247    x86_ret( func );
 248 }
 249
 250
 251 /**
 252  * Data fetch helpers.
 253  */
 254
 255 /**
 256  * Copy a shader constant to xmm register
 257  * \param xmm  the destination xmm register
 258  * \param vec  the src const buffer index
 259  * \param chan  src channel to fetch (X, Y, Z or W)
 260  */
 261 static void
 262 emit_const(
 263    struct x86_function *func,
 264    uint xmm,
 265    int vec,
 266    uint chan,
 267    uint indirect,
 268    uint indirectFile,
 269    int indirectIndex )
 270 {
 271    if (indirect) {
 272       /* 'vec' is the offset from the address register's value.
 273        * We're loading CONST[ADDR+vec] into an xmm register.
 274        */
 275       struct x86_reg r0 = get_immediate_base();
 276       struct x86_reg r1 = get_coef_base();
 277       uint i;
 278
 279       assert( indirectFile == TGSI_FILE_ADDRESS );
 280       assert( indirectIndex == 0 );
 281       assert( r0.mod == mod_REG );
 282       assert( r1.mod == mod_REG );
 283
 284       x86_push( func, r0 );
 285       x86_push( func, r1 );
 286
 287       /*
 288        * Loop over the four pixels or vertices in the quad.
 289        * Get the value of the address (offset) register for pixel/vertex[i],
 290        * add it to the src offset and index into the constant buffer.
 291        * Note that we're working on SOA data.
 292        * If any of the pixel/vertex execution channels are unused their
 293        * values will be garbage.  It's very important that we don't use
 294        * those garbage values as indexes into the constant buffer since
 295        * that'll cause segfaults.
 296        * The solution is to bitwise-AND the offset with the execution mask
 297        * register whose values are either 0 or ~0.
 298        * The caller must setup the execution mask register to indicate
 299        * which channels are valid/alive before running the shader.
 300        * The execution mask will also figure into loops and conditionals
 301        * someday.
 302        */
 303       for (i = 0; i < QUAD_SIZE; i++) {
 304          /* r1 = address register[i] */
 305          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 306          /* r0 = execution mask[i] */
 307          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 308          /* r1 = r1 & r0 */
 309          x86_and( func, r1, r0 );
 310          /* r0 = 'vec', the offset */
 311          x86_lea( func, r0, get_const( vec, chan ) );
 312
 313          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 314           */
 315          x86_add( func, r1, r1 );
 316          x86_add( func, r1, r1 );
 317          x86_add( func, r1, r1 );
 318          x86_add( func, r1, r1 );
 319
 320          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 321          x86_mov( func, r1, x86_deref( r0 ) );
 322          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 323       }
 324
 325       x86_pop( func, r1 );
 326       x86_pop( func, r0 );
 327
 328       sse_movaps(
 329          func,
 330          make_xmm( xmm ),
 331          get_temp( TEMP_R0, CHAN_X ) );
 332    }
 333    else {
 334       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 335       assert( vec >= 0 );
 336
 337       sse_movss(
 338          func,
 339          make_xmm( xmm ),
 340          get_const( vec, chan ) );
 341       sse_shufps(
 342          func,
 343          make_xmm( xmm ),
 344          make_xmm( xmm ),
 345          SHUF( 0, 0, 0, 0 ) );
 346    }
 347 }
 348
 349 static void
 350 emit_immediate(
 351    struct x86_function *func,
 352    unsigned xmm,
 353    unsigned vec,
 354    unsigned chan )
 355 {
 356    sse_movss(
 357       func,
 358       make_xmm( xmm ),
 359       get_immediate( vec, chan ) );
 360    sse_shufps(
 361       func,
 362       make_xmm( xmm ),
 363       make_xmm( xmm ),
 364       SHUF( 0, 0, 0, 0 ) );
 365 }
 366
 367
 368 /**
 369  * Copy a shader input to xmm register
 370  * \param xmm  the destination xmm register
 371  * \param vec  the src input attrib
 372  * \param chan  src channel to fetch (X, Y, Z or W)
 373  */
 374 static void
 375 emit_inputf(
 376    struct x86_function *func,
 377    unsigned xmm,
 378    unsigned vec,
 379    unsigned chan )
 380 {
 381    sse_movups(
 382       func,
 383       make_xmm( xmm ),
 384       get_input( vec, chan ) );
 385 }
 386
 387 /**
 388  * Store an xmm register to a shader output
 389  * \param xmm  the source xmm register
 390  * \param vec  the dest output attrib
 391  * \param chan  src dest channel to store (X, Y, Z or W)
 392  */
 393 static void
 394 emit_output(
 395    struct x86_function *func,
 396    unsigned xmm,
 397    unsigned vec,
 398    unsigned chan )
 399 {
 400    sse_movups(
 401       func,
 402       get_output( vec, chan ),
 403       make_xmm( xmm ) );
 404 }
 405
 406 /**
 407  * Copy a shader temporary to xmm register
 408  * \param xmm  the destination xmm register
 409  * \param vec  the src temp register
 410  * \param chan  src channel to fetch (X, Y, Z or W)
 411  */
 412 static void
 413 emit_tempf(
 414    struct x86_function *func,
 415    unsigned xmm,
 416    unsigned vec,
 417    unsigned chan )
 418 {
 419    sse_movaps(
 420       func,
 421       make_xmm( xmm ),
 422       get_temp( vec, chan ) );
 423 }
 424
 425 /**
 426  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 427  * \param xmm  the destination xmm register
 428  * \param vec  the src input/attribute coefficient index
 429  * \param chan  src channel to fetch (X, Y, Z or W)
 430  * \param member  0=a0, 1=dadx, 2=dady
 431  */
 432 static void
 433 emit_coef(
 434    struct x86_function *func,
 435    unsigned xmm,
 436    unsigned vec,
 437    unsigned chan,
 438    unsigned member )
 439 {
 440    sse_movss(
 441       func,
 442       make_xmm( xmm ),
 443       get_coef( vec, chan, member ) );
 444    sse_shufps(
 445       func,
 446       make_xmm( xmm ),
 447       make_xmm( xmm ),
 448       SHUF( 0, 0, 0, 0 ) );
 449 }
 450
 451 /**
 452  * Data store helpers.
 453  */
 454
 455 static void
 456 emit_inputs(
 457    struct x86_function *func,
 458    unsigned xmm,
 459    unsigned vec,
 460    unsigned chan )
 461 {
 462    sse_movups(
 463       func,
 464       get_input( vec, chan ),
 465       make_xmm( xmm ) );
 466 }
 467
 468 static void
 469 emit_temps(
 470    struct x86_function *func,
 471    unsigned xmm,
 472    unsigned vec,
 473    unsigned chan )
 474 {
 475    sse_movaps(
 476       func,
 477       get_temp( vec, chan ),
 478       make_xmm( xmm ) );
 479 }
 480
 481 static void
 482 emit_addrs(
 483    struct x86_function *func,
 484    unsigned xmm,
 485    unsigned vec,
 486    unsigned chan )
 487 {
 488    assert( vec == 0 );
 489
 490    emit_temps(
 491       func,
 492       xmm,
 493       vec + TGSI_EXEC_TEMP_ADDR,
 494       chan );
 495 }
 496
 497 /**
 498  * Coefficent fetch helpers.
 499  */
 500
 501 static void
 502 emit_coef_a0(
 503    struct x86_function *func,
 504    unsigned xmm,
 505    unsigned vec,
 506    unsigned chan )
 507 {
 508    emit_coef(
 509       func,
 510       xmm,
 511       vec,
 512       chan,
 513       0 );
 514 }
 515
 516 static void
 517 emit_coef_dadx(
 518    struct x86_function *func,
 519    unsigned xmm,
 520    unsigned vec,
 521    unsigned chan )
 522 {
 523    emit_coef(
 524       func,
 525       xmm,
 526       vec,
 527       chan,
 528       1 );
 529 }
 530
 531 static void
 532 emit_coef_dady(
 533    struct x86_function *func,
 534    unsigned xmm,
 535    unsigned vec,
 536    unsigned chan )
 537 {
 538    emit_coef(
 539       func,
 540       xmm,
 541       vec,
 542       chan,
 543       2 );
 544 }
 545
 546 /**
 547  * Function call helpers.
 548  */
 549
 550 /**
 551  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 552  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 553  * that the stack pointer is 16 byte aligned, as expected.
 554  */
 555 static void
 556 emit_func_call(
 557    struct x86_function *func,
 558    unsigned xmm_save_mask,
 559    const struct x86_reg *arg,
 560    unsigned nr_args,
 561    void (PIPE_CDECL *code)() )
 562 {
 563    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 564    unsigned i, n;
 565
 566    x86_push(
 567       func,
 568       x86_make_reg( file_REG32, reg_AX) );
 569    x86_push(
 570       func,
 571       x86_make_reg( file_REG32, reg_CX) );
 572    x86_push(
 573       func,
 574       x86_make_reg( file_REG32, reg_DX) );
 575
 576    /* Store XMM regs to the stack
 577     */
 578    for(i = 0, n = 0; i < 8; ++i)
 579       if(xmm_save_mask & (1 << i))
 580          ++n;
 581
 582    x86_sub_imm(
 583       func,
 584       x86_make_reg( file_REG32, reg_SP ),
 585       n*16);
 586
 587    for(i = 0, n = 0; i < 8; ++i)
 588       if(xmm_save_mask & (1 << i)) {
 589          sse_movups(
 590             func,
 591             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 592             make_xmm( i ) );
 593          ++n;
 594       }
 595
 596    for (i = 0; i < nr_args; i++) {
 597       /* Load the address of the buffer we use for passing arguments and
 598        * receiving results:
 599        */
 600       x86_lea(
 601          func,
 602          ecx,
 603          arg[i] );
 604
 605       /* Push actual function arguments (currently just the pointer to
 606        * the buffer above), and call the function:
 607        */
 608       x86_push( func, ecx );
 609    }
 610
 611    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 612    x86_call( func, ecx );
 613
 614    /* Pop the arguments (or just add an immediate to esp)
 615     */
 616    for (i = 0; i < nr_args; i++) {
 617       x86_pop(func, ecx );
 618    }
 619
 620    /* Pop the saved XMM regs:
 621     */
 622    for(i = 0, n = 0; i < 8; ++i)
 623       if(xmm_save_mask & (1 << i)) {
 624          sse_movups(
 625             func,
 626             make_xmm( i ),
 627             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 628          ++n;
 629       }
 630
 631    x86_add_imm(
 632       func,
 633       x86_make_reg( file_REG32, reg_SP ),
 634       n*16);
 635
 636    /* Restore GP registers in a reverse order.
 637     */
 638    x86_pop(
 639       func,
 640       x86_make_reg( file_REG32, reg_DX) );
 641    x86_pop(
 642       func,
 643       x86_make_reg( file_REG32, reg_CX) );
 644    x86_pop(
 645       func,
 646       x86_make_reg( file_REG32, reg_AX) );
 647 }
 648
 649 static void
 650 emit_func_call_dst_src1(
 651    struct x86_function *func,
 652    unsigned xmm_save,
 653    unsigned xmm_dst,
 654    unsigned xmm_src0,
 655    void (PIPE_CDECL *code)() )
 656 {
 657    struct x86_reg store = get_temp( TEMP_R0, 0 );
 658    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 659
 660    /* Store our input parameters (in xmm regs) to the buffer we use
 661     * for passing arguments.  We will pass a pointer to this buffer as
 662     * the actual function argument.
 663     */
 664    sse_movaps(
 665       func,
 666       store,
 667       make_xmm( xmm_src0 ) );
 668
 669    emit_func_call( func,
 670                    xmm_mask,
 671                    &store,
 672                    1,
 673                    code );
 674
 675    sse_movaps(
 676       func,
 677       make_xmm( xmm_dst ),
 678       store );
 679 }
 680
 681
 682 static void
 683 emit_func_call_dst_src2(
 684    struct x86_function *func,
 685    unsigned xmm_save,
 686    unsigned xmm_dst,
 687    unsigned xmm_src0,
 688    unsigned xmm_src1,
 689    void (PIPE_CDECL *code)() )
 690 {
 691    struct x86_reg store = get_temp( TEMP_R0, 0 );
 692    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 693
 694    /* Store two inputs to parameter buffer.
 695     */
 696    sse_movaps(
 697       func,
 698       store,
 699       make_xmm( xmm_src0 ) );
 700
 701    sse_movaps(
 702       func,
 703       x86_make_disp( store, 4 * sizeof(float) ),
 704       make_xmm( xmm_src1 ) );
 705
 706
 707    /* Emit the call
 708     */
 709    emit_func_call( func,
 710                    xmm_mask,
 711                    &store,
 712                    1,
 713                    code );
 714
 715    /* Retrieve the results:
 716     */
 717    sse_movaps(
 718       func,
 719       make_xmm( xmm_dst ),
 720       store );
 721 }
 722
 723
 724
 725
 726
 727 #if defined(PIPE_ARCH_SSE)
 728
 729 /*
 730  * Fast SSE2 implementation of special math functions.
 731  */
 732
 733 #define POLY0(x, c0) _mm_set1_ps(c0)
 734 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 735 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 736 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 737 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 738 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 739
 740 #define EXP_POLY_DEGREE 3
 741 #define LOG_POLY_DEGREE 5
 742
 743 /**
 744  * See http://www.devmaster.net/forums/showthread.php?p=43580
 745  */
 746 static INLINE __m128
 747 exp2f4(__m128 x)
 748 {
 749    __m128i ipart;
 750    __m128 fpart, expipart, expfpart;
 751
 752    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 753    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 754
 755    /* ipart = int(x - 0.5) */
 756    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 757
 758    /* fpart = x - ipart */
 759    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 760
 761    /* expipart = (float) (1 << ipart) */
 762    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 763
 764    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 765 #if EXP_POLY_DEGREE == 5
 766    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 767 #elif EXP_POLY_DEGREE == 4
 768    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 769 #elif EXP_POLY_DEGREE == 3
 770    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 771 #elif EXP_POLY_DEGREE == 2
 772    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 773 #else
 774 #error
 775 #endif
 776
 777    return _mm_mul_ps(expipart, expfpart);
 778 }
 779
 780
 781 /**
 782  * See http://www.devmaster.net/forums/showthread.php?p=43580
 783  */
 784 static INLINE __m128
 785 log2f4(__m128 x)
 786 {
 787    __m128i expmask = _mm_set1_epi32(0x7f800000);
 788    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 789    __m128 one = _mm_set1_ps(1.0f);
 790
 791    __m128i i = _mm_castps_si128(x);
 792
 793    /* exp = (float) exponent(x) */
 794    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 795
 796    /* mant = (float) mantissa(x) */
 797    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 798
 799    __m128 logmant;
 800
 801    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 802     * These coefficients can be generate with
 803     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 804     */
 805 #if LOG_POLY_DEGREE == 6
 806    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 807 #elif LOG_POLY_DEGREE == 5
 808    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 809 #elif LOG_POLY_DEGREE == 4
 810    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 811 #elif LOG_POLY_DEGREE == 3
 812    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 813 #else
 814 #error
 815 #endif
 816
 817    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 818    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 819
 820    return _mm_add_ps(logmant, exp);
 821 }
 822
 823
 824 static INLINE __m128
 825 powf4(__m128 x, __m128 y)
 826 {
 827    return exp2f4(_mm_mul_ps(log2f4(x), y));
 828 }
 829
 830 #endif /* PIPE_ARCH_SSE */
 831
 832
 833
 834 /**
 835  * Low-level instruction translators.
 836  */
 837
 838 static void
 839 emit_abs(
 840    struct x86_function *func,
 841    unsigned xmm )
 842 {
 843    sse_andps(
 844       func,
 845       make_xmm( xmm ),
 846       get_temp(
 847          TGSI_EXEC_TEMP_7FFFFFFF_I,
 848          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 849 }
 850
 851 static void
 852 emit_add(
 853    struct x86_function *func,
 854    unsigned xmm_dst,
 855    unsigned xmm_src )
 856 {
 857    sse_addps(
 858       func,
 859       make_xmm( xmm_dst ),
 860       make_xmm( xmm_src ) );
 861 }
 862
 863 static void PIPE_CDECL
 864 cos4f(
 865    float *store )
 866 {
 867    store[0] = cosf( store[0] );
 868    store[1] = cosf( store[1] );
 869    store[2] = cosf( store[2] );
 870    store[3] = cosf( store[3] );
 871 }
 872
 873 static void
 874 emit_cos(
 875    struct x86_function *func,
 876    unsigned xmm_save,
 877    unsigned xmm_dst )
 878 {
 879    emit_func_call_dst_src1(
 880       func,
 881       xmm_save,
 882       xmm_dst,
 883       xmm_dst,
 884       cos4f );
 885 }
 886
 887 static void PIPE_CDECL
 888 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 889 __attribute__((force_align_arg_pointer))
 890 #endif
 891 ex24f(
 892    float *store )
 893 {
 894 #if defined(PIPE_ARCH_SSE)
 895    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 896 #else
 897    store[0] = util_fast_exp2( store[0] );
 898    store[1] = util_fast_exp2( store[1] );
 899    store[2] = util_fast_exp2( store[2] );
 900    store[3] = util_fast_exp2( store[3] );
 901 #endif
 902 }
 903
 904 static void
 905 emit_ex2(
 906    struct x86_function *func,
 907    unsigned xmm_save,
 908    unsigned xmm_dst )
 909 {
 910    emit_func_call_dst_src1(
 911       func,
 912       xmm_save,
 913       xmm_dst,
 914       xmm_dst,
 915       ex24f );
 916 }
 917
 918 static void
 919 emit_f2it(
 920    struct x86_function *func,
 921    unsigned xmm )
 922 {
 923    sse2_cvttps2dq(
 924       func,
 925       make_xmm( xmm ),
 926       make_xmm( xmm ) );
 927 }
 928
 929 static void
 930 emit_i2f(
 931    struct x86_function *func,
 932    unsigned xmm )
 933 {
 934    sse2_cvtdq2ps(
 935       func,
 936       make_xmm( xmm ),
 937       make_xmm( xmm ) );
 938 }
 939
 940 static void PIPE_CDECL
 941 flr4f(
 942    float *store )
 943 {
 944    store[0] = floorf( store[0] );
 945    store[1] = floorf( store[1] );
 946    store[2] = floorf( store[2] );
 947    store[3] = floorf( store[3] );
 948 }
 949
 950 static void
 951 emit_flr(
 952    struct x86_function *func,
 953    unsigned xmm_save,
 954    unsigned xmm_dst )
 955 {
 956    emit_func_call_dst_src1(
 957       func,
 958       xmm_save,
 959       xmm_dst,
 960       xmm_dst,
 961       flr4f );
 962 }
 963
 964 static void PIPE_CDECL
 965 frc4f(
 966    float *store )
 967 {
 968    store[0] -= floorf( store[0] );
 969    store[1] -= floorf( store[1] );
 970    store[2] -= floorf( store[2] );
 971    store[3] -= floorf( store[3] );
 972 }
 973
 974 static void
 975 emit_frc(
 976    struct x86_function *func,
 977    unsigned xmm_save,
 978    unsigned xmm_dst )
 979 {
 980    emit_func_call_dst_src1(
 981       func,
 982       xmm_save,
 983       xmm_dst,
 984       xmm_dst,
 985       frc4f );
 986 }
 987
 988 static void PIPE_CDECL
 989 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 990 __attribute__((force_align_arg_pointer))
 991 #endif
 992 lg24f(
 993    float *store )
 994 {
 995 #if defined(PIPE_ARCH_SSE)
 996    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 997 #else
 998    store[0] = util_fast_log2( store[0] );
 999    store[1] = util_fast_log2( store[1] );
1000    store[2] = util_fast_log2( store[2] );
1001    store[3] = util_fast_log2( store[3] );
1002 #endif
1003 }
1004
1005 static void
1006 emit_lg2(
1007    struct x86_function *func,
1008    unsigned xmm_save,
1009    unsigned xmm_dst )
1010 {
1011    emit_func_call_dst_src1(
1012       func,
1013       xmm_save,
1014       xmm_dst,
1015       xmm_dst,
1016       lg24f );
1017 }
1018
1019 static void
1020 emit_MOV(
1021    struct x86_function *func,
1022    unsigned xmm_dst,
1023    unsigned xmm_src )
1024 {
1025    sse_movups(
1026       func,
1027       make_xmm( xmm_dst ),
1028       make_xmm( xmm_src ) );
1029 }
1030
1031 static void
1032 emit_mul (struct x86_function *func,
1033           unsigned xmm_dst,
1034           unsigned xmm_src)
1035 {
1036    sse_mulps(
1037       func,
1038       make_xmm( xmm_dst ),
1039       make_xmm( xmm_src ) );
1040 }
1041
1042 static void
1043 emit_neg(
1044    struct x86_function *func,
1045    unsigned xmm )
1046 {
1047    sse_xorps(
1048       func,
1049       make_xmm( xmm ),
1050       get_temp(
1051          TGSI_EXEC_TEMP_80000000_I,
1052          TGSI_EXEC_TEMP_80000000_C ) );
1053 }
1054
1055 static void PIPE_CDECL
1056 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1057 __attribute__((force_align_arg_pointer))
1058 #endif
1059 pow4f(
1060    float *store )
1061 {
1062 #if defined(PIPE_ARCH_SSE)
1063    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1064 #else
1065    store[0] = util_fast_pow( store[0], store[4] );
1066    store[1] = util_fast_pow( store[1], store[5] );
1067    store[2] = util_fast_pow( store[2], store[6] );
1068    store[3] = util_fast_pow( store[3], store[7] );
1069 #endif
1070 }
1071
1072 static void
1073 emit_pow(
1074    struct x86_function *func,
1075    unsigned xmm_save,
1076    unsigned xmm_dst,
1077    unsigned xmm_src0,
1078    unsigned xmm_src1 )
1079 {
1080    emit_func_call_dst_src2(
1081       func,
1082       xmm_save,
1083       xmm_dst,
1084       xmm_src0,
1085       xmm_src1,
1086       pow4f );
1087 }
1088
1089 static void
1090 emit_rcp (
1091    struct x86_function *func,
1092    unsigned xmm_dst,
1093    unsigned xmm_src )
1094 {
1095    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1096     * good enough.  Need to either emit a proper divide or use the
1097     * iterative technique described below in emit_rsqrt().
1098     */
1099    sse2_rcpps(
1100       func,
1101       make_xmm( xmm_dst ),
1102       make_xmm( xmm_src ) );
1103 }
1104
1105 static void PIPE_CDECL
1106 rnd4f(
1107    float *store )
1108 {
1109    store[0] = floorf( store[0] + 0.5f );
1110    store[1] = floorf( store[1] + 0.5f );
1111    store[2] = floorf( store[2] + 0.5f );
1112    store[3] = floorf( store[3] + 0.5f );
1113 }
1114
1115 static void
1116 emit_rnd(
1117    struct x86_function *func,
1118    unsigned xmm_save,
1119    unsigned xmm_dst )
1120 {
1121    emit_func_call_dst_src1(
1122       func,
1123       xmm_save,
1124       xmm_dst,
1125       xmm_dst,
1126       rnd4f );
1127 }
1128
1129 static void
1130 emit_rsqrt(
1131    struct x86_function *func,
1132    unsigned xmm_dst,
1133    unsigned xmm_src )
1134 {
1135 #if HIGH_PRECISION
1136    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1137     * implementations, it is possible to improve its precision at
1138     * fairly low cost, using a newton/raphson step, as below:
1139     *
1140     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1141     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1142     *
1143     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1144     */
1145    {
1146       struct x86_reg dst = make_xmm( xmm_dst );
1147       struct x86_reg src = make_xmm( xmm_src );
1148       struct x86_reg tmp0 = make_xmm( 2 );
1149       struct x86_reg tmp1 = make_xmm( 3 );
1150
1151       assert( xmm_dst != xmm_src );
1152       assert( xmm_dst != 2 && xmm_dst != 3 );
1153       assert( xmm_src != 2 && xmm_src != 3 );
1154
1155       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1156       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1157       sse_rsqrtps( func, tmp1, src  );
1158       sse_mulps(   func, src,  tmp1 );
1159       sse_mulps(   func, dst,  tmp1 );
1160       sse_mulps(   func, src,  tmp1 );
1161       sse_subps(   func, tmp0, src  );
1162       sse_mulps(   func, dst,  tmp0 );
1163    }
1164 #else
1165    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1166     * good enough.
1167     */
1168    sse_rsqrtps(
1169       func,
1170       make_xmm( xmm_dst ),
1171       make_xmm( xmm_src ) );
1172 #endif
1173 }
1174
1175 static void
1176 emit_setsign(
1177    struct x86_function *func,
1178    unsigned xmm )
1179 {
1180    sse_orps(
1181       func,
1182       make_xmm( xmm ),
1183       get_temp(
1184          TGSI_EXEC_TEMP_80000000_I,
1185          TGSI_EXEC_TEMP_80000000_C ) );
1186 }
1187
1188 static void PIPE_CDECL
1189 sgn4f(
1190    float *store )
1191 {
1192    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1193    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1194    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1195    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1196 }
1197
1198 static void
1199 emit_sgn(
1200    struct x86_function *func,
1201    unsigned xmm_save,
1202    unsigned xmm_dst )
1203 {
1204    emit_func_call_dst_src1(
1205       func,
1206       xmm_save,
1207       xmm_dst,
1208       xmm_dst,
1209       sgn4f );
1210 }
1211
1212 static void PIPE_CDECL
1213 sin4f(
1214    float *store )
1215 {
1216    store[0] = sinf( store[0] );
1217    store[1] = sinf( store[1] );
1218    store[2] = sinf( store[2] );
1219    store[3] = sinf( store[3] );
1220 }
1221
1222 static void
1223 emit_sin (struct x86_function *func,
1224           unsigned xmm_save,
1225           unsigned xmm_dst)
1226 {
1227    emit_func_call_dst_src1(
1228       func,
1229       xmm_save,
1230       xmm_dst,
1231       xmm_dst,
1232       sin4f );
1233 }
1234
1235 static void
1236 emit_sub(
1237    struct x86_function *func,
1238    unsigned xmm_dst,
1239    unsigned xmm_src )
1240 {
1241    sse_subps(
1242       func,
1243       make_xmm( xmm_dst ),
1244       make_xmm( xmm_src ) );
1245 }
1246
1247 /**
1248  * Register fetch.
1249  */
1250 static void
1251 emit_fetch(
1252    struct x86_function *func,
1253    unsigned xmm,
1254    const struct tgsi_full_src_register *reg,
1255    const unsigned chan_index )
1256 {
1257    unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1258
1259    switch (swizzle) {
1260    case TGSI_SWIZZLE_X:
1261    case TGSI_SWIZZLE_Y:
1262    case TGSI_SWIZZLE_Z:
1263    case TGSI_SWIZZLE_W:
1264       switch (reg->Register.File) {
1265       case TGSI_FILE_CONSTANT:
1266          emit_const(
1267             func,
1268             xmm,
1269             reg->Register.Index,
1270             swizzle,
1271             reg->Register.Indirect,
1272             reg->Indirect.File,
1273             reg->Indirect.Index );
1274          break;
1275
1276       case TGSI_FILE_IMMEDIATE:
1277          emit_immediate(
1278             func,
1279             xmm,
1280             reg->Register.Index,
1281             swizzle );
1282          break;
1283
1284       case TGSI_FILE_INPUT:
1285       case TGSI_FILE_SYSTEM_VALUE:
1286          emit_inputf(
1287             func,
1288             xmm,
1289             reg->Register.Index,
1290             swizzle );
1291          break;
1292
1293       case TGSI_FILE_TEMPORARY:
1294          emit_tempf(
1295             func,
1296             xmm,
1297             reg->Register.Index,
1298             swizzle );
1299          break;
1300
1301       default:
1302          assert( 0 );
1303       }
1304       break;
1305
1306    default:
1307       assert( 0 );
1308    }
1309
1310    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1311    case TGSI_UTIL_SIGN_CLEAR:
1312       emit_abs( func, xmm );
1313       break;
1314
1315    case TGSI_UTIL_SIGN_SET:
1316       emit_setsign( func, xmm );
1317       break;
1318
1319    case TGSI_UTIL_SIGN_TOGGLE:
1320       emit_neg( func, xmm );
1321       break;
1322
1323    case TGSI_UTIL_SIGN_KEEP:
1324       break;
1325    }
1326 }
1327
1328 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1329    emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1330
1331 /**
1332  * Register store.
1333  */
1334 static void
1335 emit_store(
1336    struct x86_function *func,
1337    unsigned xmm,
1338    const struct tgsi_full_dst_register *reg,
1339    const struct tgsi_full_instruction *inst,
1340    unsigned chan_index )
1341 {
1342    switch( inst->Instruction.Saturate ) {
1343    case TGSI_SAT_NONE:
1344       break;
1345
1346    case TGSI_SAT_ZERO_ONE:
1347       sse_maxps(
1348          func,
1349          make_xmm( xmm ),
1350          get_temp(
1351             TGSI_EXEC_TEMP_00000000_I,
1352             TGSI_EXEC_TEMP_00000000_C ) );
1353
1354       sse_minps(
1355          func,
1356          make_xmm( xmm ),
1357          get_temp(
1358             TGSI_EXEC_TEMP_ONE_I,
1359             TGSI_EXEC_TEMP_ONE_C ) );
1360       break;
1361
1362    case TGSI_SAT_MINUS_PLUS_ONE:
1363       assert( 0 );
1364       break;
1365    }
1366
1367
1368    switch( reg->Register.File ) {
1369    case TGSI_FILE_OUTPUT:
1370       emit_output(
1371          func,
1372          xmm,
1373          reg->Register.Index,
1374          chan_index );
1375       break;
1376
1377    case TGSI_FILE_TEMPORARY:
1378       emit_temps(
1379          func,
1380          xmm,
1381          reg->Register.Index,
1382          chan_index );
1383       break;
1384
1385    case TGSI_FILE_ADDRESS:
1386       emit_addrs(
1387          func,
1388          xmm,
1389          reg->Register.Index,
1390          chan_index );
1391       break;
1392
1393    default:
1394       assert( 0 );
1395    }
1396 }
1397
1398 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1399    emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1400
1401
1402 static void PIPE_CDECL
1403 fetch_texel( struct tgsi_sampler **sampler,
1404              float *store )
1405 {
1406 #if 0
1407    uint j;
1408
1409    debug_printf("%s sampler: %p (%p) store: %p\n",
1410                 __FUNCTION__,
1411                 sampler, *sampler,
1412                 store );
1413
1414    for (j = 0; j < 4; j++)
1415       debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
1416                    j,
1417                    store[0+j],
1418                    store[4+j],
1419                    store[8 + j],
1420                    store[12 + j]);
1421 #endif
1422
1423    {
1424       float rgba[NUM_CHANNELS][QUAD_SIZE];
1425       (*sampler)->get_samples(*sampler,
1426                               &store[0],  /* s */
1427                               &store[4],  /* t */
1428                               &store[8],  /* r */
1429                               &store[12], /* lodbias */
1430                               tgsi_sampler_lod_bias,
1431                               rgba);      /* results */
1432
1433       memcpy( store, rgba, 16 * sizeof(float));
1434    }
1435
1436 #if 0
1437    for (j = 0; j < 4; j++)
1438       debug_printf("sample %d result %f %f %f %f\n",
1439                    j,
1440                    store[0+j],
1441                    store[4+j],
1442                    store[8+j],
1443                    store[12+j]);
1444 #endif
1445 }
1446
1447 /**
1448  * High-level instruction translators.
1449  */
1450 static void
1451 emit_tex( struct x86_function *func,
1452           const struct tgsi_full_instruction *inst,
1453           boolean lodbias,
1454           boolean projected)
1455 {
1456    const uint unit = inst->Src[1].Register.Index;
1457    struct x86_reg args[2];
1458    unsigned count;
1459    unsigned i;
1460
1461    assert(inst->Instruction.Texture);
1462    switch (inst->Texture.Texture) {
1463    case TGSI_TEXTURE_1D:
1464       count = 1;
1465       break;
1466    case TGSI_TEXTURE_2D:
1467    case TGSI_TEXTURE_RECT:
1468       count = 2;
1469       break;
1470    case TGSI_TEXTURE_SHADOW1D:
1471    case TGSI_TEXTURE_SHADOW2D:
1472    case TGSI_TEXTURE_SHADOWRECT:
1473    case TGSI_TEXTURE_3D:
1474    case TGSI_TEXTURE_CUBE:
1475       count = 3;
1476       break;
1477    default:
1478       assert(0);
1479       return;
1480    }
1481
1482    if (lodbias) {
1483       FETCH( func, *inst, 3, 0, 3 );
1484    }
1485    else {
1486       emit_tempf(
1487          func,
1488          3,
1489          TGSI_EXEC_TEMP_00000000_I,
1490          TGSI_EXEC_TEMP_00000000_C );
1491
1492    }
1493
1494    /* store lodbias whether enabled or not -- fetch_texel currently
1495     * respects it always.
1496     */
1497    sse_movaps( func,
1498                get_temp( TEMP_R0, 3 ),
1499                make_xmm( 3 ) );
1500
1501    if (projected) {
1502       FETCH( func, *inst, 3, 0, 3 );
1503
1504       emit_rcp( func, 3, 3 );
1505    }
1506
1507    for (i = 0; i < count; i++) {
1508       FETCH( func, *inst, i, 0, i );
1509
1510       if (projected) {
1511          sse_mulps(
1512             func,
1513             make_xmm( i ),
1514             make_xmm( 3 ) );
1515       }
1516
1517       /* Store in the argument buffer:
1518        */
1519       sse_movaps(
1520          func,
1521          get_temp( TEMP_R0, i ),
1522          make_xmm( i ) );
1523    }
1524
1525    args[0] = get_temp( TEMP_R0, 0 );
1526    args[1] = get_sampler_ptr( unit );
1527
1528    emit_func_call( func,
1529                    0,
1530                    args,
1531                    Elements(args),
1532                    fetch_texel );
1533
1534    /* If all four channels are enabled, could use a pointer to
1535     * dst[0].x instead of TEMP_R0 for store?
1536     */
1537    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1538
1539       sse_movaps(
1540          func,
1541          make_xmm( 0 ),
1542          get_temp( TEMP_R0, i ) );
1543
1544       STORE( func, *inst, 0, 0, i );
1545    }
1546 }
1547
1548
1549 static void
1550 emit_kil(
1551    struct x86_function *func,
1552    const struct tgsi_full_src_register *reg )
1553 {
1554    unsigned uniquemask;
1555    unsigned unique_count = 0;
1556    unsigned chan_index;
1557    unsigned i;
1558
1559    /* This mask stores component bits that were already tested. Note that
1560     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1561     * tested.
1562     */
1563    uniquemask = 0;
1564
1565    FOR_EACH_CHANNEL( chan_index ) {
1566       unsigned swizzle;
1567
1568       /* unswizzle channel */
1569       swizzle = tgsi_util_get_full_src_register_swizzle(
1570          reg,
1571          chan_index );
1572
1573       /* check if the component has not been already tested */
1574       if( !(uniquemask & (1 << swizzle)) ) {
1575          uniquemask |= 1 << swizzle;
1576
1577          /* allocate register */
1578          emit_fetch(
1579             func,
1580             unique_count++,
1581             reg,
1582             chan_index );
1583       }
1584    }
1585
1586    x86_push(
1587       func,
1588       x86_make_reg( file_REG32, reg_AX ) );
1589    x86_push(
1590       func,
1591       x86_make_reg( file_REG32, reg_DX ) );
1592
1593    for (i = 0 ; i < unique_count; i++ ) {
1594       struct x86_reg dataXMM = make_xmm(i);
1595
1596       sse_cmpps(
1597          func,
1598          dataXMM,
1599          get_temp(
1600             TGSI_EXEC_TEMP_00000000_I,
1601             TGSI_EXEC_TEMP_00000000_C ),
1602          cc_LessThan );
1603
1604       if( i == 0 ) {
1605          sse_movmskps(
1606             func,
1607             x86_make_reg( file_REG32, reg_AX ),
1608             dataXMM );
1609       }
1610       else {
1611          sse_movmskps(
1612             func,
1613             x86_make_reg( file_REG32, reg_DX ),
1614             dataXMM );
1615          x86_or(
1616             func,
1617             x86_make_reg( file_REG32, reg_AX ),
1618             x86_make_reg( file_REG32, reg_DX ) );
1619       }
1620    }
1621
1622    x86_or(
1623       func,
1624       get_temp(
1625          TGSI_EXEC_TEMP_KILMASK_I,
1626          TGSI_EXEC_TEMP_KILMASK_C ),
1627       x86_make_reg( file_REG32, reg_AX ) );
1628
1629    x86_pop(
1630       func,
1631       x86_make_reg( file_REG32, reg_DX ) );
1632    x86_pop(
1633       func,
1634       x86_make_reg( file_REG32, reg_AX ) );
1635 }
1636
1637
1638 static void
1639 emit_kilp(
1640    struct x86_function *func )
1641 {
1642    /* XXX todo / fix me */
1643 }
1644
1645
1646 static void
1647 emit_setcc(
1648    struct x86_function *func,
1649    struct tgsi_full_instruction *inst,
1650    enum sse_cc cc )
1651 {
1652    unsigned chan_index;
1653
1654    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1655       FETCH( func, *inst, 0, 0, chan_index );
1656       FETCH( func, *inst, 1, 1, chan_index );
1657       sse_cmpps(
1658          func,
1659          make_xmm( 0 ),
1660          make_xmm( 1 ),
1661          cc );
1662       sse_andps(
1663          func,
1664          make_xmm( 0 ),
1665          get_temp(
1666             TEMP_ONE_I,
1667             TEMP_ONE_C ) );
1668       STORE( func, *inst, 0, 0, chan_index );
1669    }
1670 }
1671
1672 static void
1673 emit_cmp(
1674    struct x86_function *func,
1675    struct tgsi_full_instruction *inst )
1676 {
1677    unsigned chan_index;
1678
1679    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1680       FETCH( func, *inst, 0, 0, chan_index );
1681       FETCH( func, *inst, 1, 1, chan_index );
1682       FETCH( func, *inst, 2, 2, chan_index );
1683       sse_cmpps(
1684          func,
1685          make_xmm( 0 ),
1686          get_temp(
1687             TGSI_EXEC_TEMP_00000000_I,
1688             TGSI_EXEC_TEMP_00000000_C ),
1689          cc_LessThan );
1690       sse_andps(
1691          func,
1692          make_xmm( 1 ),
1693          make_xmm( 0 ) );
1694       sse_andnps(
1695          func,
1696          make_xmm( 0 ),
1697          make_xmm( 2 ) );
1698       sse_orps(
1699          func,
1700          make_xmm( 0 ),
1701          make_xmm( 1 ) );
1702       STORE( func, *inst, 0, 0, chan_index );
1703    }
1704 }
1705
1706
1707 /**
1708  * Check if inst src/dest regs use indirect addressing into temporary,
1709  * input or output register files.
1710  */
1711 static boolean
1712 indirect_reg_reference(const struct tgsi_full_instruction *inst)
1713 {
1714    uint i;
1715    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1716       const struct tgsi_full_src_register *reg = &inst->Src[i];
1717       if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1718            reg->Register.File == TGSI_FILE_INPUT ||
1719            reg->Register.File == TGSI_FILE_OUTPUT) &&
1720           reg->Register.Indirect)
1721          return TRUE;
1722    }
1723    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1724       const struct tgsi_full_dst_register *reg = &inst->Dst[i];
1725       if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1726            reg->Register.File == TGSI_FILE_INPUT ||
1727            reg->Register.File == TGSI_FILE_OUTPUT) &&
1728           reg->Register.Indirect)
1729          return TRUE;
1730    }
1731    return FALSE;
1732 }
1733
1734
1735 static int
1736 emit_instruction(
1737    struct x86_function *func,
1738    struct tgsi_full_instruction *inst )
1739 {
1740    unsigned chan_index;
1741
1742    /* we can't handle indirect addressing into temp register file yet */
1743    if (indirect_reg_reference(inst))
1744       return FALSE;
1745
1746    switch (inst->Instruction.Opcode) {
1747    case TGSI_OPCODE_ARL:
1748       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1749          FETCH( func, *inst, 0, 0, chan_index );
1750          emit_flr(func, 0, 0);
1751          emit_f2it( func, 0 );
1752          STORE( func, *inst, 0, 0, chan_index );
1753       }
1754       break;
1755
1756    case TGSI_OPCODE_MOV:
1757       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1758          FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1759       }
1760       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1761          STORE( func, *inst, 4 + chan_index, 0, chan_index );
1762       }
1763       break;
1764
1765    case TGSI_OPCODE_LIT:
1766       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1767           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1768          emit_tempf(
1769             func,
1770             0,
1771             TEMP_ONE_I,
1772             TEMP_ONE_C);
1773          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1774             STORE( func, *inst, 0, 0, CHAN_X );
1775          }
1776          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1777             STORE( func, *inst, 0, 0, CHAN_W );
1778          }
1779       }
1780       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1781           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1782          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1783             FETCH( func, *inst, 0, 0, CHAN_X );
1784             sse_maxps(
1785                func,
1786                make_xmm( 0 ),
1787                get_temp(
1788                   TGSI_EXEC_TEMP_00000000_I,
1789                   TGSI_EXEC_TEMP_00000000_C ) );
1790             STORE( func, *inst, 0, 0, CHAN_Y );
1791          }
1792          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1793             /* XMM[1] = SrcReg[0].yyyy */
1794             FETCH( func, *inst, 1, 0, CHAN_Y );
1795             /* XMM[1] = max(XMM[1], 0) */
1796             sse_maxps(
1797                func,
1798                make_xmm( 1 ),
1799                get_temp(
1800                   TGSI_EXEC_TEMP_00000000_I,
1801                   TGSI_EXEC_TEMP_00000000_C ) );
1802             /* XMM[2] = SrcReg[0].wwww */
1803             FETCH( func, *inst, 2, 0, CHAN_W );
1804             /* XMM[2] = min(XMM[2], 128.0) */
1805             sse_minps(
1806                func,
1807                make_xmm( 2 ),
1808                get_temp(
1809                   TGSI_EXEC_TEMP_128_I,
1810                   TGSI_EXEC_TEMP_128_C ) );
1811             /* XMM[2] = max(XMM[2], -128.0) */
1812             sse_maxps(
1813                func,
1814                make_xmm( 2 ),
1815                get_temp(
1816                   TGSI_EXEC_TEMP_MINUS_128_I,
1817                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1818             emit_pow( func, 3, 1, 1, 2 );
1819             FETCH( func, *inst, 0, 0, CHAN_X );
1820             sse_xorps(
1821                func,
1822                make_xmm( 2 ),
1823                make_xmm( 2 ) );
1824             sse_cmpps(
1825                func,
1826                make_xmm( 2 ),
1827                make_xmm( 0 ),
1828                cc_LessThan );
1829             sse_andps(
1830                func,
1831                make_xmm( 2 ),
1832                make_xmm( 1 ) );
1833             STORE( func, *inst, 2, 0, CHAN_Z );
1834          }
1835       }
1836       break;
1837
1838    case TGSI_OPCODE_RCP:
1839       FETCH( func, *inst, 0, 0, CHAN_X );
1840       emit_rcp( func, 0, 0 );
1841       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1842          STORE( func, *inst, 0, 0, chan_index );
1843       }
1844       break;
1845
1846    case TGSI_OPCODE_RSQ:
1847       FETCH( func, *inst, 0, 0, CHAN_X );
1848       emit_abs( func, 0 );
1849       emit_rsqrt( func, 1, 0 );
1850       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1851          STORE( func, *inst, 1, 0, chan_index );
1852       }
1853       break;
1854
1855    case TGSI_OPCODE_EXP:
1856       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1857           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1858           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1859          FETCH( func, *inst, 0, 0, CHAN_X );
1860          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1861              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1862             emit_MOV( func, 1, 0 );
1863             emit_flr( func, 2, 1 );
1864             /* dst.x = ex2(floor(src.x)) */
1865             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1866                emit_MOV( func, 2, 1 );
1867                emit_ex2( func, 3, 2 );
1868                STORE( func, *inst, 2, 0, CHAN_X );
1869             }
1870             /* dst.y = src.x - floor(src.x) */
1871             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1872                emit_MOV( func, 2, 0 );
1873                emit_sub( func, 2, 1 );
1874                STORE( func, *inst, 2, 0, CHAN_Y );
1875             }
1876          }
1877          /* dst.z = ex2(src.x) */
1878          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1879             emit_ex2( func, 3, 0 );
1880             STORE( func, *inst, 0, 0, CHAN_Z );
1881          }
1882       }
1883       /* dst.w = 1.0 */
1884       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1885          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1886          STORE( func, *inst, 0, 0, CHAN_W );
1887       }
1888       break;
1889
1890    case TGSI_OPCODE_LOG:
1891       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1892           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1893           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1894          FETCH( func, *inst, 0, 0, CHAN_X );
1895          emit_abs( func, 0 );
1896          emit_MOV( func, 1, 0 );
1897          emit_lg2( func, 2, 1 );
1898          /* dst.z = lg2(abs(src.x)) */
1899          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1900             STORE( func, *inst, 1, 0, CHAN_Z );
1901          }
1902          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1903              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1904             emit_flr( func, 2, 1 );
1905             /* dst.x = floor(lg2(abs(src.x))) */
1906             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1907                STORE( func, *inst, 1, 0, CHAN_X );
1908             }
1909             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1910             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1911                emit_ex2( func, 2, 1 );
1912                emit_rcp( func, 1, 1 );
1913                emit_mul( func, 0, 1 );
1914                STORE( func, *inst, 0, 0, CHAN_Y );
1915             }
1916          }
1917       }
1918       /* dst.w = 1.0 */
1919       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1920          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1921          STORE( func, *inst, 0, 0, CHAN_W );
1922       }
1923       break;
1924
1925    case TGSI_OPCODE_MUL:
1926       /* do all fetches and adds, storing results in temp regs */
1927       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1928          int r = chan_index + 1;
1929          FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1930          FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
1931          emit_mul( func, r, 0 );   /* xmm[r] = xmm[r] * xmm[0] */
1932       }
1933       /* do all stores of the temp regs */
1934       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1935          int r = chan_index + 1;
1936          STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
1937       }
1938       break;
1939
1940    case TGSI_OPCODE_ADD:
1941       /* do all fetches and adds, storing results in temp regs */
1942       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1943          int r = chan_index + 1;
1944          FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1945          FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
1946          emit_add( func, r, 0 );   /* xmm[r] = xmm[r] + xmm[0] */
1947       }
1948       /* do all stores of the temp regs */
1949       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1950          int r = chan_index + 1;
1951          STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
1952       }
1953       break;
1954
1955    case TGSI_OPCODE_DP3:
1956       FETCH( func, *inst, 0, 0, CHAN_X );
1957       FETCH( func, *inst, 1, 1, CHAN_X );
1958       emit_mul( func, 0, 1 );
1959       FETCH( func, *inst, 1, 0, CHAN_Y );
1960       FETCH( func, *inst, 2, 1, CHAN_Y );
1961       emit_mul( func, 1, 2 );
1962       emit_add( func, 0, 1 );
1963       FETCH( func, *inst, 1, 0, CHAN_Z );
1964       FETCH( func, *inst, 2, 1, CHAN_Z );
1965       emit_mul( func, 1, 2 );
1966       emit_add( func, 0, 1 );
1967       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1968          STORE( func, *inst, 0, 0, chan_index );
1969       }
1970       break;
1971
1972    case TGSI_OPCODE_DP4:
1973       FETCH( func, *inst, 0, 0, CHAN_X );
1974       FETCH( func, *inst, 1, 1, CHAN_X );
1975       emit_mul( func, 0, 1 );
1976       FETCH( func, *inst, 1, 0, CHAN_Y );
1977       FETCH( func, *inst, 2, 1, CHAN_Y );
1978       emit_mul( func, 1, 2 );
1979       emit_add( func, 0, 1 );
1980       FETCH( func, *inst, 1, 0, CHAN_Z );
1981       FETCH( func, *inst, 2, 1, CHAN_Z );
1982       emit_mul(func, 1, 2 );
1983       emit_add(func, 0, 1 );
1984       FETCH( func, *inst, 1, 0, CHAN_W );
1985       FETCH( func, *inst, 2, 1, CHAN_W );
1986       emit_mul( func, 1, 2 );
1987       emit_add( func, 0, 1 );
1988       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1989          STORE( func, *inst, 0, 0, chan_index );
1990       }
1991       break;
1992
1993    case TGSI_OPCODE_DST:
1994       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1995          emit_tempf(
1996             func,
1997             0,
1998             TEMP_ONE_I,
1999             TEMP_ONE_C );
2000          STORE( func, *inst, 0, 0, CHAN_X );
2001       }
2002       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2003          FETCH( func, *inst, 0, 0, CHAN_Y );
2004          FETCH( func, *inst, 1, 1, CHAN_Y );
2005          emit_mul( func, 0, 1 );
2006          STORE( func, *inst, 0, 0, CHAN_Y );
2007       }
2008       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2009          FETCH( func, *inst, 0, 0, CHAN_Z );
2010          STORE( func, *inst, 0, 0, CHAN_Z );
2011       }
2012       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2013          FETCH( func, *inst, 0, 1, CHAN_W );
2014          STORE( func, *inst, 0, 0, CHAN_W );
2015       }
2016       break;
2017
2018    case TGSI_OPCODE_MIN:
2019       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2020          FETCH( func, *inst, 0, 0, chan_index );
2021          FETCH( func, *inst, 1, 1, chan_index );
2022          sse_minps(
2023             func,
2024             make_xmm( 0 ),
2025             make_xmm( 1 ) );
2026          STORE( func, *inst, 0, 0, chan_index );
2027       }
2028       break;
2029
2030    case TGSI_OPCODE_MAX:
2031       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2032          FETCH( func, *inst, 0, 0, chan_index );
2033          FETCH( func, *inst, 1, 1, chan_index );
2034          sse_maxps(
2035             func,
2036             make_xmm( 0 ),
2037             make_xmm( 1 ) );
2038          STORE( func, *inst, 0, 0, chan_index );
2039       }
2040       break;
2041
2042    case TGSI_OPCODE_SLT:
2043       emit_setcc( func, inst, cc_LessThan );
2044       break;
2045
2046    case TGSI_OPCODE_SGE:
2047       emit_setcc( func, inst, cc_NotLessThan );
2048       break;
2049
2050    case TGSI_OPCODE_MAD:
2051       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2052          FETCH( func, *inst, 0, 0, chan_index );
2053          FETCH( func, *inst, 1, 1, chan_index );
2054          FETCH( func, *inst, 2, 2, chan_index );
2055          emit_mul( func, 0, 1 );
2056          emit_add( func, 0, 2 );
2057          STORE( func, *inst, 0, 0, chan_index );
2058       }
2059       break;
2060
2061    case TGSI_OPCODE_SUB:
2062       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2063          FETCH( func, *inst, 0, 0, chan_index );
2064          FETCH( func, *inst, 1, 1, chan_index );
2065          emit_sub( func, 0, 1 );
2066          STORE( func, *inst, 0, 0, chan_index );
2067       }
2068       break;
2069
2070    case TGSI_OPCODE_LRP:
2071       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2072          FETCH( func, *inst, 0, 0, chan_index );
2073          FETCH( func, *inst, 1, 1, chan_index );
2074          FETCH( func, *inst, 2, 2, chan_index );
2075          emit_sub( func, 1, 2 );
2076          emit_mul( func, 0, 1 );
2077          emit_add( func, 0, 2 );
2078          STORE( func, *inst, 0, 0, chan_index );
2079       }
2080       break;
2081
2082    case TGSI_OPCODE_CND:
2083       return 0;
2084       break;
2085
2086    case TGSI_OPCODE_DP2A:
2087       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2088       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2089       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2090       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2091       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2092       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2093       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2094       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2095       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2096       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2097          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2098       }
2099       break;
2100
2101    case TGSI_OPCODE_FRC:
2102       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2103          FETCH( func, *inst, 0, 0, chan_index );
2104          emit_frc( func, 0, 0 );
2105          STORE( func, *inst, 0, 0, chan_index );
2106       }
2107       break;
2108
2109    case TGSI_OPCODE_CLAMP:
2110       return 0;
2111       break;
2112
2113    case TGSI_OPCODE_FLR:
2114       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2115          FETCH( func, *inst, 0, 0, chan_index );
2116          emit_flr( func, 0, 0 );
2117          STORE( func, *inst, 0, 0, chan_index );
2118       }
2119       break;
2120
2121    case TGSI_OPCODE_ROUND:
2122       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2123          FETCH( func, *inst, 0, 0, chan_index );
2124          emit_rnd( func, 0, 0 );
2125          STORE( func, *inst, 0, 0, chan_index );
2126       }
2127       break;
2128
2129    case TGSI_OPCODE_EX2:
2130       FETCH( func, *inst, 0, 0, CHAN_X );
2131       emit_ex2( func, 0, 0 );
2132       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2133          STORE( func, *inst, 0, 0, chan_index );
2134       }
2135       break;
2136
2137    case TGSI_OPCODE_LG2:
2138       FETCH( func, *inst, 0, 0, CHAN_X );
2139       emit_lg2( func, 0, 0 );
2140       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2141          STORE( func, *inst, 0, 0, chan_index );
2142       }
2143       break;
2144
2145    case TGSI_OPCODE_POW:
2146       FETCH( func, *inst, 0, 0, CHAN_X );
2147       FETCH( func, *inst, 1, 1, CHAN_X );
2148       emit_pow( func, 0, 0, 0, 1 );
2149       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2150          STORE( func, *inst, 0, 0, chan_index );
2151       }
2152       break;
2153
2154    case TGSI_OPCODE_XPD:
2155       /* Note: we do all stores after all operands have been fetched
2156        * to avoid src/dst register aliasing issues for an instruction
2157        * such as:  XPD TEMP[2].xyz, TEMP[0], TEMP[2];
2158        */
2159       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2160           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2161          FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
2162          FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
2163       }
2164       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2165           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2166          FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
2167          FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
2168       }
2169       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2170          emit_MOV( func, 7, 0 );  /* xmm[7] = xmm[0] */
2171          emit_mul( func, 7, 1 );  /* xmm[7] = xmm[2] * xmm[1] */
2172          emit_MOV( func, 5, 3 );  /* xmm[5] = xmm[3] */
2173          emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
2174          emit_sub( func, 7, 5 );  /* xmm[7] = xmm[2] - xmm[5] */
2175          /* store xmm[7] in dst.x below */
2176       }
2177       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2178           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2179          FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
2180          FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
2181       }
2182       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2183          emit_mul( func, 3, 2 );  /* xmm[3] = xmm[3] * xmm[2] */
2184          emit_mul( func, 1, 5 );  /* xmm[1] = xmm[1] * xmm[5] */
2185          emit_sub( func, 3, 1 );  /* xmm[3] = xmm[3] - xmm[1] */
2186          /* store xmm[3] in dst.y below */
2187       }
2188       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2189          emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
2190          emit_mul( func, 0, 2 );  /* xmm[0] = xmm[0] * xmm[2] */
2191          emit_sub( func, 5, 0 );  /* xmm[5] = xmm[5] - xmm[0] */
2192          STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
2193       }
2194       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2195          STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
2196       }
2197       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2198          STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
2199       }
2200       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2201          emit_tempf(
2202             func,
2203             0,
2204             TEMP_ONE_I,
2205             TEMP_ONE_C );
2206          STORE( func, *inst, 0, 0, CHAN_W );
2207       }
2208       break;
2209
2210    case TGSI_OPCODE_ABS:
2211       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2212          FETCH( func, *inst, 0, 0, chan_index );
2213          emit_abs( func, 0) ;
2214
2215          STORE( func, *inst, 0, 0, chan_index );
2216       }
2217       break;
2218
2219    case TGSI_OPCODE_RCC:
2220       return 0;
2221       break;
2222
2223    case TGSI_OPCODE_DPH:
2224       FETCH( func, *inst, 0, 0, CHAN_X );
2225       FETCH( func, *inst, 1, 1, CHAN_X );
2226       emit_mul( func, 0, 1 );
2227       FETCH( func, *inst, 1, 0, CHAN_Y );
2228       FETCH( func, *inst, 2, 1, CHAN_Y );
2229       emit_mul( func, 1, 2 );
2230       emit_add( func, 0, 1 );
2231       FETCH( func, *inst, 1, 0, CHAN_Z );
2232       FETCH( func, *inst, 2, 1, CHAN_Z );
2233       emit_mul( func, 1, 2 );
2234       emit_add( func, 0, 1 );
2235       FETCH( func, *inst, 1, 1, CHAN_W );
2236       emit_add( func, 0, 1 );
2237       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2238          STORE( func, *inst, 0, 0, chan_index );
2239       }
2240       break;
2241
2242    case TGSI_OPCODE_COS:
2243       FETCH( func, *inst, 0, 0, CHAN_X );
2244       emit_cos( func, 0, 0 );
2245       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2246          STORE( func, *inst, 0, 0, chan_index );
2247       }
2248       break;
2249
2250    case TGSI_OPCODE_DDX:
2251       return 0;
2252       break;
2253
2254    case TGSI_OPCODE_DDY:
2255       return 0;
2256       break;
2257
2258    case TGSI_OPCODE_KILP:
2259       /* predicated kill */
2260       emit_kilp( func );
2261       return 0; /* XXX fix me */
2262       break;
2263
2264    case TGSI_OPCODE_KIL:
2265       /* conditional kill */
2266       emit_kil( func, &inst->Src[0] );
2267       break;
2268
2269    case TGSI_OPCODE_PK2H:
2270       return 0;
2271       break;
2272
2273    case TGSI_OPCODE_PK2US:
2274       return 0;
2275       break;
2276
2277    case TGSI_OPCODE_PK4B:
2278       return 0;
2279       break;
2280
2281    case TGSI_OPCODE_PK4UB:
2282       return 0;
2283       break;
2284
2285    case TGSI_OPCODE_RFL:
2286       return 0;
2287       break;
2288
2289    case TGSI_OPCODE_SEQ:
2290       emit_setcc( func, inst, cc_Equal );
2291       break;
2292
2293    case TGSI_OPCODE_SFL:
2294       return 0;
2295       break;
2296
2297    case TGSI_OPCODE_SGT:
2298       emit_setcc( func, inst, cc_NotLessThanEqual );
2299       break;
2300
2301    case TGSI_OPCODE_SIN:
2302       FETCH( func, *inst, 0, 0, CHAN_X );
2303       emit_sin( func, 0, 0 );
2304       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2305          STORE( func, *inst, 0, 0, chan_index );
2306       }
2307       break;
2308
2309    case TGSI_OPCODE_SLE:
2310       emit_setcc( func, inst, cc_LessThanEqual );
2311       break;
2312
2313    case TGSI_OPCODE_SNE:
2314       emit_setcc( func, inst, cc_NotEqual );
2315       break;
2316
2317    case TGSI_OPCODE_STR:
2318       return 0;
2319       break;
2320
2321    case TGSI_OPCODE_TEX:
2322       emit_tex( func, inst, FALSE, FALSE );
2323       break;
2324
2325    case TGSI_OPCODE_TXD:
2326       return 0;
2327       break;
2328
2329    case TGSI_OPCODE_UP2H:
2330       return 0;
2331       break;
2332
2333    case TGSI_OPCODE_UP2US:
2334       return 0;
2335       break;
2336
2337    case TGSI_OPCODE_UP4B:
2338       return 0;
2339       break;
2340
2341    case TGSI_OPCODE_UP4UB:
2342       return 0;
2343       break;
2344
2345    case TGSI_OPCODE_X2D:
2346       return 0;
2347       break;
2348
2349    case TGSI_OPCODE_ARA:
2350       return 0;
2351       break;
2352
2353    case TGSI_OPCODE_ARR:
2354       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2355          FETCH( func, *inst, 0, 0, chan_index );
2356          emit_rnd( func, 0, 0 );
2357          emit_f2it( func, 0 );
2358          STORE( func, *inst, 0, 0, chan_index );
2359       }
2360       break;
2361
2362    case TGSI_OPCODE_BRA:
2363       return 0;
2364       break;
2365
2366    case TGSI_OPCODE_CAL:
2367       return 0;
2368       break;
2369
2370    case TGSI_OPCODE_RET:
2371       emit_ret( func );
2372       break;
2373
2374    case TGSI_OPCODE_END:
2375       break;
2376
2377    case TGSI_OPCODE_SSG:
2378       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2379          FETCH( func, *inst, 0, 0, chan_index );
2380          emit_sgn( func, 0, 0 );
2381          STORE( func, *inst, 0, 0, chan_index );
2382       }
2383       break;
2384
2385    case TGSI_OPCODE_CMP:
2386       emit_cmp (func, inst);
2387       break;
2388
2389    case TGSI_OPCODE_SCS:
2390       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2391          FETCH( func, *inst, 0, 0, CHAN_X );
2392          emit_cos( func, 0, 0 );
2393          STORE( func, *inst, 0, 0, CHAN_X );
2394       }
2395       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2396          FETCH( func, *inst, 0, 0, CHAN_X );
2397          emit_sin( func, 0, 0 );
2398          STORE( func, *inst, 0, 0, CHAN_Y );
2399       }
2400       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2401          emit_tempf(
2402             func,
2403             0,
2404             TGSI_EXEC_TEMP_00000000_I,
2405             TGSI_EXEC_TEMP_00000000_C );
2406          STORE( func, *inst, 0, 0, CHAN_Z );
2407       }
2408       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2409          emit_tempf(
2410             func,
2411             0,
2412             TEMP_ONE_I,
2413             TEMP_ONE_C );
2414          STORE( func, *inst, 0, 0, CHAN_W );
2415       }
2416       break;
2417
2418    case TGSI_OPCODE_TXB:
2419       emit_tex( func, inst, TRUE, FALSE );
2420       break;
2421
2422    case TGSI_OPCODE_NRM:
2423       /* fall-through */
2424    case TGSI_OPCODE_NRM4:
2425       /* 3 or 4-component normalization */
2426       {
2427          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2428
2429          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2430              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2431              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2432              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2433
2434             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2435
2436             /* xmm4 = src.x */
2437             /* xmm0 = src.x * src.x */
2438             FETCH(func, *inst, 0, 0, CHAN_X);
2439             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2440                emit_MOV(func, 4, 0);
2441             }
2442             emit_mul(func, 0, 0);
2443
2444             /* xmm5 = src.y */
2445             /* xmm0 = xmm0 + src.y * src.y */
2446             FETCH(func, *inst, 1, 0, CHAN_Y);
2447             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2448                emit_MOV(func, 5, 1);
2449             }
2450             emit_mul(func, 1, 1);
2451             emit_add(func, 0, 1);
2452
2453             /* xmm6 = src.z */
2454             /* xmm0 = xmm0 + src.z * src.z */
2455             FETCH(func, *inst, 1, 0, CHAN_Z);
2456             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2457                emit_MOV(func, 6, 1);
2458             }
2459             emit_mul(func, 1, 1);
2460             emit_add(func, 0, 1);
2461
2462             if (dims == 4) {
2463                /* xmm7 = src.w */
2464                /* xmm0 = xmm0 + src.w * src.w */
2465                FETCH(func, *inst, 1, 0, CHAN_W);
2466                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2467                   emit_MOV(func, 7, 1);
2468                }
2469                emit_mul(func, 1, 1);
2470                emit_add(func, 0, 1);
2471             }
2472
2473             /* xmm1 = 1 / sqrt(xmm0) */
2474             emit_rsqrt(func, 1, 0);
2475
2476             /* dst.x = xmm1 * src.x */
2477             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2478                emit_mul(func, 4, 1);
2479                STORE(func, *inst, 4, 0, CHAN_X);
2480             }
2481
2482             /* dst.y = xmm1 * src.y */
2483             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2484                emit_mul(func, 5, 1);
2485                STORE(func, *inst, 5, 0, CHAN_Y);
2486             }
2487
2488             /* dst.z = xmm1 * src.z */
2489             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2490                emit_mul(func, 6, 1);
2491                STORE(func, *inst, 6, 0, CHAN_Z);
2492             }
2493
2494             /* dst.w = xmm1 * src.w */
2495             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2496                emit_mul(func, 7, 1);
2497                STORE(func, *inst, 7, 0, CHAN_W);
2498             }
2499          }
2500
2501          /* dst0.w = 1.0 */
2502          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2503             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2504             STORE(func, *inst, 0, 0, CHAN_W);
2505          }
2506       }
2507       break;
2508
2509    case TGSI_OPCODE_DIV:
2510       return 0;
2511       break;
2512
2513    case TGSI_OPCODE_DP2:
2514       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2515       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2516       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2517       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2518       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2519       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2520       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2521       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2522          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2523       }
2524       break;
2525
2526    case TGSI_OPCODE_TXL:
2527       return 0;
2528       break;
2529
2530    case TGSI_OPCODE_TXP:
2531       emit_tex( func, inst, FALSE, TRUE );
2532       break;
2533
2534    case TGSI_OPCODE_BRK:
2535       return 0;
2536       break;
2537
2538    case TGSI_OPCODE_IF:
2539       return 0;
2540       break;
2541
2542    case TGSI_OPCODE_ELSE:
2543       return 0;
2544       break;
2545
2546    case TGSI_OPCODE_ENDIF:
2547       return 0;
2548       break;
2549
2550    case TGSI_OPCODE_PUSHA:
2551       return 0;
2552       break;
2553
2554    case TGSI_OPCODE_POPA:
2555       return 0;
2556       break;
2557
2558    case TGSI_OPCODE_CEIL:
2559       return 0;
2560       break;
2561
2562    case TGSI_OPCODE_I2F:
2563       return 0;
2564       break;
2565
2566    case TGSI_OPCODE_NOT:
2567       return 0;
2568       break;
2569
2570    case TGSI_OPCODE_TRUNC:
2571       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2572          FETCH( func, *inst, 0, 0, chan_index );
2573          emit_f2it( func, 0 );
2574          emit_i2f( func, 0 );
2575          STORE( func, *inst, 0, 0, chan_index );
2576       }
2577       break;
2578
2579    case TGSI_OPCODE_SHL:
2580       return 0;
2581       break;
2582
2583    case TGSI_OPCODE_ISHR:
2584       return 0;
2585       break;
2586
2587    case TGSI_OPCODE_AND:
2588       return 0;
2589       break;
2590
2591    case TGSI_OPCODE_OR:
2592       return 0;
2593       break;
2594
2595    case TGSI_OPCODE_MOD:
2596       return 0;
2597       break;
2598
2599    case TGSI_OPCODE_XOR:
2600       return 0;
2601       break;
2602
2603    case TGSI_OPCODE_SAD:
2604       return 0;
2605       break;
2606
2607    case TGSI_OPCODE_TXF:
2608       return 0;
2609       break;
2610
2611    case TGSI_OPCODE_TXQ:
2612       return 0;
2613       break;
2614
2615    case TGSI_OPCODE_CONT:
2616       return 0;
2617       break;
2618
2619    case TGSI_OPCODE_EMIT:
2620       return 0;
2621       break;
2622
2623    case TGSI_OPCODE_ENDPRIM:
2624       return 0;
2625       break;
2626
2627    default:
2628       return 0;
2629    }
2630
2631    return 1;
2632 }
2633
2634 static void
2635 emit_declaration(
2636    struct x86_function *func,
2637    struct tgsi_full_declaration *decl )
2638 {
2639    if( decl->Declaration.File == TGSI_FILE_INPUT ||
2640        decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE ) {
2641       unsigned first, last, mask;
2642       unsigned i, j;
2643
2644       first = decl->Range.First;
2645       last = decl->Range.Last;
2646       mask = decl->Declaration.UsageMask;
2647
2648       for( i = first; i <= last; i++ ) {
2649          for( j = 0; j < NUM_CHANNELS; j++ ) {
2650             if( mask & (1 << j) ) {
2651                switch( decl->Declaration.Interpolate ) {
2652                case TGSI_INTERPOLATE_CONSTANT:
2653                   emit_coef_a0( func, 0, i, j );
2654                   emit_inputs( func, 0, i, j );
2655                   break;
2656
2657                case TGSI_INTERPOLATE_LINEAR:
2658                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2659                   emit_coef_dadx( func, 1, i, j );
2660                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2661                   emit_coef_dady( func, 3, i, j );
2662                   emit_mul( func, 0, 1 );    /* x * dadx */
2663                   emit_coef_a0( func, 4, i, j );
2664                   emit_mul( func, 2, 3 );    /* y * dady */
2665                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2666                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2667                   emit_inputs( func, 0, i, j );
2668                   break;
2669
2670                case TGSI_INTERPOLATE_PERSPECTIVE:
2671                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2672                   emit_coef_dadx( func, 1, i, j );
2673                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2674                   emit_coef_dady( func, 3, i, j );
2675                   emit_mul( func, 0, 1 );    /* x * dadx */
2676                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2677                   emit_coef_a0( func, 5, i, j );
2678                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2679                   emit_mul( func, 2, 3 );    /* y * dady */
2680                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2681                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2682                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2683                   emit_inputs( func, 0, i, j );
2684                   break;
2685
2686                default:
2687                   assert( 0 );
2688                   break;
2689                }
2690             }
2691          }
2692       }
2693    }
2694 }
2695
2696 static void aos_to_soa( struct x86_function *func,
2697                         uint arg_aos,
2698                         uint arg_machine,
2699                         uint arg_num,
2700                         uint arg_stride )
2701 {
2702    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2703    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2704    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2705    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2706    int inner_loop;
2707
2708
2709    /* Save EBX */
2710    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2711
2712    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2713    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2714    x86_lea( func, soa_input,
2715             x86_make_disp( soa_input,
2716                            Offset(struct tgsi_exec_machine, Inputs) ) );
2717    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2718    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2719
2720    /* do */
2721    inner_loop = x86_get_label( func );
2722    {
2723       x86_push( func, aos_input );
2724       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2725       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2726       x86_add( func, aos_input, stride );
2727       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2728       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2729       x86_add( func, aos_input, stride );
2730       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2731       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2732       x86_add( func, aos_input, stride );
2733       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2734       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2735       x86_pop( func, aos_input );
2736
2737       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2738       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2739       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2740       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2741       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2742       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2743
2744       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2745       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2746       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2747       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2748
2749       /* Advance to next input */
2750       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2751       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2752    }
2753    /* while --num_inputs */
2754    x86_dec( func, num_inputs );
2755    x86_jcc( func, cc_NE, inner_loop );
2756
2757    /* Restore EBX */
2758    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2759 }
2760
2761 static void soa_to_aos( struct x86_function *func,
2762                         uint arg_aos,
2763                         uint arg_machine,
2764                         uint arg_num,
2765                         uint arg_stride )
2766 {
2767    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2768    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2769    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2770    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2771    int inner_loop;
2772
2773    /* Save EBX */
2774    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2775
2776    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2777    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2778    x86_lea( func, soa_output,
2779             x86_make_disp( soa_output,
2780                            Offset(struct tgsi_exec_machine, Outputs) ) );
2781    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2782
2783    /* do */
2784    inner_loop = x86_get_label( func );
2785    {
2786       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2787       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2788       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2789       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2790
2791       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2792       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2793       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2794       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2795       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2796       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2797
2798       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2799       x86_push( func, aos_output );
2800       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2801       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2802       x86_add( func, aos_output, temp );
2803       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2804       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2805       x86_add( func, aos_output, temp );
2806       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2807       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2808       x86_add( func, aos_output, temp );
2809       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2810       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2811       x86_pop( func, aos_output );
2812
2813       /* Advance to next output */
2814       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2815       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2816    }
2817    /* while --num_outputs */
2818    x86_dec( func, num_outputs );
2819    x86_jcc( func, cc_NE, inner_loop );
2820
2821    /* Restore EBX */
2822    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2823 }
2824
2825
2826 /**
2827  * Check if the instructions dst register is the same as any src
2828  * register and warn if there's a posible SOA dependency.
2829  */
2830 static void
2831 check_soa_dependencies(const struct tgsi_full_instruction *inst)
2832 {
2833    switch (inst->Instruction.Opcode) {
2834    case TGSI_OPCODE_ADD:
2835    case TGSI_OPCODE_MOV:
2836    case TGSI_OPCODE_MUL:
2837    case TGSI_OPCODE_XPD:
2838       /* OK - these opcodes correctly handle SOA dependencies */
2839       break;
2840    default:
2841       if (tgsi_check_soa_dependencies(inst)) {
2842          uint opcode = inst->Instruction.Opcode;
2843
2844          /* XXX: we only handle src/dst aliasing in a few opcodes
2845           * currently.  Need to use an additional temporay to hold
2846           * the result in the cases where the code is too opaque to
2847           * fix.
2848           */
2849          if (opcode != TGSI_OPCODE_MOV) {
2850             debug_printf("Warning: src/dst aliasing in instruction"
2851                          " is not handled:\n");
2852             tgsi_dump_instruction(inst, 1);
2853          }
2854       }
2855    }
2856 }
2857
2858
2859 /**
2860  * Translate a TGSI vertex/fragment shader to SSE2 code.
2861  * Slightly different things are done for vertex vs. fragment shaders.
2862  *
2863  * \param tokens  the TGSI input shader
2864  * \param func  the output SSE code/function
2865  * \param immediates  buffer to place immediates, later passed to SSE func
2866  * \param return  1 for success, 0 if translation failed
2867  */
2868 unsigned
2869 tgsi_emit_sse2(
2870    const struct tgsi_token *tokens,
2871    struct x86_function *func,
2872    float (*immediates)[4],
2873    boolean do_swizzles )
2874 {
2875    struct tgsi_parse_context parse;
2876    unsigned ok = 1;
2877    uint num_immediates = 0;
2878
2879    util_init_math();
2880
2881    func->csr = func->store;
2882
2883    tgsi_parse_init( &parse, tokens );
2884
2885    /* Can't just use EDI, EBX without save/restoring them:
2886     */
2887    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2888    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2889
2890    /*
2891     * Different function args for vertex/fragment shaders:
2892     */
2893    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2894       if (do_swizzles)
2895          aos_to_soa( func,
2896                      4,         /* aos_input */
2897                      1,         /* machine */
2898                      5,         /* num_inputs */
2899                      6 );       /* input_stride */
2900    }
2901
2902    x86_mov(
2903       func,
2904       get_machine_base(),
2905       x86_fn_arg( func, 1 ) );
2906    x86_mov(
2907       func,
2908       get_const_base(),
2909       x86_fn_arg( func, 2 ) );
2910    x86_mov(
2911       func,
2912       get_immediate_base(),
2913       x86_fn_arg( func, 3 ) );
2914
2915    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2916       x86_mov(
2917          func,
2918          get_coef_base(),
2919          x86_fn_arg( func, 4 ) );
2920    }
2921
2922    x86_mov(
2923       func,
2924       get_sampler_base(),
2925       x86_make_disp( get_machine_base(),
2926                      Offset( struct tgsi_exec_machine, Samplers ) ) );
2927
2928    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2929       tgsi_parse_token( &parse );
2930
2931       switch( parse.FullToken.Token.Type ) {
2932       case TGSI_TOKEN_TYPE_DECLARATION:
2933          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2934             emit_declaration(
2935                func,
2936                &parse.FullToken.FullDeclaration );
2937          }
2938          break;
2939
2940       case TGSI_TOKEN_TYPE_INSTRUCTION:
2941          ok = emit_instruction(
2942             func,
2943             &parse.FullToken.FullInstruction );
2944
2945          if (!ok) {
2946             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2947             uint proc = parse.FullHeader.Processor.Processor;
2948             debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2949                          opcode,
2950                          tgsi_get_opcode_name(opcode),
2951                          tgsi_get_processor_name(proc));
2952          }
2953
2954          check_soa_dependencies(&parse.FullToken.FullInstruction);
2955          break;
2956
2957       case TGSI_TOKEN_TYPE_IMMEDIATE:
2958          /* simply copy the immediate values into the next immediates[] slot */
2959          {
2960             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2961             uint i;
2962             assert(size <= 4);
2963             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2964             for( i = 0; i < size; i++ ) {
2965                immediates[num_immediates][i] =
2966                   parse.FullToken.FullImmediate.u[i].Float;
2967             }
2968 #if 0
2969             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2970                    num_immediates,
2971                    immediates[num_immediates][0],
2972                    immediates[num_immediates][1],
2973                    immediates[num_immediates][2],
2974                    immediates[num_immediates][3]);
2975 #endif
2976             num_immediates++;
2977          }
2978          break;
2979       case TGSI_TOKEN_TYPE_PROPERTY:
2980          /* we just ignore them for now */
2981          break;
2982
2983       default:
2984          ok = 0;
2985          assert( 0 );
2986       }
2987    }
2988
2989    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2990       if (do_swizzles)
2991          soa_to_aos( func,
2992                      7,         /* aos_output */
2993                      1,         /* machine */
2994                      8,         /* num_outputs */
2995                      9 );       /* output_stride */
2996    }
2997
2998    /* Can't just use EBX, EDI without save/restoring them:
2999     */
3000    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
3001    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
3002
3003    emit_ret( func );
3004
3005    tgsi_parse_free( &parse );
3006
3007    return ok;
3008 }
3009
3010 #endif /* PIPE_ARCH_X86 */