src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "util/u_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #include "util/u_memory.h"
  36 #if defined(PIPE_ARCH_SSE)
  37 #include "util/u_sse.h"
  38 #endif
  39 #include "tgsi/tgsi_parse.h"
  40 #include "tgsi/tgsi_util.h"
  41 #include "tgsi_exec.h"
  42 #include "tgsi_sse2.h"
  43
  44 #include "rtasm/rtasm_x86sse.h"
  45
  46 /* for 1/sqrt()
  47  *
  48  * This costs about 100fps (close to 10%) in gears:
  49  */
  50 #define HIGH_PRECISION 1
  51
  52 #define FAST_MATH 1
  53
  54
  55 #define FOR_EACH_CHANNEL( CHAN )\
  56    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  57
  58 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  59    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  60
  61 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  62    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  63
  64 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  65    FOR_EACH_CHANNEL( CHAN )\
  66       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  67
  68 #define CHAN_X 0
  69 #define CHAN_Y 1
  70 #define CHAN_Z 2
  71 #define CHAN_W 3
  72
  73 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  74 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  75
  76 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  77 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  78 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  79 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  80
  81
  82 /**
  83  * X86 utility functions.
  84  */
  85
  86 static struct x86_reg
  87 make_xmm(
  88    unsigned xmm )
  89 {
  90    return x86_make_reg(
  91       file_XMM,
  92       (enum x86_reg_name) xmm );
  93 }
  94
  95 /**
  96  * X86 register mapping helpers.
  97  */
  98
  99 static struct x86_reg
 100 get_const_base( void )
 101 {
 102    return x86_make_reg(
 103       file_REG32,
 104       reg_AX );
 105 }
 106
 107 static struct x86_reg
 108 get_machine_base( void )
 109 {
 110    return x86_make_reg(
 111       file_REG32,
 112       reg_CX );
 113 }
 114
 115 static struct x86_reg
 116 get_input_base( void )
 117 {
 118    return x86_make_disp(
 119       get_machine_base(),
 120       Offset(struct tgsi_exec_machine, Inputs) );
 121 }
 122
 123 static struct x86_reg
 124 get_output_base( void )
 125 {
 126    return x86_make_disp(
 127       get_machine_base(),
 128       Offset(struct tgsi_exec_machine, Outputs) );
 129 }
 130
 131 static struct x86_reg
 132 get_temp_base( void )
 133 {
 134    return x86_make_disp(
 135       get_machine_base(),
 136       Offset(struct tgsi_exec_machine, Temps) );
 137 }
 138
 139 static struct x86_reg
 140 get_coef_base( void )
 141 {
 142    return x86_make_reg(
 143       file_REG32,
 144       reg_BX );
 145 }
 146
 147 static struct x86_reg
 148 get_sampler_base( void )
 149 {
 150    return x86_make_reg(
 151       file_REG32,
 152       reg_DI );
 153 }
 154
 155 static struct x86_reg
 156 get_immediate_base( void )
 157 {
 158    return x86_make_reg(
 159       file_REG32,
 160       reg_DX );
 161 }
 162
 163
 164 /**
 165  * Data access helpers.
 166  */
 167
 168
 169 static struct x86_reg
 170 get_immediate(
 171    unsigned vec,
 172    unsigned chan )
 173 {
 174    return x86_make_disp(
 175       get_immediate_base(),
 176       (vec * 4 + chan) * 4 );
 177 }
 178
 179 static struct x86_reg
 180 get_const(
 181    unsigned vec,
 182    unsigned chan )
 183 {
 184    return x86_make_disp(
 185       get_const_base(),
 186       (vec * 4 + chan) * 4 );
 187 }
 188
 189 static struct x86_reg
 190 get_sampler_ptr(
 191    unsigned unit )
 192 {
 193    return x86_make_disp(
 194       get_sampler_base(),
 195       unit * sizeof( struct tgsi_sampler * ) );
 196 }
 197
 198 static struct x86_reg
 199 get_input(
 200    unsigned vec,
 201    unsigned chan )
 202 {
 203    return x86_make_disp(
 204       get_input_base(),
 205       (vec * 4 + chan) * 16 );
 206 }
 207
 208 static struct x86_reg
 209 get_output(
 210    unsigned vec,
 211    unsigned chan )
 212 {
 213    return x86_make_disp(
 214       get_output_base(),
 215       (vec * 4 + chan) * 16 );
 216 }
 217
 218 static struct x86_reg
 219 get_temp(
 220    unsigned vec,
 221    unsigned chan )
 222 {
 223    return x86_make_disp(
 224       get_temp_base(),
 225       (vec * 4 + chan) * 16 );
 226 }
 227
 228 static struct x86_reg
 229 get_coef(
 230    unsigned vec,
 231    unsigned chan,
 232    unsigned member )
 233 {
 234    return x86_make_disp(
 235       get_coef_base(),
 236       ((vec * 3 + member) * 4 + chan) * 4 );
 237 }
 238
 239
 240 static void
 241 emit_ret(
 242    struct x86_function  *func )
 243 {
 244    x86_ret( func );
 245 }
 246
 247
 248 /**
 249  * Data fetch helpers.
 250  */
 251
 252 /**
 253  * Copy a shader constant to xmm register
 254  * \param xmm  the destination xmm register
 255  * \param vec  the src const buffer index
 256  * \param chan  src channel to fetch (X, Y, Z or W)
 257  */
 258 static void
 259 emit_const(
 260    struct x86_function *func,
 261    uint xmm,
 262    int vec,
 263    uint chan,
 264    uint indirect,
 265    uint indirectFile,
 266    int indirectIndex )
 267 {
 268    if (indirect) {
 269       /* 'vec' is the offset from the address register's value.
 270        * We're loading CONST[ADDR+vec] into an xmm register.
 271        */
 272       struct x86_reg r0 = get_input_base();
 273       struct x86_reg r1 = get_output_base();
 274       uint i;
 275
 276       assert( indirectFile == TGSI_FILE_ADDRESS );
 277       assert( indirectIndex == 0 );
 278
 279       x86_push( func, r0 );
 280       x86_push( func, r1 );
 281
 282       /*
 283        * Loop over the four pixels or vertices in the quad.
 284        * Get the value of the address (offset) register for pixel/vertex[i],
 285        * add it to the src offset and index into the constant buffer.
 286        * Note that we're working on SOA data.
 287        * If any of the pixel/vertex execution channels are unused their
 288        * values will be garbage.  It's very important that we don't use
 289        * those garbage values as indexes into the constant buffer since
 290        * that'll cause segfaults.
 291        * The solution is to bitwise-AND the offset with the execution mask
 292        * register whose values are either 0 or ~0.
 293        * The caller must setup the execution mask register to indicate
 294        * which channels are valid/alive before running the shader.
 295        * The execution mask will also figure into loops and conditionals
 296        * someday.
 297        */
 298       for (i = 0; i < QUAD_SIZE; i++) {
 299          /* r1 = address register[i] */
 300          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 301          /* r0 = execution mask[i] */
 302          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 303          /* r1 = r1 & r0 */
 304          x86_and( func, r1, r0 );
 305          /* r0 = 'vec', the offset */
 306          x86_lea( func, r0, get_const( vec, chan ) );
 307
 308          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 309           */
 310          x86_add( func, r1, r1 );
 311          x86_add( func, r1, r1 );
 312          x86_add( func, r1, r1 );
 313          x86_add( func, r1, r1 );
 314
 315          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 316          x86_mov( func, r1, x86_deref( r0 ) );
 317          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 318       }
 319
 320       x86_pop( func, r1 );
 321       x86_pop( func, r0 );
 322
 323       sse_movaps(
 324          func,
 325          make_xmm( xmm ),
 326          get_temp( TEMP_R0, CHAN_X ) );
 327    }
 328    else {
 329       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 330       assert( vec >= 0 );
 331
 332       sse_movss(
 333          func,
 334          make_xmm( xmm ),
 335          get_const( vec, chan ) );
 336       sse_shufps(
 337          func,
 338          make_xmm( xmm ),
 339          make_xmm( xmm ),
 340          SHUF( 0, 0, 0, 0 ) );
 341    }
 342 }
 343
 344 static void
 345 emit_immediate(
 346    struct x86_function *func,
 347    unsigned xmm,
 348    unsigned vec,
 349    unsigned chan )
 350 {
 351    sse_movss(
 352       func,
 353       make_xmm( xmm ),
 354       get_immediate( vec, chan ) );
 355    sse_shufps(
 356       func,
 357       make_xmm( xmm ),
 358       make_xmm( xmm ),
 359       SHUF( 0, 0, 0, 0 ) );
 360 }
 361
 362
 363 /**
 364  * Copy a shader input to xmm register
 365  * \param xmm  the destination xmm register
 366  * \param vec  the src input attrib
 367  * \param chan  src channel to fetch (X, Y, Z or W)
 368  */
 369 static void
 370 emit_inputf(
 371    struct x86_function *func,
 372    unsigned xmm,
 373    unsigned vec,
 374    unsigned chan )
 375 {
 376    sse_movups(
 377       func,
 378       make_xmm( xmm ),
 379       get_input( vec, chan ) );
 380 }
 381
 382 /**
 383  * Store an xmm register to a shader output
 384  * \param xmm  the source xmm register
 385  * \param vec  the dest output attrib
 386  * \param chan  src dest channel to store (X, Y, Z or W)
 387  */
 388 static void
 389 emit_output(
 390    struct x86_function *func,
 391    unsigned xmm,
 392    unsigned vec,
 393    unsigned chan )
 394 {
 395    sse_movups(
 396       func,
 397       get_output( vec, chan ),
 398       make_xmm( xmm ) );
 399 }
 400
 401 /**
 402  * Copy a shader temporary to xmm register
 403  * \param xmm  the destination xmm register
 404  * \param vec  the src temp register
 405  * \param chan  src channel to fetch (X, Y, Z or W)
 406  */
 407 static void
 408 emit_tempf(
 409    struct x86_function *func,
 410    unsigned xmm,
 411    unsigned vec,
 412    unsigned chan )
 413 {
 414    sse_movaps(
 415       func,
 416       make_xmm( xmm ),
 417       get_temp( vec, chan ) );
 418 }
 419
 420 /**
 421  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 422  * \param xmm  the destination xmm register
 423  * \param vec  the src input/attribute coefficient index
 424  * \param chan  src channel to fetch (X, Y, Z or W)
 425  * \param member  0=a0, 1=dadx, 2=dady
 426  */
 427 static void
 428 emit_coef(
 429    struct x86_function *func,
 430    unsigned xmm,
 431    unsigned vec,
 432    unsigned chan,
 433    unsigned member )
 434 {
 435    sse_movss(
 436       func,
 437       make_xmm( xmm ),
 438       get_coef( vec, chan, member ) );
 439    sse_shufps(
 440       func,
 441       make_xmm( xmm ),
 442       make_xmm( xmm ),
 443       SHUF( 0, 0, 0, 0 ) );
 444 }
 445
 446 /**
 447  * Data store helpers.
 448  */
 449
 450 static void
 451 emit_inputs(
 452    struct x86_function *func,
 453    unsigned xmm,
 454    unsigned vec,
 455    unsigned chan )
 456 {
 457    sse_movups(
 458       func,
 459       get_input( vec, chan ),
 460       make_xmm( xmm ) );
 461 }
 462
 463 static void
 464 emit_temps(
 465    struct x86_function *func,
 466    unsigned xmm,
 467    unsigned vec,
 468    unsigned chan )
 469 {
 470    sse_movaps(
 471       func,
 472       get_temp( vec, chan ),
 473       make_xmm( xmm ) );
 474 }
 475
 476 static void
 477 emit_addrs(
 478    struct x86_function *func,
 479    unsigned xmm,
 480    unsigned vec,
 481    unsigned chan )
 482 {
 483    assert( vec == 0 );
 484
 485    emit_temps(
 486       func,
 487       xmm,
 488       vec + TGSI_EXEC_TEMP_ADDR,
 489       chan );
 490 }
 491
 492 /**
 493  * Coefficent fetch helpers.
 494  */
 495
 496 static void
 497 emit_coef_a0(
 498    struct x86_function *func,
 499    unsigned xmm,
 500    unsigned vec,
 501    unsigned chan )
 502 {
 503    emit_coef(
 504       func,
 505       xmm,
 506       vec,
 507       chan,
 508       0 );
 509 }
 510
 511 static void
 512 emit_coef_dadx(
 513    struct x86_function *func,
 514    unsigned xmm,
 515    unsigned vec,
 516    unsigned chan )
 517 {
 518    emit_coef(
 519       func,
 520       xmm,
 521       vec,
 522       chan,
 523       1 );
 524 }
 525
 526 static void
 527 emit_coef_dady(
 528    struct x86_function *func,
 529    unsigned xmm,
 530    unsigned vec,
 531    unsigned chan )
 532 {
 533    emit_coef(
 534       func,
 535       xmm,
 536       vec,
 537       chan,
 538       2 );
 539 }
 540
 541 /**
 542  * Function call helpers.
 543  */
 544
 545 /**
 546  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 547  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 548  * that the stack pointer is 16 byte aligned, as expected.
 549  */
 550 static void
 551 emit_func_call(
 552    struct x86_function *func,
 553    unsigned xmm_save_mask,
 554    const struct x86_reg *arg,
 555    unsigned nr_args,
 556    void (PIPE_CDECL *code)() )
 557 {
 558    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 559    unsigned i, n;
 560
 561    x86_push(
 562       func,
 563       x86_make_reg( file_REG32, reg_AX) );
 564    x86_push(
 565       func,
 566       x86_make_reg( file_REG32, reg_CX) );
 567    x86_push(
 568       func,
 569       x86_make_reg( file_REG32, reg_DX) );
 570
 571    /* Store XMM regs to the stack
 572     */
 573    for(i = 0, n = 0; i < 8; ++i)
 574       if(xmm_save_mask & (1 << i))
 575          ++n;
 576
 577    x86_sub_imm(
 578       func,
 579       x86_make_reg( file_REG32, reg_SP ),
 580       n*16);
 581
 582    for(i = 0, n = 0; i < 8; ++i)
 583       if(xmm_save_mask & (1 << i)) {
 584          sse_movups(
 585             func,
 586             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 587             make_xmm( i ) );
 588          ++n;
 589       }
 590
 591    for (i = 0; i < nr_args; i++) {
 592       /* Load the address of the buffer we use for passing arguments and
 593        * receiving results:
 594        */
 595       x86_lea(
 596          func,
 597          ecx,
 598          arg[i] );
 599
 600       /* Push actual function arguments (currently just the pointer to
 601        * the buffer above), and call the function:
 602        */
 603       x86_push( func, ecx );
 604    }
 605
 606    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 607    x86_call( func, ecx );
 608
 609    /* Pop the arguments (or just add an immediate to esp)
 610     */
 611    for (i = 0; i < nr_args; i++) {
 612       x86_pop(func, ecx );
 613    }
 614
 615    /* Pop the saved XMM regs:
 616     */
 617    for(i = 0, n = 0; i < 8; ++i)
 618       if(xmm_save_mask & (1 << i)) {
 619          sse_movups(
 620             func,
 621             make_xmm( i ),
 622             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 623          ++n;
 624       }
 625
 626    x86_add_imm(
 627       func,
 628       x86_make_reg( file_REG32, reg_SP ),
 629       n*16);
 630
 631    /* Restore GP registers in a reverse order.
 632     */
 633    x86_pop(
 634       func,
 635       x86_make_reg( file_REG32, reg_DX) );
 636    x86_pop(
 637       func,
 638       x86_make_reg( file_REG32, reg_CX) );
 639    x86_pop(
 640       func,
 641       x86_make_reg( file_REG32, reg_AX) );
 642 }
 643
 644 static void
 645 emit_func_call_dst_src1(
 646    struct x86_function *func,
 647    unsigned xmm_save,
 648    unsigned xmm_dst,
 649    unsigned xmm_src0,
 650    void (PIPE_CDECL *code)() )
 651 {
 652    struct x86_reg store = get_temp( TEMP_R0, 0 );
 653    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 654
 655    /* Store our input parameters (in xmm regs) to the buffer we use
 656     * for passing arguments.  We will pass a pointer to this buffer as
 657     * the actual function argument.
 658     */
 659    sse_movaps(
 660       func,
 661       store,
 662       make_xmm( xmm_src0 ) );
 663
 664    emit_func_call( func,
 665                    xmm_mask,
 666                    &store,
 667                    1,
 668                    code );
 669
 670    sse_movaps(
 671       func,
 672       make_xmm( xmm_dst ),
 673       store );
 674 }
 675
 676
 677 static void
 678 emit_func_call_dst_src2(
 679    struct x86_function *func,
 680    unsigned xmm_save,
 681    unsigned xmm_dst,
 682    unsigned xmm_src0,
 683    unsigned xmm_src1,
 684    void (PIPE_CDECL *code)() )
 685 {
 686    struct x86_reg store = get_temp( TEMP_R0, 0 );
 687    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 688
 689    /* Store two inputs to parameter buffer.
 690     */
 691    sse_movaps(
 692       func,
 693       store,
 694       make_xmm( xmm_src0 ) );
 695
 696    sse_movaps(
 697       func,
 698       x86_make_disp( store, 4 * sizeof(float) ),
 699       make_xmm( xmm_src1 ) );
 700
 701
 702    /* Emit the call
 703     */
 704    emit_func_call( func,
 705                    xmm_mask,
 706                    &store,
 707                    1,
 708                    code );
 709
 710    /* Retrieve the results:
 711     */
 712    sse_movaps(
 713       func,
 714       make_xmm( xmm_dst ),
 715       store );
 716 }
 717
 718
 719
 720
 721
 722 #if defined(PIPE_ARCH_SSE)
 723
 724 /*
 725  * Fast SSE2 implementation of special math functions.
 726  */
 727
 728 #define POLY0(x, c0) _mm_set1_ps(c0)
 729 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 730 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 731 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 732 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 733 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 734
 735 #define EXP_POLY_DEGREE 3
 736 #define LOG_POLY_DEGREE 5
 737
 738 /**
 739  * See http://www.devmaster.net/forums/showthread.php?p=43580
 740  */
 741 static INLINE __m128
 742 exp2f4(__m128 x)
 743 {
 744    __m128i ipart;
 745    __m128 fpart, expipart, expfpart;
 746
 747    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 748    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 749
 750    /* ipart = int(x - 0.5) */
 751    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 752
 753    /* fpart = x - ipart */
 754    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 755
 756    /* expipart = (float) (1 << ipart) */
 757    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 758
 759    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 760 #if EXP_POLY_DEGREE == 5
 761    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 762 #elif EXP_POLY_DEGREE == 4
 763    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 764 #elif EXP_POLY_DEGREE == 3
 765    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 766 #elif EXP_POLY_DEGREE == 2
 767    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 768 #else
 769 #error
 770 #endif
 771
 772    return _mm_mul_ps(expipart, expfpart);
 773 }
 774
 775
 776 /**
 777  * See http://www.devmaster.net/forums/showthread.php?p=43580
 778  */
 779 static INLINE __m128
 780 log2f4(__m128 x)
 781 {
 782    __m128i expmask = _mm_set1_epi32(0x7f800000);
 783    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 784    __m128 one = _mm_set1_ps(1.0f);
 785
 786    __m128i i = _mm_castps_si128(x);
 787
 788    /* exp = (float) exponent(x) */
 789    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 790
 791    /* mant = (float) mantissa(x) */
 792    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 793
 794    __m128 logmant;
 795
 796    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 797     * These coefficients can be generate with
 798     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 799     */
 800 #if LOG_POLY_DEGREE == 6
 801    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 802 #elif LOG_POLY_DEGREE == 5
 803    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 804 #elif LOG_POLY_DEGREE == 4
 805    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 806 #elif LOG_POLY_DEGREE == 3
 807    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 808 #else
 809 #error
 810 #endif
 811
 812    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 813    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 814
 815    return _mm_add_ps(logmant, exp);
 816 }
 817
 818
 819 static INLINE __m128
 820 powf4(__m128 x, __m128 y)
 821 {
 822    return exp2f4(_mm_mul_ps(log2f4(x), y));
 823 }
 824
 825 #endif /* PIPE_ARCH_SSE */
 826
 827
 828
 829 /**
 830  * Low-level instruction translators.
 831  */
 832
 833 static void
 834 emit_abs(
 835    struct x86_function *func,
 836    unsigned xmm )
 837 {
 838    sse_andps(
 839       func,
 840       make_xmm( xmm ),
 841       get_temp(
 842          TGSI_EXEC_TEMP_7FFFFFFF_I,
 843          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 844 }
 845
 846 static void
 847 emit_add(
 848    struct x86_function *func,
 849    unsigned xmm_dst,
 850    unsigned xmm_src )
 851 {
 852    sse_addps(
 853       func,
 854       make_xmm( xmm_dst ),
 855       make_xmm( xmm_src ) );
 856 }
 857
 858 static void PIPE_CDECL
 859 cos4f(
 860    float *store )
 861 {
 862    store[0] = cosf( store[0] );
 863    store[1] = cosf( store[1] );
 864    store[2] = cosf( store[2] );
 865    store[3] = cosf( store[3] );
 866 }
 867
 868 static void
 869 emit_cos(
 870    struct x86_function *func,
 871    unsigned xmm_save,
 872    unsigned xmm_dst )
 873 {
 874    emit_func_call_dst_src1(
 875       func,
 876       xmm_save,
 877       xmm_dst,
 878       xmm_dst,
 879       cos4f );
 880 }
 881
 882 static void PIPE_CDECL
 883 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 884 __attribute__((force_align_arg_pointer))
 885 #endif
 886 ex24f(
 887    float *store )
 888 {
 889 #if defined(PIPE_ARCH_SSE)
 890    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 891 #else
 892    store[0] = util_fast_exp2( store[0] );
 893    store[1] = util_fast_exp2( store[1] );
 894    store[2] = util_fast_exp2( store[2] );
 895    store[3] = util_fast_exp2( store[3] );
 896 #endif
 897 }
 898
 899 static void
 900 emit_ex2(
 901    struct x86_function *func,
 902    unsigned xmm_save,
 903    unsigned xmm_dst )
 904 {
 905    emit_func_call_dst_src1(
 906       func,
 907       xmm_save,
 908       xmm_dst,
 909       xmm_dst,
 910       ex24f );
 911 }
 912
 913 static void
 914 emit_f2it(
 915    struct x86_function *func,
 916    unsigned xmm )
 917 {
 918    sse2_cvttps2dq(
 919       func,
 920       make_xmm( xmm ),
 921       make_xmm( xmm ) );
 922 }
 923
 924 static void
 925 emit_i2f(
 926    struct x86_function *func,
 927    unsigned xmm )
 928 {
 929    sse2_cvtdq2ps(
 930       func,
 931       make_xmm( xmm ),
 932       make_xmm( xmm ) );
 933 }
 934
 935 static void PIPE_CDECL
 936 flr4f(
 937    float *store )
 938 {
 939    store[0] = floorf( store[0] );
 940    store[1] = floorf( store[1] );
 941    store[2] = floorf( store[2] );
 942    store[3] = floorf( store[3] );
 943 }
 944
 945 static void
 946 emit_flr(
 947    struct x86_function *func,
 948    unsigned xmm_save,
 949    unsigned xmm_dst )
 950 {
 951    emit_func_call_dst_src1(
 952       func,
 953       xmm_save,
 954       xmm_dst,
 955       xmm_dst,
 956       flr4f );
 957 }
 958
 959 static void PIPE_CDECL
 960 frc4f(
 961    float *store )
 962 {
 963    store[0] -= floorf( store[0] );
 964    store[1] -= floorf( store[1] );
 965    store[2] -= floorf( store[2] );
 966    store[3] -= floorf( store[3] );
 967 }
 968
 969 static void
 970 emit_frc(
 971    struct x86_function *func,
 972    unsigned xmm_save,
 973    unsigned xmm_dst )
 974 {
 975    emit_func_call_dst_src1(
 976       func,
 977       xmm_save,
 978       xmm_dst,
 979       xmm_dst,
 980       frc4f );
 981 }
 982
 983 static void PIPE_CDECL
 984 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 985 __attribute__((force_align_arg_pointer))
 986 #endif
 987 lg24f(
 988    float *store )
 989 {
 990 #if defined(PIPE_ARCH_SSE)
 991    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 992 #else
 993    store[0] = util_fast_log2( store[0] );
 994    store[1] = util_fast_log2( store[1] );
 995    store[2] = util_fast_log2( store[2] );
 996    store[3] = util_fast_log2( store[3] );
 997 #endif
 998 }
 999
1000 static void
1001 emit_lg2(
1002    struct x86_function *func,
1003    unsigned xmm_save,
1004    unsigned xmm_dst )
1005 {
1006    emit_func_call_dst_src1(
1007       func,
1008       xmm_save,
1009       xmm_dst,
1010       xmm_dst,
1011       lg24f );
1012 }
1013
1014 static void
1015 emit_MOV(
1016    struct x86_function *func,
1017    unsigned xmm_dst,
1018    unsigned xmm_src )
1019 {
1020    sse_movups(
1021       func,
1022       make_xmm( xmm_dst ),
1023       make_xmm( xmm_src ) );
1024 }
1025
1026 static void
1027 emit_mul (struct x86_function *func,
1028           unsigned xmm_dst,
1029           unsigned xmm_src)
1030 {
1031    sse_mulps(
1032       func,
1033       make_xmm( xmm_dst ),
1034       make_xmm( xmm_src ) );
1035 }
1036
1037 static void
1038 emit_neg(
1039    struct x86_function *func,
1040    unsigned xmm )
1041 {
1042    sse_xorps(
1043       func,
1044       make_xmm( xmm ),
1045       get_temp(
1046          TGSI_EXEC_TEMP_80000000_I,
1047          TGSI_EXEC_TEMP_80000000_C ) );
1048 }
1049
1050 static void PIPE_CDECL
1051 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1052 __attribute__((force_align_arg_pointer))
1053 #endif
1054 pow4f(
1055    float *store )
1056 {
1057 #if defined(PIPE_ARCH_SSE)
1058    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1059 #else
1060    store[0] = util_fast_pow( store[0], store[4] );
1061    store[1] = util_fast_pow( store[1], store[5] );
1062    store[2] = util_fast_pow( store[2], store[6] );
1063    store[3] = util_fast_pow( store[3], store[7] );
1064 #endif
1065 }
1066
1067 static void
1068 emit_pow(
1069    struct x86_function *func,
1070    unsigned xmm_save,
1071    unsigned xmm_dst,
1072    unsigned xmm_src0,
1073    unsigned xmm_src1 )
1074 {
1075    emit_func_call_dst_src2(
1076       func,
1077       xmm_save,
1078       xmm_dst,
1079       xmm_src0,
1080       xmm_src1,
1081       pow4f );
1082 }
1083
1084 static void
1085 emit_rcp (
1086    struct x86_function *func,
1087    unsigned xmm_dst,
1088    unsigned xmm_src )
1089 {
1090    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1091     * good enough.  Need to either emit a proper divide or use the
1092     * iterative technique described below in emit_rsqrt().
1093     */
1094    sse2_rcpps(
1095       func,
1096       make_xmm( xmm_dst ),
1097       make_xmm( xmm_src ) );
1098 }
1099
1100 static void PIPE_CDECL
1101 rnd4f(
1102    float *store )
1103 {
1104    store[0] = floorf( store[0] + 0.5f );
1105    store[1] = floorf( store[1] + 0.5f );
1106    store[2] = floorf( store[2] + 0.5f );
1107    store[3] = floorf( store[3] + 0.5f );
1108 }
1109
1110 static void
1111 emit_rnd(
1112    struct x86_function *func,
1113    unsigned xmm_save,
1114    unsigned xmm_dst )
1115 {
1116    emit_func_call_dst_src1(
1117       func,
1118       xmm_save,
1119       xmm_dst,
1120       xmm_dst,
1121       rnd4f );
1122 }
1123
1124 static void
1125 emit_rsqrt(
1126    struct x86_function *func,
1127    unsigned xmm_dst,
1128    unsigned xmm_src )
1129 {
1130 #if HIGH_PRECISION
1131    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1132     * implementations, it is possible to improve its precision at
1133     * fairly low cost, using a newton/raphson step, as below:
1134     *
1135     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1136     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1137     *
1138     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1139     */
1140    {
1141       struct x86_reg dst = make_xmm( xmm_dst );
1142       struct x86_reg src = make_xmm( xmm_src );
1143       struct x86_reg tmp0 = make_xmm( 2 );
1144       struct x86_reg tmp1 = make_xmm( 3 );
1145
1146       assert( xmm_dst != xmm_src );
1147       assert( xmm_dst != 2 && xmm_dst != 3 );
1148       assert( xmm_src != 2 && xmm_src != 3 );
1149
1150       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1151       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1152       sse_rsqrtps( func, tmp1, src  );
1153       sse_mulps(   func, src,  tmp1 );
1154       sse_mulps(   func, dst,  tmp1 );
1155       sse_mulps(   func, src,  tmp1 );
1156       sse_subps(   func, tmp0, src  );
1157       sse_mulps(   func, dst,  tmp0 );
1158    }
1159 #else
1160    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1161     * good enough.
1162     */
1163    sse_rsqrtps(
1164       func,
1165       make_xmm( xmm_dst ),
1166       make_xmm( xmm_src ) );
1167 #endif
1168 }
1169
1170 static void
1171 emit_setsign(
1172    struct x86_function *func,
1173    unsigned xmm )
1174 {
1175    sse_orps(
1176       func,
1177       make_xmm( xmm ),
1178       get_temp(
1179          TGSI_EXEC_TEMP_80000000_I,
1180          TGSI_EXEC_TEMP_80000000_C ) );
1181 }
1182
1183 static void PIPE_CDECL
1184 sgn4f(
1185    float *store )
1186 {
1187    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1188    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1189    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1190    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1191 }
1192
1193 static void
1194 emit_sgn(
1195    struct x86_function *func,
1196    unsigned xmm_save,
1197    unsigned xmm_dst )
1198 {
1199    emit_func_call_dst_src1(
1200       func,
1201       xmm_save,
1202       xmm_dst,
1203       xmm_dst,
1204       sgn4f );
1205 }
1206
1207 static void PIPE_CDECL
1208 sin4f(
1209    float *store )
1210 {
1211    store[0] = sinf( store[0] );
1212    store[1] = sinf( store[1] );
1213    store[2] = sinf( store[2] );
1214    store[3] = sinf( store[3] );
1215 }
1216
1217 static void
1218 emit_sin (struct x86_function *func,
1219           unsigned xmm_save,
1220           unsigned xmm_dst)
1221 {
1222    emit_func_call_dst_src1(
1223       func,
1224       xmm_save,
1225       xmm_dst,
1226       xmm_dst,
1227       sin4f );
1228 }
1229
1230 static void
1231 emit_sub(
1232    struct x86_function *func,
1233    unsigned xmm_dst,
1234    unsigned xmm_src )
1235 {
1236    sse_subps(
1237       func,
1238       make_xmm( xmm_dst ),
1239       make_xmm( xmm_src ) );
1240 }
1241
1242
1243
1244
1245
1246
1247
1248 /**
1249  * Register fetch.
1250  */
1251
1252 static void
1253 emit_fetch(
1254    struct x86_function *func,
1255    unsigned xmm,
1256    const struct tgsi_full_src_register *reg,
1257    const unsigned chan_index )
1258 {
1259    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1260
1261    switch (swizzle) {
1262    case TGSI_EXTSWIZZLE_X:
1263    case TGSI_EXTSWIZZLE_Y:
1264    case TGSI_EXTSWIZZLE_Z:
1265    case TGSI_EXTSWIZZLE_W:
1266       switch (reg->SrcRegister.File) {
1267       case TGSI_FILE_CONSTANT:
1268          emit_const(
1269             func,
1270             xmm,
1271             reg->SrcRegister.Index,
1272             swizzle,
1273             reg->SrcRegister.Indirect,
1274             reg->SrcRegisterInd.File,
1275             reg->SrcRegisterInd.Index );
1276          break;
1277
1278       case TGSI_FILE_IMMEDIATE:
1279          emit_immediate(
1280             func,
1281             xmm,
1282             reg->SrcRegister.Index,
1283             swizzle );
1284          break;
1285
1286       case TGSI_FILE_INPUT:
1287          emit_inputf(
1288             func,
1289             xmm,
1290             reg->SrcRegister.Index,
1291             swizzle );
1292          break;
1293
1294       case TGSI_FILE_TEMPORARY:
1295          emit_tempf(
1296             func,
1297             xmm,
1298             reg->SrcRegister.Index,
1299             swizzle );
1300          break;
1301
1302       default:
1303          assert( 0 );
1304       }
1305       break;
1306
1307    case TGSI_EXTSWIZZLE_ZERO:
1308       emit_tempf(
1309          func,
1310          xmm,
1311          TGSI_EXEC_TEMP_00000000_I,
1312          TGSI_EXEC_TEMP_00000000_C );
1313       break;
1314
1315    case TGSI_EXTSWIZZLE_ONE:
1316       emit_tempf(
1317          func,
1318          xmm,
1319          TEMP_ONE_I,
1320          TEMP_ONE_C );
1321       break;
1322
1323    default:
1324       assert( 0 );
1325    }
1326
1327    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1328    case TGSI_UTIL_SIGN_CLEAR:
1329       emit_abs( func, xmm );
1330       break;
1331
1332    case TGSI_UTIL_SIGN_SET:
1333       emit_setsign( func, xmm );
1334       break;
1335
1336    case TGSI_UTIL_SIGN_TOGGLE:
1337       emit_neg( func, xmm );
1338       break;
1339
1340    case TGSI_UTIL_SIGN_KEEP:
1341       break;
1342    }
1343 }
1344
1345 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1346    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1347
1348 /**
1349  * Register store.
1350  */
1351
1352 static void
1353 emit_store(
1354    struct x86_function *func,
1355    unsigned xmm,
1356    const struct tgsi_full_dst_register *reg,
1357    const struct tgsi_full_instruction *inst,
1358    unsigned chan_index )
1359 {
1360    switch( reg->DstRegister.File ) {
1361    case TGSI_FILE_OUTPUT:
1362       emit_output(
1363          func,
1364          xmm,
1365          reg->DstRegister.Index,
1366          chan_index );
1367       break;
1368
1369    case TGSI_FILE_TEMPORARY:
1370       emit_temps(
1371          func,
1372          xmm,
1373          reg->DstRegister.Index,
1374          chan_index );
1375       break;
1376
1377    case TGSI_FILE_ADDRESS:
1378       emit_addrs(
1379          func,
1380          xmm,
1381          reg->DstRegister.Index,
1382          chan_index );
1383       break;
1384
1385    default:
1386       assert( 0 );
1387    }
1388
1389    switch( inst->Instruction.Saturate ) {
1390    case TGSI_SAT_NONE:
1391       break;
1392
1393    case TGSI_SAT_ZERO_ONE:
1394       /* assert( 0 ); */
1395       break;
1396
1397    case TGSI_SAT_MINUS_PLUS_ONE:
1398       assert( 0 );
1399       break;
1400    }
1401 }
1402
1403 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1404    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1405
1406
1407 static void PIPE_CDECL
1408 fetch_texel( struct tgsi_sampler **sampler,
1409              float *store )
1410 {
1411 #if 0
1412    uint j;
1413
1414    debug_printf("%s sampler: %p (%p) store: %p\n",
1415                 __FUNCTION__,
1416                 sampler, *sampler,
1417                 store );
1418
1419    debug_printf("lodbias %f\n", store[12]);
1420
1421    for (j = 0; j < 4; j++)
1422       debug_printf("sample %d texcoord %f %f\n",
1423                    j,
1424                    store[0+j],
1425                    store[4+j]);
1426 #endif
1427
1428    {
1429       float rgba[NUM_CHANNELS][QUAD_SIZE];
1430       (*sampler)->get_samples(*sampler,
1431                               &store[0],
1432                               &store[4],
1433                               &store[8],
1434                               0.0f, /*store[12],  lodbias */
1435                               rgba);
1436
1437       memcpy( store, rgba, 16 * sizeof(float));
1438    }
1439
1440 #if 0
1441    for (j = 0; j < 4; j++)
1442       debug_printf("sample %d result %f %f %f %f\n",
1443                    j,
1444                    store[0+j],
1445                    store[4+j],
1446                    store[8+j],
1447                    store[12+j]);
1448 #endif
1449 }
1450
1451 /**
1452  * High-level instruction translators.
1453  */
1454
1455 static void
1456 emit_tex( struct x86_function *func,
1457           const struct tgsi_full_instruction *inst,
1458           boolean lodbias,
1459           boolean projected)
1460 {
1461    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1462    struct x86_reg args[2];
1463    unsigned count;
1464    unsigned i;
1465
1466    switch (inst->InstructionExtTexture.Texture) {
1467    case TGSI_TEXTURE_1D:
1468    case TGSI_TEXTURE_SHADOW1D:
1469       count = 1;
1470       break;
1471    case TGSI_TEXTURE_2D:
1472    case TGSI_TEXTURE_RECT:
1473    case TGSI_TEXTURE_SHADOW2D:
1474    case TGSI_TEXTURE_SHADOWRECT:
1475       count = 2;
1476       break;
1477    case TGSI_TEXTURE_3D:
1478    case TGSI_TEXTURE_CUBE:
1479       count = 3;
1480       break;
1481    default:
1482       assert(0);
1483       return;
1484    }
1485
1486    if (lodbias) {
1487       FETCH( func, *inst, 3, 0, 3 );
1488    }
1489    else {
1490       emit_tempf(
1491          func,
1492          3,
1493          TGSI_EXEC_TEMP_00000000_I,
1494          TGSI_EXEC_TEMP_00000000_C );
1495
1496    }
1497
1498    /* store lodbias whether enabled or not -- fetch_texel currently
1499     * respects it always.
1500     */
1501    sse_movaps( func,
1502                get_temp( TEMP_R0, 3 ),
1503                make_xmm( 3 ) );
1504
1505
1506    if (projected) {
1507       FETCH( func, *inst, 3, 0, 3 );
1508
1509       emit_rcp( func, 3, 3 );
1510    }
1511
1512    for (i = 0; i < count; i++) {
1513       FETCH( func, *inst, i, 0, i );
1514
1515       if (projected) {
1516          sse_mulps(
1517             func,
1518             make_xmm( i ),
1519             make_xmm( 3 ) );
1520       }
1521
1522       /* Store in the argument buffer:
1523        */
1524       sse_movaps(
1525          func,
1526          get_temp( TEMP_R0, i ),
1527          make_xmm( i ) );
1528    }
1529
1530    args[0] = get_temp( TEMP_R0, 0 );
1531    args[1] = get_sampler_ptr( unit );
1532
1533
1534    emit_func_call( func,
1535                    0,
1536                    args,
1537                    Elements(args),
1538                    fetch_texel );
1539
1540    /* If all four channels are enabled, could use a pointer to
1541     * dst[0].x instead of TEMP_R0 for store?
1542     */
1543    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1544
1545       sse_movaps(
1546          func,
1547          make_xmm( 0 ),
1548          get_temp( TEMP_R0, i ) );
1549
1550       STORE( func, *inst, 0, 0, i );
1551    }
1552 }
1553
1554
1555 static void
1556 emit_kil(
1557    struct x86_function *func,
1558    const struct tgsi_full_src_register *reg )
1559 {
1560    unsigned uniquemask;
1561    unsigned unique_count = 0;
1562    unsigned chan_index;
1563    unsigned i;
1564
1565    /* This mask stores component bits that were already tested. Note that
1566     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1567     * tested. */
1568    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1569
1570    FOR_EACH_CHANNEL( chan_index ) {
1571       unsigned swizzle;
1572
1573       /* unswizzle channel */
1574       swizzle = tgsi_util_get_full_src_register_extswizzle(
1575          reg,
1576          chan_index );
1577
1578       /* check if the component has not been already tested */
1579       if( !(uniquemask & (1 << swizzle)) ) {
1580          uniquemask |= 1 << swizzle;
1581
1582          /* allocate register */
1583          emit_fetch(
1584             func,
1585             unique_count++,
1586             reg,
1587             chan_index );
1588       }
1589    }
1590
1591    x86_push(
1592       func,
1593       x86_make_reg( file_REG32, reg_AX ) );
1594    x86_push(
1595       func,
1596       x86_make_reg( file_REG32, reg_DX ) );
1597
1598    for (i = 0 ; i < unique_count; i++ ) {
1599       struct x86_reg dataXMM = make_xmm(i);
1600
1601       sse_cmpps(
1602          func,
1603          dataXMM,
1604          get_temp(
1605             TGSI_EXEC_TEMP_00000000_I,
1606             TGSI_EXEC_TEMP_00000000_C ),
1607          cc_LessThan );
1608
1609       if( i == 0 ) {
1610          sse_movmskps(
1611             func,
1612             x86_make_reg( file_REG32, reg_AX ),
1613             dataXMM );
1614       }
1615       else {
1616          sse_movmskps(
1617             func,
1618             x86_make_reg( file_REG32, reg_DX ),
1619             dataXMM );
1620          x86_or(
1621             func,
1622             x86_make_reg( file_REG32, reg_AX ),
1623             x86_make_reg( file_REG32, reg_DX ) );
1624       }
1625    }
1626
1627    x86_or(
1628       func,
1629       get_temp(
1630          TGSI_EXEC_TEMP_KILMASK_I,
1631          TGSI_EXEC_TEMP_KILMASK_C ),
1632       x86_make_reg( file_REG32, reg_AX ) );
1633
1634    x86_pop(
1635       func,
1636       x86_make_reg( file_REG32, reg_DX ) );
1637    x86_pop(
1638       func,
1639       x86_make_reg( file_REG32, reg_AX ) );
1640 }
1641
1642
1643 static void
1644 emit_kilp(
1645    struct x86_function *func )
1646 {
1647    /* XXX todo / fix me */
1648 }
1649
1650
1651 static void
1652 emit_setcc(
1653    struct x86_function *func,
1654    struct tgsi_full_instruction *inst,
1655    enum sse_cc cc )
1656 {
1657    unsigned chan_index;
1658
1659    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1660       FETCH( func, *inst, 0, 0, chan_index );
1661       FETCH( func, *inst, 1, 1, chan_index );
1662       sse_cmpps(
1663          func,
1664          make_xmm( 0 ),
1665          make_xmm( 1 ),
1666          cc );
1667       sse_andps(
1668          func,
1669          make_xmm( 0 ),
1670          get_temp(
1671             TEMP_ONE_I,
1672             TEMP_ONE_C ) );
1673       STORE( func, *inst, 0, 0, chan_index );
1674    }
1675 }
1676
1677 static void
1678 emit_cmp(
1679    struct x86_function *func,
1680    struct tgsi_full_instruction *inst )
1681 {
1682    unsigned chan_index;
1683
1684    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1685       FETCH( func, *inst, 0, 0, chan_index );
1686       FETCH( func, *inst, 1, 1, chan_index );
1687       FETCH( func, *inst, 2, 2, chan_index );
1688       sse_cmpps(
1689          func,
1690          make_xmm( 0 ),
1691          get_temp(
1692             TGSI_EXEC_TEMP_00000000_I,
1693             TGSI_EXEC_TEMP_00000000_C ),
1694          cc_LessThan );
1695       sse_andps(
1696          func,
1697          make_xmm( 1 ),
1698          make_xmm( 0 ) );
1699       sse_andnps(
1700          func,
1701          make_xmm( 0 ),
1702          make_xmm( 2 ) );
1703       sse_orps(
1704          func,
1705          make_xmm( 0 ),
1706          make_xmm( 1 ) );
1707       STORE( func, *inst, 0, 0, chan_index );
1708    }
1709 }
1710
1711
1712 /**
1713  * Check if inst src/dest regs use indirect addressing into temporary
1714  * register file.
1715  */
1716 static boolean
1717 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1718 {
1719    uint i;
1720    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1721       const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1722       if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1723           reg->SrcRegister.Indirect)
1724          return TRUE;
1725    }
1726    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1727       const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1728       if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1729           reg->DstRegister.Indirect)
1730          return TRUE;
1731    }
1732    return FALSE;
1733 }
1734
1735
1736 static int
1737 emit_instruction(
1738    struct x86_function *func,
1739    struct tgsi_full_instruction *inst )
1740 {
1741    unsigned chan_index;
1742
1743    /* we can't handle indirect addressing into temp register file yet */
1744    if (indirect_temp_reference(inst))
1745       return FALSE;
1746
1747    switch (inst->Instruction.Opcode) {
1748    case TGSI_OPCODE_ARL:
1749       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1750          FETCH( func, *inst, 0, 0, chan_index );
1751          emit_flr(func, 0, 0);
1752          emit_f2it( func, 0 );
1753          STORE( func, *inst, 0, 0, chan_index );
1754       }
1755       break;
1756
1757    case TGSI_OPCODE_MOV:
1758    case TGSI_OPCODE_SWZ:
1759       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1760          FETCH( func, *inst, 0, 0, chan_index );
1761          STORE( func, *inst, 0, 0, chan_index );
1762       }
1763       break;
1764
1765    case TGSI_OPCODE_LIT:
1766       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1767           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1768          emit_tempf(
1769             func,
1770             0,
1771             TEMP_ONE_I,
1772             TEMP_ONE_C);
1773          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1774             STORE( func, *inst, 0, 0, CHAN_X );
1775          }
1776          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1777             STORE( func, *inst, 0, 0, CHAN_W );
1778          }
1779       }
1780       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1781           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1782          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1783             FETCH( func, *inst, 0, 0, CHAN_X );
1784             sse_maxps(
1785                func,
1786                make_xmm( 0 ),
1787                get_temp(
1788                   TGSI_EXEC_TEMP_00000000_I,
1789                   TGSI_EXEC_TEMP_00000000_C ) );
1790             STORE( func, *inst, 0, 0, CHAN_Y );
1791          }
1792          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1793             /* XMM[1] = SrcReg[0].yyyy */
1794             FETCH( func, *inst, 1, 0, CHAN_Y );
1795             /* XMM[1] = max(XMM[1], 0) */
1796             sse_maxps(
1797                func,
1798                make_xmm( 1 ),
1799                get_temp(
1800                   TGSI_EXEC_TEMP_00000000_I,
1801                   TGSI_EXEC_TEMP_00000000_C ) );
1802             /* XMM[2] = SrcReg[0].wwww */
1803             FETCH( func, *inst, 2, 0, CHAN_W );
1804             /* XMM[2] = min(XMM[2], 128.0) */
1805             sse_minps(
1806                func,
1807                make_xmm( 2 ),
1808                get_temp(
1809                   TGSI_EXEC_TEMP_128_I,
1810                   TGSI_EXEC_TEMP_128_C ) );
1811             /* XMM[2] = max(XMM[2], -128.0) */
1812             sse_maxps(
1813                func,
1814                make_xmm( 2 ),
1815                get_temp(
1816                   TGSI_EXEC_TEMP_MINUS_128_I,
1817                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1818             emit_pow( func, 3, 1, 1, 2 );
1819             FETCH( func, *inst, 0, 0, CHAN_X );
1820             sse_xorps(
1821                func,
1822                make_xmm( 2 ),
1823                make_xmm( 2 ) );
1824             sse_cmpps(
1825                func,
1826                make_xmm( 2 ),
1827                make_xmm( 0 ),
1828                cc_LessThan );
1829             sse_andps(
1830                func,
1831                make_xmm( 2 ),
1832                make_xmm( 1 ) );
1833             STORE( func, *inst, 2, 0, CHAN_Z );
1834          }
1835       }
1836       break;
1837
1838    case TGSI_OPCODE_RCP:
1839    /* TGSI_OPCODE_RECIP */
1840       FETCH( func, *inst, 0, 0, CHAN_X );
1841       emit_rcp( func, 0, 0 );
1842       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1843          STORE( func, *inst, 0, 0, chan_index );
1844       }
1845       break;
1846
1847    case TGSI_OPCODE_RSQ:
1848    /* TGSI_OPCODE_RECIPSQRT */
1849       FETCH( func, *inst, 0, 0, CHAN_X );
1850       emit_abs( func, 0 );
1851       emit_rsqrt( func, 1, 0 );
1852       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1853          STORE( func, *inst, 1, 0, chan_index );
1854       }
1855       break;
1856
1857    case TGSI_OPCODE_EXP:
1858       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1859           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1860           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1861          FETCH( func, *inst, 0, 0, CHAN_X );
1862          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1863              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1864             emit_MOV( func, 1, 0 );
1865             emit_flr( func, 2, 1 );
1866             /* dst.x = ex2(floor(src.x)) */
1867             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1868                emit_MOV( func, 2, 1 );
1869                emit_ex2( func, 3, 2 );
1870                STORE( func, *inst, 2, 0, CHAN_X );
1871             }
1872             /* dst.y = src.x - floor(src.x) */
1873             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1874                emit_MOV( func, 2, 0 );
1875                emit_sub( func, 2, 1 );
1876                STORE( func, *inst, 2, 0, CHAN_Y );
1877             }
1878          }
1879          /* dst.z = ex2(src.x) */
1880          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1881             emit_ex2( func, 3, 0 );
1882             STORE( func, *inst, 0, 0, CHAN_Z );
1883          }
1884       }
1885       /* dst.w = 1.0 */
1886       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1887          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1888          STORE( func, *inst, 0, 0, CHAN_W );
1889       }
1890       break;
1891
1892    case TGSI_OPCODE_LOG:
1893       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1894           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1895           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1896          FETCH( func, *inst, 0, 0, CHAN_X );
1897          emit_abs( func, 0 );
1898          emit_MOV( func, 1, 0 );
1899          emit_lg2( func, 2, 1 );
1900          /* dst.z = lg2(abs(src.x)) */
1901          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1902             STORE( func, *inst, 1, 0, CHAN_Z );
1903          }
1904          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1905              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1906             emit_flr( func, 2, 1 );
1907             /* dst.x = floor(lg2(abs(src.x))) */
1908             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1909                STORE( func, *inst, 1, 0, CHAN_X );
1910             }
1911             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1912             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1913                emit_ex2( func, 2, 1 );
1914                emit_rcp( func, 1, 1 );
1915                emit_mul( func, 0, 1 );
1916                STORE( func, *inst, 0, 0, CHAN_Y );
1917             }
1918          }
1919       }
1920       /* dst.w = 1.0 */
1921       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1922          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1923          STORE( func, *inst, 0, 0, CHAN_W );
1924       }
1925       break;
1926
1927    case TGSI_OPCODE_MUL:
1928       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1929          FETCH( func, *inst, 0, 0, chan_index );
1930          FETCH( func, *inst, 1, 1, chan_index );
1931          emit_mul( func, 0, 1 );
1932          STORE( func, *inst, 0, 0, chan_index );
1933       }
1934       break;
1935
1936    case TGSI_OPCODE_ADD:
1937       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1938          FETCH( func, *inst, 0, 0, chan_index );
1939          FETCH( func, *inst, 1, 1, chan_index );
1940          emit_add( func, 0, 1 );
1941          STORE( func, *inst, 0, 0, chan_index );
1942       }
1943       break;
1944
1945    case TGSI_OPCODE_DP3:
1946    /* TGSI_OPCODE_DOT3 */
1947       FETCH( func, *inst, 0, 0, CHAN_X );
1948       FETCH( func, *inst, 1, 1, CHAN_X );
1949       emit_mul( func, 0, 1 );
1950       FETCH( func, *inst, 1, 0, CHAN_Y );
1951       FETCH( func, *inst, 2, 1, CHAN_Y );
1952       emit_mul( func, 1, 2 );
1953       emit_add( func, 0, 1 );
1954       FETCH( func, *inst, 1, 0, CHAN_Z );
1955       FETCH( func, *inst, 2, 1, CHAN_Z );
1956       emit_mul( func, 1, 2 );
1957       emit_add( func, 0, 1 );
1958       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1959          STORE( func, *inst, 0, 0, chan_index );
1960       }
1961       break;
1962
1963    case TGSI_OPCODE_DP4:
1964    /* TGSI_OPCODE_DOT4 */
1965       FETCH( func, *inst, 0, 0, CHAN_X );
1966       FETCH( func, *inst, 1, 1, CHAN_X );
1967       emit_mul( func, 0, 1 );
1968       FETCH( func, *inst, 1, 0, CHAN_Y );
1969       FETCH( func, *inst, 2, 1, CHAN_Y );
1970       emit_mul( func, 1, 2 );
1971       emit_add( func, 0, 1 );
1972       FETCH( func, *inst, 1, 0, CHAN_Z );
1973       FETCH( func, *inst, 2, 1, CHAN_Z );
1974       emit_mul(func, 1, 2 );
1975       emit_add(func, 0, 1 );
1976       FETCH( func, *inst, 1, 0, CHAN_W );
1977       FETCH( func, *inst, 2, 1, CHAN_W );
1978       emit_mul( func, 1, 2 );
1979       emit_add( func, 0, 1 );
1980       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1981          STORE( func, *inst, 0, 0, chan_index );
1982       }
1983       break;
1984
1985    case TGSI_OPCODE_DST:
1986       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1987          emit_tempf(
1988             func,
1989             0,
1990             TEMP_ONE_I,
1991             TEMP_ONE_C );
1992          STORE( func, *inst, 0, 0, CHAN_X );
1993       }
1994       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1995          FETCH( func, *inst, 0, 0, CHAN_Y );
1996          FETCH( func, *inst, 1, 1, CHAN_Y );
1997          emit_mul( func, 0, 1 );
1998          STORE( func, *inst, 0, 0, CHAN_Y );
1999       }
2000       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2001          FETCH( func, *inst, 0, 0, CHAN_Z );
2002          STORE( func, *inst, 0, 0, CHAN_Z );
2003       }
2004       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2005          FETCH( func, *inst, 0, 1, CHAN_W );
2006          STORE( func, *inst, 0, 0, CHAN_W );
2007       }
2008       break;
2009
2010    case TGSI_OPCODE_MIN:
2011       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2012          FETCH( func, *inst, 0, 0, chan_index );
2013          FETCH( func, *inst, 1, 1, chan_index );
2014          sse_minps(
2015             func,
2016             make_xmm( 0 ),
2017             make_xmm( 1 ) );
2018          STORE( func, *inst, 0, 0, chan_index );
2019       }
2020       break;
2021
2022    case TGSI_OPCODE_MAX:
2023       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2024          FETCH( func, *inst, 0, 0, chan_index );
2025          FETCH( func, *inst, 1, 1, chan_index );
2026          sse_maxps(
2027             func,
2028             make_xmm( 0 ),
2029             make_xmm( 1 ) );
2030          STORE( func, *inst, 0, 0, chan_index );
2031       }
2032       break;
2033
2034    case TGSI_OPCODE_SLT:
2035    /* TGSI_OPCODE_SETLT */
2036       emit_setcc( func, inst, cc_LessThan );
2037       break;
2038
2039    case TGSI_OPCODE_SGE:
2040    /* TGSI_OPCODE_SETGE */
2041       emit_setcc( func, inst, cc_NotLessThan );
2042       break;
2043
2044    case TGSI_OPCODE_MAD:
2045    /* TGSI_OPCODE_MADD */
2046       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2047          FETCH( func, *inst, 0, 0, chan_index );
2048          FETCH( func, *inst, 1, 1, chan_index );
2049          FETCH( func, *inst, 2, 2, chan_index );
2050          emit_mul( func, 0, 1 );
2051          emit_add( func, 0, 2 );
2052          STORE( func, *inst, 0, 0, chan_index );
2053       }
2054       break;
2055
2056    case TGSI_OPCODE_SUB:
2057       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2058          FETCH( func, *inst, 0, 0, chan_index );
2059          FETCH( func, *inst, 1, 1, chan_index );
2060          emit_sub( func, 0, 1 );
2061          STORE( func, *inst, 0, 0, chan_index );
2062       }
2063       break;
2064
2065    case TGSI_OPCODE_LERP:
2066    /* TGSI_OPCODE_LRP */
2067       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2068          FETCH( func, *inst, 0, 0, chan_index );
2069          FETCH( func, *inst, 1, 1, chan_index );
2070          FETCH( func, *inst, 2, 2, chan_index );
2071          emit_sub( func, 1, 2 );
2072          emit_mul( func, 0, 1 );
2073          emit_add( func, 0, 2 );
2074          STORE( func, *inst, 0, 0, chan_index );
2075       }
2076       break;
2077
2078    case TGSI_OPCODE_CND:
2079       return 0;
2080       break;
2081
2082    case TGSI_OPCODE_CND0:
2083       return 0;
2084       break;
2085
2086    case TGSI_OPCODE_DOT2ADD:
2087    /* TGSI_OPCODE_DP2A */
2088       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2089       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2090       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2091       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2092       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2093       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2094       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2095       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2096       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2097       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2098          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2099       }
2100       break;
2101
2102    case TGSI_OPCODE_INDEX:
2103       return 0;
2104       break;
2105
2106    case TGSI_OPCODE_NEGATE:
2107       return 0;
2108       break;
2109
2110    case TGSI_OPCODE_FRAC:
2111    /* TGSI_OPCODE_FRC */
2112       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2113          FETCH( func, *inst, 0, 0, chan_index );
2114          emit_frc( func, 0, 0 );
2115          STORE( func, *inst, 0, 0, chan_index );
2116       }
2117       break;
2118
2119    case TGSI_OPCODE_CLAMP:
2120       return 0;
2121       break;
2122
2123    case TGSI_OPCODE_FLOOR:
2124    /* TGSI_OPCODE_FLR */
2125       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2126          FETCH( func, *inst, 0, 0, chan_index );
2127          emit_flr( func, 0, 0 );
2128          STORE( func, *inst, 0, 0, chan_index );
2129       }
2130       break;
2131
2132    case TGSI_OPCODE_ROUND:
2133       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2134          FETCH( func, *inst, 0, 0, chan_index );
2135          emit_rnd( func, 0, 0 );
2136          STORE( func, *inst, 0, 0, chan_index );
2137       }
2138       break;
2139
2140    case TGSI_OPCODE_EXPBASE2:
2141    /* TGSI_OPCODE_EX2 */
2142       FETCH( func, *inst, 0, 0, CHAN_X );
2143       emit_ex2( func, 0, 0 );
2144       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2145          STORE( func, *inst, 0, 0, chan_index );
2146       }
2147       break;
2148
2149    case TGSI_OPCODE_LOGBASE2:
2150    /* TGSI_OPCODE_LG2 */
2151       FETCH( func, *inst, 0, 0, CHAN_X );
2152       emit_lg2( func, 0, 0 );
2153       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2154          STORE( func, *inst, 0, 0, chan_index );
2155       }
2156       break;
2157
2158    case TGSI_OPCODE_POWER:
2159    /* TGSI_OPCODE_POW */
2160       FETCH( func, *inst, 0, 0, CHAN_X );
2161       FETCH( func, *inst, 1, 1, CHAN_X );
2162       emit_pow( func, 0, 0, 0, 1 );
2163       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2164          STORE( func, *inst, 0, 0, chan_index );
2165       }
2166       break;
2167
2168    case TGSI_OPCODE_CROSSPRODUCT:
2169    /* TGSI_OPCODE_XPD */
2170       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2171           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2172          FETCH( func, *inst, 1, 1, CHAN_Z );
2173          FETCH( func, *inst, 3, 0, CHAN_Z );
2174       }
2175       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2176           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2177          FETCH( func, *inst, 0, 0, CHAN_Y );
2178          FETCH( func, *inst, 4, 1, CHAN_Y );
2179       }
2180       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2181          emit_MOV( func, 2, 0 );
2182          emit_mul( func, 2, 1 );
2183          emit_MOV( func, 5, 3 );
2184          emit_mul( func, 5, 4 );
2185          emit_sub( func, 2, 5 );
2186          STORE( func, *inst, 2, 0, CHAN_X );
2187       }
2188       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2189           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2190          FETCH( func, *inst, 2, 1, CHAN_X );
2191          FETCH( func, *inst, 5, 0, CHAN_X );
2192       }
2193       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2194          emit_mul( func, 3, 2 );
2195          emit_mul( func, 1, 5 );
2196          emit_sub( func, 3, 1 );
2197          STORE( func, *inst, 3, 0, CHAN_Y );
2198       }
2199       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2200          emit_mul( func, 5, 4 );
2201          emit_mul( func, 0, 2 );
2202          emit_sub( func, 5, 0 );
2203          STORE( func, *inst, 5, 0, CHAN_Z );
2204       }
2205       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2206          emit_tempf(
2207             func,
2208             0,
2209             TEMP_ONE_I,
2210             TEMP_ONE_C );
2211          STORE( func, *inst, 0, 0, CHAN_W );
2212       }
2213       break;
2214
2215    case TGSI_OPCODE_MULTIPLYMATRIX:
2216       return 0;
2217       break;
2218
2219    case TGSI_OPCODE_ABS:
2220       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2221          FETCH( func, *inst, 0, 0, chan_index );
2222          emit_abs( func, 0) ;
2223
2224          STORE( func, *inst, 0, 0, chan_index );
2225       }
2226       break;
2227
2228    case TGSI_OPCODE_RCC:
2229       return 0;
2230       break;
2231
2232    case TGSI_OPCODE_DPH:
2233       FETCH( func, *inst, 0, 0, CHAN_X );
2234       FETCH( func, *inst, 1, 1, CHAN_X );
2235       emit_mul( func, 0, 1 );
2236       FETCH( func, *inst, 1, 0, CHAN_Y );
2237       FETCH( func, *inst, 2, 1, CHAN_Y );
2238       emit_mul( func, 1, 2 );
2239       emit_add( func, 0, 1 );
2240       FETCH( func, *inst, 1, 0, CHAN_Z );
2241       FETCH( func, *inst, 2, 1, CHAN_Z );
2242       emit_mul( func, 1, 2 );
2243       emit_add( func, 0, 1 );
2244       FETCH( func, *inst, 1, 1, CHAN_W );
2245       emit_add( func, 0, 1 );
2246       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2247          STORE( func, *inst, 0, 0, chan_index );
2248       }
2249       break;
2250
2251    case TGSI_OPCODE_COS:
2252       FETCH( func, *inst, 0, 0, CHAN_X );
2253       emit_cos( func, 0, 0 );
2254       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2255          STORE( func, *inst, 0, 0, chan_index );
2256       }
2257       break;
2258
2259    case TGSI_OPCODE_DDX:
2260       return 0;
2261       break;
2262
2263    case TGSI_OPCODE_DDY:
2264       return 0;
2265       break;
2266
2267    case TGSI_OPCODE_KILP:
2268       /* predicated kill */
2269       emit_kilp( func );
2270       return 0; /* XXX fix me */
2271       break;
2272
2273    case TGSI_OPCODE_KIL:
2274       /* conditional kill */
2275       emit_kil( func, &inst->FullSrcRegisters[0] );
2276       break;
2277
2278    case TGSI_OPCODE_PK2H:
2279       return 0;
2280       break;
2281
2282    case TGSI_OPCODE_PK2US:
2283       return 0;
2284       break;
2285
2286    case TGSI_OPCODE_PK4B:
2287       return 0;
2288       break;
2289
2290    case TGSI_OPCODE_PK4UB:
2291       return 0;
2292       break;
2293
2294    case TGSI_OPCODE_RFL:
2295       return 0;
2296       break;
2297
2298    case TGSI_OPCODE_SEQ:
2299       return 0;
2300       break;
2301
2302    case TGSI_OPCODE_SFL:
2303       return 0;
2304       break;
2305
2306    case TGSI_OPCODE_SGT:
2307       return 0;
2308       break;
2309
2310    case TGSI_OPCODE_SIN:
2311       FETCH( func, *inst, 0, 0, CHAN_X );
2312       emit_sin( func, 0, 0 );
2313       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2314          STORE( func, *inst, 0, 0, chan_index );
2315       }
2316       break;
2317
2318    case TGSI_OPCODE_SLE:
2319       return 0;
2320       break;
2321
2322    case TGSI_OPCODE_SNE:
2323       return 0;
2324       break;
2325
2326    case TGSI_OPCODE_STR:
2327       return 0;
2328       break;
2329
2330    case TGSI_OPCODE_TEX:
2331       emit_tex( func, inst, FALSE, FALSE );
2332       break;
2333
2334    case TGSI_OPCODE_TXD:
2335       return 0;
2336       break;
2337
2338    case TGSI_OPCODE_UP2H:
2339       return 0;
2340       break;
2341
2342    case TGSI_OPCODE_UP2US:
2343       return 0;
2344       break;
2345
2346    case TGSI_OPCODE_UP4B:
2347       return 0;
2348       break;
2349
2350    case TGSI_OPCODE_UP4UB:
2351       return 0;
2352       break;
2353
2354    case TGSI_OPCODE_X2D:
2355       return 0;
2356       break;
2357
2358    case TGSI_OPCODE_ARA:
2359       return 0;
2360       break;
2361
2362    case TGSI_OPCODE_ARR:
2363       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2364          FETCH( func, *inst, 0, 0, chan_index );
2365          emit_rnd( func, 0, 0 );
2366          emit_f2it( func, 0 );
2367          STORE( func, *inst, 0, 0, chan_index );
2368       }
2369       break;
2370
2371    case TGSI_OPCODE_BRA:
2372       return 0;
2373       break;
2374
2375    case TGSI_OPCODE_CAL:
2376       return 0;
2377       break;
2378
2379    case TGSI_OPCODE_RET:
2380       emit_ret( func );
2381       break;
2382
2383    case TGSI_OPCODE_END:
2384       break;
2385
2386    case TGSI_OPCODE_SSG:
2387    /* TGSI_OPCODE_SGN */
2388       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2389          FETCH( func, *inst, 0, 0, chan_index );
2390          emit_sgn( func, 0, 0 );
2391          STORE( func, *inst, 0, 0, chan_index );
2392       }
2393       break;
2394
2395    case TGSI_OPCODE_CMP:
2396       emit_cmp (func, inst);
2397       break;
2398
2399    case TGSI_OPCODE_SCS:
2400       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2401          FETCH( func, *inst, 0, 0, CHAN_X );
2402          emit_cos( func, 0, 0 );
2403          STORE( func, *inst, 0, 0, CHAN_X );
2404       }
2405       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2406          FETCH( func, *inst, 0, 0, CHAN_X );
2407          emit_sin( func, 0, 0 );
2408          STORE( func, *inst, 0, 0, CHAN_Y );
2409       }
2410       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2411          emit_tempf(
2412             func,
2413             0,
2414             TGSI_EXEC_TEMP_00000000_I,
2415             TGSI_EXEC_TEMP_00000000_C );
2416          STORE( func, *inst, 0, 0, CHAN_Z );
2417       }
2418       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2419          emit_tempf(
2420             func,
2421             0,
2422             TEMP_ONE_I,
2423             TEMP_ONE_C );
2424          STORE( func, *inst, 0, 0, CHAN_W );
2425       }
2426       break;
2427
2428    case TGSI_OPCODE_TXB:
2429       emit_tex( func, inst, TRUE, FALSE );
2430       break;
2431
2432    case TGSI_OPCODE_NRM:
2433       /* fall-through */
2434    case TGSI_OPCODE_NRM4:
2435       /* 3 or 4-component normalization */
2436       {
2437          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2438
2439          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2440              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2441              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2442              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2443
2444             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2445
2446             /* xmm4 = src.x */
2447             /* xmm0 = src.x * src.x */
2448             FETCH(func, *inst, 0, 0, CHAN_X);
2449             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2450                emit_MOV(func, 4, 0);
2451             }
2452             emit_mul(func, 0, 0);
2453
2454             /* xmm5 = src.y */
2455             /* xmm0 = xmm0 + src.y * src.y */
2456             FETCH(func, *inst, 1, 0, CHAN_Y);
2457             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2458                emit_MOV(func, 5, 1);
2459             }
2460             emit_mul(func, 1, 1);
2461             emit_add(func, 0, 1);
2462
2463             /* xmm6 = src.z */
2464             /* xmm0 = xmm0 + src.z * src.z */
2465             FETCH(func, *inst, 1, 0, CHAN_Z);
2466             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2467                emit_MOV(func, 6, 1);
2468             }
2469             emit_mul(func, 1, 1);
2470             emit_add(func, 0, 1);
2471
2472             if (dims == 4) {
2473                /* xmm7 = src.w */
2474                /* xmm0 = xmm0 + src.w * src.w */
2475                FETCH(func, *inst, 1, 0, CHAN_W);
2476                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2477                   emit_MOV(func, 7, 1);
2478                }
2479                emit_mul(func, 1, 1);
2480                emit_add(func, 0, 1);
2481             }
2482
2483             /* xmm1 = 1 / sqrt(xmm0) */
2484             emit_rsqrt(func, 1, 0);
2485
2486             /* dst.x = xmm1 * src.x */
2487             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2488                emit_mul(func, 4, 1);
2489                STORE(func, *inst, 4, 0, CHAN_X);
2490             }
2491
2492             /* dst.y = xmm1 * src.y */
2493             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2494                emit_mul(func, 5, 1);
2495                STORE(func, *inst, 5, 0, CHAN_Y);
2496             }
2497
2498             /* dst.z = xmm1 * src.z */
2499             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2500                emit_mul(func, 6, 1);
2501                STORE(func, *inst, 6, 0, CHAN_Z);
2502             }
2503
2504             /* dst.w = xmm1 * src.w */
2505             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2506                emit_mul(func, 7, 1);
2507                STORE(func, *inst, 7, 0, CHAN_W);
2508             }
2509          }
2510
2511          /* dst0.w = 1.0 */
2512          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2513             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2514             STORE(func, *inst, 0, 0, CHAN_W);
2515          }
2516       }
2517       break;
2518
2519    case TGSI_OPCODE_DIV:
2520       return 0;
2521       break;
2522
2523    case TGSI_OPCODE_DP2:
2524       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2525       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2526       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2527       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2528       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2529       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2530       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2531       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2532          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2533       }
2534       break;
2535
2536    case TGSI_OPCODE_TXL:
2537       emit_tex( func, inst, TRUE, FALSE );
2538       break;
2539
2540    case TGSI_OPCODE_TXP:
2541       emit_tex( func, inst, FALSE, TRUE );
2542       break;
2543
2544    case TGSI_OPCODE_BRK:
2545       return 0;
2546       break;
2547
2548    case TGSI_OPCODE_IF:
2549       return 0;
2550       break;
2551
2552    case TGSI_OPCODE_LOOP:
2553       return 0;
2554       break;
2555
2556    case TGSI_OPCODE_REP:
2557       return 0;
2558       break;
2559
2560    case TGSI_OPCODE_ELSE:
2561       return 0;
2562       break;
2563
2564    case TGSI_OPCODE_ENDIF:
2565       return 0;
2566       break;
2567
2568    case TGSI_OPCODE_ENDLOOP:
2569       return 0;
2570       break;
2571
2572    case TGSI_OPCODE_ENDREP:
2573       return 0;
2574       break;
2575
2576    case TGSI_OPCODE_PUSHA:
2577       return 0;
2578       break;
2579
2580    case TGSI_OPCODE_POPA:
2581       return 0;
2582       break;
2583
2584    case TGSI_OPCODE_CEIL:
2585       return 0;
2586       break;
2587
2588    case TGSI_OPCODE_I2F:
2589       return 0;
2590       break;
2591
2592    case TGSI_OPCODE_NOT:
2593       return 0;
2594       break;
2595
2596    case TGSI_OPCODE_TRUNC:
2597       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2598          FETCH( func, *inst, 0, 0, chan_index );
2599          emit_f2it( func, 0 );
2600          emit_i2f( func, 0 );
2601          STORE( func, *inst, 0, 0, chan_index );
2602       }
2603       break;
2604
2605    case TGSI_OPCODE_SHL:
2606       return 0;
2607       break;
2608
2609    case TGSI_OPCODE_SHR:
2610       return 0;
2611       break;
2612
2613    case TGSI_OPCODE_AND:
2614       return 0;
2615       break;
2616
2617    case TGSI_OPCODE_OR:
2618       return 0;
2619       break;
2620
2621    case TGSI_OPCODE_MOD:
2622       return 0;
2623       break;
2624
2625    case TGSI_OPCODE_XOR:
2626       return 0;
2627       break;
2628
2629    case TGSI_OPCODE_SAD:
2630       return 0;
2631       break;
2632
2633    case TGSI_OPCODE_TXF:
2634       return 0;
2635       break;
2636
2637    case TGSI_OPCODE_TXQ:
2638       return 0;
2639       break;
2640
2641    case TGSI_OPCODE_CONT:
2642       return 0;
2643       break;
2644
2645    case TGSI_OPCODE_EMIT:
2646       return 0;
2647       break;
2648
2649    case TGSI_OPCODE_ENDPRIM:
2650       return 0;
2651       break;
2652
2653    default:
2654       return 0;
2655    }
2656
2657    return 1;
2658 }
2659
2660 static void
2661 emit_declaration(
2662    struct x86_function *func,
2663    struct tgsi_full_declaration *decl )
2664 {
2665    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2666       unsigned first, last, mask;
2667       unsigned i, j;
2668
2669       first = decl->DeclarationRange.First;
2670       last = decl->DeclarationRange.Last;
2671       mask = decl->Declaration.UsageMask;
2672
2673       for( i = first; i <= last; i++ ) {
2674          for( j = 0; j < NUM_CHANNELS; j++ ) {
2675             if( mask & (1 << j) ) {
2676                switch( decl->Declaration.Interpolate ) {
2677                case TGSI_INTERPOLATE_CONSTANT:
2678                   emit_coef_a0( func, 0, i, j );
2679                   emit_inputs( func, 0, i, j );
2680                   break;
2681
2682                case TGSI_INTERPOLATE_LINEAR:
2683                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2684                   emit_coef_dadx( func, 1, i, j );
2685                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2686                   emit_coef_dady( func, 3, i, j );
2687                   emit_mul( func, 0, 1 );    /* x * dadx */
2688                   emit_coef_a0( func, 4, i, j );
2689                   emit_mul( func, 2, 3 );    /* y * dady */
2690                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2691                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2692                   emit_inputs( func, 0, i, j );
2693                   break;
2694
2695                case TGSI_INTERPOLATE_PERSPECTIVE:
2696                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2697                   emit_coef_dadx( func, 1, i, j );
2698                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2699                   emit_coef_dady( func, 3, i, j );
2700                   emit_mul( func, 0, 1 );    /* x * dadx */
2701                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2702                   emit_coef_a0( func, 5, i, j );
2703                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2704                   emit_mul( func, 2, 3 );    /* y * dady */
2705                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2706                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2707                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2708                   emit_inputs( func, 0, i, j );
2709                   break;
2710
2711                default:
2712                   assert( 0 );
2713                   break;
2714                }
2715             }
2716          }
2717       }
2718    }
2719 }
2720
2721 static void aos_to_soa( struct x86_function *func,
2722                         uint arg_aos,
2723                         uint arg_machine,
2724                         uint arg_num,
2725                         uint arg_stride )
2726 {
2727    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2728    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2729    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2730    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2731    int inner_loop;
2732
2733
2734    /* Save EBX */
2735    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2736
2737    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2738    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2739    x86_lea( func, soa_input,
2740             x86_make_disp( soa_input,
2741                            Offset(struct tgsi_exec_machine, Inputs) ) );
2742    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2743    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2744
2745    /* do */
2746    inner_loop = x86_get_label( func );
2747    {
2748       x86_push( func, aos_input );
2749       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2750       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2751       x86_add( func, aos_input, stride );
2752       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2753       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2754       x86_add( func, aos_input, stride );
2755       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2756       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2757       x86_add( func, aos_input, stride );
2758       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2759       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2760       x86_pop( func, aos_input );
2761
2762       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2763       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2764       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2765       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2766       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2767       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2768
2769       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2770       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2771       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2772       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2773
2774       /* Advance to next input */
2775       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2776       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2777    }
2778    /* while --num_inputs */
2779    x86_dec( func, num_inputs );
2780    x86_jcc( func, cc_NE, inner_loop );
2781
2782    /* Restore EBX */
2783    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2784 }
2785
2786 static void soa_to_aos( struct x86_function *func,
2787                         uint arg_aos,
2788                         uint arg_machine,
2789                         uint arg_num,
2790                         uint arg_stride )
2791 {
2792    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2793    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2794    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2795    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2796    int inner_loop;
2797
2798    /* Save EBX */
2799    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2800
2801    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2802    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2803    x86_lea( func, soa_output,
2804             x86_make_disp( soa_output,
2805                            Offset(struct tgsi_exec_machine, Outputs) ) );
2806    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2807
2808    /* do */
2809    inner_loop = x86_get_label( func );
2810    {
2811       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2812       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2813       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2814       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2815
2816       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2817       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2818       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2819       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2820       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2821       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2822
2823       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2824       x86_push( func, aos_output );
2825       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2826       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2827       x86_add( func, aos_output, temp );
2828       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2829       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2830       x86_add( func, aos_output, temp );
2831       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2832       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2833       x86_add( func, aos_output, temp );
2834       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2835       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2836       x86_pop( func, aos_output );
2837
2838       /* Advance to next output */
2839       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2840       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2841    }
2842    /* while --num_outputs */
2843    x86_dec( func, num_outputs );
2844    x86_jcc( func, cc_NE, inner_loop );
2845
2846    /* Restore EBX */
2847    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2848 }
2849
2850 /**
2851  * Translate a TGSI vertex/fragment shader to SSE2 code.
2852  * Slightly different things are done for vertex vs. fragment shaders.
2853  *
2854  * \param tokens  the TGSI input shader
2855  * \param func  the output SSE code/function
2856  * \param immediates  buffer to place immediates, later passed to SSE func
2857  * \param return  1 for success, 0 if translation failed
2858  */
2859 unsigned
2860 tgsi_emit_sse2(
2861    const struct tgsi_token *tokens,
2862    struct x86_function *func,
2863    float (*immediates)[4],
2864    boolean do_swizzles )
2865 {
2866    struct tgsi_parse_context parse;
2867    unsigned ok = 1;
2868    uint num_immediates = 0;
2869
2870    util_init_math();
2871
2872    func->csr = func->store;
2873
2874    tgsi_parse_init( &parse, tokens );
2875
2876    /* Can't just use EDI, EBX without save/restoring them:
2877     */
2878    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2879    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2880
2881    /*
2882     * Different function args for vertex/fragment shaders:
2883     */
2884    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2885       if (do_swizzles)
2886          aos_to_soa( func,
2887                      4,         /* aos_input */
2888                      1,         /* machine */
2889                      5,         /* num_inputs */
2890                      6 );       /* input_stride */
2891    }
2892
2893    x86_mov(
2894       func,
2895       get_machine_base(),
2896       x86_fn_arg( func, 1 ) );
2897    x86_mov(
2898       func,
2899       get_const_base(),
2900       x86_fn_arg( func, 2 ) );
2901    x86_mov(
2902       func,
2903       get_immediate_base(),
2904       x86_fn_arg( func, 3 ) );
2905
2906    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2907       x86_mov(
2908          func,
2909          get_coef_base(),
2910          x86_fn_arg( func, 4 ) );
2911
2912       x86_mov(
2913          func,
2914          get_sampler_base(),
2915          x86_make_disp( get_machine_base(),
2916                         Offset( struct tgsi_exec_machine, Samplers ) ) );
2917    }
2918
2919
2920    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2921       tgsi_parse_token( &parse );
2922
2923       switch( parse.FullToken.Token.Type ) {
2924       case TGSI_TOKEN_TYPE_DECLARATION:
2925          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2926             emit_declaration(
2927                func,
2928                &parse.FullToken.FullDeclaration );
2929          }
2930          break;
2931
2932       case TGSI_TOKEN_TYPE_INSTRUCTION:
2933          ok = emit_instruction(
2934             func,
2935             &parse.FullToken.FullInstruction );
2936
2937          if (!ok) {
2938             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2939                          parse.FullToken.FullInstruction.Instruction.Opcode,
2940                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2941                          "vertex shader" : "fragment shader");
2942          }
2943          break;
2944
2945       case TGSI_TOKEN_TYPE_IMMEDIATE:
2946          /* simply copy the immediate values into the next immediates[] slot */
2947          {
2948             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2949             uint i;
2950             assert(size <= 4);
2951             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2952             for( i = 0; i < size; i++ ) {
2953                immediates[num_immediates][i] =
2954                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2955             }
2956 #if 0
2957             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2958                    num_immediates,
2959                    immediates[num_immediates][0],
2960                    immediates[num_immediates][1],
2961                    immediates[num_immediates][2],
2962                    immediates[num_immediates][3]);
2963 #endif
2964             num_immediates++;
2965          }
2966          break;
2967
2968       default:
2969          ok = 0;
2970          assert( 0 );
2971       }
2972    }
2973
2974    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2975       if (do_swizzles)
2976          soa_to_aos( func,
2977                      7,         /* aos_output */
2978                      1,         /* machine */
2979                      8,         /* num_outputs */
2980                      9 );       /* output_stride */
2981    }
2982
2983    /* Can't just use EBX, EDI without save/restoring them:
2984     */
2985    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2986    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2987
2988    emit_ret( func );
2989
2990    tgsi_parse_free( &parse );
2991
2992    return ok;
2993 }
2994
2995 #endif /* PIPE_ARCH_X86 */
2996