src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "util/u_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #include "util/u_memory.h"
  36 #if defined(PIPE_ARCH_SSE)
  37 #include "util/u_sse.h"
  38 #endif
  39 #include "tgsi/tgsi_parse.h"
  40 #include "tgsi/tgsi_util.h"
  41 #include "tgsi_exec.h"
  42 #include "tgsi_sse2.h"
  43
  44 #include "rtasm/rtasm_x86sse.h"
  45
  46 /* for 1/sqrt()
  47  *
  48  * This costs about 100fps (close to 10%) in gears:
  49  */
  50 #define HIGH_PRECISION 1
  51
  52 #define FAST_MATH 1
  53
  54
  55 #define FOR_EACH_CHANNEL( CHAN )\
  56    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  57
  58 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  59    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  60
  61 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  62    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  63
  64 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  65    FOR_EACH_CHANNEL( CHAN )\
  66       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  67
  68 #define CHAN_X 0
  69 #define CHAN_Y 1
  70 #define CHAN_Z 2
  71 #define CHAN_W 3
  72
  73 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  74 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  75
  76 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  77 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  78 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  79 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  80
  81
  82 /**
  83  * X86 utility functions.
  84  */
  85
  86 static struct x86_reg
  87 make_xmm(
  88    unsigned xmm )
  89 {
  90    return x86_make_reg(
  91       file_XMM,
  92       (enum x86_reg_name) xmm );
  93 }
  94
  95 /**
  96  * X86 register mapping helpers.
  97  */
  98
  99 static struct x86_reg
 100 get_const_base( void )
 101 {
 102    return x86_make_reg(
 103       file_REG32,
 104       reg_AX );
 105 }
 106
 107 static struct x86_reg
 108 get_machine_base( void )
 109 {
 110    return x86_make_reg(
 111       file_REG32,
 112       reg_CX );
 113 }
 114
 115 static struct x86_reg
 116 get_input_base( void )
 117 {
 118    return x86_make_disp(
 119       get_machine_base(),
 120       Offset(struct tgsi_exec_machine, Inputs) );
 121 }
 122
 123 static struct x86_reg
 124 get_output_base( void )
 125 {
 126    return x86_make_disp(
 127       get_machine_base(),
 128       Offset(struct tgsi_exec_machine, Outputs) );
 129 }
 130
 131 static struct x86_reg
 132 get_temp_base( void )
 133 {
 134    return x86_make_disp(
 135       get_machine_base(),
 136       Offset(struct tgsi_exec_machine, Temps) );
 137 }
 138
 139 static struct x86_reg
 140 get_coef_base( void )
 141 {
 142    return x86_make_reg(
 143       file_REG32,
 144       reg_BX );
 145 }
 146
 147 static struct x86_reg
 148 get_sampler_base( void )
 149 {
 150    return x86_make_reg(
 151       file_REG32,
 152       reg_DI );
 153 }
 154
 155 static struct x86_reg
 156 get_immediate_base( void )
 157 {
 158    return x86_make_reg(
 159       file_REG32,
 160       reg_DX );
 161 }
 162
 163
 164 /**
 165  * Data access helpers.
 166  */
 167
 168
 169 static struct x86_reg
 170 get_immediate(
 171    unsigned vec,
 172    unsigned chan )
 173 {
 174    return x86_make_disp(
 175       get_immediate_base(),
 176       (vec * 4 + chan) * 4 );
 177 }
 178
 179 static struct x86_reg
 180 get_const(
 181    unsigned vec,
 182    unsigned chan )
 183 {
 184    return x86_make_disp(
 185       get_const_base(),
 186       (vec * 4 + chan) * 4 );
 187 }
 188
 189 static struct x86_reg
 190 get_sampler_ptr(
 191    unsigned unit )
 192 {
 193    return x86_make_disp(
 194       get_sampler_base(),
 195       unit * sizeof( struct tgsi_sampler * ) );
 196 }
 197
 198 static struct x86_reg
 199 get_input(
 200    unsigned vec,
 201    unsigned chan )
 202 {
 203    return x86_make_disp(
 204       get_input_base(),
 205       (vec * 4 + chan) * 16 );
 206 }
 207
 208 static struct x86_reg
 209 get_output(
 210    unsigned vec,
 211    unsigned chan )
 212 {
 213    return x86_make_disp(
 214       get_output_base(),
 215       (vec * 4 + chan) * 16 );
 216 }
 217
 218 static struct x86_reg
 219 get_temp(
 220    unsigned vec,
 221    unsigned chan )
 222 {
 223    return x86_make_disp(
 224       get_temp_base(),
 225       (vec * 4 + chan) * 16 );
 226 }
 227
 228 static struct x86_reg
 229 get_coef(
 230    unsigned vec,
 231    unsigned chan,
 232    unsigned member )
 233 {
 234    return x86_make_disp(
 235       get_coef_base(),
 236       ((vec * 3 + member) * 4 + chan) * 4 );
 237 }
 238
 239
 240 static void
 241 emit_ret(
 242    struct x86_function  *func )
 243 {
 244    x86_ret( func );
 245 }
 246
 247
 248 /**
 249  * Data fetch helpers.
 250  */
 251
 252 /**
 253  * Copy a shader constant to xmm register
 254  * \param xmm  the destination xmm register
 255  * \param vec  the src const buffer index
 256  * \param chan  src channel to fetch (X, Y, Z or W)
 257  */
 258 static void
 259 emit_const(
 260    struct x86_function *func,
 261    uint xmm,
 262    int vec,
 263    uint chan,
 264    uint indirect,
 265    uint indirectFile,
 266    int indirectIndex )
 267 {
 268    if (indirect) {
 269       /* 'vec' is the offset from the address register's value.
 270        * We're loading CONST[ADDR+vec] into an xmm register.
 271        */
 272       struct x86_reg r0 = get_immediate_base();
 273       struct x86_reg r1 = get_coef_base();
 274       uint i;
 275
 276       assert( indirectFile == TGSI_FILE_ADDRESS );
 277       assert( indirectIndex == 0 );
 278       assert( r0.mod == mod_REG );
 279       assert( r1.mod == mod_REG );
 280
 281       x86_push( func, r0 );
 282       x86_push( func, r1 );
 283
 284       /*
 285        * Loop over the four pixels or vertices in the quad.
 286        * Get the value of the address (offset) register for pixel/vertex[i],
 287        * add it to the src offset and index into the constant buffer.
 288        * Note that we're working on SOA data.
 289        * If any of the pixel/vertex execution channels are unused their
 290        * values will be garbage.  It's very important that we don't use
 291        * those garbage values as indexes into the constant buffer since
 292        * that'll cause segfaults.
 293        * The solution is to bitwise-AND the offset with the execution mask
 294        * register whose values are either 0 or ~0.
 295        * The caller must setup the execution mask register to indicate
 296        * which channels are valid/alive before running the shader.
 297        * The execution mask will also figure into loops and conditionals
 298        * someday.
 299        */
 300       for (i = 0; i < QUAD_SIZE; i++) {
 301          /* r1 = address register[i] */
 302          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 303          /* r0 = execution mask[i] */
 304          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 305          /* r1 = r1 & r0 */
 306          x86_and( func, r1, r0 );
 307          /* r0 = 'vec', the offset */
 308          x86_lea( func, r0, get_const( vec, chan ) );
 309
 310          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 311           */
 312          x86_add( func, r1, r1 );
 313          x86_add( func, r1, r1 );
 314          x86_add( func, r1, r1 );
 315          x86_add( func, r1, r1 );
 316
 317          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 318          x86_mov( func, r1, x86_deref( r0 ) );
 319          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 320       }
 321
 322       x86_pop( func, r1 );
 323       x86_pop( func, r0 );
 324
 325       sse_movaps(
 326          func,
 327          make_xmm( xmm ),
 328          get_temp( TEMP_R0, CHAN_X ) );
 329    }
 330    else {
 331       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 332       assert( vec >= 0 );
 333
 334       sse_movss(
 335          func,
 336          make_xmm( xmm ),
 337          get_const( vec, chan ) );
 338       sse_shufps(
 339          func,
 340          make_xmm( xmm ),
 341          make_xmm( xmm ),
 342          SHUF( 0, 0, 0, 0 ) );
 343    }
 344 }
 345
 346 static void
 347 emit_immediate(
 348    struct x86_function *func,
 349    unsigned xmm,
 350    unsigned vec,
 351    unsigned chan )
 352 {
 353    sse_movss(
 354       func,
 355       make_xmm( xmm ),
 356       get_immediate( vec, chan ) );
 357    sse_shufps(
 358       func,
 359       make_xmm( xmm ),
 360       make_xmm( xmm ),
 361       SHUF( 0, 0, 0, 0 ) );
 362 }
 363
 364
 365 /**
 366  * Copy a shader input to xmm register
 367  * \param xmm  the destination xmm register
 368  * \param vec  the src input attrib
 369  * \param chan  src channel to fetch (X, Y, Z or W)
 370  */
 371 static void
 372 emit_inputf(
 373    struct x86_function *func,
 374    unsigned xmm,
 375    unsigned vec,
 376    unsigned chan )
 377 {
 378    sse_movups(
 379       func,
 380       make_xmm( xmm ),
 381       get_input( vec, chan ) );
 382 }
 383
 384 /**
 385  * Store an xmm register to a shader output
 386  * \param xmm  the source xmm register
 387  * \param vec  the dest output attrib
 388  * \param chan  src dest channel to store (X, Y, Z or W)
 389  */
 390 static void
 391 emit_output(
 392    struct x86_function *func,
 393    unsigned xmm,
 394    unsigned vec,
 395    unsigned chan )
 396 {
 397    sse_movups(
 398       func,
 399       get_output( vec, chan ),
 400       make_xmm( xmm ) );
 401 }
 402
 403 /**
 404  * Copy a shader temporary to xmm register
 405  * \param xmm  the destination xmm register
 406  * \param vec  the src temp register
 407  * \param chan  src channel to fetch (X, Y, Z or W)
 408  */
 409 static void
 410 emit_tempf(
 411    struct x86_function *func,
 412    unsigned xmm,
 413    unsigned vec,
 414    unsigned chan )
 415 {
 416    sse_movaps(
 417       func,
 418       make_xmm( xmm ),
 419       get_temp( vec, chan ) );
 420 }
 421
 422 /**
 423  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 424  * \param xmm  the destination xmm register
 425  * \param vec  the src input/attribute coefficient index
 426  * \param chan  src channel to fetch (X, Y, Z or W)
 427  * \param member  0=a0, 1=dadx, 2=dady
 428  */
 429 static void
 430 emit_coef(
 431    struct x86_function *func,
 432    unsigned xmm,
 433    unsigned vec,
 434    unsigned chan,
 435    unsigned member )
 436 {
 437    sse_movss(
 438       func,
 439       make_xmm( xmm ),
 440       get_coef( vec, chan, member ) );
 441    sse_shufps(
 442       func,
 443       make_xmm( xmm ),
 444       make_xmm( xmm ),
 445       SHUF( 0, 0, 0, 0 ) );
 446 }
 447
 448 /**
 449  * Data store helpers.
 450  */
 451
 452 static void
 453 emit_inputs(
 454    struct x86_function *func,
 455    unsigned xmm,
 456    unsigned vec,
 457    unsigned chan )
 458 {
 459    sse_movups(
 460       func,
 461       get_input( vec, chan ),
 462       make_xmm( xmm ) );
 463 }
 464
 465 static void
 466 emit_temps(
 467    struct x86_function *func,
 468    unsigned xmm,
 469    unsigned vec,
 470    unsigned chan )
 471 {
 472    sse_movaps(
 473       func,
 474       get_temp( vec, chan ),
 475       make_xmm( xmm ) );
 476 }
 477
 478 static void
 479 emit_addrs(
 480    struct x86_function *func,
 481    unsigned xmm,
 482    unsigned vec,
 483    unsigned chan )
 484 {
 485    assert( vec == 0 );
 486
 487    emit_temps(
 488       func,
 489       xmm,
 490       vec + TGSI_EXEC_TEMP_ADDR,
 491       chan );
 492 }
 493
 494 /**
 495  * Coefficent fetch helpers.
 496  */
 497
 498 static void
 499 emit_coef_a0(
 500    struct x86_function *func,
 501    unsigned xmm,
 502    unsigned vec,
 503    unsigned chan )
 504 {
 505    emit_coef(
 506       func,
 507       xmm,
 508       vec,
 509       chan,
 510       0 );
 511 }
 512
 513 static void
 514 emit_coef_dadx(
 515    struct x86_function *func,
 516    unsigned xmm,
 517    unsigned vec,
 518    unsigned chan )
 519 {
 520    emit_coef(
 521       func,
 522       xmm,
 523       vec,
 524       chan,
 525       1 );
 526 }
 527
 528 static void
 529 emit_coef_dady(
 530    struct x86_function *func,
 531    unsigned xmm,
 532    unsigned vec,
 533    unsigned chan )
 534 {
 535    emit_coef(
 536       func,
 537       xmm,
 538       vec,
 539       chan,
 540       2 );
 541 }
 542
 543 /**
 544  * Function call helpers.
 545  */
 546
 547 /**
 548  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 549  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 550  * that the stack pointer is 16 byte aligned, as expected.
 551  */
 552 static void
 553 emit_func_call(
 554    struct x86_function *func,
 555    unsigned xmm_save_mask,
 556    const struct x86_reg *arg,
 557    unsigned nr_args,
 558    void (PIPE_CDECL *code)() )
 559 {
 560    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 561    unsigned i, n;
 562
 563    x86_push(
 564       func,
 565       x86_make_reg( file_REG32, reg_AX) );
 566    x86_push(
 567       func,
 568       x86_make_reg( file_REG32, reg_CX) );
 569    x86_push(
 570       func,
 571       x86_make_reg( file_REG32, reg_DX) );
 572
 573    /* Store XMM regs to the stack
 574     */
 575    for(i = 0, n = 0; i < 8; ++i)
 576       if(xmm_save_mask & (1 << i))
 577          ++n;
 578
 579    x86_sub_imm(
 580       func,
 581       x86_make_reg( file_REG32, reg_SP ),
 582       n*16);
 583
 584    for(i = 0, n = 0; i < 8; ++i)
 585       if(xmm_save_mask & (1 << i)) {
 586          sse_movups(
 587             func,
 588             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 589             make_xmm( i ) );
 590          ++n;
 591       }
 592
 593    for (i = 0; i < nr_args; i++) {
 594       /* Load the address of the buffer we use for passing arguments and
 595        * receiving results:
 596        */
 597       x86_lea(
 598          func,
 599          ecx,
 600          arg[i] );
 601
 602       /* Push actual function arguments (currently just the pointer to
 603        * the buffer above), and call the function:
 604        */
 605       x86_push( func, ecx );
 606    }
 607
 608    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 609    x86_call( func, ecx );
 610
 611    /* Pop the arguments (or just add an immediate to esp)
 612     */
 613    for (i = 0; i < nr_args; i++) {
 614       x86_pop(func, ecx );
 615    }
 616
 617    /* Pop the saved XMM regs:
 618     */
 619    for(i = 0, n = 0; i < 8; ++i)
 620       if(xmm_save_mask & (1 << i)) {
 621          sse_movups(
 622             func,
 623             make_xmm( i ),
 624             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 625          ++n;
 626       }
 627
 628    x86_add_imm(
 629       func,
 630       x86_make_reg( file_REG32, reg_SP ),
 631       n*16);
 632
 633    /* Restore GP registers in a reverse order.
 634     */
 635    x86_pop(
 636       func,
 637       x86_make_reg( file_REG32, reg_DX) );
 638    x86_pop(
 639       func,
 640       x86_make_reg( file_REG32, reg_CX) );
 641    x86_pop(
 642       func,
 643       x86_make_reg( file_REG32, reg_AX) );
 644 }
 645
 646 static void
 647 emit_func_call_dst_src1(
 648    struct x86_function *func,
 649    unsigned xmm_save,
 650    unsigned xmm_dst,
 651    unsigned xmm_src0,
 652    void (PIPE_CDECL *code)() )
 653 {
 654    struct x86_reg store = get_temp( TEMP_R0, 0 );
 655    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 656
 657    /* Store our input parameters (in xmm regs) to the buffer we use
 658     * for passing arguments.  We will pass a pointer to this buffer as
 659     * the actual function argument.
 660     */
 661    sse_movaps(
 662       func,
 663       store,
 664       make_xmm( xmm_src0 ) );
 665
 666    emit_func_call( func,
 667                    xmm_mask,
 668                    &store,
 669                    1,
 670                    code );
 671
 672    sse_movaps(
 673       func,
 674       make_xmm( xmm_dst ),
 675       store );
 676 }
 677
 678
 679 static void
 680 emit_func_call_dst_src2(
 681    struct x86_function *func,
 682    unsigned xmm_save,
 683    unsigned xmm_dst,
 684    unsigned xmm_src0,
 685    unsigned xmm_src1,
 686    void (PIPE_CDECL *code)() )
 687 {
 688    struct x86_reg store = get_temp( TEMP_R0, 0 );
 689    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 690
 691    /* Store two inputs to parameter buffer.
 692     */
 693    sse_movaps(
 694       func,
 695       store,
 696       make_xmm( xmm_src0 ) );
 697
 698    sse_movaps(
 699       func,
 700       x86_make_disp( store, 4 * sizeof(float) ),
 701       make_xmm( xmm_src1 ) );
 702
 703
 704    /* Emit the call
 705     */
 706    emit_func_call( func,
 707                    xmm_mask,
 708                    &store,
 709                    1,
 710                    code );
 711
 712    /* Retrieve the results:
 713     */
 714    sse_movaps(
 715       func,
 716       make_xmm( xmm_dst ),
 717       store );
 718 }
 719
 720
 721
 722
 723
 724 #if defined(PIPE_ARCH_SSE)
 725
 726 /*
 727  * Fast SSE2 implementation of special math functions.
 728  */
 729
 730 #define POLY0(x, c0) _mm_set1_ps(c0)
 731 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 732 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 733 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 734 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 735 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 736
 737 #define EXP_POLY_DEGREE 3
 738 #define LOG_POLY_DEGREE 5
 739
 740 /**
 741  * See http://www.devmaster.net/forums/showthread.php?p=43580
 742  */
 743 static INLINE __m128
 744 exp2f4(__m128 x)
 745 {
 746    __m128i ipart;
 747    __m128 fpart, expipart, expfpart;
 748
 749    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 750    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 751
 752    /* ipart = int(x - 0.5) */
 753    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 754
 755    /* fpart = x - ipart */
 756    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 757
 758    /* expipart = (float) (1 << ipart) */
 759    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 760
 761    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 762 #if EXP_POLY_DEGREE == 5
 763    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 764 #elif EXP_POLY_DEGREE == 4
 765    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 766 #elif EXP_POLY_DEGREE == 3
 767    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 768 #elif EXP_POLY_DEGREE == 2
 769    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 770 #else
 771 #error
 772 #endif
 773
 774    return _mm_mul_ps(expipart, expfpart);
 775 }
 776
 777
 778 /**
 779  * See http://www.devmaster.net/forums/showthread.php?p=43580
 780  */
 781 static INLINE __m128
 782 log2f4(__m128 x)
 783 {
 784    __m128i expmask = _mm_set1_epi32(0x7f800000);
 785    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 786    __m128 one = _mm_set1_ps(1.0f);
 787
 788    __m128i i = _mm_castps_si128(x);
 789
 790    /* exp = (float) exponent(x) */
 791    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 792
 793    /* mant = (float) mantissa(x) */
 794    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 795
 796    __m128 logmant;
 797
 798    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 799     * These coefficients can be generate with
 800     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 801     */
 802 #if LOG_POLY_DEGREE == 6
 803    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 804 #elif LOG_POLY_DEGREE == 5
 805    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 806 #elif LOG_POLY_DEGREE == 4
 807    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 808 #elif LOG_POLY_DEGREE == 3
 809    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 810 #else
 811 #error
 812 #endif
 813
 814    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 815    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 816
 817    return _mm_add_ps(logmant, exp);
 818 }
 819
 820
 821 static INLINE __m128
 822 powf4(__m128 x, __m128 y)
 823 {
 824    return exp2f4(_mm_mul_ps(log2f4(x), y));
 825 }
 826
 827 #endif /* PIPE_ARCH_SSE */
 828
 829
 830
 831 /**
 832  * Low-level instruction translators.
 833  */
 834
 835 static void
 836 emit_abs(
 837    struct x86_function *func,
 838    unsigned xmm )
 839 {
 840    sse_andps(
 841       func,
 842       make_xmm( xmm ),
 843       get_temp(
 844          TGSI_EXEC_TEMP_7FFFFFFF_I,
 845          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 846 }
 847
 848 static void
 849 emit_add(
 850    struct x86_function *func,
 851    unsigned xmm_dst,
 852    unsigned xmm_src )
 853 {
 854    sse_addps(
 855       func,
 856       make_xmm( xmm_dst ),
 857       make_xmm( xmm_src ) );
 858 }
 859
 860 static void PIPE_CDECL
 861 cos4f(
 862    float *store )
 863 {
 864    store[0] = cosf( store[0] );
 865    store[1] = cosf( store[1] );
 866    store[2] = cosf( store[2] );
 867    store[3] = cosf( store[3] );
 868 }
 869
 870 static void
 871 emit_cos(
 872    struct x86_function *func,
 873    unsigned xmm_save,
 874    unsigned xmm_dst )
 875 {
 876    emit_func_call_dst_src1(
 877       func,
 878       xmm_save,
 879       xmm_dst,
 880       xmm_dst,
 881       cos4f );
 882 }
 883
 884 static void PIPE_CDECL
 885 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 886 __attribute__((force_align_arg_pointer))
 887 #endif
 888 ex24f(
 889    float *store )
 890 {
 891 #if defined(PIPE_ARCH_SSE)
 892    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 893 #else
 894    store[0] = util_fast_exp2( store[0] );
 895    store[1] = util_fast_exp2( store[1] );
 896    store[2] = util_fast_exp2( store[2] );
 897    store[3] = util_fast_exp2( store[3] );
 898 #endif
 899 }
 900
 901 static void
 902 emit_ex2(
 903    struct x86_function *func,
 904    unsigned xmm_save,
 905    unsigned xmm_dst )
 906 {
 907    emit_func_call_dst_src1(
 908       func,
 909       xmm_save,
 910       xmm_dst,
 911       xmm_dst,
 912       ex24f );
 913 }
 914
 915 static void
 916 emit_f2it(
 917    struct x86_function *func,
 918    unsigned xmm )
 919 {
 920    sse2_cvttps2dq(
 921       func,
 922       make_xmm( xmm ),
 923       make_xmm( xmm ) );
 924 }
 925
 926 static void
 927 emit_i2f(
 928    struct x86_function *func,
 929    unsigned xmm )
 930 {
 931    sse2_cvtdq2ps(
 932       func,
 933       make_xmm( xmm ),
 934       make_xmm( xmm ) );
 935 }
 936
 937 static void PIPE_CDECL
 938 flr4f(
 939    float *store )
 940 {
 941    store[0] = floorf( store[0] );
 942    store[1] = floorf( store[1] );
 943    store[2] = floorf( store[2] );
 944    store[3] = floorf( store[3] );
 945 }
 946
 947 static void
 948 emit_flr(
 949    struct x86_function *func,
 950    unsigned xmm_save,
 951    unsigned xmm_dst )
 952 {
 953    emit_func_call_dst_src1(
 954       func,
 955       xmm_save,
 956       xmm_dst,
 957       xmm_dst,
 958       flr4f );
 959 }
 960
 961 static void PIPE_CDECL
 962 frc4f(
 963    float *store )
 964 {
 965    store[0] -= floorf( store[0] );
 966    store[1] -= floorf( store[1] );
 967    store[2] -= floorf( store[2] );
 968    store[3] -= floorf( store[3] );
 969 }
 970
 971 static void
 972 emit_frc(
 973    struct x86_function *func,
 974    unsigned xmm_save,
 975    unsigned xmm_dst )
 976 {
 977    emit_func_call_dst_src1(
 978       func,
 979       xmm_save,
 980       xmm_dst,
 981       xmm_dst,
 982       frc4f );
 983 }
 984
 985 static void PIPE_CDECL
 986 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 987 __attribute__((force_align_arg_pointer))
 988 #endif
 989 lg24f(
 990    float *store )
 991 {
 992 #if defined(PIPE_ARCH_SSE)
 993    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 994 #else
 995    store[0] = util_fast_log2( store[0] );
 996    store[1] = util_fast_log2( store[1] );
 997    store[2] = util_fast_log2( store[2] );
 998    store[3] = util_fast_log2( store[3] );
 999 #endif
1000 }
1001
1002 static void
1003 emit_lg2(
1004    struct x86_function *func,
1005    unsigned xmm_save,
1006    unsigned xmm_dst )
1007 {
1008    emit_func_call_dst_src1(
1009       func,
1010       xmm_save,
1011       xmm_dst,
1012       xmm_dst,
1013       lg24f );
1014 }
1015
1016 static void
1017 emit_MOV(
1018    struct x86_function *func,
1019    unsigned xmm_dst,
1020    unsigned xmm_src )
1021 {
1022    sse_movups(
1023       func,
1024       make_xmm( xmm_dst ),
1025       make_xmm( xmm_src ) );
1026 }
1027
1028 static void
1029 emit_mul (struct x86_function *func,
1030           unsigned xmm_dst,
1031           unsigned xmm_src)
1032 {
1033    sse_mulps(
1034       func,
1035       make_xmm( xmm_dst ),
1036       make_xmm( xmm_src ) );
1037 }
1038
1039 static void
1040 emit_neg(
1041    struct x86_function *func,
1042    unsigned xmm )
1043 {
1044    sse_xorps(
1045       func,
1046       make_xmm( xmm ),
1047       get_temp(
1048          TGSI_EXEC_TEMP_80000000_I,
1049          TGSI_EXEC_TEMP_80000000_C ) );
1050 }
1051
1052 static void PIPE_CDECL
1053 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1054 __attribute__((force_align_arg_pointer))
1055 #endif
1056 pow4f(
1057    float *store )
1058 {
1059 #if defined(PIPE_ARCH_SSE)
1060    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1061 #else
1062    store[0] = util_fast_pow( store[0], store[4] );
1063    store[1] = util_fast_pow( store[1], store[5] );
1064    store[2] = util_fast_pow( store[2], store[6] );
1065    store[3] = util_fast_pow( store[3], store[7] );
1066 #endif
1067 }
1068
1069 static void
1070 emit_pow(
1071    struct x86_function *func,
1072    unsigned xmm_save,
1073    unsigned xmm_dst,
1074    unsigned xmm_src0,
1075    unsigned xmm_src1 )
1076 {
1077    emit_func_call_dst_src2(
1078       func,
1079       xmm_save,
1080       xmm_dst,
1081       xmm_src0,
1082       xmm_src1,
1083       pow4f );
1084 }
1085
1086 static void
1087 emit_rcp (
1088    struct x86_function *func,
1089    unsigned xmm_dst,
1090    unsigned xmm_src )
1091 {
1092    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1093     * good enough.  Need to either emit a proper divide or use the
1094     * iterative technique described below in emit_rsqrt().
1095     */
1096    sse2_rcpps(
1097       func,
1098       make_xmm( xmm_dst ),
1099       make_xmm( xmm_src ) );
1100 }
1101
1102 static void PIPE_CDECL
1103 rnd4f(
1104    float *store )
1105 {
1106    store[0] = floorf( store[0] + 0.5f );
1107    store[1] = floorf( store[1] + 0.5f );
1108    store[2] = floorf( store[2] + 0.5f );
1109    store[3] = floorf( store[3] + 0.5f );
1110 }
1111
1112 static void
1113 emit_rnd(
1114    struct x86_function *func,
1115    unsigned xmm_save,
1116    unsigned xmm_dst )
1117 {
1118    emit_func_call_dst_src1(
1119       func,
1120       xmm_save,
1121       xmm_dst,
1122       xmm_dst,
1123       rnd4f );
1124 }
1125
1126 static void
1127 emit_rsqrt(
1128    struct x86_function *func,
1129    unsigned xmm_dst,
1130    unsigned xmm_src )
1131 {
1132 #if HIGH_PRECISION
1133    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1134     * implementations, it is possible to improve its precision at
1135     * fairly low cost, using a newton/raphson step, as below:
1136     *
1137     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1138     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1139     *
1140     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1141     */
1142    {
1143       struct x86_reg dst = make_xmm( xmm_dst );
1144       struct x86_reg src = make_xmm( xmm_src );
1145       struct x86_reg tmp0 = make_xmm( 2 );
1146       struct x86_reg tmp1 = make_xmm( 3 );
1147
1148       assert( xmm_dst != xmm_src );
1149       assert( xmm_dst != 2 && xmm_dst != 3 );
1150       assert( xmm_src != 2 && xmm_src != 3 );
1151
1152       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1153       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1154       sse_rsqrtps( func, tmp1, src  );
1155       sse_mulps(   func, src,  tmp1 );
1156       sse_mulps(   func, dst,  tmp1 );
1157       sse_mulps(   func, src,  tmp1 );
1158       sse_subps(   func, tmp0, src  );
1159       sse_mulps(   func, dst,  tmp0 );
1160    }
1161 #else
1162    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1163     * good enough.
1164     */
1165    sse_rsqrtps(
1166       func,
1167       make_xmm( xmm_dst ),
1168       make_xmm( xmm_src ) );
1169 #endif
1170 }
1171
1172 static void
1173 emit_setsign(
1174    struct x86_function *func,
1175    unsigned xmm )
1176 {
1177    sse_orps(
1178       func,
1179       make_xmm( xmm ),
1180       get_temp(
1181          TGSI_EXEC_TEMP_80000000_I,
1182          TGSI_EXEC_TEMP_80000000_C ) );
1183 }
1184
1185 static void PIPE_CDECL
1186 sgn4f(
1187    float *store )
1188 {
1189    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1190    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1191    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1192    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1193 }
1194
1195 static void
1196 emit_sgn(
1197    struct x86_function *func,
1198    unsigned xmm_save,
1199    unsigned xmm_dst )
1200 {
1201    emit_func_call_dst_src1(
1202       func,
1203       xmm_save,
1204       xmm_dst,
1205       xmm_dst,
1206       sgn4f );
1207 }
1208
1209 static void PIPE_CDECL
1210 sin4f(
1211    float *store )
1212 {
1213    store[0] = sinf( store[0] );
1214    store[1] = sinf( store[1] );
1215    store[2] = sinf( store[2] );
1216    store[3] = sinf( store[3] );
1217 }
1218
1219 static void
1220 emit_sin (struct x86_function *func,
1221           unsigned xmm_save,
1222           unsigned xmm_dst)
1223 {
1224    emit_func_call_dst_src1(
1225       func,
1226       xmm_save,
1227       xmm_dst,
1228       xmm_dst,
1229       sin4f );
1230 }
1231
1232 static void
1233 emit_sub(
1234    struct x86_function *func,
1235    unsigned xmm_dst,
1236    unsigned xmm_src )
1237 {
1238    sse_subps(
1239       func,
1240       make_xmm( xmm_dst ),
1241       make_xmm( xmm_src ) );
1242 }
1243
1244
1245
1246
1247
1248
1249
1250 /**
1251  * Register fetch.
1252  */
1253
1254 static void
1255 emit_fetch(
1256    struct x86_function *func,
1257    unsigned xmm,
1258    const struct tgsi_full_src_register *reg,
1259    const unsigned chan_index )
1260 {
1261    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1262
1263    switch (swizzle) {
1264    case TGSI_EXTSWIZZLE_X:
1265    case TGSI_EXTSWIZZLE_Y:
1266    case TGSI_EXTSWIZZLE_Z:
1267    case TGSI_EXTSWIZZLE_W:
1268       switch (reg->SrcRegister.File) {
1269       case TGSI_FILE_CONSTANT:
1270          emit_const(
1271             func,
1272             xmm,
1273             reg->SrcRegister.Index,
1274             swizzle,
1275             reg->SrcRegister.Indirect,
1276             reg->SrcRegisterInd.File,
1277             reg->SrcRegisterInd.Index );
1278          break;
1279
1280       case TGSI_FILE_IMMEDIATE:
1281          emit_immediate(
1282             func,
1283             xmm,
1284             reg->SrcRegister.Index,
1285             swizzle );
1286          break;
1287
1288       case TGSI_FILE_INPUT:
1289          emit_inputf(
1290             func,
1291             xmm,
1292             reg->SrcRegister.Index,
1293             swizzle );
1294          break;
1295
1296       case TGSI_FILE_TEMPORARY:
1297          emit_tempf(
1298             func,
1299             xmm,
1300             reg->SrcRegister.Index,
1301             swizzle );
1302          break;
1303
1304       default:
1305          assert( 0 );
1306       }
1307       break;
1308
1309    case TGSI_EXTSWIZZLE_ZERO:
1310       emit_tempf(
1311          func,
1312          xmm,
1313          TGSI_EXEC_TEMP_00000000_I,
1314          TGSI_EXEC_TEMP_00000000_C );
1315       break;
1316
1317    case TGSI_EXTSWIZZLE_ONE:
1318       emit_tempf(
1319          func,
1320          xmm,
1321          TEMP_ONE_I,
1322          TEMP_ONE_C );
1323       break;
1324
1325    default:
1326       assert( 0 );
1327    }
1328
1329    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1330    case TGSI_UTIL_SIGN_CLEAR:
1331       emit_abs( func, xmm );
1332       break;
1333
1334    case TGSI_UTIL_SIGN_SET:
1335       emit_setsign( func, xmm );
1336       break;
1337
1338    case TGSI_UTIL_SIGN_TOGGLE:
1339       emit_neg( func, xmm );
1340       break;
1341
1342    case TGSI_UTIL_SIGN_KEEP:
1343       break;
1344    }
1345 }
1346
1347 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1348    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1349
1350 /**
1351  * Register store.
1352  */
1353
1354 static void
1355 emit_store(
1356    struct x86_function *func,
1357    unsigned xmm,
1358    const struct tgsi_full_dst_register *reg,
1359    const struct tgsi_full_instruction *inst,
1360    unsigned chan_index )
1361 {
1362    switch( reg->DstRegister.File ) {
1363    case TGSI_FILE_OUTPUT:
1364       emit_output(
1365          func,
1366          xmm,
1367          reg->DstRegister.Index,
1368          chan_index );
1369       break;
1370
1371    case TGSI_FILE_TEMPORARY:
1372       emit_temps(
1373          func,
1374          xmm,
1375          reg->DstRegister.Index,
1376          chan_index );
1377       break;
1378
1379    case TGSI_FILE_ADDRESS:
1380       emit_addrs(
1381          func,
1382          xmm,
1383          reg->DstRegister.Index,
1384          chan_index );
1385       break;
1386
1387    default:
1388       assert( 0 );
1389    }
1390
1391    switch( inst->Instruction.Saturate ) {
1392    case TGSI_SAT_NONE:
1393       break;
1394
1395    case TGSI_SAT_ZERO_ONE:
1396       /* assert( 0 ); */
1397       break;
1398
1399    case TGSI_SAT_MINUS_PLUS_ONE:
1400       assert( 0 );
1401       break;
1402    }
1403 }
1404
1405 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1406    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1407
1408
1409 static void PIPE_CDECL
1410 fetch_texel( struct tgsi_sampler **sampler,
1411              float *store )
1412 {
1413 #if 0
1414    uint j;
1415
1416    debug_printf("%s sampler: %p (%p) store: %p\n",
1417                 __FUNCTION__,
1418                 sampler, *sampler,
1419                 store );
1420
1421    debug_printf("lodbias %f\n", store[12]);
1422
1423    for (j = 0; j < 4; j++)
1424       debug_printf("sample %d texcoord %f %f\n",
1425                    j,
1426                    store[0+j],
1427                    store[4+j]);
1428 #endif
1429
1430    {
1431       float rgba[NUM_CHANNELS][QUAD_SIZE];
1432       (*sampler)->get_samples(*sampler,
1433                               &store[0],
1434                               &store[4],
1435                               &store[8],
1436                               0.0f, /*store[12],  lodbias */
1437                               rgba);
1438
1439       memcpy( store, rgba, 16 * sizeof(float));
1440    }
1441
1442 #if 0
1443    for (j = 0; j < 4; j++)
1444       debug_printf("sample %d result %f %f %f %f\n",
1445                    j,
1446                    store[0+j],
1447                    store[4+j],
1448                    store[8+j],
1449                    store[12+j]);
1450 #endif
1451 }
1452
1453 /**
1454  * High-level instruction translators.
1455  */
1456
1457 static void
1458 emit_tex( struct x86_function *func,
1459           const struct tgsi_full_instruction *inst,
1460           boolean lodbias,
1461           boolean projected)
1462 {
1463    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1464    struct x86_reg args[2];
1465    unsigned count;
1466    unsigned i;
1467
1468    switch (inst->InstructionExtTexture.Texture) {
1469    case TGSI_TEXTURE_1D:
1470    case TGSI_TEXTURE_SHADOW1D:
1471       count = 1;
1472       break;
1473    case TGSI_TEXTURE_2D:
1474    case TGSI_TEXTURE_RECT:
1475    case TGSI_TEXTURE_SHADOW2D:
1476    case TGSI_TEXTURE_SHADOWRECT:
1477       count = 2;
1478       break;
1479    case TGSI_TEXTURE_3D:
1480    case TGSI_TEXTURE_CUBE:
1481       count = 3;
1482       break;
1483    default:
1484       assert(0);
1485       return;
1486    }
1487
1488    if (lodbias) {
1489       FETCH( func, *inst, 3, 0, 3 );
1490    }
1491    else {
1492       emit_tempf(
1493          func,
1494          3,
1495          TGSI_EXEC_TEMP_00000000_I,
1496          TGSI_EXEC_TEMP_00000000_C );
1497
1498    }
1499
1500    /* store lodbias whether enabled or not -- fetch_texel currently
1501     * respects it always.
1502     */
1503    sse_movaps( func,
1504                get_temp( TEMP_R0, 3 ),
1505                make_xmm( 3 ) );
1506
1507
1508    if (projected) {
1509       FETCH( func, *inst, 3, 0, 3 );
1510
1511       emit_rcp( func, 3, 3 );
1512    }
1513
1514    for (i = 0; i < count; i++) {
1515       FETCH( func, *inst, i, 0, i );
1516
1517       if (projected) {
1518          sse_mulps(
1519             func,
1520             make_xmm( i ),
1521             make_xmm( 3 ) );
1522       }
1523
1524       /* Store in the argument buffer:
1525        */
1526       sse_movaps(
1527          func,
1528          get_temp( TEMP_R0, i ),
1529          make_xmm( i ) );
1530    }
1531
1532    args[0] = get_temp( TEMP_R0, 0 );
1533    args[1] = get_sampler_ptr( unit );
1534
1535
1536    emit_func_call( func,
1537                    0,
1538                    args,
1539                    Elements(args),
1540                    fetch_texel );
1541
1542    /* If all four channels are enabled, could use a pointer to
1543     * dst[0].x instead of TEMP_R0 for store?
1544     */
1545    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1546
1547       sse_movaps(
1548          func,
1549          make_xmm( 0 ),
1550          get_temp( TEMP_R0, i ) );
1551
1552       STORE( func, *inst, 0, 0, i );
1553    }
1554 }
1555
1556
1557 static void
1558 emit_kil(
1559    struct x86_function *func,
1560    const struct tgsi_full_src_register *reg )
1561 {
1562    unsigned uniquemask;
1563    unsigned unique_count = 0;
1564    unsigned chan_index;
1565    unsigned i;
1566
1567    /* This mask stores component bits that were already tested. Note that
1568     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1569     * tested. */
1570    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1571
1572    FOR_EACH_CHANNEL( chan_index ) {
1573       unsigned swizzle;
1574
1575       /* unswizzle channel */
1576       swizzle = tgsi_util_get_full_src_register_extswizzle(
1577          reg,
1578          chan_index );
1579
1580       /* check if the component has not been already tested */
1581       if( !(uniquemask & (1 << swizzle)) ) {
1582          uniquemask |= 1 << swizzle;
1583
1584          /* allocate register */
1585          emit_fetch(
1586             func,
1587             unique_count++,
1588             reg,
1589             chan_index );
1590       }
1591    }
1592
1593    x86_push(
1594       func,
1595       x86_make_reg( file_REG32, reg_AX ) );
1596    x86_push(
1597       func,
1598       x86_make_reg( file_REG32, reg_DX ) );
1599
1600    for (i = 0 ; i < unique_count; i++ ) {
1601       struct x86_reg dataXMM = make_xmm(i);
1602
1603       sse_cmpps(
1604          func,
1605          dataXMM,
1606          get_temp(
1607             TGSI_EXEC_TEMP_00000000_I,
1608             TGSI_EXEC_TEMP_00000000_C ),
1609          cc_LessThan );
1610
1611       if( i == 0 ) {
1612          sse_movmskps(
1613             func,
1614             x86_make_reg( file_REG32, reg_AX ),
1615             dataXMM );
1616       }
1617       else {
1618          sse_movmskps(
1619             func,
1620             x86_make_reg( file_REG32, reg_DX ),
1621             dataXMM );
1622          x86_or(
1623             func,
1624             x86_make_reg( file_REG32, reg_AX ),
1625             x86_make_reg( file_REG32, reg_DX ) );
1626       }
1627    }
1628
1629    x86_or(
1630       func,
1631       get_temp(
1632          TGSI_EXEC_TEMP_KILMASK_I,
1633          TGSI_EXEC_TEMP_KILMASK_C ),
1634       x86_make_reg( file_REG32, reg_AX ) );
1635
1636    x86_pop(
1637       func,
1638       x86_make_reg( file_REG32, reg_DX ) );
1639    x86_pop(
1640       func,
1641       x86_make_reg( file_REG32, reg_AX ) );
1642 }
1643
1644
1645 static void
1646 emit_kilp(
1647    struct x86_function *func )
1648 {
1649    /* XXX todo / fix me */
1650 }
1651
1652
1653 static void
1654 emit_setcc(
1655    struct x86_function *func,
1656    struct tgsi_full_instruction *inst,
1657    enum sse_cc cc )
1658 {
1659    unsigned chan_index;
1660
1661    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1662       FETCH( func, *inst, 0, 0, chan_index );
1663       FETCH( func, *inst, 1, 1, chan_index );
1664       sse_cmpps(
1665          func,
1666          make_xmm( 0 ),
1667          make_xmm( 1 ),
1668          cc );
1669       sse_andps(
1670          func,
1671          make_xmm( 0 ),
1672          get_temp(
1673             TEMP_ONE_I,
1674             TEMP_ONE_C ) );
1675       STORE( func, *inst, 0, 0, chan_index );
1676    }
1677 }
1678
1679 static void
1680 emit_cmp(
1681    struct x86_function *func,
1682    struct tgsi_full_instruction *inst )
1683 {
1684    unsigned chan_index;
1685
1686    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1687       FETCH( func, *inst, 0, 0, chan_index );
1688       FETCH( func, *inst, 1, 1, chan_index );
1689       FETCH( func, *inst, 2, 2, chan_index );
1690       sse_cmpps(
1691          func,
1692          make_xmm( 0 ),
1693          get_temp(
1694             TGSI_EXEC_TEMP_00000000_I,
1695             TGSI_EXEC_TEMP_00000000_C ),
1696          cc_LessThan );
1697       sse_andps(
1698          func,
1699          make_xmm( 1 ),
1700          make_xmm( 0 ) );
1701       sse_andnps(
1702          func,
1703          make_xmm( 0 ),
1704          make_xmm( 2 ) );
1705       sse_orps(
1706          func,
1707          make_xmm( 0 ),
1708          make_xmm( 1 ) );
1709       STORE( func, *inst, 0, 0, chan_index );
1710    }
1711 }
1712
1713
1714 /**
1715  * Check if inst src/dest regs use indirect addressing into temporary
1716  * register file.
1717  */
1718 static boolean
1719 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1720 {
1721    uint i;
1722    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1723       const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1724       if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1725           reg->SrcRegister.Indirect)
1726          return TRUE;
1727    }
1728    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1729       const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1730       if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1731           reg->DstRegister.Indirect)
1732          return TRUE;
1733    }
1734    return FALSE;
1735 }
1736
1737
1738 static int
1739 emit_instruction(
1740    struct x86_function *func,
1741    struct tgsi_full_instruction *inst )
1742 {
1743    unsigned chan_index;
1744
1745    /* we can't handle indirect addressing into temp register file yet */
1746    if (indirect_temp_reference(inst))
1747       return FALSE;
1748
1749    switch (inst->Instruction.Opcode) {
1750    case TGSI_OPCODE_ARL:
1751       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1752          FETCH( func, *inst, 0, 0, chan_index );
1753          emit_flr(func, 0, 0);
1754          emit_f2it( func, 0 );
1755          STORE( func, *inst, 0, 0, chan_index );
1756       }
1757       break;
1758
1759    case TGSI_OPCODE_MOV:
1760    case TGSI_OPCODE_SWZ:
1761       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1762          FETCH( func, *inst, 0, 0, chan_index );
1763          STORE( func, *inst, 0, 0, chan_index );
1764       }
1765       break;
1766
1767    case TGSI_OPCODE_LIT:
1768       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1769           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1770          emit_tempf(
1771             func,
1772             0,
1773             TEMP_ONE_I,
1774             TEMP_ONE_C);
1775          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1776             STORE( func, *inst, 0, 0, CHAN_X );
1777          }
1778          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1779             STORE( func, *inst, 0, 0, CHAN_W );
1780          }
1781       }
1782       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1783           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1784          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1785             FETCH( func, *inst, 0, 0, CHAN_X );
1786             sse_maxps(
1787                func,
1788                make_xmm( 0 ),
1789                get_temp(
1790                   TGSI_EXEC_TEMP_00000000_I,
1791                   TGSI_EXEC_TEMP_00000000_C ) );
1792             STORE( func, *inst, 0, 0, CHAN_Y );
1793          }
1794          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1795             /* XMM[1] = SrcReg[0].yyyy */
1796             FETCH( func, *inst, 1, 0, CHAN_Y );
1797             /* XMM[1] = max(XMM[1], 0) */
1798             sse_maxps(
1799                func,
1800                make_xmm( 1 ),
1801                get_temp(
1802                   TGSI_EXEC_TEMP_00000000_I,
1803                   TGSI_EXEC_TEMP_00000000_C ) );
1804             /* XMM[2] = SrcReg[0].wwww */
1805             FETCH( func, *inst, 2, 0, CHAN_W );
1806             /* XMM[2] = min(XMM[2], 128.0) */
1807             sse_minps(
1808                func,
1809                make_xmm( 2 ),
1810                get_temp(
1811                   TGSI_EXEC_TEMP_128_I,
1812                   TGSI_EXEC_TEMP_128_C ) );
1813             /* XMM[2] = max(XMM[2], -128.0) */
1814             sse_maxps(
1815                func,
1816                make_xmm( 2 ),
1817                get_temp(
1818                   TGSI_EXEC_TEMP_MINUS_128_I,
1819                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1820             emit_pow( func, 3, 1, 1, 2 );
1821             FETCH( func, *inst, 0, 0, CHAN_X );
1822             sse_xorps(
1823                func,
1824                make_xmm( 2 ),
1825                make_xmm( 2 ) );
1826             sse_cmpps(
1827                func,
1828                make_xmm( 2 ),
1829                make_xmm( 0 ),
1830                cc_LessThan );
1831             sse_andps(
1832                func,
1833                make_xmm( 2 ),
1834                make_xmm( 1 ) );
1835             STORE( func, *inst, 2, 0, CHAN_Z );
1836          }
1837       }
1838       break;
1839
1840    case TGSI_OPCODE_RCP:
1841    /* TGSI_OPCODE_RECIP */
1842       FETCH( func, *inst, 0, 0, CHAN_X );
1843       emit_rcp( func, 0, 0 );
1844       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1845          STORE( func, *inst, 0, 0, chan_index );
1846       }
1847       break;
1848
1849    case TGSI_OPCODE_RSQ:
1850    /* TGSI_OPCODE_RECIPSQRT */
1851       FETCH( func, *inst, 0, 0, CHAN_X );
1852       emit_abs( func, 0 );
1853       emit_rsqrt( func, 1, 0 );
1854       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1855          STORE( func, *inst, 1, 0, chan_index );
1856       }
1857       break;
1858
1859    case TGSI_OPCODE_EXP:
1860       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1861           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1862           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1863          FETCH( func, *inst, 0, 0, CHAN_X );
1864          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1865              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1866             emit_MOV( func, 1, 0 );
1867             emit_flr( func, 2, 1 );
1868             /* dst.x = ex2(floor(src.x)) */
1869             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1870                emit_MOV( func, 2, 1 );
1871                emit_ex2( func, 3, 2 );
1872                STORE( func, *inst, 2, 0, CHAN_X );
1873             }
1874             /* dst.y = src.x - floor(src.x) */
1875             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1876                emit_MOV( func, 2, 0 );
1877                emit_sub( func, 2, 1 );
1878                STORE( func, *inst, 2, 0, CHAN_Y );
1879             }
1880          }
1881          /* dst.z = ex2(src.x) */
1882          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1883             emit_ex2( func, 3, 0 );
1884             STORE( func, *inst, 0, 0, CHAN_Z );
1885          }
1886       }
1887       /* dst.w = 1.0 */
1888       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1889          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1890          STORE( func, *inst, 0, 0, CHAN_W );
1891       }
1892       break;
1893
1894    case TGSI_OPCODE_LOG:
1895       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1896           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1897           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1898          FETCH( func, *inst, 0, 0, CHAN_X );
1899          emit_abs( func, 0 );
1900          emit_MOV( func, 1, 0 );
1901          emit_lg2( func, 2, 1 );
1902          /* dst.z = lg2(abs(src.x)) */
1903          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1904             STORE( func, *inst, 1, 0, CHAN_Z );
1905          }
1906          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1907              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1908             emit_flr( func, 2, 1 );
1909             /* dst.x = floor(lg2(abs(src.x))) */
1910             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1911                STORE( func, *inst, 1, 0, CHAN_X );
1912             }
1913             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1914             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1915                emit_ex2( func, 2, 1 );
1916                emit_rcp( func, 1, 1 );
1917                emit_mul( func, 0, 1 );
1918                STORE( func, *inst, 0, 0, CHAN_Y );
1919             }
1920          }
1921       }
1922       /* dst.w = 1.0 */
1923       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1924          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1925          STORE( func, *inst, 0, 0, CHAN_W );
1926       }
1927       break;
1928
1929    case TGSI_OPCODE_MUL:
1930       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1931          FETCH( func, *inst, 0, 0, chan_index );
1932          FETCH( func, *inst, 1, 1, chan_index );
1933          emit_mul( func, 0, 1 );
1934          STORE( func, *inst, 0, 0, chan_index );
1935       }
1936       break;
1937
1938    case TGSI_OPCODE_ADD:
1939       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1940          FETCH( func, *inst, 0, 0, chan_index );
1941          FETCH( func, *inst, 1, 1, chan_index );
1942          emit_add( func, 0, 1 );
1943          STORE( func, *inst, 0, 0, chan_index );
1944       }
1945       break;
1946
1947    case TGSI_OPCODE_DP3:
1948    /* TGSI_OPCODE_DOT3 */
1949       FETCH( func, *inst, 0, 0, CHAN_X );
1950       FETCH( func, *inst, 1, 1, CHAN_X );
1951       emit_mul( func, 0, 1 );
1952       FETCH( func, *inst, 1, 0, CHAN_Y );
1953       FETCH( func, *inst, 2, 1, CHAN_Y );
1954       emit_mul( func, 1, 2 );
1955       emit_add( func, 0, 1 );
1956       FETCH( func, *inst, 1, 0, CHAN_Z );
1957       FETCH( func, *inst, 2, 1, CHAN_Z );
1958       emit_mul( func, 1, 2 );
1959       emit_add( func, 0, 1 );
1960       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1961          STORE( func, *inst, 0, 0, chan_index );
1962       }
1963       break;
1964
1965    case TGSI_OPCODE_DP4:
1966    /* TGSI_OPCODE_DOT4 */
1967       FETCH( func, *inst, 0, 0, CHAN_X );
1968       FETCH( func, *inst, 1, 1, CHAN_X );
1969       emit_mul( func, 0, 1 );
1970       FETCH( func, *inst, 1, 0, CHAN_Y );
1971       FETCH( func, *inst, 2, 1, CHAN_Y );
1972       emit_mul( func, 1, 2 );
1973       emit_add( func, 0, 1 );
1974       FETCH( func, *inst, 1, 0, CHAN_Z );
1975       FETCH( func, *inst, 2, 1, CHAN_Z );
1976       emit_mul(func, 1, 2 );
1977       emit_add(func, 0, 1 );
1978       FETCH( func, *inst, 1, 0, CHAN_W );
1979       FETCH( func, *inst, 2, 1, CHAN_W );
1980       emit_mul( func, 1, 2 );
1981       emit_add( func, 0, 1 );
1982       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1983          STORE( func, *inst, 0, 0, chan_index );
1984       }
1985       break;
1986
1987    case TGSI_OPCODE_DST:
1988       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1989          emit_tempf(
1990             func,
1991             0,
1992             TEMP_ONE_I,
1993             TEMP_ONE_C );
1994          STORE( func, *inst, 0, 0, CHAN_X );
1995       }
1996       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1997          FETCH( func, *inst, 0, 0, CHAN_Y );
1998          FETCH( func, *inst, 1, 1, CHAN_Y );
1999          emit_mul( func, 0, 1 );
2000          STORE( func, *inst, 0, 0, CHAN_Y );
2001       }
2002       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2003          FETCH( func, *inst, 0, 0, CHAN_Z );
2004          STORE( func, *inst, 0, 0, CHAN_Z );
2005       }
2006       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2007          FETCH( func, *inst, 0, 1, CHAN_W );
2008          STORE( func, *inst, 0, 0, CHAN_W );
2009       }
2010       break;
2011
2012    case TGSI_OPCODE_MIN:
2013       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2014          FETCH( func, *inst, 0, 0, chan_index );
2015          FETCH( func, *inst, 1, 1, chan_index );
2016          sse_minps(
2017             func,
2018             make_xmm( 0 ),
2019             make_xmm( 1 ) );
2020          STORE( func, *inst, 0, 0, chan_index );
2021       }
2022       break;
2023
2024    case TGSI_OPCODE_MAX:
2025       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2026          FETCH( func, *inst, 0, 0, chan_index );
2027          FETCH( func, *inst, 1, 1, chan_index );
2028          sse_maxps(
2029             func,
2030             make_xmm( 0 ),
2031             make_xmm( 1 ) );
2032          STORE( func, *inst, 0, 0, chan_index );
2033       }
2034       break;
2035
2036    case TGSI_OPCODE_SLT:
2037    /* TGSI_OPCODE_SETLT */
2038       emit_setcc( func, inst, cc_LessThan );
2039       break;
2040
2041    case TGSI_OPCODE_SGE:
2042    /* TGSI_OPCODE_SETGE */
2043       emit_setcc( func, inst, cc_NotLessThan );
2044       break;
2045
2046    case TGSI_OPCODE_MAD:
2047    /* TGSI_OPCODE_MADD */
2048       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2049          FETCH( func, *inst, 0, 0, chan_index );
2050          FETCH( func, *inst, 1, 1, chan_index );
2051          FETCH( func, *inst, 2, 2, chan_index );
2052          emit_mul( func, 0, 1 );
2053          emit_add( func, 0, 2 );
2054          STORE( func, *inst, 0, 0, chan_index );
2055       }
2056       break;
2057
2058    case TGSI_OPCODE_SUB:
2059       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2060          FETCH( func, *inst, 0, 0, chan_index );
2061          FETCH( func, *inst, 1, 1, chan_index );
2062          emit_sub( func, 0, 1 );
2063          STORE( func, *inst, 0, 0, chan_index );
2064       }
2065       break;
2066
2067    case TGSI_OPCODE_LERP:
2068    /* TGSI_OPCODE_LRP */
2069       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2070          FETCH( func, *inst, 0, 0, chan_index );
2071          FETCH( func, *inst, 1, 1, chan_index );
2072          FETCH( func, *inst, 2, 2, chan_index );
2073          emit_sub( func, 1, 2 );
2074          emit_mul( func, 0, 1 );
2075          emit_add( func, 0, 2 );
2076          STORE( func, *inst, 0, 0, chan_index );
2077       }
2078       break;
2079
2080    case TGSI_OPCODE_CND:
2081       return 0;
2082       break;
2083
2084    case TGSI_OPCODE_CND0:
2085       return 0;
2086       break;
2087
2088    case TGSI_OPCODE_DOT2ADD:
2089    /* TGSI_OPCODE_DP2A */
2090       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2091       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2092       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2093       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2094       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2095       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2096       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2097       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2098       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2099       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2100          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2101       }
2102       break;
2103
2104    case TGSI_OPCODE_INDEX:
2105       return 0;
2106       break;
2107
2108    case TGSI_OPCODE_NEGATE:
2109       return 0;
2110       break;
2111
2112    case TGSI_OPCODE_FRAC:
2113    /* TGSI_OPCODE_FRC */
2114       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2115          FETCH( func, *inst, 0, 0, chan_index );
2116          emit_frc( func, 0, 0 );
2117          STORE( func, *inst, 0, 0, chan_index );
2118       }
2119       break;
2120
2121    case TGSI_OPCODE_CLAMP:
2122       return 0;
2123       break;
2124
2125    case TGSI_OPCODE_FLOOR:
2126    /* TGSI_OPCODE_FLR */
2127       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2128          FETCH( func, *inst, 0, 0, chan_index );
2129          emit_flr( func, 0, 0 );
2130          STORE( func, *inst, 0, 0, chan_index );
2131       }
2132       break;
2133
2134    case TGSI_OPCODE_ROUND:
2135       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2136          FETCH( func, *inst, 0, 0, chan_index );
2137          emit_rnd( func, 0, 0 );
2138          STORE( func, *inst, 0, 0, chan_index );
2139       }
2140       break;
2141
2142    case TGSI_OPCODE_EXPBASE2:
2143    /* TGSI_OPCODE_EX2 */
2144       FETCH( func, *inst, 0, 0, CHAN_X );
2145       emit_ex2( func, 0, 0 );
2146       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2147          STORE( func, *inst, 0, 0, chan_index );
2148       }
2149       break;
2150
2151    case TGSI_OPCODE_LOGBASE2:
2152    /* TGSI_OPCODE_LG2 */
2153       FETCH( func, *inst, 0, 0, CHAN_X );
2154       emit_lg2( func, 0, 0 );
2155       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2156          STORE( func, *inst, 0, 0, chan_index );
2157       }
2158       break;
2159
2160    case TGSI_OPCODE_POWER:
2161    /* TGSI_OPCODE_POW */
2162       FETCH( func, *inst, 0, 0, CHAN_X );
2163       FETCH( func, *inst, 1, 1, CHAN_X );
2164       emit_pow( func, 0, 0, 0, 1 );
2165       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2166          STORE( func, *inst, 0, 0, chan_index );
2167       }
2168       break;
2169
2170    case TGSI_OPCODE_CROSSPRODUCT:
2171    /* TGSI_OPCODE_XPD */
2172       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2173           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2174          FETCH( func, *inst, 1, 1, CHAN_Z );
2175          FETCH( func, *inst, 3, 0, CHAN_Z );
2176       }
2177       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2178           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2179          FETCH( func, *inst, 0, 0, CHAN_Y );
2180          FETCH( func, *inst, 4, 1, CHAN_Y );
2181       }
2182       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2183          emit_MOV( func, 2, 0 );
2184          emit_mul( func, 2, 1 );
2185          emit_MOV( func, 5, 3 );
2186          emit_mul( func, 5, 4 );
2187          emit_sub( func, 2, 5 );
2188          STORE( func, *inst, 2, 0, CHAN_X );
2189       }
2190       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2191           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2192          FETCH( func, *inst, 2, 1, CHAN_X );
2193          FETCH( func, *inst, 5, 0, CHAN_X );
2194       }
2195       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2196          emit_mul( func, 3, 2 );
2197          emit_mul( func, 1, 5 );
2198          emit_sub( func, 3, 1 );
2199          STORE( func, *inst, 3, 0, CHAN_Y );
2200       }
2201       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2202          emit_mul( func, 5, 4 );
2203          emit_mul( func, 0, 2 );
2204          emit_sub( func, 5, 0 );
2205          STORE( func, *inst, 5, 0, CHAN_Z );
2206       }
2207       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2208          emit_tempf(
2209             func,
2210             0,
2211             TEMP_ONE_I,
2212             TEMP_ONE_C );
2213          STORE( func, *inst, 0, 0, CHAN_W );
2214       }
2215       break;
2216
2217    case TGSI_OPCODE_MULTIPLYMATRIX:
2218       return 0;
2219       break;
2220
2221    case TGSI_OPCODE_ABS:
2222       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2223          FETCH( func, *inst, 0, 0, chan_index );
2224          emit_abs( func, 0) ;
2225
2226          STORE( func, *inst, 0, 0, chan_index );
2227       }
2228       break;
2229
2230    case TGSI_OPCODE_RCC:
2231       return 0;
2232       break;
2233
2234    case TGSI_OPCODE_DPH:
2235       FETCH( func, *inst, 0, 0, CHAN_X );
2236       FETCH( func, *inst, 1, 1, CHAN_X );
2237       emit_mul( func, 0, 1 );
2238       FETCH( func, *inst, 1, 0, CHAN_Y );
2239       FETCH( func, *inst, 2, 1, CHAN_Y );
2240       emit_mul( func, 1, 2 );
2241       emit_add( func, 0, 1 );
2242       FETCH( func, *inst, 1, 0, CHAN_Z );
2243       FETCH( func, *inst, 2, 1, CHAN_Z );
2244       emit_mul( func, 1, 2 );
2245       emit_add( func, 0, 1 );
2246       FETCH( func, *inst, 1, 1, CHAN_W );
2247       emit_add( func, 0, 1 );
2248       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2249          STORE( func, *inst, 0, 0, chan_index );
2250       }
2251       break;
2252
2253    case TGSI_OPCODE_COS:
2254       FETCH( func, *inst, 0, 0, CHAN_X );
2255       emit_cos( func, 0, 0 );
2256       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2257          STORE( func, *inst, 0, 0, chan_index );
2258       }
2259       break;
2260
2261    case TGSI_OPCODE_DDX:
2262       return 0;
2263       break;
2264
2265    case TGSI_OPCODE_DDY:
2266       return 0;
2267       break;
2268
2269    case TGSI_OPCODE_KILP:
2270       /* predicated kill */
2271       emit_kilp( func );
2272       return 0; /* XXX fix me */
2273       break;
2274
2275    case TGSI_OPCODE_KIL:
2276       /* conditional kill */
2277       emit_kil( func, &inst->FullSrcRegisters[0] );
2278       break;
2279
2280    case TGSI_OPCODE_PK2H:
2281       return 0;
2282       break;
2283
2284    case TGSI_OPCODE_PK2US:
2285       return 0;
2286       break;
2287
2288    case TGSI_OPCODE_PK4B:
2289       return 0;
2290       break;
2291
2292    case TGSI_OPCODE_PK4UB:
2293       return 0;
2294       break;
2295
2296    case TGSI_OPCODE_RFL:
2297       return 0;
2298       break;
2299
2300    case TGSI_OPCODE_SEQ:
2301       return 0;
2302       break;
2303
2304    case TGSI_OPCODE_SFL:
2305       return 0;
2306       break;
2307
2308    case TGSI_OPCODE_SGT:
2309       return 0;
2310       break;
2311
2312    case TGSI_OPCODE_SIN:
2313       FETCH( func, *inst, 0, 0, CHAN_X );
2314       emit_sin( func, 0, 0 );
2315       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2316          STORE( func, *inst, 0, 0, chan_index );
2317       }
2318       break;
2319
2320    case TGSI_OPCODE_SLE:
2321       return 0;
2322       break;
2323
2324    case TGSI_OPCODE_SNE:
2325       return 0;
2326       break;
2327
2328    case TGSI_OPCODE_STR:
2329       return 0;
2330       break;
2331
2332    case TGSI_OPCODE_TEX:
2333       emit_tex( func, inst, FALSE, FALSE );
2334       break;
2335
2336    case TGSI_OPCODE_TXD:
2337       return 0;
2338       break;
2339
2340    case TGSI_OPCODE_UP2H:
2341       return 0;
2342       break;
2343
2344    case TGSI_OPCODE_UP2US:
2345       return 0;
2346       break;
2347
2348    case TGSI_OPCODE_UP4B:
2349       return 0;
2350       break;
2351
2352    case TGSI_OPCODE_UP4UB:
2353       return 0;
2354       break;
2355
2356    case TGSI_OPCODE_X2D:
2357       return 0;
2358       break;
2359
2360    case TGSI_OPCODE_ARA:
2361       return 0;
2362       break;
2363
2364    case TGSI_OPCODE_ARR:
2365       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2366          FETCH( func, *inst, 0, 0, chan_index );
2367          emit_rnd( func, 0, 0 );
2368          emit_f2it( func, 0 );
2369          STORE( func, *inst, 0, 0, chan_index );
2370       }
2371       break;
2372
2373    case TGSI_OPCODE_BRA:
2374       return 0;
2375       break;
2376
2377    case TGSI_OPCODE_CAL:
2378       return 0;
2379       break;
2380
2381    case TGSI_OPCODE_RET:
2382       emit_ret( func );
2383       break;
2384
2385    case TGSI_OPCODE_END:
2386       break;
2387
2388    case TGSI_OPCODE_SSG:
2389    /* TGSI_OPCODE_SGN */
2390       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2391          FETCH( func, *inst, 0, 0, chan_index );
2392          emit_sgn( func, 0, 0 );
2393          STORE( func, *inst, 0, 0, chan_index );
2394       }
2395       break;
2396
2397    case TGSI_OPCODE_CMP:
2398       emit_cmp (func, inst);
2399       break;
2400
2401    case TGSI_OPCODE_SCS:
2402       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2403          FETCH( func, *inst, 0, 0, CHAN_X );
2404          emit_cos( func, 0, 0 );
2405          STORE( func, *inst, 0, 0, CHAN_X );
2406       }
2407       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2408          FETCH( func, *inst, 0, 0, CHAN_X );
2409          emit_sin( func, 0, 0 );
2410          STORE( func, *inst, 0, 0, CHAN_Y );
2411       }
2412       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2413          emit_tempf(
2414             func,
2415             0,
2416             TGSI_EXEC_TEMP_00000000_I,
2417             TGSI_EXEC_TEMP_00000000_C );
2418          STORE( func, *inst, 0, 0, CHAN_Z );
2419       }
2420       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2421          emit_tempf(
2422             func,
2423             0,
2424             TEMP_ONE_I,
2425             TEMP_ONE_C );
2426          STORE( func, *inst, 0, 0, CHAN_W );
2427       }
2428       break;
2429
2430    case TGSI_OPCODE_TXB:
2431       emit_tex( func, inst, TRUE, FALSE );
2432       break;
2433
2434    case TGSI_OPCODE_NRM:
2435       /* fall-through */
2436    case TGSI_OPCODE_NRM4:
2437       /* 3 or 4-component normalization */
2438       {
2439          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2440
2441          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2442              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2443              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2444              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2445
2446             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2447
2448             /* xmm4 = src.x */
2449             /* xmm0 = src.x * src.x */
2450             FETCH(func, *inst, 0, 0, CHAN_X);
2451             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2452                emit_MOV(func, 4, 0);
2453             }
2454             emit_mul(func, 0, 0);
2455
2456             /* xmm5 = src.y */
2457             /* xmm0 = xmm0 + src.y * src.y */
2458             FETCH(func, *inst, 1, 0, CHAN_Y);
2459             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2460                emit_MOV(func, 5, 1);
2461             }
2462             emit_mul(func, 1, 1);
2463             emit_add(func, 0, 1);
2464
2465             /* xmm6 = src.z */
2466             /* xmm0 = xmm0 + src.z * src.z */
2467             FETCH(func, *inst, 1, 0, CHAN_Z);
2468             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2469                emit_MOV(func, 6, 1);
2470             }
2471             emit_mul(func, 1, 1);
2472             emit_add(func, 0, 1);
2473
2474             if (dims == 4) {
2475                /* xmm7 = src.w */
2476                /* xmm0 = xmm0 + src.w * src.w */
2477                FETCH(func, *inst, 1, 0, CHAN_W);
2478                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2479                   emit_MOV(func, 7, 1);
2480                }
2481                emit_mul(func, 1, 1);
2482                emit_add(func, 0, 1);
2483             }
2484
2485             /* xmm1 = 1 / sqrt(xmm0) */
2486             emit_rsqrt(func, 1, 0);
2487
2488             /* dst.x = xmm1 * src.x */
2489             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2490                emit_mul(func, 4, 1);
2491                STORE(func, *inst, 4, 0, CHAN_X);
2492             }
2493
2494             /* dst.y = xmm1 * src.y */
2495             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2496                emit_mul(func, 5, 1);
2497                STORE(func, *inst, 5, 0, CHAN_Y);
2498             }
2499
2500             /* dst.z = xmm1 * src.z */
2501             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2502                emit_mul(func, 6, 1);
2503                STORE(func, *inst, 6, 0, CHAN_Z);
2504             }
2505
2506             /* dst.w = xmm1 * src.w */
2507             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2508                emit_mul(func, 7, 1);
2509                STORE(func, *inst, 7, 0, CHAN_W);
2510             }
2511          }
2512
2513          /* dst0.w = 1.0 */
2514          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2515             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2516             STORE(func, *inst, 0, 0, CHAN_W);
2517          }
2518       }
2519       break;
2520
2521    case TGSI_OPCODE_DIV:
2522       return 0;
2523       break;
2524
2525    case TGSI_OPCODE_DP2:
2526       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2527       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2528       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2529       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2530       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2531       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2532       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2533       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2534          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2535       }
2536       break;
2537
2538    case TGSI_OPCODE_TXL:
2539       emit_tex( func, inst, TRUE, FALSE );
2540       break;
2541
2542    case TGSI_OPCODE_TXP:
2543       emit_tex( func, inst, FALSE, TRUE );
2544       break;
2545
2546    case TGSI_OPCODE_BRK:
2547       return 0;
2548       break;
2549
2550    case TGSI_OPCODE_IF:
2551       return 0;
2552       break;
2553
2554    case TGSI_OPCODE_LOOP:
2555       return 0;
2556       break;
2557
2558    case TGSI_OPCODE_REP:
2559       return 0;
2560       break;
2561
2562    case TGSI_OPCODE_ELSE:
2563       return 0;
2564       break;
2565
2566    case TGSI_OPCODE_ENDIF:
2567       return 0;
2568       break;
2569
2570    case TGSI_OPCODE_ENDLOOP:
2571       return 0;
2572       break;
2573
2574    case TGSI_OPCODE_ENDREP:
2575       return 0;
2576       break;
2577
2578    case TGSI_OPCODE_PUSHA:
2579       return 0;
2580       break;
2581
2582    case TGSI_OPCODE_POPA:
2583       return 0;
2584       break;
2585
2586    case TGSI_OPCODE_CEIL:
2587       return 0;
2588       break;
2589
2590    case TGSI_OPCODE_I2F:
2591       return 0;
2592       break;
2593
2594    case TGSI_OPCODE_NOT:
2595       return 0;
2596       break;
2597
2598    case TGSI_OPCODE_TRUNC:
2599       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2600          FETCH( func, *inst, 0, 0, chan_index );
2601          emit_f2it( func, 0 );
2602          emit_i2f( func, 0 );
2603          STORE( func, *inst, 0, 0, chan_index );
2604       }
2605       break;
2606
2607    case TGSI_OPCODE_SHL:
2608       return 0;
2609       break;
2610
2611    case TGSI_OPCODE_SHR:
2612       return 0;
2613       break;
2614
2615    case TGSI_OPCODE_AND:
2616       return 0;
2617       break;
2618
2619    case TGSI_OPCODE_OR:
2620       return 0;
2621       break;
2622
2623    case TGSI_OPCODE_MOD:
2624       return 0;
2625       break;
2626
2627    case TGSI_OPCODE_XOR:
2628       return 0;
2629       break;
2630
2631    case TGSI_OPCODE_SAD:
2632       return 0;
2633       break;
2634
2635    case TGSI_OPCODE_TXF:
2636       return 0;
2637       break;
2638
2639    case TGSI_OPCODE_TXQ:
2640       return 0;
2641       break;
2642
2643    case TGSI_OPCODE_CONT:
2644       return 0;
2645       break;
2646
2647    case TGSI_OPCODE_EMIT:
2648       return 0;
2649       break;
2650
2651    case TGSI_OPCODE_ENDPRIM:
2652       return 0;
2653       break;
2654
2655    default:
2656       return 0;
2657    }
2658
2659    return 1;
2660 }
2661
2662 static void
2663 emit_declaration(
2664    struct x86_function *func,
2665    struct tgsi_full_declaration *decl )
2666 {
2667    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2668       unsigned first, last, mask;
2669       unsigned i, j;
2670
2671       first = decl->DeclarationRange.First;
2672       last = decl->DeclarationRange.Last;
2673       mask = decl->Declaration.UsageMask;
2674
2675       for( i = first; i <= last; i++ ) {
2676          for( j = 0; j < NUM_CHANNELS; j++ ) {
2677             if( mask & (1 << j) ) {
2678                switch( decl->Declaration.Interpolate ) {
2679                case TGSI_INTERPOLATE_CONSTANT:
2680                   emit_coef_a0( func, 0, i, j );
2681                   emit_inputs( func, 0, i, j );
2682                   break;
2683
2684                case TGSI_INTERPOLATE_LINEAR:
2685                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2686                   emit_coef_dadx( func, 1, i, j );
2687                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2688                   emit_coef_dady( func, 3, i, j );
2689                   emit_mul( func, 0, 1 );    /* x * dadx */
2690                   emit_coef_a0( func, 4, i, j );
2691                   emit_mul( func, 2, 3 );    /* y * dady */
2692                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2693                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2694                   emit_inputs( func, 0, i, j );
2695                   break;
2696
2697                case TGSI_INTERPOLATE_PERSPECTIVE:
2698                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2699                   emit_coef_dadx( func, 1, i, j );
2700                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2701                   emit_coef_dady( func, 3, i, j );
2702                   emit_mul( func, 0, 1 );    /* x * dadx */
2703                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2704                   emit_coef_a0( func, 5, i, j );
2705                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2706                   emit_mul( func, 2, 3 );    /* y * dady */
2707                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2708                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2709                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2710                   emit_inputs( func, 0, i, j );
2711                   break;
2712
2713                default:
2714                   assert( 0 );
2715                   break;
2716                }
2717             }
2718          }
2719       }
2720    }
2721 }
2722
2723 static void aos_to_soa( struct x86_function *func,
2724                         uint arg_aos,
2725                         uint arg_machine,
2726                         uint arg_num,
2727                         uint arg_stride )
2728 {
2729    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2730    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2731    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2732    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2733    int inner_loop;
2734
2735
2736    /* Save EBX */
2737    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2738
2739    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2740    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2741    x86_lea( func, soa_input,
2742             x86_make_disp( soa_input,
2743                            Offset(struct tgsi_exec_machine, Inputs) ) );
2744    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2745    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2746
2747    /* do */
2748    inner_loop = x86_get_label( func );
2749    {
2750       x86_push( func, aos_input );
2751       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2752       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2753       x86_add( func, aos_input, stride );
2754       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2755       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2756       x86_add( func, aos_input, stride );
2757       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2758       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2759       x86_add( func, aos_input, stride );
2760       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2761       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2762       x86_pop( func, aos_input );
2763
2764       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2765       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2766       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2767       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2768       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2769       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2770
2771       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2772       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2773       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2774       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2775
2776       /* Advance to next input */
2777       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2778       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2779    }
2780    /* while --num_inputs */
2781    x86_dec( func, num_inputs );
2782    x86_jcc( func, cc_NE, inner_loop );
2783
2784    /* Restore EBX */
2785    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2786 }
2787
2788 static void soa_to_aos( struct x86_function *func,
2789                         uint arg_aos,
2790                         uint arg_machine,
2791                         uint arg_num,
2792                         uint arg_stride )
2793 {
2794    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2795    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2796    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2797    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2798    int inner_loop;
2799
2800    /* Save EBX */
2801    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2802
2803    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2804    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2805    x86_lea( func, soa_output,
2806             x86_make_disp( soa_output,
2807                            Offset(struct tgsi_exec_machine, Outputs) ) );
2808    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2809
2810    /* do */
2811    inner_loop = x86_get_label( func );
2812    {
2813       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2814       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2815       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2816       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2817
2818       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2819       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2820       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2821       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2822       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2823       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2824
2825       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2826       x86_push( func, aos_output );
2827       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2828       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2829       x86_add( func, aos_output, temp );
2830       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2831       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2832       x86_add( func, aos_output, temp );
2833       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2834       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2835       x86_add( func, aos_output, temp );
2836       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2837       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2838       x86_pop( func, aos_output );
2839
2840       /* Advance to next output */
2841       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2842       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2843    }
2844    /* while --num_outputs */
2845    x86_dec( func, num_outputs );
2846    x86_jcc( func, cc_NE, inner_loop );
2847
2848    /* Restore EBX */
2849    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2850 }
2851
2852 /**
2853  * Translate a TGSI vertex/fragment shader to SSE2 code.
2854  * Slightly different things are done for vertex vs. fragment shaders.
2855  *
2856  * \param tokens  the TGSI input shader
2857  * \param func  the output SSE code/function
2858  * \param immediates  buffer to place immediates, later passed to SSE func
2859  * \param return  1 for success, 0 if translation failed
2860  */
2861 unsigned
2862 tgsi_emit_sse2(
2863    const struct tgsi_token *tokens,
2864    struct x86_function *func,
2865    float (*immediates)[4],
2866    boolean do_swizzles )
2867 {
2868    struct tgsi_parse_context parse;
2869    unsigned ok = 1;
2870    uint num_immediates = 0;
2871
2872    util_init_math();
2873
2874    func->csr = func->store;
2875
2876    tgsi_parse_init( &parse, tokens );
2877
2878    /* Can't just use EDI, EBX without save/restoring them:
2879     */
2880    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2881    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2882
2883    /*
2884     * Different function args for vertex/fragment shaders:
2885     */
2886    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2887       if (do_swizzles)
2888          aos_to_soa( func,
2889                      4,         /* aos_input */
2890                      1,         /* machine */
2891                      5,         /* num_inputs */
2892                      6 );       /* input_stride */
2893    }
2894
2895    x86_mov(
2896       func,
2897       get_machine_base(),
2898       x86_fn_arg( func, 1 ) );
2899    x86_mov(
2900       func,
2901       get_const_base(),
2902       x86_fn_arg( func, 2 ) );
2903    x86_mov(
2904       func,
2905       get_immediate_base(),
2906       x86_fn_arg( func, 3 ) );
2907
2908    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2909       x86_mov(
2910          func,
2911          get_coef_base(),
2912          x86_fn_arg( func, 4 ) );
2913    }
2914
2915    x86_mov(
2916       func,
2917       get_sampler_base(),
2918       x86_make_disp( get_machine_base(),
2919                      Offset( struct tgsi_exec_machine, Samplers ) ) );
2920
2921
2922    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2923       tgsi_parse_token( &parse );
2924
2925       switch( parse.FullToken.Token.Type ) {
2926       case TGSI_TOKEN_TYPE_DECLARATION:
2927          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2928             emit_declaration(
2929                func,
2930                &parse.FullToken.FullDeclaration );
2931          }
2932          break;
2933
2934       case TGSI_TOKEN_TYPE_INSTRUCTION:
2935          ok = emit_instruction(
2936             func,
2937             &parse.FullToken.FullInstruction );
2938
2939          if (!ok) {
2940             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2941                          parse.FullToken.FullInstruction.Instruction.Opcode,
2942                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2943                          "vertex shader" : "fragment shader");
2944          }
2945          break;
2946
2947       case TGSI_TOKEN_TYPE_IMMEDIATE:
2948          /* simply copy the immediate values into the next immediates[] slot */
2949          {
2950             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2951             uint i;
2952             assert(size <= 4);
2953             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2954             for( i = 0; i < size; i++ ) {
2955                immediates[num_immediates][i] =
2956                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2957             }
2958 #if 0
2959             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2960                    num_immediates,
2961                    immediates[num_immediates][0],
2962                    immediates[num_immediates][1],
2963                    immediates[num_immediates][2],
2964                    immediates[num_immediates][3]);
2965 #endif
2966             num_immediates++;
2967          }
2968          break;
2969
2970       default:
2971          ok = 0;
2972          assert( 0 );
2973       }
2974    }
2975
2976    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2977       if (do_swizzles)
2978          soa_to_aos( func,
2979                      7,         /* aos_output */
2980                      1,         /* machine */
2981                      8,         /* num_outputs */
2982                      9 );       /* output_stride */
2983    }
2984
2985    /* Can't just use EBX, EDI without save/restoring them:
2986     */
2987    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2988    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2989
2990    emit_ret( func );
2991
2992    tgsi_parse_free( &parse );
2993
2994    return ok;
2995 }
2996
2997 #endif /* PIPE_ARCH_X86 */
2998