src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "util/u_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #include "util/u_memory.h"
  36 #if defined(PIPE_ARCH_SSE)
  37 #include "util/u_sse.h"
  38 #endif
  39 #include "tgsi/tgsi_info.h"
  40 #include "tgsi/tgsi_parse.h"
  41 #include "tgsi/tgsi_util.h"
  42 #include "tgsi/tgsi_dump.h"
  43 #include "tgsi/tgsi_exec.h"
  44 #include "tgsi/tgsi_sse2.h"
  45
  46 #include "rtasm/rtasm_x86sse.h"
  47
  48 /* for 1/sqrt()
  49  *
  50  * This costs about 100fps (close to 10%) in gears:
  51  */
  52 #define HIGH_PRECISION 1
  53
  54 #define FAST_MATH 1
  55
  56
  57 #define FOR_EACH_CHANNEL( CHAN )\
  58    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  59
  60 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  61    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
  62
  63 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  64    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  65
  66 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  67    FOR_EACH_CHANNEL( CHAN )\
  68       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  69
  70 #define CHAN_X 0
  71 #define CHAN_Y 1
  72 #define CHAN_Z 2
  73 #define CHAN_W 3
  74
  75 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  76 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  77
  78 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  79 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  80 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  81 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  82
  83
  84 /**
  85  * X86 utility functions.
  86  */
  87
  88 static struct x86_reg
  89 make_xmm(
  90    unsigned xmm )
  91 {
  92    return x86_make_reg(
  93       file_XMM,
  94       (enum x86_reg_name) xmm );
  95 }
  96
  97 /**
  98  * X86 register mapping helpers.
  99  */
 100
 101 static struct x86_reg
 102 get_const_base( void )
 103 {
 104    return x86_make_reg(
 105       file_REG32,
 106       reg_AX );
 107 }
 108
 109 static struct x86_reg
 110 get_machine_base( void )
 111 {
 112    return x86_make_reg(
 113       file_REG32,
 114       reg_CX );
 115 }
 116
 117 static struct x86_reg
 118 get_input_base( void )
 119 {
 120    return x86_make_disp(
 121       get_machine_base(),
 122       Offset(struct tgsi_exec_machine, Inputs) );
 123 }
 124
 125 static struct x86_reg
 126 get_output_base( void )
 127 {
 128    return x86_make_disp(
 129       get_machine_base(),
 130       Offset(struct tgsi_exec_machine, Outputs) );
 131 }
 132
 133 static struct x86_reg
 134 get_temp_base( void )
 135 {
 136    return x86_make_disp(
 137       get_machine_base(),
 138       Offset(struct tgsi_exec_machine, Temps) );
 139 }
 140
 141 static struct x86_reg
 142 get_coef_base( void )
 143 {
 144    return x86_make_reg(
 145       file_REG32,
 146       reg_BX );
 147 }
 148
 149 static struct x86_reg
 150 get_sampler_base( void )
 151 {
 152    return x86_make_reg(
 153       file_REG32,
 154       reg_DI );
 155 }
 156
 157 static struct x86_reg
 158 get_immediate_base( void )
 159 {
 160    return x86_make_reg(
 161       file_REG32,
 162       reg_DX );
 163 }
 164
 165
 166 /**
 167  * Data access helpers.
 168  */
 169
 170
 171 static struct x86_reg
 172 get_immediate(
 173    unsigned vec,
 174    unsigned chan )
 175 {
 176    return x86_make_disp(
 177       get_immediate_base(),
 178       (vec * 4 + chan) * 4 );
 179 }
 180
 181 static struct x86_reg
 182 get_const(
 183    unsigned vec,
 184    unsigned chan )
 185 {
 186    return x86_make_disp(
 187       get_const_base(),
 188       (vec * 4 + chan) * 4 );
 189 }
 190
 191 static struct x86_reg
 192 get_sampler_ptr(
 193    unsigned unit )
 194 {
 195    return x86_make_disp(
 196       get_sampler_base(),
 197       unit * sizeof( struct tgsi_sampler * ) );
 198 }
 199
 200 static struct x86_reg
 201 get_input(
 202    unsigned vec,
 203    unsigned chan )
 204 {
 205    return x86_make_disp(
 206       get_input_base(),
 207       (vec * 4 + chan) * 16 );
 208 }
 209
 210 static struct x86_reg
 211 get_output(
 212    unsigned vec,
 213    unsigned chan )
 214 {
 215    return x86_make_disp(
 216       get_output_base(),
 217       (vec * 4 + chan) * 16 );
 218 }
 219
 220 static struct x86_reg
 221 get_temp(
 222    unsigned vec,
 223    unsigned chan )
 224 {
 225    return x86_make_disp(
 226       get_temp_base(),
 227       (vec * 4 + chan) * 16 );
 228 }
 229
 230 static struct x86_reg
 231 get_coef(
 232    unsigned vec,
 233    unsigned chan,
 234    unsigned member )
 235 {
 236    return x86_make_disp(
 237       get_coef_base(),
 238       ((vec * 3 + member) * 4 + chan) * 4 );
 239 }
 240
 241
 242 static void
 243 emit_ret(
 244    struct x86_function  *func )
 245 {
 246    x86_ret( func );
 247 }
 248
 249
 250 /**
 251  * Data fetch helpers.
 252  */
 253
 254 /**
 255  * Copy a shader constant to xmm register
 256  * \param xmm  the destination xmm register
 257  * \param vec  the src const buffer index
 258  * \param chan  src channel to fetch (X, Y, Z or W)
 259  */
 260 static void
 261 emit_const(
 262    struct x86_function *func,
 263    uint xmm,
 264    int vec,
 265    uint chan,
 266    uint indirect,
 267    uint indirectFile,
 268    int indirectIndex )
 269 {
 270    if (indirect) {
 271       /* 'vec' is the offset from the address register's value.
 272        * We're loading CONST[ADDR+vec] into an xmm register.
 273        */
 274       struct x86_reg r0 = get_immediate_base();
 275       struct x86_reg r1 = get_coef_base();
 276       uint i;
 277
 278       assert( indirectFile == TGSI_FILE_ADDRESS );
 279       assert( indirectIndex == 0 );
 280       assert( r0.mod == mod_REG );
 281       assert( r1.mod == mod_REG );
 282
 283       x86_push( func, r0 );
 284       x86_push( func, r1 );
 285
 286       /*
 287        * Loop over the four pixels or vertices in the quad.
 288        * Get the value of the address (offset) register for pixel/vertex[i],
 289        * add it to the src offset and index into the constant buffer.
 290        * Note that we're working on SOA data.
 291        * If any of the pixel/vertex execution channels are unused their
 292        * values will be garbage.  It's very important that we don't use
 293        * those garbage values as indexes into the constant buffer since
 294        * that'll cause segfaults.
 295        * The solution is to bitwise-AND the offset with the execution mask
 296        * register whose values are either 0 or ~0.
 297        * The caller must setup the execution mask register to indicate
 298        * which channels are valid/alive before running the shader.
 299        * The execution mask will also figure into loops and conditionals
 300        * someday.
 301        */
 302       for (i = 0; i < QUAD_SIZE; i++) {
 303          /* r1 = address register[i] */
 304          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 305          /* r0 = execution mask[i] */
 306          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 307          /* r1 = r1 & r0 */
 308          x86_and( func, r1, r0 );
 309          /* r0 = 'vec', the offset */
 310          x86_lea( func, r0, get_const( vec, chan ) );
 311
 312          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 313           */
 314          x86_add( func, r1, r1 );
 315          x86_add( func, r1, r1 );
 316          x86_add( func, r1, r1 );
 317          x86_add( func, r1, r1 );
 318
 319          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 320          x86_mov( func, r1, x86_deref( r0 ) );
 321          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 322       }
 323
 324       x86_pop( func, r1 );
 325       x86_pop( func, r0 );
 326
 327       sse_movaps(
 328          func,
 329          make_xmm( xmm ),
 330          get_temp( TEMP_R0, CHAN_X ) );
 331    }
 332    else {
 333       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 334       assert( vec >= 0 );
 335
 336       sse_movss(
 337          func,
 338          make_xmm( xmm ),
 339          get_const( vec, chan ) );
 340       sse_shufps(
 341          func,
 342          make_xmm( xmm ),
 343          make_xmm( xmm ),
 344          SHUF( 0, 0, 0, 0 ) );
 345    }
 346 }
 347
 348 static void
 349 emit_immediate(
 350    struct x86_function *func,
 351    unsigned xmm,
 352    unsigned vec,
 353    unsigned chan )
 354 {
 355    sse_movss(
 356       func,
 357       make_xmm( xmm ),
 358       get_immediate( vec, chan ) );
 359    sse_shufps(
 360       func,
 361       make_xmm( xmm ),
 362       make_xmm( xmm ),
 363       SHUF( 0, 0, 0, 0 ) );
 364 }
 365
 366
 367 /**
 368  * Copy a shader input to xmm register
 369  * \param xmm  the destination xmm register
 370  * \param vec  the src input attrib
 371  * \param chan  src channel to fetch (X, Y, Z or W)
 372  */
 373 static void
 374 emit_inputf(
 375    struct x86_function *func,
 376    unsigned xmm,
 377    unsigned vec,
 378    unsigned chan )
 379 {
 380    sse_movups(
 381       func,
 382       make_xmm( xmm ),
 383       get_input( vec, chan ) );
 384 }
 385
 386 /**
 387  * Store an xmm register to a shader output
 388  * \param xmm  the source xmm register
 389  * \param vec  the dest output attrib
 390  * \param chan  src dest channel to store (X, Y, Z or W)
 391  */
 392 static void
 393 emit_output(
 394    struct x86_function *func,
 395    unsigned xmm,
 396    unsigned vec,
 397    unsigned chan )
 398 {
 399    sse_movups(
 400       func,
 401       get_output( vec, chan ),
 402       make_xmm( xmm ) );
 403 }
 404
 405 /**
 406  * Copy a shader temporary to xmm register
 407  * \param xmm  the destination xmm register
 408  * \param vec  the src temp register
 409  * \param chan  src channel to fetch (X, Y, Z or W)
 410  */
 411 static void
 412 emit_tempf(
 413    struct x86_function *func,
 414    unsigned xmm,
 415    unsigned vec,
 416    unsigned chan )
 417 {
 418    sse_movaps(
 419       func,
 420       make_xmm( xmm ),
 421       get_temp( vec, chan ) );
 422 }
 423
 424 /**
 425  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 426  * \param xmm  the destination xmm register
 427  * \param vec  the src input/attribute coefficient index
 428  * \param chan  src channel to fetch (X, Y, Z or W)
 429  * \param member  0=a0, 1=dadx, 2=dady
 430  */
 431 static void
 432 emit_coef(
 433    struct x86_function *func,
 434    unsigned xmm,
 435    unsigned vec,
 436    unsigned chan,
 437    unsigned member )
 438 {
 439    sse_movss(
 440       func,
 441       make_xmm( xmm ),
 442       get_coef( vec, chan, member ) );
 443    sse_shufps(
 444       func,
 445       make_xmm( xmm ),
 446       make_xmm( xmm ),
 447       SHUF( 0, 0, 0, 0 ) );
 448 }
 449
 450 /**
 451  * Data store helpers.
 452  */
 453
 454 static void
 455 emit_inputs(
 456    struct x86_function *func,
 457    unsigned xmm,
 458    unsigned vec,
 459    unsigned chan )
 460 {
 461    sse_movups(
 462       func,
 463       get_input( vec, chan ),
 464       make_xmm( xmm ) );
 465 }
 466
 467 static void
 468 emit_temps(
 469    struct x86_function *func,
 470    unsigned xmm,
 471    unsigned vec,
 472    unsigned chan )
 473 {
 474    sse_movaps(
 475       func,
 476       get_temp( vec, chan ),
 477       make_xmm( xmm ) );
 478 }
 479
 480 static void
 481 emit_addrs(
 482    struct x86_function *func,
 483    unsigned xmm,
 484    unsigned vec,
 485    unsigned chan )
 486 {
 487    assert( vec == 0 );
 488
 489    emit_temps(
 490       func,
 491       xmm,
 492       vec + TGSI_EXEC_TEMP_ADDR,
 493       chan );
 494 }
 495
 496 /**
 497  * Coefficent fetch helpers.
 498  */
 499
 500 static void
 501 emit_coef_a0(
 502    struct x86_function *func,
 503    unsigned xmm,
 504    unsigned vec,
 505    unsigned chan )
 506 {
 507    emit_coef(
 508       func,
 509       xmm,
 510       vec,
 511       chan,
 512       0 );
 513 }
 514
 515 static void
 516 emit_coef_dadx(
 517    struct x86_function *func,
 518    unsigned xmm,
 519    unsigned vec,
 520    unsigned chan )
 521 {
 522    emit_coef(
 523       func,
 524       xmm,
 525       vec,
 526       chan,
 527       1 );
 528 }
 529
 530 static void
 531 emit_coef_dady(
 532    struct x86_function *func,
 533    unsigned xmm,
 534    unsigned vec,
 535    unsigned chan )
 536 {
 537    emit_coef(
 538       func,
 539       xmm,
 540       vec,
 541       chan,
 542       2 );
 543 }
 544
 545 /**
 546  * Function call helpers.
 547  */
 548
 549 /**
 550  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 551  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 552  * that the stack pointer is 16 byte aligned, as expected.
 553  */
 554 static void
 555 emit_func_call(
 556    struct x86_function *func,
 557    unsigned xmm_save_mask,
 558    const struct x86_reg *arg,
 559    unsigned nr_args,
 560    void (PIPE_CDECL *code)() )
 561 {
 562    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 563    unsigned i, n;
 564
 565    x86_push(
 566       func,
 567       x86_make_reg( file_REG32, reg_AX) );
 568    x86_push(
 569       func,
 570       x86_make_reg( file_REG32, reg_CX) );
 571    x86_push(
 572       func,
 573       x86_make_reg( file_REG32, reg_DX) );
 574
 575    /* Store XMM regs to the stack
 576     */
 577    for(i = 0, n = 0; i < 8; ++i)
 578       if(xmm_save_mask & (1 << i))
 579          ++n;
 580
 581    x86_sub_imm(
 582       func,
 583       x86_make_reg( file_REG32, reg_SP ),
 584       n*16);
 585
 586    for(i = 0, n = 0; i < 8; ++i)
 587       if(xmm_save_mask & (1 << i)) {
 588          sse_movups(
 589             func,
 590             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 591             make_xmm( i ) );
 592          ++n;
 593       }
 594
 595    for (i = 0; i < nr_args; i++) {
 596       /* Load the address of the buffer we use for passing arguments and
 597        * receiving results:
 598        */
 599       x86_lea(
 600          func,
 601          ecx,
 602          arg[i] );
 603
 604       /* Push actual function arguments (currently just the pointer to
 605        * the buffer above), and call the function:
 606        */
 607       x86_push( func, ecx );
 608    }
 609
 610    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 611    x86_call( func, ecx );
 612
 613    /* Pop the arguments (or just add an immediate to esp)
 614     */
 615    for (i = 0; i < nr_args; i++) {
 616       x86_pop(func, ecx );
 617    }
 618
 619    /* Pop the saved XMM regs:
 620     */
 621    for(i = 0, n = 0; i < 8; ++i)
 622       if(xmm_save_mask & (1 << i)) {
 623          sse_movups(
 624             func,
 625             make_xmm( i ),
 626             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 627          ++n;
 628       }
 629
 630    x86_add_imm(
 631       func,
 632       x86_make_reg( file_REG32, reg_SP ),
 633       n*16);
 634
 635    /* Restore GP registers in a reverse order.
 636     */
 637    x86_pop(
 638       func,
 639       x86_make_reg( file_REG32, reg_DX) );
 640    x86_pop(
 641       func,
 642       x86_make_reg( file_REG32, reg_CX) );
 643    x86_pop(
 644       func,
 645       x86_make_reg( file_REG32, reg_AX) );
 646 }
 647
 648 static void
 649 emit_func_call_dst_src1(
 650    struct x86_function *func,
 651    unsigned xmm_save,
 652    unsigned xmm_dst,
 653    unsigned xmm_src0,
 654    void (PIPE_CDECL *code)() )
 655 {
 656    struct x86_reg store = get_temp( TEMP_R0, 0 );
 657    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 658
 659    /* Store our input parameters (in xmm regs) to the buffer we use
 660     * for passing arguments.  We will pass a pointer to this buffer as
 661     * the actual function argument.
 662     */
 663    sse_movaps(
 664       func,
 665       store,
 666       make_xmm( xmm_src0 ) );
 667
 668    emit_func_call( func,
 669                    xmm_mask,
 670                    &store,
 671                    1,
 672                    code );
 673
 674    sse_movaps(
 675       func,
 676       make_xmm( xmm_dst ),
 677       store );
 678 }
 679
 680
 681 static void
 682 emit_func_call_dst_src2(
 683    struct x86_function *func,
 684    unsigned xmm_save,
 685    unsigned xmm_dst,
 686    unsigned xmm_src0,
 687    unsigned xmm_src1,
 688    void (PIPE_CDECL *code)() )
 689 {
 690    struct x86_reg store = get_temp( TEMP_R0, 0 );
 691    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 692
 693    /* Store two inputs to parameter buffer.
 694     */
 695    sse_movaps(
 696       func,
 697       store,
 698       make_xmm( xmm_src0 ) );
 699
 700    sse_movaps(
 701       func,
 702       x86_make_disp( store, 4 * sizeof(float) ),
 703       make_xmm( xmm_src1 ) );
 704
 705
 706    /* Emit the call
 707     */
 708    emit_func_call( func,
 709                    xmm_mask,
 710                    &store,
 711                    1,
 712                    code );
 713
 714    /* Retrieve the results:
 715     */
 716    sse_movaps(
 717       func,
 718       make_xmm( xmm_dst ),
 719       store );
 720 }
 721
 722
 723
 724
 725
 726 #if defined(PIPE_ARCH_SSE)
 727
 728 /*
 729  * Fast SSE2 implementation of special math functions.
 730  */
 731
 732 #define POLY0(x, c0) _mm_set1_ps(c0)
 733 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 734 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 735 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 736 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 737 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 738
 739 #define EXP_POLY_DEGREE 3
 740 #define LOG_POLY_DEGREE 5
 741
 742 /**
 743  * See http://www.devmaster.net/forums/showthread.php?p=43580
 744  */
 745 static INLINE __m128
 746 exp2f4(__m128 x)
 747 {
 748    __m128i ipart;
 749    __m128 fpart, expipart, expfpart;
 750
 751    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 752    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 753
 754    /* ipart = int(x - 0.5) */
 755    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 756
 757    /* fpart = x - ipart */
 758    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 759
 760    /* expipart = (float) (1 << ipart) */
 761    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 762
 763    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 764 #if EXP_POLY_DEGREE == 5
 765    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 766 #elif EXP_POLY_DEGREE == 4
 767    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 768 #elif EXP_POLY_DEGREE == 3
 769    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 770 #elif EXP_POLY_DEGREE == 2
 771    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 772 #else
 773 #error
 774 #endif
 775
 776    return _mm_mul_ps(expipart, expfpart);
 777 }
 778
 779
 780 /**
 781  * See http://www.devmaster.net/forums/showthread.php?p=43580
 782  */
 783 static INLINE __m128
 784 log2f4(__m128 x)
 785 {
 786    __m128i expmask = _mm_set1_epi32(0x7f800000);
 787    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 788    __m128 one = _mm_set1_ps(1.0f);
 789
 790    __m128i i = _mm_castps_si128(x);
 791
 792    /* exp = (float) exponent(x) */
 793    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 794
 795    /* mant = (float) mantissa(x) */
 796    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 797
 798    __m128 logmant;
 799
 800    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 801     * These coefficients can be generate with
 802     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 803     */
 804 #if LOG_POLY_DEGREE == 6
 805    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 806 #elif LOG_POLY_DEGREE == 5
 807    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 808 #elif LOG_POLY_DEGREE == 4
 809    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 810 #elif LOG_POLY_DEGREE == 3
 811    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 812 #else
 813 #error
 814 #endif
 815
 816    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 817    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 818
 819    return _mm_add_ps(logmant, exp);
 820 }
 821
 822
 823 static INLINE __m128
 824 powf4(__m128 x, __m128 y)
 825 {
 826    return exp2f4(_mm_mul_ps(log2f4(x), y));
 827 }
 828
 829 #endif /* PIPE_ARCH_SSE */
 830
 831
 832
 833 /**
 834  * Low-level instruction translators.
 835  */
 836
 837 static void
 838 emit_abs(
 839    struct x86_function *func,
 840    unsigned xmm )
 841 {
 842    sse_andps(
 843       func,
 844       make_xmm( xmm ),
 845       get_temp(
 846          TGSI_EXEC_TEMP_7FFFFFFF_I,
 847          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 848 }
 849
 850 static void
 851 emit_add(
 852    struct x86_function *func,
 853    unsigned xmm_dst,
 854    unsigned xmm_src )
 855 {
 856    sse_addps(
 857       func,
 858       make_xmm( xmm_dst ),
 859       make_xmm( xmm_src ) );
 860 }
 861
 862 static void PIPE_CDECL
 863 cos4f(
 864    float *store )
 865 {
 866    store[0] = cosf( store[0] );
 867    store[1] = cosf( store[1] );
 868    store[2] = cosf( store[2] );
 869    store[3] = cosf( store[3] );
 870 }
 871
 872 static void
 873 emit_cos(
 874    struct x86_function *func,
 875    unsigned xmm_save,
 876    unsigned xmm_dst )
 877 {
 878    emit_func_call_dst_src1(
 879       func,
 880       xmm_save,
 881       xmm_dst,
 882       xmm_dst,
 883       cos4f );
 884 }
 885
 886 static void PIPE_CDECL
 887 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 888 __attribute__((force_align_arg_pointer))
 889 #endif
 890 ex24f(
 891    float *store )
 892 {
 893 #if defined(PIPE_ARCH_SSE)
 894    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 895 #else
 896    store[0] = util_fast_exp2( store[0] );
 897    store[1] = util_fast_exp2( store[1] );
 898    store[2] = util_fast_exp2( store[2] );
 899    store[3] = util_fast_exp2( store[3] );
 900 #endif
 901 }
 902
 903 static void
 904 emit_ex2(
 905    struct x86_function *func,
 906    unsigned xmm_save,
 907    unsigned xmm_dst )
 908 {
 909    emit_func_call_dst_src1(
 910       func,
 911       xmm_save,
 912       xmm_dst,
 913       xmm_dst,
 914       ex24f );
 915 }
 916
 917 static void
 918 emit_f2it(
 919    struct x86_function *func,
 920    unsigned xmm )
 921 {
 922    sse2_cvttps2dq(
 923       func,
 924       make_xmm( xmm ),
 925       make_xmm( xmm ) );
 926 }
 927
 928 static void
 929 emit_i2f(
 930    struct x86_function *func,
 931    unsigned xmm )
 932 {
 933    sse2_cvtdq2ps(
 934       func,
 935       make_xmm( xmm ),
 936       make_xmm( xmm ) );
 937 }
 938
 939 static void PIPE_CDECL
 940 flr4f(
 941    float *store )
 942 {
 943    store[0] = floorf( store[0] );
 944    store[1] = floorf( store[1] );
 945    store[2] = floorf( store[2] );
 946    store[3] = floorf( store[3] );
 947 }
 948
 949 static void
 950 emit_flr(
 951    struct x86_function *func,
 952    unsigned xmm_save,
 953    unsigned xmm_dst )
 954 {
 955    emit_func_call_dst_src1(
 956       func,
 957       xmm_save,
 958       xmm_dst,
 959       xmm_dst,
 960       flr4f );
 961 }
 962
 963 static void PIPE_CDECL
 964 frc4f(
 965    float *store )
 966 {
 967    store[0] -= floorf( store[0] );
 968    store[1] -= floorf( store[1] );
 969    store[2] -= floorf( store[2] );
 970    store[3] -= floorf( store[3] );
 971 }
 972
 973 static void
 974 emit_frc(
 975    struct x86_function *func,
 976    unsigned xmm_save,
 977    unsigned xmm_dst )
 978 {
 979    emit_func_call_dst_src1(
 980       func,
 981       xmm_save,
 982       xmm_dst,
 983       xmm_dst,
 984       frc4f );
 985 }
 986
 987 static void PIPE_CDECL
 988 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 989 __attribute__((force_align_arg_pointer))
 990 #endif
 991 lg24f(
 992    float *store )
 993 {
 994 #if defined(PIPE_ARCH_SSE)
 995    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 996 #else
 997    store[0] = util_fast_log2( store[0] );
 998    store[1] = util_fast_log2( store[1] );
 999    store[2] = util_fast_log2( store[2] );
1000    store[3] = util_fast_log2( store[3] );
1001 #endif
1002 }
1003
1004 static void
1005 emit_lg2(
1006    struct x86_function *func,
1007    unsigned xmm_save,
1008    unsigned xmm_dst )
1009 {
1010    emit_func_call_dst_src1(
1011       func,
1012       xmm_save,
1013       xmm_dst,
1014       xmm_dst,
1015       lg24f );
1016 }
1017
1018 static void
1019 emit_MOV(
1020    struct x86_function *func,
1021    unsigned xmm_dst,
1022    unsigned xmm_src )
1023 {
1024    sse_movups(
1025       func,
1026       make_xmm( xmm_dst ),
1027       make_xmm( xmm_src ) );
1028 }
1029
1030 static void
1031 emit_mul (struct x86_function *func,
1032           unsigned xmm_dst,
1033           unsigned xmm_src)
1034 {
1035    sse_mulps(
1036       func,
1037       make_xmm( xmm_dst ),
1038       make_xmm( xmm_src ) );
1039 }
1040
1041 static void
1042 emit_neg(
1043    struct x86_function *func,
1044    unsigned xmm )
1045 {
1046    sse_xorps(
1047       func,
1048       make_xmm( xmm ),
1049       get_temp(
1050          TGSI_EXEC_TEMP_80000000_I,
1051          TGSI_EXEC_TEMP_80000000_C ) );
1052 }
1053
1054 static void PIPE_CDECL
1055 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1056 __attribute__((force_align_arg_pointer))
1057 #endif
1058 pow4f(
1059    float *store )
1060 {
1061 #if defined(PIPE_ARCH_SSE)
1062    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1063 #else
1064    store[0] = util_fast_pow( store[0], store[4] );
1065    store[1] = util_fast_pow( store[1], store[5] );
1066    store[2] = util_fast_pow( store[2], store[6] );
1067    store[3] = util_fast_pow( store[3], store[7] );
1068 #endif
1069 }
1070
1071 static void
1072 emit_pow(
1073    struct x86_function *func,
1074    unsigned xmm_save,
1075    unsigned xmm_dst,
1076    unsigned xmm_src0,
1077    unsigned xmm_src1 )
1078 {
1079    emit_func_call_dst_src2(
1080       func,
1081       xmm_save,
1082       xmm_dst,
1083       xmm_src0,
1084       xmm_src1,
1085       pow4f );
1086 }
1087
1088 static void
1089 emit_rcp (
1090    struct x86_function *func,
1091    unsigned xmm_dst,
1092    unsigned xmm_src )
1093 {
1094    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1095     * good enough.  Need to either emit a proper divide or use the
1096     * iterative technique described below in emit_rsqrt().
1097     */
1098    sse2_rcpps(
1099       func,
1100       make_xmm( xmm_dst ),
1101       make_xmm( xmm_src ) );
1102 }
1103
1104 static void PIPE_CDECL
1105 rnd4f(
1106    float *store )
1107 {
1108    store[0] = floorf( store[0] + 0.5f );
1109    store[1] = floorf( store[1] + 0.5f );
1110    store[2] = floorf( store[2] + 0.5f );
1111    store[3] = floorf( store[3] + 0.5f );
1112 }
1113
1114 static void
1115 emit_rnd(
1116    struct x86_function *func,
1117    unsigned xmm_save,
1118    unsigned xmm_dst )
1119 {
1120    emit_func_call_dst_src1(
1121       func,
1122       xmm_save,
1123       xmm_dst,
1124       xmm_dst,
1125       rnd4f );
1126 }
1127
1128 static void
1129 emit_rsqrt(
1130    struct x86_function *func,
1131    unsigned xmm_dst,
1132    unsigned xmm_src )
1133 {
1134 #if HIGH_PRECISION
1135    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1136     * implementations, it is possible to improve its precision at
1137     * fairly low cost, using a newton/raphson step, as below:
1138     *
1139     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1140     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1141     *
1142     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1143     */
1144    {
1145       struct x86_reg dst = make_xmm( xmm_dst );
1146       struct x86_reg src = make_xmm( xmm_src );
1147       struct x86_reg tmp0 = make_xmm( 2 );
1148       struct x86_reg tmp1 = make_xmm( 3 );
1149
1150       assert( xmm_dst != xmm_src );
1151       assert( xmm_dst != 2 && xmm_dst != 3 );
1152       assert( xmm_src != 2 && xmm_src != 3 );
1153
1154       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1155       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1156       sse_rsqrtps( func, tmp1, src  );
1157       sse_mulps(   func, src,  tmp1 );
1158       sse_mulps(   func, dst,  tmp1 );
1159       sse_mulps(   func, src,  tmp1 );
1160       sse_subps(   func, tmp0, src  );
1161       sse_mulps(   func, dst,  tmp0 );
1162    }
1163 #else
1164    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1165     * good enough.
1166     */
1167    sse_rsqrtps(
1168       func,
1169       make_xmm( xmm_dst ),
1170       make_xmm( xmm_src ) );
1171 #endif
1172 }
1173
1174 static void
1175 emit_setsign(
1176    struct x86_function *func,
1177    unsigned xmm )
1178 {
1179    sse_orps(
1180       func,
1181       make_xmm( xmm ),
1182       get_temp(
1183          TGSI_EXEC_TEMP_80000000_I,
1184          TGSI_EXEC_TEMP_80000000_C ) );
1185 }
1186
1187 static void PIPE_CDECL
1188 sgn4f(
1189    float *store )
1190 {
1191    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1192    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1193    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1194    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1195 }
1196
1197 static void
1198 emit_sgn(
1199    struct x86_function *func,
1200    unsigned xmm_save,
1201    unsigned xmm_dst )
1202 {
1203    emit_func_call_dst_src1(
1204       func,
1205       xmm_save,
1206       xmm_dst,
1207       xmm_dst,
1208       sgn4f );
1209 }
1210
1211 static void PIPE_CDECL
1212 sin4f(
1213    float *store )
1214 {
1215    store[0] = sinf( store[0] );
1216    store[1] = sinf( store[1] );
1217    store[2] = sinf( store[2] );
1218    store[3] = sinf( store[3] );
1219 }
1220
1221 static void
1222 emit_sin (struct x86_function *func,
1223           unsigned xmm_save,
1224           unsigned xmm_dst)
1225 {
1226    emit_func_call_dst_src1(
1227       func,
1228       xmm_save,
1229       xmm_dst,
1230       xmm_dst,
1231       sin4f );
1232 }
1233
1234 static void
1235 emit_sub(
1236    struct x86_function *func,
1237    unsigned xmm_dst,
1238    unsigned xmm_src )
1239 {
1240    sse_subps(
1241       func,
1242       make_xmm( xmm_dst ),
1243       make_xmm( xmm_src ) );
1244 }
1245
1246
1247
1248
1249
1250
1251
1252 /**
1253  * Register fetch.
1254  */
1255
1256 static void
1257 emit_fetch(
1258    struct x86_function *func,
1259    unsigned xmm,
1260    const struct tgsi_full_src_register *reg,
1261    const unsigned chan_index )
1262 {
1263    unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1264
1265    switch (swizzle) {
1266    case TGSI_SWIZZLE_X:
1267    case TGSI_SWIZZLE_Y:
1268    case TGSI_SWIZZLE_Z:
1269    case TGSI_SWIZZLE_W:
1270       switch (reg->Register.File) {
1271       case TGSI_FILE_CONSTANT:
1272          emit_const(
1273             func,
1274             xmm,
1275             reg->Register.Index,
1276             swizzle,
1277             reg->Register.Indirect,
1278             reg->Indirect.File,
1279             reg->Indirect.Index );
1280          break;
1281
1282       case TGSI_FILE_IMMEDIATE:
1283          emit_immediate(
1284             func,
1285             xmm,
1286             reg->Register.Index,
1287             swizzle );
1288          break;
1289
1290       case TGSI_FILE_INPUT:
1291       case TGSI_FILE_SYSTEM_VALUE:
1292          emit_inputf(
1293             func,
1294             xmm,
1295             reg->Register.Index,
1296             swizzle );
1297          break;
1298
1299       case TGSI_FILE_TEMPORARY:
1300          emit_tempf(
1301             func,
1302             xmm,
1303             reg->Register.Index,
1304             swizzle );
1305          break;
1306
1307       default:
1308          assert( 0 );
1309       }
1310       break;
1311
1312    default:
1313       assert( 0 );
1314    }
1315
1316    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1317    case TGSI_UTIL_SIGN_CLEAR:
1318       emit_abs( func, xmm );
1319       break;
1320
1321    case TGSI_UTIL_SIGN_SET:
1322       emit_setsign( func, xmm );
1323       break;
1324
1325    case TGSI_UTIL_SIGN_TOGGLE:
1326       emit_neg( func, xmm );
1327       break;
1328
1329    case TGSI_UTIL_SIGN_KEEP:
1330       break;
1331    }
1332 }
1333
1334 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1335    emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1336
1337 /**
1338  * Register store.
1339  */
1340
1341 static void
1342 emit_store(
1343    struct x86_function *func,
1344    unsigned xmm,
1345    const struct tgsi_full_dst_register *reg,
1346    const struct tgsi_full_instruction *inst,
1347    unsigned chan_index )
1348 {
1349    switch( inst->Instruction.Saturate ) {
1350    case TGSI_SAT_NONE:
1351       break;
1352
1353    case TGSI_SAT_ZERO_ONE:
1354       sse_maxps(
1355          func,
1356          make_xmm( xmm ),
1357          get_temp(
1358             TGSI_EXEC_TEMP_00000000_I,
1359             TGSI_EXEC_TEMP_00000000_C ) );
1360
1361       sse_minps(
1362          func,
1363          make_xmm( xmm ),
1364          get_temp(
1365             TGSI_EXEC_TEMP_ONE_I,
1366             TGSI_EXEC_TEMP_ONE_C ) );
1367       break;
1368
1369    case TGSI_SAT_MINUS_PLUS_ONE:
1370       assert( 0 );
1371       break;
1372    }
1373
1374
1375    switch( reg->Register.File ) {
1376    case TGSI_FILE_OUTPUT:
1377       emit_output(
1378          func,
1379          xmm,
1380          reg->Register.Index,
1381          chan_index );
1382       break;
1383
1384    case TGSI_FILE_TEMPORARY:
1385       emit_temps(
1386          func,
1387          xmm,
1388          reg->Register.Index,
1389          chan_index );
1390       break;
1391
1392    case TGSI_FILE_ADDRESS:
1393       emit_addrs(
1394          func,
1395          xmm,
1396          reg->Register.Index,
1397          chan_index );
1398       break;
1399
1400    default:
1401       assert( 0 );
1402    }
1403 }
1404
1405 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1406    emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1407
1408
1409 static void PIPE_CDECL
1410 fetch_texel( struct tgsi_sampler **sampler,
1411              float *store )
1412 {
1413 #if 0
1414    uint j;
1415
1416    debug_printf("%s sampler: %p (%p) store: %p\n",
1417                 __FUNCTION__,
1418                 sampler, *sampler,
1419                 store );
1420
1421    debug_printf("lodbias %f\n", store[12]);
1422
1423    for (j = 0; j < 4; j++)
1424       debug_printf("sample %d texcoord %f %f\n",
1425                    j,
1426                    store[0+j],
1427                    store[4+j]);
1428 #endif
1429
1430    {
1431       float rgba[NUM_CHANNELS][QUAD_SIZE];
1432       (*sampler)->get_samples(*sampler,
1433                               &store[0],  /* s */
1434                               &store[4],  /* t */
1435                               &store[8],  /* r */
1436                               store[12],  /* lodbias */
1437                               rgba);      /* results */
1438
1439       memcpy( store, rgba, 16 * sizeof(float));
1440    }
1441
1442 #if 0
1443    for (j = 0; j < 4; j++)
1444       debug_printf("sample %d result %f %f %f %f\n",
1445                    j,
1446                    store[0+j],
1447                    store[4+j],
1448                    store[8+j],
1449                    store[12+j]);
1450 #endif
1451 }
1452
1453 /**
1454  * High-level instruction translators.
1455  */
1456
1457 static void
1458 emit_tex( struct x86_function *func,
1459           const struct tgsi_full_instruction *inst,
1460           boolean lodbias,
1461           boolean projected)
1462 {
1463    const uint unit = inst->Src[1].Register.Index;
1464    struct x86_reg args[2];
1465    unsigned count;
1466    unsigned i;
1467
1468    assert(inst->Instruction.Texture);
1469    switch (inst->Texture.Texture) {
1470    case TGSI_TEXTURE_1D:
1471       count = 1;
1472       break;
1473    case TGSI_TEXTURE_2D:
1474    case TGSI_TEXTURE_RECT:
1475       count = 2;
1476       break;
1477    case TGSI_TEXTURE_SHADOW1D:
1478    case TGSI_TEXTURE_SHADOW2D:
1479    case TGSI_TEXTURE_SHADOWRECT:
1480    case TGSI_TEXTURE_3D:
1481    case TGSI_TEXTURE_CUBE:
1482       count = 3;
1483       break;
1484    default:
1485       assert(0);
1486       return;
1487    }
1488
1489    if (lodbias) {
1490       FETCH( func, *inst, 3, 0, 3 );
1491    }
1492    else {
1493       emit_tempf(
1494          func,
1495          3,
1496          TGSI_EXEC_TEMP_00000000_I,
1497          TGSI_EXEC_TEMP_00000000_C );
1498
1499    }
1500
1501    /* store lodbias whether enabled or not -- fetch_texel currently
1502     * respects it always.
1503     */
1504    sse_movaps( func,
1505                get_temp( TEMP_R0, 3 ),
1506                make_xmm( 3 ) );
1507
1508
1509    if (projected) {
1510       FETCH( func, *inst, 3, 0, 3 );
1511
1512       emit_rcp( func, 3, 3 );
1513    }
1514
1515    for (i = 0; i < count; i++) {
1516       FETCH( func, *inst, i, 0, i );
1517
1518       if (projected) {
1519          sse_mulps(
1520             func,
1521             make_xmm( i ),
1522             make_xmm( 3 ) );
1523       }
1524
1525       /* Store in the argument buffer:
1526        */
1527       sse_movaps(
1528          func,
1529          get_temp( TEMP_R0, i ),
1530          make_xmm( i ) );
1531    }
1532
1533    args[0] = get_temp( TEMP_R0, 0 );
1534    args[1] = get_sampler_ptr( unit );
1535
1536
1537    emit_func_call( func,
1538                    0,
1539                    args,
1540                    Elements(args),
1541                    fetch_texel );
1542
1543    /* If all four channels are enabled, could use a pointer to
1544     * dst[0].x instead of TEMP_R0 for store?
1545     */
1546    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1547
1548       sse_movaps(
1549          func,
1550          make_xmm( 0 ),
1551          get_temp( TEMP_R0, i ) );
1552
1553       STORE( func, *inst, 0, 0, i );
1554    }
1555 }
1556
1557
1558 static void
1559 emit_kil(
1560    struct x86_function *func,
1561    const struct tgsi_full_src_register *reg )
1562 {
1563    unsigned uniquemask;
1564    unsigned unique_count = 0;
1565    unsigned chan_index;
1566    unsigned i;
1567
1568    /* This mask stores component bits that were already tested. Note that
1569     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1570     * tested. */
1571    uniquemask = 0;
1572
1573    FOR_EACH_CHANNEL( chan_index ) {
1574       unsigned swizzle;
1575
1576       /* unswizzle channel */
1577       swizzle = tgsi_util_get_full_src_register_swizzle(
1578          reg,
1579          chan_index );
1580
1581       /* check if the component has not been already tested */
1582       if( !(uniquemask & (1 << swizzle)) ) {
1583          uniquemask |= 1 << swizzle;
1584
1585          /* allocate register */
1586          emit_fetch(
1587             func,
1588             unique_count++,
1589             reg,
1590             chan_index );
1591       }
1592    }
1593
1594    x86_push(
1595       func,
1596       x86_make_reg( file_REG32, reg_AX ) );
1597    x86_push(
1598       func,
1599       x86_make_reg( file_REG32, reg_DX ) );
1600
1601    for (i = 0 ; i < unique_count; i++ ) {
1602       struct x86_reg dataXMM = make_xmm(i);
1603
1604       sse_cmpps(
1605          func,
1606          dataXMM,
1607          get_temp(
1608             TGSI_EXEC_TEMP_00000000_I,
1609             TGSI_EXEC_TEMP_00000000_C ),
1610          cc_LessThan );
1611
1612       if( i == 0 ) {
1613          sse_movmskps(
1614             func,
1615             x86_make_reg( file_REG32, reg_AX ),
1616             dataXMM );
1617       }
1618       else {
1619          sse_movmskps(
1620             func,
1621             x86_make_reg( file_REG32, reg_DX ),
1622             dataXMM );
1623          x86_or(
1624             func,
1625             x86_make_reg( file_REG32, reg_AX ),
1626             x86_make_reg( file_REG32, reg_DX ) );
1627       }
1628    }
1629
1630    x86_or(
1631       func,
1632       get_temp(
1633          TGSI_EXEC_TEMP_KILMASK_I,
1634          TGSI_EXEC_TEMP_KILMASK_C ),
1635       x86_make_reg( file_REG32, reg_AX ) );
1636
1637    x86_pop(
1638       func,
1639       x86_make_reg( file_REG32, reg_DX ) );
1640    x86_pop(
1641       func,
1642       x86_make_reg( file_REG32, reg_AX ) );
1643 }
1644
1645
1646 static void
1647 emit_kilp(
1648    struct x86_function *func )
1649 {
1650    /* XXX todo / fix me */
1651 }
1652
1653
1654 static void
1655 emit_setcc(
1656    struct x86_function *func,
1657    struct tgsi_full_instruction *inst,
1658    enum sse_cc cc )
1659 {
1660    unsigned chan_index;
1661
1662    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1663       FETCH( func, *inst, 0, 0, chan_index );
1664       FETCH( func, *inst, 1, 1, chan_index );
1665       sse_cmpps(
1666          func,
1667          make_xmm( 0 ),
1668          make_xmm( 1 ),
1669          cc );
1670       sse_andps(
1671          func,
1672          make_xmm( 0 ),
1673          get_temp(
1674             TEMP_ONE_I,
1675             TEMP_ONE_C ) );
1676       STORE( func, *inst, 0, 0, chan_index );
1677    }
1678 }
1679
1680 static void
1681 emit_cmp(
1682    struct x86_function *func,
1683    struct tgsi_full_instruction *inst )
1684 {
1685    unsigned chan_index;
1686
1687    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1688       FETCH( func, *inst, 0, 0, chan_index );
1689       FETCH( func, *inst, 1, 1, chan_index );
1690       FETCH( func, *inst, 2, 2, chan_index );
1691       sse_cmpps(
1692          func,
1693          make_xmm( 0 ),
1694          get_temp(
1695             TGSI_EXEC_TEMP_00000000_I,
1696             TGSI_EXEC_TEMP_00000000_C ),
1697          cc_LessThan );
1698       sse_andps(
1699          func,
1700          make_xmm( 1 ),
1701          make_xmm( 0 ) );
1702       sse_andnps(
1703          func,
1704          make_xmm( 0 ),
1705          make_xmm( 2 ) );
1706       sse_orps(
1707          func,
1708          make_xmm( 0 ),
1709          make_xmm( 1 ) );
1710       STORE( func, *inst, 0, 0, chan_index );
1711    }
1712 }
1713
1714
1715 /**
1716  * Check if inst src/dest regs use indirect addressing into temporary
1717  * register file.
1718  */
1719 static boolean
1720 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1721 {
1722    uint i;
1723    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1724       const struct tgsi_full_src_register *reg = &inst->Src[i];
1725       if (reg->Register.File == TGSI_FILE_TEMPORARY &&
1726           reg->Register.Indirect)
1727          return TRUE;
1728    }
1729    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1730       const struct tgsi_full_dst_register *reg = &inst->Dst[i];
1731       if (reg->Register.File == TGSI_FILE_TEMPORARY &&
1732           reg->Register.Indirect)
1733          return TRUE;
1734    }
1735    return FALSE;
1736 }
1737
1738
1739 static int
1740 emit_instruction(
1741    struct x86_function *func,
1742    struct tgsi_full_instruction *inst )
1743 {
1744    unsigned chan_index;
1745
1746    /* we can't handle indirect addressing into temp register file yet */
1747    if (indirect_temp_reference(inst))
1748       return FALSE;
1749
1750    switch (inst->Instruction.Opcode) {
1751    case TGSI_OPCODE_ARL:
1752       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1753          FETCH( func, *inst, 0, 0, chan_index );
1754          emit_flr(func, 0, 0);
1755          emit_f2it( func, 0 );
1756          STORE( func, *inst, 0, 0, chan_index );
1757       }
1758       break;
1759
1760    case TGSI_OPCODE_MOV:
1761       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1762          FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1763       }
1764       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1765          STORE( func, *inst, 4 + chan_index, 0, chan_index );
1766       }
1767       break;
1768
1769    case TGSI_OPCODE_LIT:
1770       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1771           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1772          emit_tempf(
1773             func,
1774             0,
1775             TEMP_ONE_I,
1776             TEMP_ONE_C);
1777          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1778             STORE( func, *inst, 0, 0, CHAN_X );
1779          }
1780          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1781             STORE( func, *inst, 0, 0, CHAN_W );
1782          }
1783       }
1784       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1785           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1786          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1787             FETCH( func, *inst, 0, 0, CHAN_X );
1788             sse_maxps(
1789                func,
1790                make_xmm( 0 ),
1791                get_temp(
1792                   TGSI_EXEC_TEMP_00000000_I,
1793                   TGSI_EXEC_TEMP_00000000_C ) );
1794             STORE( func, *inst, 0, 0, CHAN_Y );
1795          }
1796          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1797             /* XMM[1] = SrcReg[0].yyyy */
1798             FETCH( func, *inst, 1, 0, CHAN_Y );
1799             /* XMM[1] = max(XMM[1], 0) */
1800             sse_maxps(
1801                func,
1802                make_xmm( 1 ),
1803                get_temp(
1804                   TGSI_EXEC_TEMP_00000000_I,
1805                   TGSI_EXEC_TEMP_00000000_C ) );
1806             /* XMM[2] = SrcReg[0].wwww */
1807             FETCH( func, *inst, 2, 0, CHAN_W );
1808             /* XMM[2] = min(XMM[2], 128.0) */
1809             sse_minps(
1810                func,
1811                make_xmm( 2 ),
1812                get_temp(
1813                   TGSI_EXEC_TEMP_128_I,
1814                   TGSI_EXEC_TEMP_128_C ) );
1815             /* XMM[2] = max(XMM[2], -128.0) */
1816             sse_maxps(
1817                func,
1818                make_xmm( 2 ),
1819                get_temp(
1820                   TGSI_EXEC_TEMP_MINUS_128_I,
1821                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1822             emit_pow( func, 3, 1, 1, 2 );
1823             FETCH( func, *inst, 0, 0, CHAN_X );
1824             sse_xorps(
1825                func,
1826                make_xmm( 2 ),
1827                make_xmm( 2 ) );
1828             sse_cmpps(
1829                func,
1830                make_xmm( 2 ),
1831                make_xmm( 0 ),
1832                cc_LessThan );
1833             sse_andps(
1834                func,
1835                make_xmm( 2 ),
1836                make_xmm( 1 ) );
1837             STORE( func, *inst, 2, 0, CHAN_Z );
1838          }
1839       }
1840       break;
1841
1842    case TGSI_OPCODE_RCP:
1843       FETCH( func, *inst, 0, 0, CHAN_X );
1844       emit_rcp( func, 0, 0 );
1845       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1846          STORE( func, *inst, 0, 0, chan_index );
1847       }
1848       break;
1849
1850    case TGSI_OPCODE_RSQ:
1851       FETCH( func, *inst, 0, 0, CHAN_X );
1852       emit_abs( func, 0 );
1853       emit_rsqrt( func, 1, 0 );
1854       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1855          STORE( func, *inst, 1, 0, chan_index );
1856       }
1857       break;
1858
1859    case TGSI_OPCODE_EXP:
1860       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1861           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1862           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1863          FETCH( func, *inst, 0, 0, CHAN_X );
1864          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1865              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1866             emit_MOV( func, 1, 0 );
1867             emit_flr( func, 2, 1 );
1868             /* dst.x = ex2(floor(src.x)) */
1869             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1870                emit_MOV( func, 2, 1 );
1871                emit_ex2( func, 3, 2 );
1872                STORE( func, *inst, 2, 0, CHAN_X );
1873             }
1874             /* dst.y = src.x - floor(src.x) */
1875             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1876                emit_MOV( func, 2, 0 );
1877                emit_sub( func, 2, 1 );
1878                STORE( func, *inst, 2, 0, CHAN_Y );
1879             }
1880          }
1881          /* dst.z = ex2(src.x) */
1882          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1883             emit_ex2( func, 3, 0 );
1884             STORE( func, *inst, 0, 0, CHAN_Z );
1885          }
1886       }
1887       /* dst.w = 1.0 */
1888       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1889          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1890          STORE( func, *inst, 0, 0, CHAN_W );
1891       }
1892       break;
1893
1894    case TGSI_OPCODE_LOG:
1895       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1896           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1897           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1898          FETCH( func, *inst, 0, 0, CHAN_X );
1899          emit_abs( func, 0 );
1900          emit_MOV( func, 1, 0 );
1901          emit_lg2( func, 2, 1 );
1902          /* dst.z = lg2(abs(src.x)) */
1903          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1904             STORE( func, *inst, 1, 0, CHAN_Z );
1905          }
1906          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1907              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1908             emit_flr( func, 2, 1 );
1909             /* dst.x = floor(lg2(abs(src.x))) */
1910             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1911                STORE( func, *inst, 1, 0, CHAN_X );
1912             }
1913             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1914             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1915                emit_ex2( func, 2, 1 );
1916                emit_rcp( func, 1, 1 );
1917                emit_mul( func, 0, 1 );
1918                STORE( func, *inst, 0, 0, CHAN_Y );
1919             }
1920          }
1921       }
1922       /* dst.w = 1.0 */
1923       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1924          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1925          STORE( func, *inst, 0, 0, CHAN_W );
1926       }
1927       break;
1928
1929    case TGSI_OPCODE_MUL:
1930       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1931          FETCH( func, *inst, 0, 0, chan_index );
1932          FETCH( func, *inst, 1, 1, chan_index );
1933          emit_mul( func, 0, 1 );
1934          STORE( func, *inst, 0, 0, chan_index );
1935       }
1936       break;
1937
1938    case TGSI_OPCODE_ADD:
1939       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1940          FETCH( func, *inst, 0, 0, chan_index );
1941          FETCH( func, *inst, 1, 1, chan_index );
1942          emit_add( func, 0, 1 );
1943          STORE( func, *inst, 0, 0, chan_index );
1944       }
1945       break;
1946
1947    case TGSI_OPCODE_DP3:
1948       FETCH( func, *inst, 0, 0, CHAN_X );
1949       FETCH( func, *inst, 1, 1, CHAN_X );
1950       emit_mul( func, 0, 1 );
1951       FETCH( func, *inst, 1, 0, CHAN_Y );
1952       FETCH( func, *inst, 2, 1, CHAN_Y );
1953       emit_mul( func, 1, 2 );
1954       emit_add( func, 0, 1 );
1955       FETCH( func, *inst, 1, 0, CHAN_Z );
1956       FETCH( func, *inst, 2, 1, CHAN_Z );
1957       emit_mul( func, 1, 2 );
1958       emit_add( func, 0, 1 );
1959       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1960          STORE( func, *inst, 0, 0, chan_index );
1961       }
1962       break;
1963
1964    case TGSI_OPCODE_DP4:
1965       FETCH( func, *inst, 0, 0, CHAN_X );
1966       FETCH( func, *inst, 1, 1, CHAN_X );
1967       emit_mul( func, 0, 1 );
1968       FETCH( func, *inst, 1, 0, CHAN_Y );
1969       FETCH( func, *inst, 2, 1, CHAN_Y );
1970       emit_mul( func, 1, 2 );
1971       emit_add( func, 0, 1 );
1972       FETCH( func, *inst, 1, 0, CHAN_Z );
1973       FETCH( func, *inst, 2, 1, CHAN_Z );
1974       emit_mul(func, 1, 2 );
1975       emit_add(func, 0, 1 );
1976       FETCH( func, *inst, 1, 0, CHAN_W );
1977       FETCH( func, *inst, 2, 1, CHAN_W );
1978       emit_mul( func, 1, 2 );
1979       emit_add( func, 0, 1 );
1980       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1981          STORE( func, *inst, 0, 0, chan_index );
1982       }
1983       break;
1984
1985    case TGSI_OPCODE_DST:
1986       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1987          emit_tempf(
1988             func,
1989             0,
1990             TEMP_ONE_I,
1991             TEMP_ONE_C );
1992          STORE( func, *inst, 0, 0, CHAN_X );
1993       }
1994       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1995          FETCH( func, *inst, 0, 0, CHAN_Y );
1996          FETCH( func, *inst, 1, 1, CHAN_Y );
1997          emit_mul( func, 0, 1 );
1998          STORE( func, *inst, 0, 0, CHAN_Y );
1999       }
2000       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2001          FETCH( func, *inst, 0, 0, CHAN_Z );
2002          STORE( func, *inst, 0, 0, CHAN_Z );
2003       }
2004       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2005          FETCH( func, *inst, 0, 1, CHAN_W );
2006          STORE( func, *inst, 0, 0, CHAN_W );
2007       }
2008       break;
2009
2010    case TGSI_OPCODE_MIN:
2011       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2012          FETCH( func, *inst, 0, 0, chan_index );
2013          FETCH( func, *inst, 1, 1, chan_index );
2014          sse_minps(
2015             func,
2016             make_xmm( 0 ),
2017             make_xmm( 1 ) );
2018          STORE( func, *inst, 0, 0, chan_index );
2019       }
2020       break;
2021
2022    case TGSI_OPCODE_MAX:
2023       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2024          FETCH( func, *inst, 0, 0, chan_index );
2025          FETCH( func, *inst, 1, 1, chan_index );
2026          sse_maxps(
2027             func,
2028             make_xmm( 0 ),
2029             make_xmm( 1 ) );
2030          STORE( func, *inst, 0, 0, chan_index );
2031       }
2032       break;
2033
2034    case TGSI_OPCODE_SLT:
2035       emit_setcc( func, inst, cc_LessThan );
2036       break;
2037
2038    case TGSI_OPCODE_SGE:
2039       emit_setcc( func, inst, cc_NotLessThan );
2040       break;
2041
2042    case TGSI_OPCODE_MAD:
2043       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2044          FETCH( func, *inst, 0, 0, chan_index );
2045          FETCH( func, *inst, 1, 1, chan_index );
2046          FETCH( func, *inst, 2, 2, chan_index );
2047          emit_mul( func, 0, 1 );
2048          emit_add( func, 0, 2 );
2049          STORE( func, *inst, 0, 0, chan_index );
2050       }
2051       break;
2052
2053    case TGSI_OPCODE_SUB:
2054       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2055          FETCH( func, *inst, 0, 0, chan_index );
2056          FETCH( func, *inst, 1, 1, chan_index );
2057          emit_sub( func, 0, 1 );
2058          STORE( func, *inst, 0, 0, chan_index );
2059       }
2060       break;
2061
2062    case TGSI_OPCODE_LRP:
2063       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2064          FETCH( func, *inst, 0, 0, chan_index );
2065          FETCH( func, *inst, 1, 1, chan_index );
2066          FETCH( func, *inst, 2, 2, chan_index );
2067          emit_sub( func, 1, 2 );
2068          emit_mul( func, 0, 1 );
2069          emit_add( func, 0, 2 );
2070          STORE( func, *inst, 0, 0, chan_index );
2071       }
2072       break;
2073
2074    case TGSI_OPCODE_CND:
2075       return 0;
2076       break;
2077
2078    case TGSI_OPCODE_DP2A:
2079       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2080       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2081       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2082       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2083       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2084       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2085       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2086       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2087       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2088       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2089          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2090       }
2091       break;
2092
2093    case TGSI_OPCODE_FRC:
2094       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2095          FETCH( func, *inst, 0, 0, chan_index );
2096          emit_frc( func, 0, 0 );
2097          STORE( func, *inst, 0, 0, chan_index );
2098       }
2099       break;
2100
2101    case TGSI_OPCODE_CLAMP:
2102       return 0;
2103       break;
2104
2105    case TGSI_OPCODE_FLR:
2106       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2107          FETCH( func, *inst, 0, 0, chan_index );
2108          emit_flr( func, 0, 0 );
2109          STORE( func, *inst, 0, 0, chan_index );
2110       }
2111       break;
2112
2113    case TGSI_OPCODE_ROUND:
2114       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2115          FETCH( func, *inst, 0, 0, chan_index );
2116          emit_rnd( func, 0, 0 );
2117          STORE( func, *inst, 0, 0, chan_index );
2118       }
2119       break;
2120
2121    case TGSI_OPCODE_EX2:
2122       FETCH( func, *inst, 0, 0, CHAN_X );
2123       emit_ex2( func, 0, 0 );
2124       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2125          STORE( func, *inst, 0, 0, chan_index );
2126       }
2127       break;
2128
2129    case TGSI_OPCODE_LG2:
2130       FETCH( func, *inst, 0, 0, CHAN_X );
2131       emit_lg2( func, 0, 0 );
2132       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2133          STORE( func, *inst, 0, 0, chan_index );
2134       }
2135       break;
2136
2137    case TGSI_OPCODE_POW:
2138       FETCH( func, *inst, 0, 0, CHAN_X );
2139       FETCH( func, *inst, 1, 1, CHAN_X );
2140       emit_pow( func, 0, 0, 0, 1 );
2141       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2142          STORE( func, *inst, 0, 0, chan_index );
2143       }
2144       break;
2145
2146    case TGSI_OPCODE_XPD:
2147       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2148           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2149          FETCH( func, *inst, 1, 1, CHAN_Z );
2150          FETCH( func, *inst, 3, 0, CHAN_Z );
2151       }
2152       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2153           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2154          FETCH( func, *inst, 0, 0, CHAN_Y );
2155          FETCH( func, *inst, 4, 1, CHAN_Y );
2156       }
2157       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2158          emit_MOV( func, 2, 0 );
2159          emit_mul( func, 2, 1 );
2160          emit_MOV( func, 5, 3 );
2161          emit_mul( func, 5, 4 );
2162          emit_sub( func, 2, 5 );
2163          STORE( func, *inst, 2, 0, CHAN_X );
2164       }
2165       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2166           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2167          FETCH( func, *inst, 2, 1, CHAN_X );
2168          FETCH( func, *inst, 5, 0, CHAN_X );
2169       }
2170       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2171          emit_mul( func, 3, 2 );
2172          emit_mul( func, 1, 5 );
2173          emit_sub( func, 3, 1 );
2174          STORE( func, *inst, 3, 0, CHAN_Y );
2175       }
2176       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2177          emit_mul( func, 5, 4 );
2178          emit_mul( func, 0, 2 );
2179          emit_sub( func, 5, 0 );
2180          STORE( func, *inst, 5, 0, CHAN_Z );
2181       }
2182       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2183          emit_tempf(
2184             func,
2185             0,
2186             TEMP_ONE_I,
2187             TEMP_ONE_C );
2188          STORE( func, *inst, 0, 0, CHAN_W );
2189       }
2190       break;
2191
2192    case TGSI_OPCODE_ABS:
2193       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2194          FETCH( func, *inst, 0, 0, chan_index );
2195          emit_abs( func, 0) ;
2196
2197          STORE( func, *inst, 0, 0, chan_index );
2198       }
2199       break;
2200
2201    case TGSI_OPCODE_RCC:
2202       return 0;
2203       break;
2204
2205    case TGSI_OPCODE_DPH:
2206       FETCH( func, *inst, 0, 0, CHAN_X );
2207       FETCH( func, *inst, 1, 1, CHAN_X );
2208       emit_mul( func, 0, 1 );
2209       FETCH( func, *inst, 1, 0, CHAN_Y );
2210       FETCH( func, *inst, 2, 1, CHAN_Y );
2211       emit_mul( func, 1, 2 );
2212       emit_add( func, 0, 1 );
2213       FETCH( func, *inst, 1, 0, CHAN_Z );
2214       FETCH( func, *inst, 2, 1, CHAN_Z );
2215       emit_mul( func, 1, 2 );
2216       emit_add( func, 0, 1 );
2217       FETCH( func, *inst, 1, 1, CHAN_W );
2218       emit_add( func, 0, 1 );
2219       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2220          STORE( func, *inst, 0, 0, chan_index );
2221       }
2222       break;
2223
2224    case TGSI_OPCODE_COS:
2225       FETCH( func, *inst, 0, 0, CHAN_X );
2226       emit_cos( func, 0, 0 );
2227       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2228          STORE( func, *inst, 0, 0, chan_index );
2229       }
2230       break;
2231
2232    case TGSI_OPCODE_DDX:
2233       return 0;
2234       break;
2235
2236    case TGSI_OPCODE_DDY:
2237       return 0;
2238       break;
2239
2240    case TGSI_OPCODE_KILP:
2241       /* predicated kill */
2242       emit_kilp( func );
2243       return 0; /* XXX fix me */
2244       break;
2245
2246    case TGSI_OPCODE_KIL:
2247       /* conditional kill */
2248       emit_kil( func, &inst->Src[0] );
2249       break;
2250
2251    case TGSI_OPCODE_PK2H:
2252       return 0;
2253       break;
2254
2255    case TGSI_OPCODE_PK2US:
2256       return 0;
2257       break;
2258
2259    case TGSI_OPCODE_PK4B:
2260       return 0;
2261       break;
2262
2263    case TGSI_OPCODE_PK4UB:
2264       return 0;
2265       break;
2266
2267    case TGSI_OPCODE_RFL:
2268       return 0;
2269       break;
2270
2271    case TGSI_OPCODE_SEQ:
2272       emit_setcc( func, inst, cc_Equal );
2273       break;
2274
2275    case TGSI_OPCODE_SFL:
2276       return 0;
2277       break;
2278
2279    case TGSI_OPCODE_SGT:
2280       emit_setcc( func, inst, cc_NotLessThanEqual );
2281       break;
2282
2283    case TGSI_OPCODE_SIN:
2284       FETCH( func, *inst, 0, 0, CHAN_X );
2285       emit_sin( func, 0, 0 );
2286       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2287          STORE( func, *inst, 0, 0, chan_index );
2288       }
2289       break;
2290
2291    case TGSI_OPCODE_SLE:
2292       emit_setcc( func, inst, cc_LessThanEqual );
2293       break;
2294
2295    case TGSI_OPCODE_SNE:
2296       emit_setcc( func, inst, cc_NotEqual );
2297       break;
2298
2299    case TGSI_OPCODE_STR:
2300       return 0;
2301       break;
2302
2303    case TGSI_OPCODE_TEX:
2304       emit_tex( func, inst, FALSE, FALSE );
2305       break;
2306
2307    case TGSI_OPCODE_TXD:
2308       return 0;
2309       break;
2310
2311    case TGSI_OPCODE_UP2H:
2312       return 0;
2313       break;
2314
2315    case TGSI_OPCODE_UP2US:
2316       return 0;
2317       break;
2318
2319    case TGSI_OPCODE_UP4B:
2320       return 0;
2321       break;
2322
2323    case TGSI_OPCODE_UP4UB:
2324       return 0;
2325       break;
2326
2327    case TGSI_OPCODE_X2D:
2328       return 0;
2329       break;
2330
2331    case TGSI_OPCODE_ARA:
2332       return 0;
2333       break;
2334
2335    case TGSI_OPCODE_ARR:
2336       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2337          FETCH( func, *inst, 0, 0, chan_index );
2338          emit_rnd( func, 0, 0 );
2339          emit_f2it( func, 0 );
2340          STORE( func, *inst, 0, 0, chan_index );
2341       }
2342       break;
2343
2344    case TGSI_OPCODE_BRA:
2345       return 0;
2346       break;
2347
2348    case TGSI_OPCODE_CAL:
2349       return 0;
2350       break;
2351
2352    case TGSI_OPCODE_RET:
2353       emit_ret( func );
2354       break;
2355
2356    case TGSI_OPCODE_END:
2357       break;
2358
2359    case TGSI_OPCODE_SSG:
2360       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2361          FETCH( func, *inst, 0, 0, chan_index );
2362          emit_sgn( func, 0, 0 );
2363          STORE( func, *inst, 0, 0, chan_index );
2364       }
2365       break;
2366
2367    case TGSI_OPCODE_CMP:
2368       emit_cmp (func, inst);
2369       break;
2370
2371    case TGSI_OPCODE_SCS:
2372       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2373          FETCH( func, *inst, 0, 0, CHAN_X );
2374          emit_cos( func, 0, 0 );
2375          STORE( func, *inst, 0, 0, CHAN_X );
2376       }
2377       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2378          FETCH( func, *inst, 0, 0, CHAN_X );
2379          emit_sin( func, 0, 0 );
2380          STORE( func, *inst, 0, 0, CHAN_Y );
2381       }
2382       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2383          emit_tempf(
2384             func,
2385             0,
2386             TGSI_EXEC_TEMP_00000000_I,
2387             TGSI_EXEC_TEMP_00000000_C );
2388          STORE( func, *inst, 0, 0, CHAN_Z );
2389       }
2390       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2391          emit_tempf(
2392             func,
2393             0,
2394             TEMP_ONE_I,
2395             TEMP_ONE_C );
2396          STORE( func, *inst, 0, 0, CHAN_W );
2397       }
2398       break;
2399
2400    case TGSI_OPCODE_TXB:
2401       emit_tex( func, inst, TRUE, FALSE );
2402       break;
2403
2404    case TGSI_OPCODE_NRM:
2405       /* fall-through */
2406    case TGSI_OPCODE_NRM4:
2407       /* 3 or 4-component normalization */
2408       {
2409          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2410
2411          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2412              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2413              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2414              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2415
2416             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2417
2418             /* xmm4 = src.x */
2419             /* xmm0 = src.x * src.x */
2420             FETCH(func, *inst, 0, 0, CHAN_X);
2421             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2422                emit_MOV(func, 4, 0);
2423             }
2424             emit_mul(func, 0, 0);
2425
2426             /* xmm5 = src.y */
2427             /* xmm0 = xmm0 + src.y * src.y */
2428             FETCH(func, *inst, 1, 0, CHAN_Y);
2429             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2430                emit_MOV(func, 5, 1);
2431             }
2432             emit_mul(func, 1, 1);
2433             emit_add(func, 0, 1);
2434
2435             /* xmm6 = src.z */
2436             /* xmm0 = xmm0 + src.z * src.z */
2437             FETCH(func, *inst, 1, 0, CHAN_Z);
2438             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2439                emit_MOV(func, 6, 1);
2440             }
2441             emit_mul(func, 1, 1);
2442             emit_add(func, 0, 1);
2443
2444             if (dims == 4) {
2445                /* xmm7 = src.w */
2446                /* xmm0 = xmm0 + src.w * src.w */
2447                FETCH(func, *inst, 1, 0, CHAN_W);
2448                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2449                   emit_MOV(func, 7, 1);
2450                }
2451                emit_mul(func, 1, 1);
2452                emit_add(func, 0, 1);
2453             }
2454
2455             /* xmm1 = 1 / sqrt(xmm0) */
2456             emit_rsqrt(func, 1, 0);
2457
2458             /* dst.x = xmm1 * src.x */
2459             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2460                emit_mul(func, 4, 1);
2461                STORE(func, *inst, 4, 0, CHAN_X);
2462             }
2463
2464             /* dst.y = xmm1 * src.y */
2465             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2466                emit_mul(func, 5, 1);
2467                STORE(func, *inst, 5, 0, CHAN_Y);
2468             }
2469
2470             /* dst.z = xmm1 * src.z */
2471             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2472                emit_mul(func, 6, 1);
2473                STORE(func, *inst, 6, 0, CHAN_Z);
2474             }
2475
2476             /* dst.w = xmm1 * src.w */
2477             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2478                emit_mul(func, 7, 1);
2479                STORE(func, *inst, 7, 0, CHAN_W);
2480             }
2481          }
2482
2483          /* dst0.w = 1.0 */
2484          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2485             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2486             STORE(func, *inst, 0, 0, CHAN_W);
2487          }
2488       }
2489       break;
2490
2491    case TGSI_OPCODE_DIV:
2492       return 0;
2493       break;
2494
2495    case TGSI_OPCODE_DP2:
2496       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2497       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2498       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2499       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2500       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2501       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2502       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2503       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2504          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2505       }
2506       break;
2507
2508    case TGSI_OPCODE_TXL:
2509       emit_tex( func, inst, TRUE, FALSE );
2510       break;
2511
2512    case TGSI_OPCODE_TXP:
2513       emit_tex( func, inst, FALSE, TRUE );
2514       break;
2515
2516    case TGSI_OPCODE_BRK:
2517       return 0;
2518       break;
2519
2520    case TGSI_OPCODE_IF:
2521       return 0;
2522       break;
2523
2524    case TGSI_OPCODE_BGNFOR:
2525       return 0;
2526       break;
2527
2528    case TGSI_OPCODE_REP:
2529       return 0;
2530       break;
2531
2532    case TGSI_OPCODE_ELSE:
2533       return 0;
2534       break;
2535
2536    case TGSI_OPCODE_ENDIF:
2537       return 0;
2538       break;
2539
2540    case TGSI_OPCODE_ENDFOR:
2541       return 0;
2542       break;
2543
2544    case TGSI_OPCODE_ENDREP:
2545       return 0;
2546       break;
2547
2548    case TGSI_OPCODE_PUSHA:
2549       return 0;
2550       break;
2551
2552    case TGSI_OPCODE_POPA:
2553       return 0;
2554       break;
2555
2556    case TGSI_OPCODE_CEIL:
2557       return 0;
2558       break;
2559
2560    case TGSI_OPCODE_I2F:
2561       return 0;
2562       break;
2563
2564    case TGSI_OPCODE_NOT:
2565       return 0;
2566       break;
2567
2568    case TGSI_OPCODE_TRUNC:
2569       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2570          FETCH( func, *inst, 0, 0, chan_index );
2571          emit_f2it( func, 0 );
2572          emit_i2f( func, 0 );
2573          STORE( func, *inst, 0, 0, chan_index );
2574       }
2575       break;
2576
2577    case TGSI_OPCODE_SHL:
2578       return 0;
2579       break;
2580
2581    case TGSI_OPCODE_SHR:
2582       return 0;
2583       break;
2584
2585    case TGSI_OPCODE_AND:
2586       return 0;
2587       break;
2588
2589    case TGSI_OPCODE_OR:
2590       return 0;
2591       break;
2592
2593    case TGSI_OPCODE_MOD:
2594       return 0;
2595       break;
2596
2597    case TGSI_OPCODE_XOR:
2598       return 0;
2599       break;
2600
2601    case TGSI_OPCODE_SAD:
2602       return 0;
2603       break;
2604
2605    case TGSI_OPCODE_TXF:
2606       return 0;
2607       break;
2608
2609    case TGSI_OPCODE_TXQ:
2610       return 0;
2611       break;
2612
2613    case TGSI_OPCODE_CONT:
2614       return 0;
2615       break;
2616
2617    case TGSI_OPCODE_EMIT:
2618       return 0;
2619       break;
2620
2621    case TGSI_OPCODE_ENDPRIM:
2622       return 0;
2623       break;
2624
2625    default:
2626       return 0;
2627    }
2628
2629    return 1;
2630 }
2631
2632 static void
2633 emit_declaration(
2634    struct x86_function *func,
2635    struct tgsi_full_declaration *decl )
2636 {
2637    if( decl->Declaration.File == TGSI_FILE_INPUT ||
2638        decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE ) {
2639       unsigned first, last, mask;
2640       unsigned i, j;
2641
2642       first = decl->Range.First;
2643       last = decl->Range.Last;
2644       mask = decl->Declaration.UsageMask;
2645
2646       for( i = first; i <= last; i++ ) {
2647          for( j = 0; j < NUM_CHANNELS; j++ ) {
2648             if( mask & (1 << j) ) {
2649                switch( decl->Declaration.Interpolate ) {
2650                case TGSI_INTERPOLATE_CONSTANT:
2651                   emit_coef_a0( func, 0, i, j );
2652                   emit_inputs( func, 0, i, j );
2653                   break;
2654
2655                case TGSI_INTERPOLATE_LINEAR:
2656                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2657                   emit_coef_dadx( func, 1, i, j );
2658                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2659                   emit_coef_dady( func, 3, i, j );
2660                   emit_mul( func, 0, 1 );    /* x * dadx */
2661                   emit_coef_a0( func, 4, i, j );
2662                   emit_mul( func, 2, 3 );    /* y * dady */
2663                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2664                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2665                   emit_inputs( func, 0, i, j );
2666                   break;
2667
2668                case TGSI_INTERPOLATE_PERSPECTIVE:
2669                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2670                   emit_coef_dadx( func, 1, i, j );
2671                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2672                   emit_coef_dady( func, 3, i, j );
2673                   emit_mul( func, 0, 1 );    /* x * dadx */
2674                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2675                   emit_coef_a0( func, 5, i, j );
2676                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2677                   emit_mul( func, 2, 3 );    /* y * dady */
2678                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2679                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2680                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2681                   emit_inputs( func, 0, i, j );
2682                   break;
2683
2684                default:
2685                   assert( 0 );
2686                   break;
2687                }
2688             }
2689          }
2690       }
2691    }
2692 }
2693
2694 static void aos_to_soa( struct x86_function *func,
2695                         uint arg_aos,
2696                         uint arg_machine,
2697                         uint arg_num,
2698                         uint arg_stride )
2699 {
2700    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2701    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2702    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2703    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2704    int inner_loop;
2705
2706
2707    /* Save EBX */
2708    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2709
2710    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2711    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2712    x86_lea( func, soa_input,
2713             x86_make_disp( soa_input,
2714                            Offset(struct tgsi_exec_machine, Inputs) ) );
2715    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2716    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2717
2718    /* do */
2719    inner_loop = x86_get_label( func );
2720    {
2721       x86_push( func, aos_input );
2722       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2723       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2724       x86_add( func, aos_input, stride );
2725       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2726       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2727       x86_add( func, aos_input, stride );
2728       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2729       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2730       x86_add( func, aos_input, stride );
2731       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2732       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2733       x86_pop( func, aos_input );
2734
2735       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2736       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2737       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2738       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2739       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2740       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2741
2742       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2743       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2744       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2745       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2746
2747       /* Advance to next input */
2748       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2749       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2750    }
2751    /* while --num_inputs */
2752    x86_dec( func, num_inputs );
2753    x86_jcc( func, cc_NE, inner_loop );
2754
2755    /* Restore EBX */
2756    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2757 }
2758
2759 static void soa_to_aos( struct x86_function *func,
2760                         uint arg_aos,
2761                         uint arg_machine,
2762                         uint arg_num,
2763                         uint arg_stride )
2764 {
2765    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2766    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2767    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2768    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2769    int inner_loop;
2770
2771    /* Save EBX */
2772    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2773
2774    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2775    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2776    x86_lea( func, soa_output,
2777             x86_make_disp( soa_output,
2778                            Offset(struct tgsi_exec_machine, Outputs) ) );
2779    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2780
2781    /* do */
2782    inner_loop = x86_get_label( func );
2783    {
2784       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2785       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2786       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2787       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2788
2789       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2790       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2791       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2792       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2793       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2794       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2795
2796       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2797       x86_push( func, aos_output );
2798       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2799       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2800       x86_add( func, aos_output, temp );
2801       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2802       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2803       x86_add( func, aos_output, temp );
2804       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2805       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2806       x86_add( func, aos_output, temp );
2807       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2808       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2809       x86_pop( func, aos_output );
2810
2811       /* Advance to next output */
2812       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2813       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2814    }
2815    /* while --num_outputs */
2816    x86_dec( func, num_outputs );
2817    x86_jcc( func, cc_NE, inner_loop );
2818
2819    /* Restore EBX */
2820    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2821 }
2822
2823 /**
2824  * Translate a TGSI vertex/fragment shader to SSE2 code.
2825  * Slightly different things are done for vertex vs. fragment shaders.
2826  *
2827  * \param tokens  the TGSI input shader
2828  * \param func  the output SSE code/function
2829  * \param immediates  buffer to place immediates, later passed to SSE func
2830  * \param return  1 for success, 0 if translation failed
2831  */
2832 unsigned
2833 tgsi_emit_sse2(
2834    const struct tgsi_token *tokens,
2835    struct x86_function *func,
2836    float (*immediates)[4],
2837    boolean do_swizzles )
2838 {
2839    struct tgsi_parse_context parse;
2840    unsigned ok = 1;
2841    uint num_immediates = 0;
2842
2843    util_init_math();
2844
2845    func->csr = func->store;
2846
2847    tgsi_parse_init( &parse, tokens );
2848
2849    /* Can't just use EDI, EBX without save/restoring them:
2850     */
2851    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2852    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2853
2854    /*
2855     * Different function args for vertex/fragment shaders:
2856     */
2857    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2858       if (do_swizzles)
2859          aos_to_soa( func,
2860                      4,         /* aos_input */
2861                      1,         /* machine */
2862                      5,         /* num_inputs */
2863                      6 );       /* input_stride */
2864    }
2865
2866    x86_mov(
2867       func,
2868       get_machine_base(),
2869       x86_fn_arg( func, 1 ) );
2870    x86_mov(
2871       func,
2872       get_const_base(),
2873       x86_fn_arg( func, 2 ) );
2874    x86_mov(
2875       func,
2876       get_immediate_base(),
2877       x86_fn_arg( func, 3 ) );
2878
2879    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2880       x86_mov(
2881          func,
2882          get_coef_base(),
2883          x86_fn_arg( func, 4 ) );
2884    }
2885
2886    x86_mov(
2887       func,
2888       get_sampler_base(),
2889       x86_make_disp( get_machine_base(),
2890                      Offset( struct tgsi_exec_machine, Samplers ) ) );
2891
2892
2893    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2894       tgsi_parse_token( &parse );
2895
2896       switch( parse.FullToken.Token.Type ) {
2897       case TGSI_TOKEN_TYPE_DECLARATION:
2898          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2899             emit_declaration(
2900                func,
2901                &parse.FullToken.FullDeclaration );
2902          }
2903          break;
2904
2905       case TGSI_TOKEN_TYPE_INSTRUCTION:
2906          ok = emit_instruction(
2907             func,
2908             &parse.FullToken.FullInstruction );
2909
2910          if (!ok) {
2911             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2912             debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2913                          opcode,
2914                          tgsi_get_opcode_name(opcode),
2915                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2916                          "vertex shader" : "fragment shader");
2917          }
2918
2919          if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
2920             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2921
2922             /* XXX: we only handle src/dst aliasing in a few opcodes
2923              * currently.  Need to use an additional temporay to hold
2924              * the result in the cases where the code is too opaque to
2925              * fix.
2926              */
2927             if (opcode != TGSI_OPCODE_MOV) {
2928                debug_printf("Warning: src/dst aliasing in instruction"
2929                             " is not handled:\n");
2930                tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1);
2931             }
2932          }
2933          break;
2934
2935       case TGSI_TOKEN_TYPE_IMMEDIATE:
2936          /* simply copy the immediate values into the next immediates[] slot */
2937          {
2938             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2939             uint i;
2940             assert(size <= 4);
2941             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2942             for( i = 0; i < size; i++ ) {
2943                immediates[num_immediates][i] =
2944                   parse.FullToken.FullImmediate.u[i].Float;
2945             }
2946 #if 0
2947             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2948                    num_immediates,
2949                    immediates[num_immediates][0],
2950                    immediates[num_immediates][1],
2951                    immediates[num_immediates][2],
2952                    immediates[num_immediates][3]);
2953 #endif
2954             num_immediates++;
2955          }
2956          break;
2957       case TGSI_TOKEN_TYPE_PROPERTY:
2958          /* we just ignore them for now */
2959          break;
2960
2961       default:
2962          ok = 0;
2963          assert( 0 );
2964       }
2965    }
2966
2967    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2968       if (do_swizzles)
2969          soa_to_aos( func,
2970                      7,         /* aos_output */
2971                      1,         /* machine */
2972                      8,         /* num_outputs */
2973                      9 );       /* output_stride */
2974    }
2975
2976    /* Can't just use EBX, EDI without save/restoring them:
2977     */
2978    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2979    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2980
2981    emit_ret( func );
2982
2983    tgsi_parse_free( &parse );
2984
2985    return ok;
2986 }
2987
2988 #endif /* PIPE_ARCH_X86 */
2989