src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 #include "pipe/p_config.h"
  30
  31 #if defined(PIPE_ARCH_X86)
  32
  33 #include "util/u_debug.h"
  34 #include "pipe/p_shader_tokens.h"
  35 #include "util/u_math.h"
  36 #include "util/u_memory.h"
  37 #if defined(PIPE_ARCH_SSE)
  38 #include "util/u_sse.h"
  39 #endif
  40 #include "tgsi/tgsi_info.h"
  41 #include "tgsi/tgsi_parse.h"
  42 #include "tgsi/tgsi_util.h"
  43 #include "tgsi/tgsi_dump.h"
  44 #include "tgsi/tgsi_exec.h"
  45 #include "tgsi/tgsi_sse2.h"
  46
  47 #include "rtasm/rtasm_x86sse.h"
  48
  49 /* for 1/sqrt()
  50  *
  51  * This costs about 100fps (close to 10%) in gears:
  52  */
  53 #define HIGH_PRECISION 1
  54
  55 #define FAST_MATH 1
  56
  57
  58 #define FOR_EACH_CHANNEL( CHAN )\
  59    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  60
  61 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  62    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
  63
  64 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  65    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  66
  67 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  68    FOR_EACH_CHANNEL( CHAN )\
  69       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  70
  71 #define CHAN_X 0
  72 #define CHAN_Y 1
  73 #define CHAN_Z 2
  74 #define CHAN_W 3
  75
  76 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  77 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  78
  79 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  80 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  81 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  82 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  83
  84
  85 /**
  86  * X86 utility functions.
  87  */
  88
  89 static struct x86_reg
  90 make_xmm(
  91    unsigned xmm )
  92 {
  93    return x86_make_reg(
  94       file_XMM,
  95       (enum x86_reg_name) xmm );
  96 }
  97
  98 /**
  99  * X86 register mapping helpers.
 100  */
 101
 102 static struct x86_reg
 103 get_const_base( void )
 104 {
 105    return x86_make_reg(
 106       file_REG32,
 107       reg_AX );
 108 }
 109
 110 static struct x86_reg
 111 get_machine_base( void )
 112 {
 113    return x86_make_reg(
 114       file_REG32,
 115       reg_CX );
 116 }
 117
 118 static struct x86_reg
 119 get_input_base( void )
 120 {
 121    return x86_make_disp(
 122       get_machine_base(),
 123       Offset(struct tgsi_exec_machine, Inputs) );
 124 }
 125
 126 static struct x86_reg
 127 get_output_base( void )
 128 {
 129    return x86_make_disp(
 130       get_machine_base(),
 131       Offset(struct tgsi_exec_machine, Outputs) );
 132 }
 133
 134 static struct x86_reg
 135 get_temp_base( void )
 136 {
 137    return x86_make_disp(
 138       get_machine_base(),
 139       Offset(struct tgsi_exec_machine, Temps) );
 140 }
 141
 142 static struct x86_reg
 143 get_coef_base( void )
 144 {
 145    return x86_make_reg(
 146       file_REG32,
 147       reg_BX );
 148 }
 149
 150 static struct x86_reg
 151 get_sampler_base( void )
 152 {
 153    return x86_make_reg(
 154       file_REG32,
 155       reg_DI );
 156 }
 157
 158 static struct x86_reg
 159 get_immediate_base( void )
 160 {
 161    return x86_make_reg(
 162       file_REG32,
 163       reg_DX );
 164 }
 165
 166
 167 /**
 168  * Data access helpers.
 169  */
 170
 171
 172 static struct x86_reg
 173 get_immediate(
 174    unsigned vec,
 175    unsigned chan )
 176 {
 177    return x86_make_disp(
 178       get_immediate_base(),
 179       (vec * 4 + chan) * 4 );
 180 }
 181
 182 static struct x86_reg
 183 get_const(
 184    unsigned vec,
 185    unsigned chan )
 186 {
 187    return x86_make_disp(
 188       get_const_base(),
 189       (vec * 4 + chan) * 4 );
 190 }
 191
 192 static struct x86_reg
 193 get_sampler_ptr(
 194    unsigned unit )
 195 {
 196    return x86_make_disp(
 197       get_sampler_base(),
 198       unit * sizeof( struct tgsi_sampler * ) );
 199 }
 200
 201 static struct x86_reg
 202 get_input(
 203    unsigned vec,
 204    unsigned chan )
 205 {
 206    return x86_make_disp(
 207       get_input_base(),
 208       (vec * 4 + chan) * 16 );
 209 }
 210
 211 static struct x86_reg
 212 get_output(
 213    unsigned vec,
 214    unsigned chan )
 215 {
 216    return x86_make_disp(
 217       get_output_base(),
 218       (vec * 4 + chan) * 16 );
 219 }
 220
 221 static struct x86_reg
 222 get_temp(
 223    unsigned vec,
 224    unsigned chan )
 225 {
 226    return x86_make_disp(
 227       get_temp_base(),
 228       (vec * 4 + chan) * 16 );
 229 }
 230
 231 static struct x86_reg
 232 get_coef(
 233    unsigned vec,
 234    unsigned chan,
 235    unsigned member )
 236 {
 237    return x86_make_disp(
 238       get_coef_base(),
 239       ((vec * 3 + member) * 4 + chan) * 4 );
 240 }
 241
 242
 243 static void
 244 emit_ret(
 245    struct x86_function  *func )
 246 {
 247    x86_ret( func );
 248 }
 249
 250
 251 /**
 252  * Data fetch helpers.
 253  */
 254
 255 /**
 256  * Copy a shader constant to xmm register
 257  * \param xmm  the destination xmm register
 258  * \param vec  the src const buffer index
 259  * \param chan  src channel to fetch (X, Y, Z or W)
 260  */
 261 static void
 262 emit_const(
 263    struct x86_function *func,
 264    uint xmm,
 265    int vec,
 266    uint chan,
 267    uint indirect,
 268    uint indirectFile,
 269    int indirectIndex )
 270 {
 271    if (indirect) {
 272       /* 'vec' is the offset from the address register's value.
 273        * We're loading CONST[ADDR+vec] into an xmm register.
 274        */
 275       struct x86_reg r0 = get_immediate_base();
 276       struct x86_reg r1 = get_coef_base();
 277       uint i;
 278
 279       assert( indirectFile == TGSI_FILE_ADDRESS );
 280       assert( indirectIndex == 0 );
 281       assert( r0.mod == mod_REG );
 282       assert( r1.mod == mod_REG );
 283
 284       x86_push( func, r0 );
 285       x86_push( func, r1 );
 286
 287       /*
 288        * Loop over the four pixels or vertices in the quad.
 289        * Get the value of the address (offset) register for pixel/vertex[i],
 290        * add it to the src offset and index into the constant buffer.
 291        * Note that we're working on SOA data.
 292        * If any of the pixel/vertex execution channels are unused their
 293        * values will be garbage.  It's very important that we don't use
 294        * those garbage values as indexes into the constant buffer since
 295        * that'll cause segfaults.
 296        * The solution is to bitwise-AND the offset with the execution mask
 297        * register whose values are either 0 or ~0.
 298        * The caller must setup the execution mask register to indicate
 299        * which channels are valid/alive before running the shader.
 300        * The execution mask will also figure into loops and conditionals
 301        * someday.
 302        */
 303       for (i = 0; i < QUAD_SIZE; i++) {
 304          /* r1 = address register[i] */
 305          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 306          /* r0 = execution mask[i] */
 307          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 308          /* r1 = r1 & r0 */
 309          x86_and( func, r1, r0 );
 310          /* r0 = 'vec', the offset */
 311          x86_lea( func, r0, get_const( vec, chan ) );
 312
 313          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 314           */
 315          x86_add( func, r1, r1 );
 316          x86_add( func, r1, r1 );
 317          x86_add( func, r1, r1 );
 318          x86_add( func, r1, r1 );
 319
 320          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 321          x86_mov( func, r1, x86_deref( r0 ) );
 322          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 323       }
 324
 325       x86_pop( func, r1 );
 326       x86_pop( func, r0 );
 327
 328       sse_movaps(
 329          func,
 330          make_xmm( xmm ),
 331          get_temp( TEMP_R0, CHAN_X ) );
 332    }
 333    else {
 334       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 335       assert( vec >= 0 );
 336
 337       sse_movss(
 338          func,
 339          make_xmm( xmm ),
 340          get_const( vec, chan ) );
 341       sse_shufps(
 342          func,
 343          make_xmm( xmm ),
 344          make_xmm( xmm ),
 345          SHUF( 0, 0, 0, 0 ) );
 346    }
 347 }
 348
 349 static void
 350 emit_immediate(
 351    struct x86_function *func,
 352    unsigned xmm,
 353    unsigned vec,
 354    unsigned chan )
 355 {
 356    sse_movss(
 357       func,
 358       make_xmm( xmm ),
 359       get_immediate( vec, chan ) );
 360    sse_shufps(
 361       func,
 362       make_xmm( xmm ),
 363       make_xmm( xmm ),
 364       SHUF( 0, 0, 0, 0 ) );
 365 }
 366
 367
 368 /**
 369  * Copy a shader input to xmm register
 370  * \param xmm  the destination xmm register
 371  * \param vec  the src input attrib
 372  * \param chan  src channel to fetch (X, Y, Z or W)
 373  */
 374 static void
 375 emit_inputf(
 376    struct x86_function *func,
 377    unsigned xmm,
 378    unsigned vec,
 379    unsigned chan )
 380 {
 381    sse_movups(
 382       func,
 383       make_xmm( xmm ),
 384       get_input( vec, chan ) );
 385 }
 386
 387 /**
 388  * Store an xmm register to a shader output
 389  * \param xmm  the source xmm register
 390  * \param vec  the dest output attrib
 391  * \param chan  src dest channel to store (X, Y, Z or W)
 392  */
 393 static void
 394 emit_output(
 395    struct x86_function *func,
 396    unsigned xmm,
 397    unsigned vec,
 398    unsigned chan )
 399 {
 400    sse_movups(
 401       func,
 402       get_output( vec, chan ),
 403       make_xmm( xmm ) );
 404 }
 405
 406 /**
 407  * Copy a shader temporary to xmm register
 408  * \param xmm  the destination xmm register
 409  * \param vec  the src temp register
 410  * \param chan  src channel to fetch (X, Y, Z or W)
 411  */
 412 static void
 413 emit_tempf(
 414    struct x86_function *func,
 415    unsigned xmm,
 416    unsigned vec,
 417    unsigned chan )
 418 {
 419    sse_movaps(
 420       func,
 421       make_xmm( xmm ),
 422       get_temp( vec, chan ) );
 423 }
 424
 425 /**
 426  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 427  * \param xmm  the destination xmm register
 428  * \param vec  the src input/attribute coefficient index
 429  * \param chan  src channel to fetch (X, Y, Z or W)
 430  * \param member  0=a0, 1=dadx, 2=dady
 431  */
 432 static void
 433 emit_coef(
 434    struct x86_function *func,
 435    unsigned xmm,
 436    unsigned vec,
 437    unsigned chan,
 438    unsigned member )
 439 {
 440    sse_movss(
 441       func,
 442       make_xmm( xmm ),
 443       get_coef( vec, chan, member ) );
 444    sse_shufps(
 445       func,
 446       make_xmm( xmm ),
 447       make_xmm( xmm ),
 448       SHUF( 0, 0, 0, 0 ) );
 449 }
 450
 451 /**
 452  * Data store helpers.
 453  */
 454
 455 static void
 456 emit_inputs(
 457    struct x86_function *func,
 458    unsigned xmm,
 459    unsigned vec,
 460    unsigned chan )
 461 {
 462    sse_movups(
 463       func,
 464       get_input( vec, chan ),
 465       make_xmm( xmm ) );
 466 }
 467
 468 static void
 469 emit_temps(
 470    struct x86_function *func,
 471    unsigned xmm,
 472    unsigned vec,
 473    unsigned chan )
 474 {
 475    sse_movaps(
 476       func,
 477       get_temp( vec, chan ),
 478       make_xmm( xmm ) );
 479 }
 480
 481 static void
 482 emit_addrs(
 483    struct x86_function *func,
 484    unsigned xmm,
 485    unsigned vec,
 486    unsigned chan )
 487 {
 488    assert( vec == 0 );
 489
 490    emit_temps(
 491       func,
 492       xmm,
 493       vec + TGSI_EXEC_TEMP_ADDR,
 494       chan );
 495 }
 496
 497 /**
 498  * Coefficent fetch helpers.
 499  */
 500
 501 static void
 502 emit_coef_a0(
 503    struct x86_function *func,
 504    unsigned xmm,
 505    unsigned vec,
 506    unsigned chan )
 507 {
 508    emit_coef(
 509       func,
 510       xmm,
 511       vec,
 512       chan,
 513       0 );
 514 }
 515
 516 static void
 517 emit_coef_dadx(
 518    struct x86_function *func,
 519    unsigned xmm,
 520    unsigned vec,
 521    unsigned chan )
 522 {
 523    emit_coef(
 524       func,
 525       xmm,
 526       vec,
 527       chan,
 528       1 );
 529 }
 530
 531 static void
 532 emit_coef_dady(
 533    struct x86_function *func,
 534    unsigned xmm,
 535    unsigned vec,
 536    unsigned chan )
 537 {
 538    emit_coef(
 539       func,
 540       xmm,
 541       vec,
 542       chan,
 543       2 );
 544 }
 545
 546 /**
 547  * Function call helpers.
 548  */
 549
 550 /**
 551  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 552  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 553  * that the stack pointer is 16 byte aligned, as expected.
 554  */
 555 static void
 556 emit_func_call(
 557    struct x86_function *func,
 558    unsigned xmm_save_mask,
 559    const struct x86_reg *arg,
 560    unsigned nr_args,
 561    void (PIPE_CDECL *code)() )
 562 {
 563    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 564    unsigned i, n;
 565
 566    x86_push(
 567       func,
 568       x86_make_reg( file_REG32, reg_AX) );
 569    x86_push(
 570       func,
 571       x86_make_reg( file_REG32, reg_CX) );
 572    x86_push(
 573       func,
 574       x86_make_reg( file_REG32, reg_DX) );
 575
 576    /* Store XMM regs to the stack
 577     */
 578    for(i = 0, n = 0; i < 8; ++i)
 579       if(xmm_save_mask & (1 << i))
 580          ++n;
 581
 582    x86_sub_imm(
 583       func,
 584       x86_make_reg( file_REG32, reg_SP ),
 585       n*16);
 586
 587    for(i = 0, n = 0; i < 8; ++i)
 588       if(xmm_save_mask & (1 << i)) {
 589          sse_movups(
 590             func,
 591             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 592             make_xmm( i ) );
 593          ++n;
 594       }
 595
 596    for (i = 0; i < nr_args; i++) {
 597       /* Load the address of the buffer we use for passing arguments and
 598        * receiving results:
 599        */
 600       x86_lea(
 601          func,
 602          ecx,
 603          arg[i] );
 604
 605       /* Push actual function arguments (currently just the pointer to
 606        * the buffer above), and call the function:
 607        */
 608       x86_push( func, ecx );
 609    }
 610
 611    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 612    x86_call( func, ecx );
 613
 614    /* Pop the arguments (or just add an immediate to esp)
 615     */
 616    for (i = 0; i < nr_args; i++) {
 617       x86_pop(func, ecx );
 618    }
 619
 620    /* Pop the saved XMM regs:
 621     */
 622    for(i = 0, n = 0; i < 8; ++i)
 623       if(xmm_save_mask & (1 << i)) {
 624          sse_movups(
 625             func,
 626             make_xmm( i ),
 627             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 628          ++n;
 629       }
 630
 631    x86_add_imm(
 632       func,
 633       x86_make_reg( file_REG32, reg_SP ),
 634       n*16);
 635
 636    /* Restore GP registers in a reverse order.
 637     */
 638    x86_pop(
 639       func,
 640       x86_make_reg( file_REG32, reg_DX) );
 641    x86_pop(
 642       func,
 643       x86_make_reg( file_REG32, reg_CX) );
 644    x86_pop(
 645       func,
 646       x86_make_reg( file_REG32, reg_AX) );
 647 }
 648
 649 static void
 650 emit_func_call_dst_src1(
 651    struct x86_function *func,
 652    unsigned xmm_save,
 653    unsigned xmm_dst,
 654    unsigned xmm_src0,
 655    void (PIPE_CDECL *code)() )
 656 {
 657    struct x86_reg store = get_temp( TEMP_R0, 0 );
 658    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 659
 660    /* Store our input parameters (in xmm regs) to the buffer we use
 661     * for passing arguments.  We will pass a pointer to this buffer as
 662     * the actual function argument.
 663     */
 664    sse_movaps(
 665       func,
 666       store,
 667       make_xmm( xmm_src0 ) );
 668
 669    emit_func_call( func,
 670                    xmm_mask,
 671                    &store,
 672                    1,
 673                    code );
 674
 675    sse_movaps(
 676       func,
 677       make_xmm( xmm_dst ),
 678       store );
 679 }
 680
 681
 682 static void
 683 emit_func_call_dst_src2(
 684    struct x86_function *func,
 685    unsigned xmm_save,
 686    unsigned xmm_dst,
 687    unsigned xmm_src0,
 688    unsigned xmm_src1,
 689    void (PIPE_CDECL *code)() )
 690 {
 691    struct x86_reg store = get_temp( TEMP_R0, 0 );
 692    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 693
 694    /* Store two inputs to parameter buffer.
 695     */
 696    sse_movaps(
 697       func,
 698       store,
 699       make_xmm( xmm_src0 ) );
 700
 701    sse_movaps(
 702       func,
 703       x86_make_disp( store, 4 * sizeof(float) ),
 704       make_xmm( xmm_src1 ) );
 705
 706
 707    /* Emit the call
 708     */
 709    emit_func_call( func,
 710                    xmm_mask,
 711                    &store,
 712                    1,
 713                    code );
 714
 715    /* Retrieve the results:
 716     */
 717    sse_movaps(
 718       func,
 719       make_xmm( xmm_dst ),
 720       store );
 721 }
 722
 723
 724
 725
 726
 727 #if defined(PIPE_ARCH_SSE)
 728
 729 /*
 730  * Fast SSE2 implementation of special math functions.
 731  */
 732
 733 #define POLY0(x, c0) _mm_set1_ps(c0)
 734 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 735 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 736 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 737 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 738 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 739
 740 #define EXP_POLY_DEGREE 3
 741 #define LOG_POLY_DEGREE 5
 742
 743 /**
 744  * See http://www.devmaster.net/forums/showthread.php?p=43580
 745  */
 746 static INLINE __m128
 747 exp2f4(__m128 x)
 748 {
 749    __m128i ipart;
 750    __m128 fpart, expipart, expfpart;
 751
 752    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 753    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 754
 755    /* ipart = int(x - 0.5) */
 756    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 757
 758    /* fpart = x - ipart */
 759    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 760
 761    /* expipart = (float) (1 << ipart) */
 762    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 763
 764    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 765 #if EXP_POLY_DEGREE == 5
 766    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 767 #elif EXP_POLY_DEGREE == 4
 768    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 769 #elif EXP_POLY_DEGREE == 3
 770    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 771 #elif EXP_POLY_DEGREE == 2
 772    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 773 #else
 774 #error
 775 #endif
 776
 777    return _mm_mul_ps(expipart, expfpart);
 778 }
 779
 780
 781 /**
 782  * See http://www.devmaster.net/forums/showthread.php?p=43580
 783  */
 784 static INLINE __m128
 785 log2f4(__m128 x)
 786 {
 787    __m128i expmask = _mm_set1_epi32(0x7f800000);
 788    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 789    __m128 one = _mm_set1_ps(1.0f);
 790
 791    __m128i i = _mm_castps_si128(x);
 792
 793    /* exp = (float) exponent(x) */
 794    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 795
 796    /* mant = (float) mantissa(x) */
 797    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 798
 799    __m128 logmant;
 800
 801    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 802     * These coefficients can be generate with
 803     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 804     */
 805 #if LOG_POLY_DEGREE == 6
 806    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 807 #elif LOG_POLY_DEGREE == 5
 808    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 809 #elif LOG_POLY_DEGREE == 4
 810    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 811 #elif LOG_POLY_DEGREE == 3
 812    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 813 #else
 814 #error
 815 #endif
 816
 817    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 818    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 819
 820    return _mm_add_ps(logmant, exp);
 821 }
 822
 823
 824 static INLINE __m128
 825 powf4(__m128 x, __m128 y)
 826 {
 827    return exp2f4(_mm_mul_ps(log2f4(x), y));
 828 }
 829
 830 #endif /* PIPE_ARCH_SSE */
 831
 832
 833
 834 /**
 835  * Low-level instruction translators.
 836  */
 837
 838 static void
 839 emit_abs(
 840    struct x86_function *func,
 841    unsigned xmm )
 842 {
 843    sse_andps(
 844       func,
 845       make_xmm( xmm ),
 846       get_temp(
 847          TGSI_EXEC_TEMP_7FFFFFFF_I,
 848          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 849 }
 850
 851 static void
 852 emit_add(
 853    struct x86_function *func,
 854    unsigned xmm_dst,
 855    unsigned xmm_src )
 856 {
 857    sse_addps(
 858       func,
 859       make_xmm( xmm_dst ),
 860       make_xmm( xmm_src ) );
 861 }
 862
 863 static void PIPE_CDECL
 864 cos4f(
 865    float *store )
 866 {
 867    store[0] = cosf( store[0] );
 868    store[1] = cosf( store[1] );
 869    store[2] = cosf( store[2] );
 870    store[3] = cosf( store[3] );
 871 }
 872
 873 static void
 874 emit_cos(
 875    struct x86_function *func,
 876    unsigned xmm_save,
 877    unsigned xmm_dst )
 878 {
 879    emit_func_call_dst_src1(
 880       func,
 881       xmm_save,
 882       xmm_dst,
 883       xmm_dst,
 884       cos4f );
 885 }
 886
 887 static void PIPE_CDECL
 888 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 889 __attribute__((force_align_arg_pointer))
 890 #endif
 891 ex24f(
 892    float *store )
 893 {
 894 #if defined(PIPE_ARCH_SSE)
 895    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 896 #else
 897    store[0] = util_fast_exp2( store[0] );
 898    store[1] = util_fast_exp2( store[1] );
 899    store[2] = util_fast_exp2( store[2] );
 900    store[3] = util_fast_exp2( store[3] );
 901 #endif
 902 }
 903
 904 static void
 905 emit_ex2(
 906    struct x86_function *func,
 907    unsigned xmm_save,
 908    unsigned xmm_dst )
 909 {
 910    emit_func_call_dst_src1(
 911       func,
 912       xmm_save,
 913       xmm_dst,
 914       xmm_dst,
 915       ex24f );
 916 }
 917
 918 static void
 919 emit_f2it(
 920    struct x86_function *func,
 921    unsigned xmm )
 922 {
 923    sse2_cvttps2dq(
 924       func,
 925       make_xmm( xmm ),
 926       make_xmm( xmm ) );
 927 }
 928
 929 static void
 930 emit_i2f(
 931    struct x86_function *func,
 932    unsigned xmm )
 933 {
 934    sse2_cvtdq2ps(
 935       func,
 936       make_xmm( xmm ),
 937       make_xmm( xmm ) );
 938 }
 939
 940 static void PIPE_CDECL
 941 flr4f(
 942    float *store )
 943 {
 944    store[0] = floorf( store[0] );
 945    store[1] = floorf( store[1] );
 946    store[2] = floorf( store[2] );
 947    store[3] = floorf( store[3] );
 948 }
 949
 950 static void
 951 emit_flr(
 952    struct x86_function *func,
 953    unsigned xmm_save,
 954    unsigned xmm_dst )
 955 {
 956    emit_func_call_dst_src1(
 957       func,
 958       xmm_save,
 959       xmm_dst,
 960       xmm_dst,
 961       flr4f );
 962 }
 963
 964 static void PIPE_CDECL
 965 frc4f(
 966    float *store )
 967 {
 968    store[0] -= floorf( store[0] );
 969    store[1] -= floorf( store[1] );
 970    store[2] -= floorf( store[2] );
 971    store[3] -= floorf( store[3] );
 972 }
 973
 974 static void
 975 emit_frc(
 976    struct x86_function *func,
 977    unsigned xmm_save,
 978    unsigned xmm_dst )
 979 {
 980    emit_func_call_dst_src1(
 981       func,
 982       xmm_save,
 983       xmm_dst,
 984       xmm_dst,
 985       frc4f );
 986 }
 987
 988 static void PIPE_CDECL
 989 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 990 __attribute__((force_align_arg_pointer))
 991 #endif
 992 lg24f(
 993    float *store )
 994 {
 995 #if defined(PIPE_ARCH_SSE)
 996    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 997 #else
 998    store[0] = util_fast_log2( store[0] );
 999    store[1] = util_fast_log2( store[1] );
1000    store[2] = util_fast_log2( store[2] );
1001    store[3] = util_fast_log2( store[3] );
1002 #endif
1003 }
1004
1005 static void
1006 emit_lg2(
1007    struct x86_function *func,
1008    unsigned xmm_save,
1009    unsigned xmm_dst )
1010 {
1011    emit_func_call_dst_src1(
1012       func,
1013       xmm_save,
1014       xmm_dst,
1015       xmm_dst,
1016       lg24f );
1017 }
1018
1019 static void
1020 emit_MOV(
1021    struct x86_function *func,
1022    unsigned xmm_dst,
1023    unsigned xmm_src )
1024 {
1025    sse_movups(
1026       func,
1027       make_xmm( xmm_dst ),
1028       make_xmm( xmm_src ) );
1029 }
1030
1031 static void
1032 emit_mul (struct x86_function *func,
1033           unsigned xmm_dst,
1034           unsigned xmm_src)
1035 {
1036    sse_mulps(
1037       func,
1038       make_xmm( xmm_dst ),
1039       make_xmm( xmm_src ) );
1040 }
1041
1042 static void
1043 emit_neg(
1044    struct x86_function *func,
1045    unsigned xmm )
1046 {
1047    sse_xorps(
1048       func,
1049       make_xmm( xmm ),
1050       get_temp(
1051          TGSI_EXEC_TEMP_80000000_I,
1052          TGSI_EXEC_TEMP_80000000_C ) );
1053 }
1054
1055 static void PIPE_CDECL
1056 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1057 __attribute__((force_align_arg_pointer))
1058 #endif
1059 pow4f(
1060    float *store )
1061 {
1062 #if defined(PIPE_ARCH_SSE)
1063    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1064 #else
1065    store[0] = util_fast_pow( store[0], store[4] );
1066    store[1] = util_fast_pow( store[1], store[5] );
1067    store[2] = util_fast_pow( store[2], store[6] );
1068    store[3] = util_fast_pow( store[3], store[7] );
1069 #endif
1070 }
1071
1072 static void
1073 emit_pow(
1074    struct x86_function *func,
1075    unsigned xmm_save,
1076    unsigned xmm_dst,
1077    unsigned xmm_src0,
1078    unsigned xmm_src1 )
1079 {
1080    emit_func_call_dst_src2(
1081       func,
1082       xmm_save,
1083       xmm_dst,
1084       xmm_src0,
1085       xmm_src1,
1086       pow4f );
1087 }
1088
1089 static void
1090 emit_rcp (
1091    struct x86_function *func,
1092    unsigned xmm_dst,
1093    unsigned xmm_src )
1094 {
1095    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1096     * good enough.  Need to either emit a proper divide or use the
1097     * iterative technique described below in emit_rsqrt().
1098     */
1099    sse2_rcpps(
1100       func,
1101       make_xmm( xmm_dst ),
1102       make_xmm( xmm_src ) );
1103 }
1104
1105 static void PIPE_CDECL
1106 rnd4f(
1107    float *store )
1108 {
1109    store[0] = floorf( store[0] + 0.5f );
1110    store[1] = floorf( store[1] + 0.5f );
1111    store[2] = floorf( store[2] + 0.5f );
1112    store[3] = floorf( store[3] + 0.5f );
1113 }
1114
1115 static void
1116 emit_rnd(
1117    struct x86_function *func,
1118    unsigned xmm_save,
1119    unsigned xmm_dst )
1120 {
1121    emit_func_call_dst_src1(
1122       func,
1123       xmm_save,
1124       xmm_dst,
1125       xmm_dst,
1126       rnd4f );
1127 }
1128
1129 static void
1130 emit_rsqrt(
1131    struct x86_function *func,
1132    unsigned xmm_dst,
1133    unsigned xmm_src )
1134 {
1135 #if HIGH_PRECISION
1136    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1137     * implementations, it is possible to improve its precision at
1138     * fairly low cost, using a newton/raphson step, as below:
1139     *
1140     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1141     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1142     *
1143     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1144     */
1145    {
1146       struct x86_reg dst = make_xmm( xmm_dst );
1147       struct x86_reg src = make_xmm( xmm_src );
1148       struct x86_reg tmp0 = make_xmm( 2 );
1149       struct x86_reg tmp1 = make_xmm( 3 );
1150
1151       assert( xmm_dst != xmm_src );
1152       assert( xmm_dst != 2 && xmm_dst != 3 );
1153       assert( xmm_src != 2 && xmm_src != 3 );
1154
1155       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1156       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1157       sse_rsqrtps( func, tmp1, src  );
1158       sse_mulps(   func, src,  tmp1 );
1159       sse_mulps(   func, dst,  tmp1 );
1160       sse_mulps(   func, src,  tmp1 );
1161       sse_subps(   func, tmp0, src  );
1162       sse_mulps(   func, dst,  tmp0 );
1163    }
1164 #else
1165    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1166     * good enough.
1167     */
1168    sse_rsqrtps(
1169       func,
1170       make_xmm( xmm_dst ),
1171       make_xmm( xmm_src ) );
1172 #endif
1173 }
1174
1175 static void
1176 emit_setsign(
1177    struct x86_function *func,
1178    unsigned xmm )
1179 {
1180    sse_orps(
1181       func,
1182       make_xmm( xmm ),
1183       get_temp(
1184          TGSI_EXEC_TEMP_80000000_I,
1185          TGSI_EXEC_TEMP_80000000_C ) );
1186 }
1187
1188 static void PIPE_CDECL
1189 sgn4f(
1190    float *store )
1191 {
1192    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1193    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1194    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1195    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1196 }
1197
1198 static void
1199 emit_sgn(
1200    struct x86_function *func,
1201    unsigned xmm_save,
1202    unsigned xmm_dst )
1203 {
1204    emit_func_call_dst_src1(
1205       func,
1206       xmm_save,
1207       xmm_dst,
1208       xmm_dst,
1209       sgn4f );
1210 }
1211
1212 static void PIPE_CDECL
1213 sin4f(
1214    float *store )
1215 {
1216    store[0] = sinf( store[0] );
1217    store[1] = sinf( store[1] );
1218    store[2] = sinf( store[2] );
1219    store[3] = sinf( store[3] );
1220 }
1221
1222 static void
1223 emit_sin (struct x86_function *func,
1224           unsigned xmm_save,
1225           unsigned xmm_dst)
1226 {
1227    emit_func_call_dst_src1(
1228       func,
1229       xmm_save,
1230       xmm_dst,
1231       xmm_dst,
1232       sin4f );
1233 }
1234
1235 static void
1236 emit_sub(
1237    struct x86_function *func,
1238    unsigned xmm_dst,
1239    unsigned xmm_src )
1240 {
1241    sse_subps(
1242       func,
1243       make_xmm( xmm_dst ),
1244       make_xmm( xmm_src ) );
1245 }
1246
1247
1248
1249
1250
1251
1252
1253 /**
1254  * Register fetch.
1255  */
1256
1257 static void
1258 emit_fetch(
1259    struct x86_function *func,
1260    unsigned xmm,
1261    const struct tgsi_full_src_register *reg,
1262    const unsigned chan_index )
1263 {
1264    unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1265
1266    switch (swizzle) {
1267    case TGSI_SWIZZLE_X:
1268    case TGSI_SWIZZLE_Y:
1269    case TGSI_SWIZZLE_Z:
1270    case TGSI_SWIZZLE_W:
1271       switch (reg->Register.File) {
1272       case TGSI_FILE_CONSTANT:
1273          emit_const(
1274             func,
1275             xmm,
1276             reg->Register.Index,
1277             swizzle,
1278             reg->Register.Indirect,
1279             reg->Indirect.File,
1280             reg->Indirect.Index );
1281          break;
1282
1283       case TGSI_FILE_IMMEDIATE:
1284          emit_immediate(
1285             func,
1286             xmm,
1287             reg->Register.Index,
1288             swizzle );
1289          break;
1290
1291       case TGSI_FILE_INPUT:
1292       case TGSI_FILE_SYSTEM_VALUE:
1293          emit_inputf(
1294             func,
1295             xmm,
1296             reg->Register.Index,
1297             swizzle );
1298          break;
1299
1300       case TGSI_FILE_TEMPORARY:
1301          emit_tempf(
1302             func,
1303             xmm,
1304             reg->Register.Index,
1305             swizzle );
1306          break;
1307
1308       default:
1309          assert( 0 );
1310       }
1311       break;
1312
1313    default:
1314       assert( 0 );
1315    }
1316
1317    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1318    case TGSI_UTIL_SIGN_CLEAR:
1319       emit_abs( func, xmm );
1320       break;
1321
1322    case TGSI_UTIL_SIGN_SET:
1323       emit_setsign( func, xmm );
1324       break;
1325
1326    case TGSI_UTIL_SIGN_TOGGLE:
1327       emit_neg( func, xmm );
1328       break;
1329
1330    case TGSI_UTIL_SIGN_KEEP:
1331       break;
1332    }
1333 }
1334
1335 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1336    emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1337
1338 /**
1339  * Register store.
1340  */
1341
1342 static void
1343 emit_store(
1344    struct x86_function *func,
1345    unsigned xmm,
1346    const struct tgsi_full_dst_register *reg,
1347    const struct tgsi_full_instruction *inst,
1348    unsigned chan_index )
1349 {
1350    switch( inst->Instruction.Saturate ) {
1351    case TGSI_SAT_NONE:
1352       break;
1353
1354    case TGSI_SAT_ZERO_ONE:
1355       sse_maxps(
1356          func,
1357          make_xmm( xmm ),
1358          get_temp(
1359             TGSI_EXEC_TEMP_00000000_I,
1360             TGSI_EXEC_TEMP_00000000_C ) );
1361
1362       sse_minps(
1363          func,
1364          make_xmm( xmm ),
1365          get_temp(
1366             TGSI_EXEC_TEMP_ONE_I,
1367             TGSI_EXEC_TEMP_ONE_C ) );
1368       break;
1369
1370    case TGSI_SAT_MINUS_PLUS_ONE:
1371       assert( 0 );
1372       break;
1373    }
1374
1375
1376    switch( reg->Register.File ) {
1377    case TGSI_FILE_OUTPUT:
1378       emit_output(
1379          func,
1380          xmm,
1381          reg->Register.Index,
1382          chan_index );
1383       break;
1384
1385    case TGSI_FILE_TEMPORARY:
1386       emit_temps(
1387          func,
1388          xmm,
1389          reg->Register.Index,
1390          chan_index );
1391       break;
1392
1393    case TGSI_FILE_ADDRESS:
1394       emit_addrs(
1395          func,
1396          xmm,
1397          reg->Register.Index,
1398          chan_index );
1399       break;
1400
1401    default:
1402       assert( 0 );
1403    }
1404 }
1405
1406 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1407    emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1408
1409
1410 static void PIPE_CDECL
1411 fetch_texel( struct tgsi_sampler **sampler,
1412              float *store )
1413 {
1414 #if 0
1415    uint j;
1416
1417    debug_printf("%s sampler: %p (%p) store: %p\n",
1418                 __FUNCTION__,
1419                 sampler, *sampler,
1420                 store );
1421
1422    for (j = 0; j < 4; j++)
1423       debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
1424                    j,
1425                    store[0+j],
1426                    store[4+j],
1427                    store[8 + j],
1428                    store[12 + j]);
1429 #endif
1430
1431    {
1432       float rgba[NUM_CHANNELS][QUAD_SIZE];
1433       (*sampler)->get_samples(*sampler,
1434                               &store[0],  /* s */
1435                               &store[4],  /* t */
1436                               &store[8],  /* r */
1437                               &store[12], /* lodbias */
1438                               tgsi_sampler_lod_bias,
1439                               rgba);      /* results */
1440
1441       memcpy( store, rgba, 16 * sizeof(float));
1442    }
1443
1444 #if 0
1445    for (j = 0; j < 4; j++)
1446       debug_printf("sample %d result %f %f %f %f\n",
1447                    j,
1448                    store[0+j],
1449                    store[4+j],
1450                    store[8+j],
1451                    store[12+j]);
1452 #endif
1453 }
1454
1455 /**
1456  * High-level instruction translators.
1457  */
1458
1459 static void
1460 emit_tex( struct x86_function *func,
1461           const struct tgsi_full_instruction *inst,
1462           boolean lodbias,
1463           boolean projected)
1464 {
1465    const uint unit = inst->Src[1].Register.Index;
1466    struct x86_reg args[2];
1467    unsigned count;
1468    unsigned i;
1469
1470    assert(inst->Instruction.Texture);
1471    switch (inst->Texture.Texture) {
1472    case TGSI_TEXTURE_1D:
1473       count = 1;
1474       break;
1475    case TGSI_TEXTURE_2D:
1476    case TGSI_TEXTURE_RECT:
1477       count = 2;
1478       break;
1479    case TGSI_TEXTURE_SHADOW1D:
1480    case TGSI_TEXTURE_SHADOW2D:
1481    case TGSI_TEXTURE_SHADOWRECT:
1482    case TGSI_TEXTURE_3D:
1483    case TGSI_TEXTURE_CUBE:
1484       count = 3;
1485       break;
1486    default:
1487       assert(0);
1488       return;
1489    }
1490
1491    if (lodbias) {
1492       FETCH( func, *inst, 3, 0, 3 );
1493    }
1494    else {
1495       emit_tempf(
1496          func,
1497          3,
1498          TGSI_EXEC_TEMP_00000000_I,
1499          TGSI_EXEC_TEMP_00000000_C );
1500
1501    }
1502
1503    /* store lodbias whether enabled or not -- fetch_texel currently
1504     * respects it always.
1505     */
1506    sse_movaps( func,
1507                get_temp( TEMP_R0, 3 ),
1508                make_xmm( 3 ) );
1509
1510
1511    if (projected) {
1512       FETCH( func, *inst, 3, 0, 3 );
1513
1514       emit_rcp( func, 3, 3 );
1515    }
1516
1517    for (i = 0; i < count; i++) {
1518       FETCH( func, *inst, i, 0, i );
1519
1520       if (projected) {
1521          sse_mulps(
1522             func,
1523             make_xmm( i ),
1524             make_xmm( 3 ) );
1525       }
1526
1527       /* Store in the argument buffer:
1528        */
1529       sse_movaps(
1530          func,
1531          get_temp( TEMP_R0, i ),
1532          make_xmm( i ) );
1533    }
1534
1535    args[0] = get_temp( TEMP_R0, 0 );
1536    args[1] = get_sampler_ptr( unit );
1537
1538
1539    emit_func_call( func,
1540                    0,
1541                    args,
1542                    Elements(args),
1543                    fetch_texel );
1544
1545    /* If all four channels are enabled, could use a pointer to
1546     * dst[0].x instead of TEMP_R0 for store?
1547     */
1548    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1549
1550       sse_movaps(
1551          func,
1552          make_xmm( 0 ),
1553          get_temp( TEMP_R0, i ) );
1554
1555       STORE( func, *inst, 0, 0, i );
1556    }
1557 }
1558
1559
1560 static void
1561 emit_kil(
1562    struct x86_function *func,
1563    const struct tgsi_full_src_register *reg )
1564 {
1565    unsigned uniquemask;
1566    unsigned unique_count = 0;
1567    unsigned chan_index;
1568    unsigned i;
1569
1570    /* This mask stores component bits that were already tested. Note that
1571     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1572     * tested. */
1573    uniquemask = 0;
1574
1575    FOR_EACH_CHANNEL( chan_index ) {
1576       unsigned swizzle;
1577
1578       /* unswizzle channel */
1579       swizzle = tgsi_util_get_full_src_register_swizzle(
1580          reg,
1581          chan_index );
1582
1583       /* check if the component has not been already tested */
1584       if( !(uniquemask & (1 << swizzle)) ) {
1585          uniquemask |= 1 << swizzle;
1586
1587          /* allocate register */
1588          emit_fetch(
1589             func,
1590             unique_count++,
1591             reg,
1592             chan_index );
1593       }
1594    }
1595
1596    x86_push(
1597       func,
1598       x86_make_reg( file_REG32, reg_AX ) );
1599    x86_push(
1600       func,
1601       x86_make_reg( file_REG32, reg_DX ) );
1602
1603    for (i = 0 ; i < unique_count; i++ ) {
1604       struct x86_reg dataXMM = make_xmm(i);
1605
1606       sse_cmpps(
1607          func,
1608          dataXMM,
1609          get_temp(
1610             TGSI_EXEC_TEMP_00000000_I,
1611             TGSI_EXEC_TEMP_00000000_C ),
1612          cc_LessThan );
1613
1614       if( i == 0 ) {
1615          sse_movmskps(
1616             func,
1617             x86_make_reg( file_REG32, reg_AX ),
1618             dataXMM );
1619       }
1620       else {
1621          sse_movmskps(
1622             func,
1623             x86_make_reg( file_REG32, reg_DX ),
1624             dataXMM );
1625          x86_or(
1626             func,
1627             x86_make_reg( file_REG32, reg_AX ),
1628             x86_make_reg( file_REG32, reg_DX ) );
1629       }
1630    }
1631
1632    x86_or(
1633       func,
1634       get_temp(
1635          TGSI_EXEC_TEMP_KILMASK_I,
1636          TGSI_EXEC_TEMP_KILMASK_C ),
1637       x86_make_reg( file_REG32, reg_AX ) );
1638
1639    x86_pop(
1640       func,
1641       x86_make_reg( file_REG32, reg_DX ) );
1642    x86_pop(
1643       func,
1644       x86_make_reg( file_REG32, reg_AX ) );
1645 }
1646
1647
1648 static void
1649 emit_kilp(
1650    struct x86_function *func )
1651 {
1652    /* XXX todo / fix me */
1653 }
1654
1655
1656 static void
1657 emit_setcc(
1658    struct x86_function *func,
1659    struct tgsi_full_instruction *inst,
1660    enum sse_cc cc )
1661 {
1662    unsigned chan_index;
1663
1664    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1665       FETCH( func, *inst, 0, 0, chan_index );
1666       FETCH( func, *inst, 1, 1, chan_index );
1667       sse_cmpps(
1668          func,
1669          make_xmm( 0 ),
1670          make_xmm( 1 ),
1671          cc );
1672       sse_andps(
1673          func,
1674          make_xmm( 0 ),
1675          get_temp(
1676             TEMP_ONE_I,
1677             TEMP_ONE_C ) );
1678       STORE( func, *inst, 0, 0, chan_index );
1679    }
1680 }
1681
1682 static void
1683 emit_cmp(
1684    struct x86_function *func,
1685    struct tgsi_full_instruction *inst )
1686 {
1687    unsigned chan_index;
1688
1689    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1690       FETCH( func, *inst, 0, 0, chan_index );
1691       FETCH( func, *inst, 1, 1, chan_index );
1692       FETCH( func, *inst, 2, 2, chan_index );
1693       sse_cmpps(
1694          func,
1695          make_xmm( 0 ),
1696          get_temp(
1697             TGSI_EXEC_TEMP_00000000_I,
1698             TGSI_EXEC_TEMP_00000000_C ),
1699          cc_LessThan );
1700       sse_andps(
1701          func,
1702          make_xmm( 1 ),
1703          make_xmm( 0 ) );
1704       sse_andnps(
1705          func,
1706          make_xmm( 0 ),
1707          make_xmm( 2 ) );
1708       sse_orps(
1709          func,
1710          make_xmm( 0 ),
1711          make_xmm( 1 ) );
1712       STORE( func, *inst, 0, 0, chan_index );
1713    }
1714 }
1715
1716
1717 /**
1718  * Check if inst src/dest regs use indirect addressing into temporary
1719  * register file.
1720  */
1721 static boolean
1722 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1723 {
1724    uint i;
1725    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1726       const struct tgsi_full_src_register *reg = &inst->Src[i];
1727       if (reg->Register.File == TGSI_FILE_TEMPORARY &&
1728           reg->Register.Indirect)
1729          return TRUE;
1730    }
1731    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1732       const struct tgsi_full_dst_register *reg = &inst->Dst[i];
1733       if (reg->Register.File == TGSI_FILE_TEMPORARY &&
1734           reg->Register.Indirect)
1735          return TRUE;
1736    }
1737    return FALSE;
1738 }
1739
1740
1741 static int
1742 emit_instruction(
1743    struct x86_function *func,
1744    struct tgsi_full_instruction *inst )
1745 {
1746    unsigned chan_index;
1747
1748    /* we can't handle indirect addressing into temp register file yet */
1749    if (indirect_temp_reference(inst))
1750       return FALSE;
1751
1752    switch (inst->Instruction.Opcode) {
1753    case TGSI_OPCODE_ARL:
1754       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1755          FETCH( func, *inst, 0, 0, chan_index );
1756          emit_flr(func, 0, 0);
1757          emit_f2it( func, 0 );
1758          STORE( func, *inst, 0, 0, chan_index );
1759       }
1760       break;
1761
1762    case TGSI_OPCODE_MOV:
1763       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1764          FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1765       }
1766       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1767          STORE( func, *inst, 4 + chan_index, 0, chan_index );
1768       }
1769       break;
1770
1771    case TGSI_OPCODE_LIT:
1772       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1773           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1774          emit_tempf(
1775             func,
1776             0,
1777             TEMP_ONE_I,
1778             TEMP_ONE_C);
1779          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1780             STORE( func, *inst, 0, 0, CHAN_X );
1781          }
1782          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1783             STORE( func, *inst, 0, 0, CHAN_W );
1784          }
1785       }
1786       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1787           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1788          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1789             FETCH( func, *inst, 0, 0, CHAN_X );
1790             sse_maxps(
1791                func,
1792                make_xmm( 0 ),
1793                get_temp(
1794                   TGSI_EXEC_TEMP_00000000_I,
1795                   TGSI_EXEC_TEMP_00000000_C ) );
1796             STORE( func, *inst, 0, 0, CHAN_Y );
1797          }
1798          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1799             /* XMM[1] = SrcReg[0].yyyy */
1800             FETCH( func, *inst, 1, 0, CHAN_Y );
1801             /* XMM[1] = max(XMM[1], 0) */
1802             sse_maxps(
1803                func,
1804                make_xmm( 1 ),
1805                get_temp(
1806                   TGSI_EXEC_TEMP_00000000_I,
1807                   TGSI_EXEC_TEMP_00000000_C ) );
1808             /* XMM[2] = SrcReg[0].wwww */
1809             FETCH( func, *inst, 2, 0, CHAN_W );
1810             /* XMM[2] = min(XMM[2], 128.0) */
1811             sse_minps(
1812                func,
1813                make_xmm( 2 ),
1814                get_temp(
1815                   TGSI_EXEC_TEMP_128_I,
1816                   TGSI_EXEC_TEMP_128_C ) );
1817             /* XMM[2] = max(XMM[2], -128.0) */
1818             sse_maxps(
1819                func,
1820                make_xmm( 2 ),
1821                get_temp(
1822                   TGSI_EXEC_TEMP_MINUS_128_I,
1823                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1824             emit_pow( func, 3, 1, 1, 2 );
1825             FETCH( func, *inst, 0, 0, CHAN_X );
1826             sse_xorps(
1827                func,
1828                make_xmm( 2 ),
1829                make_xmm( 2 ) );
1830             sse_cmpps(
1831                func,
1832                make_xmm( 2 ),
1833                make_xmm( 0 ),
1834                cc_LessThan );
1835             sse_andps(
1836                func,
1837                make_xmm( 2 ),
1838                make_xmm( 1 ) );
1839             STORE( func, *inst, 2, 0, CHAN_Z );
1840          }
1841       }
1842       break;
1843
1844    case TGSI_OPCODE_RCP:
1845       FETCH( func, *inst, 0, 0, CHAN_X );
1846       emit_rcp( func, 0, 0 );
1847       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1848          STORE( func, *inst, 0, 0, chan_index );
1849       }
1850       break;
1851
1852    case TGSI_OPCODE_RSQ:
1853       FETCH( func, *inst, 0, 0, CHAN_X );
1854       emit_abs( func, 0 );
1855       emit_rsqrt( func, 1, 0 );
1856       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1857          STORE( func, *inst, 1, 0, chan_index );
1858       }
1859       break;
1860
1861    case TGSI_OPCODE_EXP:
1862       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1863           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1864           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1865          FETCH( func, *inst, 0, 0, CHAN_X );
1866          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1867              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1868             emit_MOV( func, 1, 0 );
1869             emit_flr( func, 2, 1 );
1870             /* dst.x = ex2(floor(src.x)) */
1871             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1872                emit_MOV( func, 2, 1 );
1873                emit_ex2( func, 3, 2 );
1874                STORE( func, *inst, 2, 0, CHAN_X );
1875             }
1876             /* dst.y = src.x - floor(src.x) */
1877             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1878                emit_MOV( func, 2, 0 );
1879                emit_sub( func, 2, 1 );
1880                STORE( func, *inst, 2, 0, CHAN_Y );
1881             }
1882          }
1883          /* dst.z = ex2(src.x) */
1884          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1885             emit_ex2( func, 3, 0 );
1886             STORE( func, *inst, 0, 0, CHAN_Z );
1887          }
1888       }
1889       /* dst.w = 1.0 */
1890       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1891          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1892          STORE( func, *inst, 0, 0, CHAN_W );
1893       }
1894       break;
1895
1896    case TGSI_OPCODE_LOG:
1897       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1898           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1899           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1900          FETCH( func, *inst, 0, 0, CHAN_X );
1901          emit_abs( func, 0 );
1902          emit_MOV( func, 1, 0 );
1903          emit_lg2( func, 2, 1 );
1904          /* dst.z = lg2(abs(src.x)) */
1905          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1906             STORE( func, *inst, 1, 0, CHAN_Z );
1907          }
1908          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1909              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1910             emit_flr( func, 2, 1 );
1911             /* dst.x = floor(lg2(abs(src.x))) */
1912             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1913                STORE( func, *inst, 1, 0, CHAN_X );
1914             }
1915             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1916             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1917                emit_ex2( func, 2, 1 );
1918                emit_rcp( func, 1, 1 );
1919                emit_mul( func, 0, 1 );
1920                STORE( func, *inst, 0, 0, CHAN_Y );
1921             }
1922          }
1923       }
1924       /* dst.w = 1.0 */
1925       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1926          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1927          STORE( func, *inst, 0, 0, CHAN_W );
1928       }
1929       break;
1930
1931    case TGSI_OPCODE_MUL:
1932       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1933          FETCH( func, *inst, 0, 0, chan_index );
1934          FETCH( func, *inst, 1, 1, chan_index );
1935          emit_mul( func, 0, 1 );
1936          STORE( func, *inst, 0, 0, chan_index );
1937       }
1938       break;
1939
1940    case TGSI_OPCODE_ADD:
1941       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1942          FETCH( func, *inst, 0, 0, chan_index );
1943          FETCH( func, *inst, 1, 1, chan_index );
1944          emit_add( func, 0, 1 );
1945          STORE( func, *inst, 0, 0, chan_index );
1946       }
1947       break;
1948
1949    case TGSI_OPCODE_DP3:
1950       FETCH( func, *inst, 0, 0, CHAN_X );
1951       FETCH( func, *inst, 1, 1, CHAN_X );
1952       emit_mul( func, 0, 1 );
1953       FETCH( func, *inst, 1, 0, CHAN_Y );
1954       FETCH( func, *inst, 2, 1, CHAN_Y );
1955       emit_mul( func, 1, 2 );
1956       emit_add( func, 0, 1 );
1957       FETCH( func, *inst, 1, 0, CHAN_Z );
1958       FETCH( func, *inst, 2, 1, CHAN_Z );
1959       emit_mul( func, 1, 2 );
1960       emit_add( func, 0, 1 );
1961       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1962          STORE( func, *inst, 0, 0, chan_index );
1963       }
1964       break;
1965
1966    case TGSI_OPCODE_DP4:
1967       FETCH( func, *inst, 0, 0, CHAN_X );
1968       FETCH( func, *inst, 1, 1, CHAN_X );
1969       emit_mul( func, 0, 1 );
1970       FETCH( func, *inst, 1, 0, CHAN_Y );
1971       FETCH( func, *inst, 2, 1, CHAN_Y );
1972       emit_mul( func, 1, 2 );
1973       emit_add( func, 0, 1 );
1974       FETCH( func, *inst, 1, 0, CHAN_Z );
1975       FETCH( func, *inst, 2, 1, CHAN_Z );
1976       emit_mul(func, 1, 2 );
1977       emit_add(func, 0, 1 );
1978       FETCH( func, *inst, 1, 0, CHAN_W );
1979       FETCH( func, *inst, 2, 1, CHAN_W );
1980       emit_mul( func, 1, 2 );
1981       emit_add( func, 0, 1 );
1982       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1983          STORE( func, *inst, 0, 0, chan_index );
1984       }
1985       break;
1986
1987    case TGSI_OPCODE_DST:
1988       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1989          emit_tempf(
1990             func,
1991             0,
1992             TEMP_ONE_I,
1993             TEMP_ONE_C );
1994          STORE( func, *inst, 0, 0, CHAN_X );
1995       }
1996       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1997          FETCH( func, *inst, 0, 0, CHAN_Y );
1998          FETCH( func, *inst, 1, 1, CHAN_Y );
1999          emit_mul( func, 0, 1 );
2000          STORE( func, *inst, 0, 0, CHAN_Y );
2001       }
2002       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2003          FETCH( func, *inst, 0, 0, CHAN_Z );
2004          STORE( func, *inst, 0, 0, CHAN_Z );
2005       }
2006       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2007          FETCH( func, *inst, 0, 1, CHAN_W );
2008          STORE( func, *inst, 0, 0, CHAN_W );
2009       }
2010       break;
2011
2012    case TGSI_OPCODE_MIN:
2013       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2014          FETCH( func, *inst, 0, 0, chan_index );
2015          FETCH( func, *inst, 1, 1, chan_index );
2016          sse_minps(
2017             func,
2018             make_xmm( 0 ),
2019             make_xmm( 1 ) );
2020          STORE( func, *inst, 0, 0, chan_index );
2021       }
2022       break;
2023
2024    case TGSI_OPCODE_MAX:
2025       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2026          FETCH( func, *inst, 0, 0, chan_index );
2027          FETCH( func, *inst, 1, 1, chan_index );
2028          sse_maxps(
2029             func,
2030             make_xmm( 0 ),
2031             make_xmm( 1 ) );
2032          STORE( func, *inst, 0, 0, chan_index );
2033       }
2034       break;
2035
2036    case TGSI_OPCODE_SLT:
2037       emit_setcc( func, inst, cc_LessThan );
2038       break;
2039
2040    case TGSI_OPCODE_SGE:
2041       emit_setcc( func, inst, cc_NotLessThan );
2042       break;
2043
2044    case TGSI_OPCODE_MAD:
2045       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2046          FETCH( func, *inst, 0, 0, chan_index );
2047          FETCH( func, *inst, 1, 1, chan_index );
2048          FETCH( func, *inst, 2, 2, chan_index );
2049          emit_mul( func, 0, 1 );
2050          emit_add( func, 0, 2 );
2051          STORE( func, *inst, 0, 0, chan_index );
2052       }
2053       break;
2054
2055    case TGSI_OPCODE_SUB:
2056       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2057          FETCH( func, *inst, 0, 0, chan_index );
2058          FETCH( func, *inst, 1, 1, chan_index );
2059          emit_sub( func, 0, 1 );
2060          STORE( func, *inst, 0, 0, chan_index );
2061       }
2062       break;
2063
2064    case TGSI_OPCODE_LRP:
2065       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2066          FETCH( func, *inst, 0, 0, chan_index );
2067          FETCH( func, *inst, 1, 1, chan_index );
2068          FETCH( func, *inst, 2, 2, chan_index );
2069          emit_sub( func, 1, 2 );
2070          emit_mul( func, 0, 1 );
2071          emit_add( func, 0, 2 );
2072          STORE( func, *inst, 0, 0, chan_index );
2073       }
2074       break;
2075
2076    case TGSI_OPCODE_CND:
2077       return 0;
2078       break;
2079
2080    case TGSI_OPCODE_DP2A:
2081       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2082       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2083       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2084       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2085       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2086       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2087       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2088       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2089       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2090       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2091          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2092       }
2093       break;
2094
2095    case TGSI_OPCODE_FRC:
2096       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2097          FETCH( func, *inst, 0, 0, chan_index );
2098          emit_frc( func, 0, 0 );
2099          STORE( func, *inst, 0, 0, chan_index );
2100       }
2101       break;
2102
2103    case TGSI_OPCODE_CLAMP:
2104       return 0;
2105       break;
2106
2107    case TGSI_OPCODE_FLR:
2108       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2109          FETCH( func, *inst, 0, 0, chan_index );
2110          emit_flr( func, 0, 0 );
2111          STORE( func, *inst, 0, 0, chan_index );
2112       }
2113       break;
2114
2115    case TGSI_OPCODE_ROUND:
2116       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2117          FETCH( func, *inst, 0, 0, chan_index );
2118          emit_rnd( func, 0, 0 );
2119          STORE( func, *inst, 0, 0, chan_index );
2120       }
2121       break;
2122
2123    case TGSI_OPCODE_EX2:
2124       FETCH( func, *inst, 0, 0, CHAN_X );
2125       emit_ex2( func, 0, 0 );
2126       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2127          STORE( func, *inst, 0, 0, chan_index );
2128       }
2129       break;
2130
2131    case TGSI_OPCODE_LG2:
2132       FETCH( func, *inst, 0, 0, CHAN_X );
2133       emit_lg2( func, 0, 0 );
2134       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2135          STORE( func, *inst, 0, 0, chan_index );
2136       }
2137       break;
2138
2139    case TGSI_OPCODE_POW:
2140       FETCH( func, *inst, 0, 0, CHAN_X );
2141       FETCH( func, *inst, 1, 1, CHAN_X );
2142       emit_pow( func, 0, 0, 0, 1 );
2143       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2144          STORE( func, *inst, 0, 0, chan_index );
2145       }
2146       break;
2147
2148    case TGSI_OPCODE_XPD:
2149       /* Note: we do all stores after all operands have been fetched
2150        * to avoid src/dst register aliasing issues for an instruction
2151        * such as:  XPD TEMP[2].xyz, TEMP[0], TEMP[2];
2152        */
2153       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2154           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2155          FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
2156          FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
2157       }
2158       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2159           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2160          FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
2161          FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
2162       }
2163       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2164          emit_MOV( func, 7, 0 );  /* xmm[7] = xmm[0] */
2165          emit_mul( func, 7, 1 );  /* xmm[7] = xmm[2] * xmm[1] */
2166          emit_MOV( func, 5, 3 );  /* xmm[5] = xmm[3] */
2167          emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
2168          emit_sub( func, 7, 5 );  /* xmm[7] = xmm[2] - xmm[5] */
2169          /* store xmm[7] in dst.x below */
2170       }
2171       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2172           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2173          FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
2174          FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
2175       }
2176       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2177          emit_mul( func, 3, 2 );  /* xmm[3] = xmm[3] * xmm[2] */
2178          emit_mul( func, 1, 5 );  /* xmm[1] = xmm[1] * xmm[5] */
2179          emit_sub( func, 3, 1 );  /* xmm[3] = xmm[3] - xmm[1] */
2180          /* store xmm[3] in dst.y below */
2181       }
2182       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2183          emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
2184          emit_mul( func, 0, 2 );  /* xmm[0] = xmm[0] * xmm[2] */
2185          emit_sub( func, 5, 0 );  /* xmm[5] = xmm[5] - xmm[0] */
2186          STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
2187       }
2188       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2189          STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
2190       }
2191       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2192          STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
2193       }
2194       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2195          emit_tempf(
2196             func,
2197             0,
2198             TEMP_ONE_I,
2199             TEMP_ONE_C );
2200          STORE( func, *inst, 0, 0, CHAN_W );
2201       }
2202       break;
2203
2204    case TGSI_OPCODE_ABS:
2205       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2206          FETCH( func, *inst, 0, 0, chan_index );
2207          emit_abs( func, 0) ;
2208
2209          STORE( func, *inst, 0, 0, chan_index );
2210       }
2211       break;
2212
2213    case TGSI_OPCODE_RCC:
2214       return 0;
2215       break;
2216
2217    case TGSI_OPCODE_DPH:
2218       FETCH( func, *inst, 0, 0, CHAN_X );
2219       FETCH( func, *inst, 1, 1, CHAN_X );
2220       emit_mul( func, 0, 1 );
2221       FETCH( func, *inst, 1, 0, CHAN_Y );
2222       FETCH( func, *inst, 2, 1, CHAN_Y );
2223       emit_mul( func, 1, 2 );
2224       emit_add( func, 0, 1 );
2225       FETCH( func, *inst, 1, 0, CHAN_Z );
2226       FETCH( func, *inst, 2, 1, CHAN_Z );
2227       emit_mul( func, 1, 2 );
2228       emit_add( func, 0, 1 );
2229       FETCH( func, *inst, 1, 1, CHAN_W );
2230       emit_add( func, 0, 1 );
2231       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2232          STORE( func, *inst, 0, 0, chan_index );
2233       }
2234       break;
2235
2236    case TGSI_OPCODE_COS:
2237       FETCH( func, *inst, 0, 0, CHAN_X );
2238       emit_cos( func, 0, 0 );
2239       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2240          STORE( func, *inst, 0, 0, chan_index );
2241       }
2242       break;
2243
2244    case TGSI_OPCODE_DDX:
2245       return 0;
2246       break;
2247
2248    case TGSI_OPCODE_DDY:
2249       return 0;
2250       break;
2251
2252    case TGSI_OPCODE_KILP:
2253       /* predicated kill */
2254       emit_kilp( func );
2255       return 0; /* XXX fix me */
2256       break;
2257
2258    case TGSI_OPCODE_KIL:
2259       /* conditional kill */
2260       emit_kil( func, &inst->Src[0] );
2261       break;
2262
2263    case TGSI_OPCODE_PK2H:
2264       return 0;
2265       break;
2266
2267    case TGSI_OPCODE_PK2US:
2268       return 0;
2269       break;
2270
2271    case TGSI_OPCODE_PK4B:
2272       return 0;
2273       break;
2274
2275    case TGSI_OPCODE_PK4UB:
2276       return 0;
2277       break;
2278
2279    case TGSI_OPCODE_RFL:
2280       return 0;
2281       break;
2282
2283    case TGSI_OPCODE_SEQ:
2284       emit_setcc( func, inst, cc_Equal );
2285       break;
2286
2287    case TGSI_OPCODE_SFL:
2288       return 0;
2289       break;
2290
2291    case TGSI_OPCODE_SGT:
2292       emit_setcc( func, inst, cc_NotLessThanEqual );
2293       break;
2294
2295    case TGSI_OPCODE_SIN:
2296       FETCH( func, *inst, 0, 0, CHAN_X );
2297       emit_sin( func, 0, 0 );
2298       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2299          STORE( func, *inst, 0, 0, chan_index );
2300       }
2301       break;
2302
2303    case TGSI_OPCODE_SLE:
2304       emit_setcc( func, inst, cc_LessThanEqual );
2305       break;
2306
2307    case TGSI_OPCODE_SNE:
2308       emit_setcc( func, inst, cc_NotEqual );
2309       break;
2310
2311    case TGSI_OPCODE_STR:
2312       return 0;
2313       break;
2314
2315    case TGSI_OPCODE_TEX:
2316       emit_tex( func, inst, FALSE, FALSE );
2317       break;
2318
2319    case TGSI_OPCODE_TXD:
2320       return 0;
2321       break;
2322
2323    case TGSI_OPCODE_UP2H:
2324       return 0;
2325       break;
2326
2327    case TGSI_OPCODE_UP2US:
2328       return 0;
2329       break;
2330
2331    case TGSI_OPCODE_UP4B:
2332       return 0;
2333       break;
2334
2335    case TGSI_OPCODE_UP4UB:
2336       return 0;
2337       break;
2338
2339    case TGSI_OPCODE_X2D:
2340       return 0;
2341       break;
2342
2343    case TGSI_OPCODE_ARA:
2344       return 0;
2345       break;
2346
2347    case TGSI_OPCODE_ARR:
2348       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2349          FETCH( func, *inst, 0, 0, chan_index );
2350          emit_rnd( func, 0, 0 );
2351          emit_f2it( func, 0 );
2352          STORE( func, *inst, 0, 0, chan_index );
2353       }
2354       break;
2355
2356    case TGSI_OPCODE_BRA:
2357       return 0;
2358       break;
2359
2360    case TGSI_OPCODE_CAL:
2361       return 0;
2362       break;
2363
2364    case TGSI_OPCODE_RET:
2365       emit_ret( func );
2366       break;
2367
2368    case TGSI_OPCODE_END:
2369       break;
2370
2371    case TGSI_OPCODE_SSG:
2372       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2373          FETCH( func, *inst, 0, 0, chan_index );
2374          emit_sgn( func, 0, 0 );
2375          STORE( func, *inst, 0, 0, chan_index );
2376       }
2377       break;
2378
2379    case TGSI_OPCODE_CMP:
2380       emit_cmp (func, inst);
2381       break;
2382
2383    case TGSI_OPCODE_SCS:
2384       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2385          FETCH( func, *inst, 0, 0, CHAN_X );
2386          emit_cos( func, 0, 0 );
2387          STORE( func, *inst, 0, 0, CHAN_X );
2388       }
2389       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2390          FETCH( func, *inst, 0, 0, CHAN_X );
2391          emit_sin( func, 0, 0 );
2392          STORE( func, *inst, 0, 0, CHAN_Y );
2393       }
2394       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2395          emit_tempf(
2396             func,
2397             0,
2398             TGSI_EXEC_TEMP_00000000_I,
2399             TGSI_EXEC_TEMP_00000000_C );
2400          STORE( func, *inst, 0, 0, CHAN_Z );
2401       }
2402       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2403          emit_tempf(
2404             func,
2405             0,
2406             TEMP_ONE_I,
2407             TEMP_ONE_C );
2408          STORE( func, *inst, 0, 0, CHAN_W );
2409       }
2410       break;
2411
2412    case TGSI_OPCODE_TXB:
2413       emit_tex( func, inst, TRUE, FALSE );
2414       break;
2415
2416    case TGSI_OPCODE_NRM:
2417       /* fall-through */
2418    case TGSI_OPCODE_NRM4:
2419       /* 3 or 4-component normalization */
2420       {
2421          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2422
2423          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2424              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2425              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2426              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2427
2428             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2429
2430             /* xmm4 = src.x */
2431             /* xmm0 = src.x * src.x */
2432             FETCH(func, *inst, 0, 0, CHAN_X);
2433             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2434                emit_MOV(func, 4, 0);
2435             }
2436             emit_mul(func, 0, 0);
2437
2438             /* xmm5 = src.y */
2439             /* xmm0 = xmm0 + src.y * src.y */
2440             FETCH(func, *inst, 1, 0, CHAN_Y);
2441             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2442                emit_MOV(func, 5, 1);
2443             }
2444             emit_mul(func, 1, 1);
2445             emit_add(func, 0, 1);
2446
2447             /* xmm6 = src.z */
2448             /* xmm0 = xmm0 + src.z * src.z */
2449             FETCH(func, *inst, 1, 0, CHAN_Z);
2450             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2451                emit_MOV(func, 6, 1);
2452             }
2453             emit_mul(func, 1, 1);
2454             emit_add(func, 0, 1);
2455
2456             if (dims == 4) {
2457                /* xmm7 = src.w */
2458                /* xmm0 = xmm0 + src.w * src.w */
2459                FETCH(func, *inst, 1, 0, CHAN_W);
2460                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2461                   emit_MOV(func, 7, 1);
2462                }
2463                emit_mul(func, 1, 1);
2464                emit_add(func, 0, 1);
2465             }
2466
2467             /* xmm1 = 1 / sqrt(xmm0) */
2468             emit_rsqrt(func, 1, 0);
2469
2470             /* dst.x = xmm1 * src.x */
2471             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2472                emit_mul(func, 4, 1);
2473                STORE(func, *inst, 4, 0, CHAN_X);
2474             }
2475
2476             /* dst.y = xmm1 * src.y */
2477             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2478                emit_mul(func, 5, 1);
2479                STORE(func, *inst, 5, 0, CHAN_Y);
2480             }
2481
2482             /* dst.z = xmm1 * src.z */
2483             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2484                emit_mul(func, 6, 1);
2485                STORE(func, *inst, 6, 0, CHAN_Z);
2486             }
2487
2488             /* dst.w = xmm1 * src.w */
2489             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2490                emit_mul(func, 7, 1);
2491                STORE(func, *inst, 7, 0, CHAN_W);
2492             }
2493          }
2494
2495          /* dst0.w = 1.0 */
2496          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2497             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2498             STORE(func, *inst, 0, 0, CHAN_W);
2499          }
2500       }
2501       break;
2502
2503    case TGSI_OPCODE_DIV:
2504       return 0;
2505       break;
2506
2507    case TGSI_OPCODE_DP2:
2508       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2509       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2510       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2511       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2512       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2513       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2514       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2515       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2516          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2517       }
2518       break;
2519
2520    case TGSI_OPCODE_TXL:
2521       return 0;
2522       break;
2523
2524    case TGSI_OPCODE_TXP:
2525       emit_tex( func, inst, FALSE, TRUE );
2526       break;
2527
2528    case TGSI_OPCODE_BRK:
2529       return 0;
2530       break;
2531
2532    case TGSI_OPCODE_IF:
2533       return 0;
2534       break;
2535
2536    case TGSI_OPCODE_BGNFOR:
2537       return 0;
2538       break;
2539
2540    case TGSI_OPCODE_REP:
2541       return 0;
2542       break;
2543
2544    case TGSI_OPCODE_ELSE:
2545       return 0;
2546       break;
2547
2548    case TGSI_OPCODE_ENDIF:
2549       return 0;
2550       break;
2551
2552    case TGSI_OPCODE_ENDFOR:
2553       return 0;
2554       break;
2555
2556    case TGSI_OPCODE_ENDREP:
2557       return 0;
2558       break;
2559
2560    case TGSI_OPCODE_PUSHA:
2561       return 0;
2562       break;
2563
2564    case TGSI_OPCODE_POPA:
2565       return 0;
2566       break;
2567
2568    case TGSI_OPCODE_CEIL:
2569       return 0;
2570       break;
2571
2572    case TGSI_OPCODE_I2F:
2573       return 0;
2574       break;
2575
2576    case TGSI_OPCODE_NOT:
2577       return 0;
2578       break;
2579
2580    case TGSI_OPCODE_TRUNC:
2581       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2582          FETCH( func, *inst, 0, 0, chan_index );
2583          emit_f2it( func, 0 );
2584          emit_i2f( func, 0 );
2585          STORE( func, *inst, 0, 0, chan_index );
2586       }
2587       break;
2588
2589    case TGSI_OPCODE_SHL:
2590       return 0;
2591       break;
2592
2593    case TGSI_OPCODE_ISHR:
2594       return 0;
2595       break;
2596
2597    case TGSI_OPCODE_AND:
2598       return 0;
2599       break;
2600
2601    case TGSI_OPCODE_OR:
2602       return 0;
2603       break;
2604
2605    case TGSI_OPCODE_MOD:
2606       return 0;
2607       break;
2608
2609    case TGSI_OPCODE_XOR:
2610       return 0;
2611       break;
2612
2613    case TGSI_OPCODE_SAD:
2614       return 0;
2615       break;
2616
2617    case TGSI_OPCODE_TXF:
2618       return 0;
2619       break;
2620
2621    case TGSI_OPCODE_TXQ:
2622       return 0;
2623       break;
2624
2625    case TGSI_OPCODE_CONT:
2626       return 0;
2627       break;
2628
2629    case TGSI_OPCODE_EMIT:
2630       return 0;
2631       break;
2632
2633    case TGSI_OPCODE_ENDPRIM:
2634       return 0;
2635       break;
2636
2637    default:
2638       return 0;
2639    }
2640
2641    return 1;
2642 }
2643
2644 static void
2645 emit_declaration(
2646    struct x86_function *func,
2647    struct tgsi_full_declaration *decl )
2648 {
2649    if( decl->Declaration.File == TGSI_FILE_INPUT ||
2650        decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE ) {
2651       unsigned first, last, mask;
2652       unsigned i, j;
2653
2654       first = decl->Range.First;
2655       last = decl->Range.Last;
2656       mask = decl->Declaration.UsageMask;
2657
2658       for( i = first; i <= last; i++ ) {
2659          for( j = 0; j < NUM_CHANNELS; j++ ) {
2660             if( mask & (1 << j) ) {
2661                switch( decl->Declaration.Interpolate ) {
2662                case TGSI_INTERPOLATE_CONSTANT:
2663                   emit_coef_a0( func, 0, i, j );
2664                   emit_inputs( func, 0, i, j );
2665                   break;
2666
2667                case TGSI_INTERPOLATE_LINEAR:
2668                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2669                   emit_coef_dadx( func, 1, i, j );
2670                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2671                   emit_coef_dady( func, 3, i, j );
2672                   emit_mul( func, 0, 1 );    /* x * dadx */
2673                   emit_coef_a0( func, 4, i, j );
2674                   emit_mul( func, 2, 3 );    /* y * dady */
2675                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2676                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2677                   emit_inputs( func, 0, i, j );
2678                   break;
2679
2680                case TGSI_INTERPOLATE_PERSPECTIVE:
2681                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2682                   emit_coef_dadx( func, 1, i, j );
2683                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2684                   emit_coef_dady( func, 3, i, j );
2685                   emit_mul( func, 0, 1 );    /* x * dadx */
2686                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2687                   emit_coef_a0( func, 5, i, j );
2688                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2689                   emit_mul( func, 2, 3 );    /* y * dady */
2690                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2691                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2692                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2693                   emit_inputs( func, 0, i, j );
2694                   break;
2695
2696                default:
2697                   assert( 0 );
2698                   break;
2699                }
2700             }
2701          }
2702       }
2703    }
2704 }
2705
2706 static void aos_to_soa( struct x86_function *func,
2707                         uint arg_aos,
2708                         uint arg_machine,
2709                         uint arg_num,
2710                         uint arg_stride )
2711 {
2712    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2713    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2714    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2715    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2716    int inner_loop;
2717
2718
2719    /* Save EBX */
2720    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2721
2722    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2723    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2724    x86_lea( func, soa_input,
2725             x86_make_disp( soa_input,
2726                            Offset(struct tgsi_exec_machine, Inputs) ) );
2727    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2728    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2729
2730    /* do */
2731    inner_loop = x86_get_label( func );
2732    {
2733       x86_push( func, aos_input );
2734       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2735       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2736       x86_add( func, aos_input, stride );
2737       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2738       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2739       x86_add( func, aos_input, stride );
2740       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2741       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2742       x86_add( func, aos_input, stride );
2743       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2744       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2745       x86_pop( func, aos_input );
2746
2747       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2748       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2749       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2750       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2751       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2752       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2753
2754       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2755       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2756       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2757       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2758
2759       /* Advance to next input */
2760       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2761       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2762    }
2763    /* while --num_inputs */
2764    x86_dec( func, num_inputs );
2765    x86_jcc( func, cc_NE, inner_loop );
2766
2767    /* Restore EBX */
2768    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2769 }
2770
2771 static void soa_to_aos( struct x86_function *func,
2772                         uint arg_aos,
2773                         uint arg_machine,
2774                         uint arg_num,
2775                         uint arg_stride )
2776 {
2777    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2778    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2779    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2780    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2781    int inner_loop;
2782
2783    /* Save EBX */
2784    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2785
2786    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2787    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2788    x86_lea( func, soa_output,
2789             x86_make_disp( soa_output,
2790                            Offset(struct tgsi_exec_machine, Outputs) ) );
2791    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2792
2793    /* do */
2794    inner_loop = x86_get_label( func );
2795    {
2796       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2797       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2798       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2799       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2800
2801       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2802       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2803       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2804       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2805       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2806       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2807
2808       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2809       x86_push( func, aos_output );
2810       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2811       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2812       x86_add( func, aos_output, temp );
2813       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2814       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2815       x86_add( func, aos_output, temp );
2816       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2817       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2818       x86_add( func, aos_output, temp );
2819       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2820       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2821       x86_pop( func, aos_output );
2822
2823       /* Advance to next output */
2824       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2825       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2826    }
2827    /* while --num_outputs */
2828    x86_dec( func, num_outputs );
2829    x86_jcc( func, cc_NE, inner_loop );
2830
2831    /* Restore EBX */
2832    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2833 }
2834
2835 /**
2836  * Translate a TGSI vertex/fragment shader to SSE2 code.
2837  * Slightly different things are done for vertex vs. fragment shaders.
2838  *
2839  * \param tokens  the TGSI input shader
2840  * \param func  the output SSE code/function
2841  * \param immediates  buffer to place immediates, later passed to SSE func
2842  * \param return  1 for success, 0 if translation failed
2843  */
2844 unsigned
2845 tgsi_emit_sse2(
2846    const struct tgsi_token *tokens,
2847    struct x86_function *func,
2848    float (*immediates)[4],
2849    boolean do_swizzles )
2850 {
2851    struct tgsi_parse_context parse;
2852    unsigned ok = 1;
2853    uint num_immediates = 0;
2854
2855    util_init_math();
2856
2857    func->csr = func->store;
2858
2859    tgsi_parse_init( &parse, tokens );
2860
2861    /* Can't just use EDI, EBX without save/restoring them:
2862     */
2863    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2864    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2865
2866    /*
2867     * Different function args for vertex/fragment shaders:
2868     */
2869    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2870       if (do_swizzles)
2871          aos_to_soa( func,
2872                      4,         /* aos_input */
2873                      1,         /* machine */
2874                      5,         /* num_inputs */
2875                      6 );       /* input_stride */
2876    }
2877
2878    x86_mov(
2879       func,
2880       get_machine_base(),
2881       x86_fn_arg( func, 1 ) );
2882    x86_mov(
2883       func,
2884       get_const_base(),
2885       x86_fn_arg( func, 2 ) );
2886    x86_mov(
2887       func,
2888       get_immediate_base(),
2889       x86_fn_arg( func, 3 ) );
2890
2891    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2892       x86_mov(
2893          func,
2894          get_coef_base(),
2895          x86_fn_arg( func, 4 ) );
2896    }
2897
2898    x86_mov(
2899       func,
2900       get_sampler_base(),
2901       x86_make_disp( get_machine_base(),
2902                      Offset( struct tgsi_exec_machine, Samplers ) ) );
2903
2904
2905    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2906       tgsi_parse_token( &parse );
2907
2908       switch( parse.FullToken.Token.Type ) {
2909       case TGSI_TOKEN_TYPE_DECLARATION:
2910          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2911             emit_declaration(
2912                func,
2913                &parse.FullToken.FullDeclaration );
2914          }
2915          break;
2916
2917       case TGSI_TOKEN_TYPE_INSTRUCTION:
2918          ok = emit_instruction(
2919             func,
2920             &parse.FullToken.FullInstruction );
2921
2922          if (!ok) {
2923             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2924             debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
2925                          opcode,
2926                          tgsi_get_opcode_name(opcode),
2927                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2928                          "vertex shader" : "fragment shader");
2929          }
2930
2931          if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
2932             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
2933
2934             /* XXX: we only handle src/dst aliasing in a few opcodes
2935              * currently.  Need to use an additional temporay to hold
2936              * the result in the cases where the code is too opaque to
2937              * fix.
2938              */
2939             if (opcode != TGSI_OPCODE_MOV) {
2940                debug_printf("Warning: src/dst aliasing in instruction"
2941                             " is not handled:\n");
2942                tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1);
2943             }
2944          }
2945          break;
2946
2947       case TGSI_TOKEN_TYPE_IMMEDIATE:
2948          /* simply copy the immediate values into the next immediates[] slot */
2949          {
2950             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2951             uint i;
2952             assert(size <= 4);
2953             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2954             for( i = 0; i < size; i++ ) {
2955                immediates[num_immediates][i] =
2956                   parse.FullToken.FullImmediate.u[i].Float;
2957             }
2958 #if 0
2959             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2960                    num_immediates,
2961                    immediates[num_immediates][0],
2962                    immediates[num_immediates][1],
2963                    immediates[num_immediates][2],
2964                    immediates[num_immediates][3]);
2965 #endif
2966             num_immediates++;
2967          }
2968          break;
2969       case TGSI_TOKEN_TYPE_PROPERTY:
2970          /* we just ignore them for now */
2971          break;
2972
2973       default:
2974          ok = 0;
2975          assert( 0 );
2976       }
2977    }
2978
2979    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2980       if (do_swizzles)
2981          soa_to_aos( func,
2982                      7,         /* aos_output */
2983                      1,         /* machine */
2984                      8,         /* num_outputs */
2985                      9 );       /* output_stride */
2986    }
2987
2988    /* Can't just use EBX, EDI without save/restoring them:
2989     */
2990    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2991    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2992
2993    emit_ret( func );
2994
2995    tgsi_parse_free( &parse );
2996
2997    return ok;
2998 }
2999
3000 #endif /* PIPE_ARCH_X86 */
3001