src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_config.h"
  29
  30 #if defined(PIPE_ARCH_X86)
  31
  32 #include "util/u_debug.h"
  33 #include "pipe/p_shader_tokens.h"
  34 #include "util/u_math.h"
  35 #include "util/u_memory.h"
  36 #if defined(PIPE_ARCH_SSE)
  37 #include "util/u_sse.h"
  38 #endif
  39 #include "tgsi/tgsi_parse.h"
  40 #include "tgsi/tgsi_util.h"
  41 #include "tgsi_exec.h"
  42 #include "tgsi_sse2.h"
  43
  44 #include "rtasm/rtasm_x86sse.h"
  45
  46 /* for 1/sqrt()
  47  *
  48  * This costs about 100fps (close to 10%) in gears:
  49  */
  50 #define HIGH_PRECISION 1
  51
  52 #define FAST_MATH 1
  53
  54
  55 #define FOR_EACH_CHANNEL( CHAN )\
  56    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  57
  58 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  59    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  60
  61 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  62    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  63
  64 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  65    FOR_EACH_CHANNEL( CHAN )\
  66       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  67
  68 #define CHAN_X 0
  69 #define CHAN_Y 1
  70 #define CHAN_Z 2
  71 #define CHAN_W 3
  72
  73 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  74 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  75
  76 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  77 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  78 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  79 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  80
  81
  82 /**
  83  * X86 utility functions.
  84  */
  85
  86 static struct x86_reg
  87 make_xmm(
  88    unsigned xmm )
  89 {
  90    return x86_make_reg(
  91       file_XMM,
  92       (enum x86_reg_name) xmm );
  93 }
  94
  95 /**
  96  * X86 register mapping helpers.
  97  */
  98
  99 static struct x86_reg
 100 get_const_base( void )
 101 {
 102    return x86_make_reg(
 103       file_REG32,
 104       reg_CX );
 105 }
 106
 107 static struct x86_reg
 108 get_machine_base( void )
 109 {
 110    return x86_make_reg(
 111       file_REG32,
 112       reg_AX );
 113 }
 114
 115 static struct x86_reg
 116 get_input_base( void )
 117 {
 118    return x86_make_disp(
 119       get_machine_base(),
 120       Offset(struct tgsi_exec_machine, Inputs) );
 121 }
 122
 123 static struct x86_reg
 124 get_output_base( void )
 125 {
 126    return x86_make_disp(
 127       get_machine_base(),
 128       Offset(struct tgsi_exec_machine, Outputs) );
 129 }
 130
 131 static struct x86_reg
 132 get_temp_base( void )
 133 {
 134    return x86_make_disp(
 135       get_machine_base(),
 136       Offset(struct tgsi_exec_machine, Temps) );
 137 }
 138
 139 static struct x86_reg
 140 get_coef_base( void )
 141 {
 142    return x86_make_reg(
 143       file_REG32,
 144       reg_BX );
 145 }
 146
 147 static struct x86_reg
 148 get_immediate_base( void )
 149 {
 150    return x86_make_reg(
 151       file_REG32,
 152       reg_DX );
 153 }
 154
 155
 156 /**
 157  * Data access helpers.
 158  */
 159
 160
 161 static struct x86_reg
 162 get_immediate(
 163    unsigned vec,
 164    unsigned chan )
 165 {
 166    return x86_make_disp(
 167       get_immediate_base(),
 168       (vec * 4 + chan) * 4 );
 169 }
 170
 171 static struct x86_reg
 172 get_const(
 173    unsigned vec,
 174    unsigned chan )
 175 {
 176    return x86_make_disp(
 177       get_const_base(),
 178       (vec * 4 + chan) * 4 );
 179 }
 180
 181 static struct x86_reg
 182 get_input(
 183    unsigned vec,
 184    unsigned chan )
 185 {
 186    return x86_make_disp(
 187       get_input_base(),
 188       (vec * 4 + chan) * 16 );
 189 }
 190
 191 static struct x86_reg
 192 get_output(
 193    unsigned vec,
 194    unsigned chan )
 195 {
 196    return x86_make_disp(
 197       get_output_base(),
 198       (vec * 4 + chan) * 16 );
 199 }
 200
 201 static struct x86_reg
 202 get_temp(
 203    unsigned vec,
 204    unsigned chan )
 205 {
 206    return x86_make_disp(
 207       get_temp_base(),
 208       (vec * 4 + chan) * 16 );
 209 }
 210
 211 static struct x86_reg
 212 get_coef(
 213    unsigned vec,
 214    unsigned chan,
 215    unsigned member )
 216 {
 217    return x86_make_disp(
 218       get_coef_base(),
 219       ((vec * 3 + member) * 4 + chan) * 4 );
 220 }
 221
 222
 223 static void
 224 emit_ret(
 225    struct x86_function  *func )
 226 {
 227    x86_ret( func );
 228 }
 229
 230
 231 /**
 232  * Data fetch helpers.
 233  */
 234
 235 /**
 236  * Copy a shader constant to xmm register
 237  * \param xmm  the destination xmm register
 238  * \param vec  the src const buffer index
 239  * \param chan  src channel to fetch (X, Y, Z or W)
 240  */
 241 static void
 242 emit_const(
 243    struct x86_function *func,
 244    uint xmm,
 245    int vec,
 246    uint chan,
 247    uint indirect,
 248    uint indirectFile,
 249    int indirectIndex )
 250 {
 251    if (indirect) {
 252       /* 'vec' is the offset from the address register's value.
 253        * We're loading CONST[ADDR+vec] into an xmm register.
 254        */
 255       struct x86_reg r0 = get_input_base();
 256       struct x86_reg r1 = get_output_base();
 257       uint i;
 258
 259       assert( indirectFile == TGSI_FILE_ADDRESS );
 260       assert( indirectIndex == 0 );
 261
 262       x86_push( func, r0 );
 263       x86_push( func, r1 );
 264
 265       /*
 266        * Loop over the four pixels or vertices in the quad.
 267        * Get the value of the address (offset) register for pixel/vertex[i],
 268        * add it to the src offset and index into the constant buffer.
 269        * Note that we're working on SOA data.
 270        * If any of the pixel/vertex execution channels are unused their
 271        * values will be garbage.  It's very important that we don't use
 272        * those garbage values as indexes into the constant buffer since
 273        * that'll cause segfaults.
 274        * The solution is to bitwise-AND the offset with the execution mask
 275        * register whose values are either 0 or ~0.
 276        * The caller must setup the execution mask register to indicate
 277        * which channels are valid/alive before running the shader.
 278        * The execution mask will also figure into loops and conditionals
 279        * someday.
 280        */
 281       for (i = 0; i < QUAD_SIZE; i++) {
 282          /* r1 = address register[i] */
 283          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 284          /* r0 = execution mask[i] */
 285          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 286          /* r1 = r1 & r0 */
 287          x86_and( func, r1, r0 );
 288          /* r0 = 'vec', the offset */
 289          x86_lea( func, r0, get_const( vec, chan ) );
 290
 291          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 292           */
 293          x86_add( func, r1, r1 );
 294          x86_add( func, r1, r1 );
 295          x86_add( func, r1, r1 );
 296          x86_add( func, r1, r1 );
 297
 298          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 299          x86_mov( func, r1, x86_deref( r0 ) );
 300          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 301       }
 302
 303       x86_pop( func, r1 );
 304       x86_pop( func, r0 );
 305
 306       sse_movaps(
 307          func,
 308          make_xmm( xmm ),
 309          get_temp( TEMP_R0, CHAN_X ) );
 310    }
 311    else {
 312       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 313       assert( vec >= 0 );
 314
 315       sse_movss(
 316          func,
 317          make_xmm( xmm ),
 318          get_const( vec, chan ) );
 319       sse_shufps(
 320          func,
 321          make_xmm( xmm ),
 322          make_xmm( xmm ),
 323          SHUF( 0, 0, 0, 0 ) );
 324    }
 325 }
 326
 327 static void
 328 emit_immediate(
 329    struct x86_function *func,
 330    unsigned xmm,
 331    unsigned vec,
 332    unsigned chan )
 333 {
 334    sse_movss(
 335       func,
 336       make_xmm( xmm ),
 337       get_immediate( vec, chan ) );
 338    sse_shufps(
 339       func,
 340       make_xmm( xmm ),
 341       make_xmm( xmm ),
 342       SHUF( 0, 0, 0, 0 ) );
 343 }
 344
 345
 346 /**
 347  * Copy a shader input to xmm register
 348  * \param xmm  the destination xmm register
 349  * \param vec  the src input attrib
 350  * \param chan  src channel to fetch (X, Y, Z or W)
 351  */
 352 static void
 353 emit_inputf(
 354    struct x86_function *func,
 355    unsigned xmm,
 356    unsigned vec,
 357    unsigned chan )
 358 {
 359    sse_movups(
 360       func,
 361       make_xmm( xmm ),
 362       get_input( vec, chan ) );
 363 }
 364
 365 /**
 366  * Store an xmm register to a shader output
 367  * \param xmm  the source xmm register
 368  * \param vec  the dest output attrib
 369  * \param chan  src dest channel to store (X, Y, Z or W)
 370  */
 371 static void
 372 emit_output(
 373    struct x86_function *func,
 374    unsigned xmm,
 375    unsigned vec,
 376    unsigned chan )
 377 {
 378    sse_movups(
 379       func,
 380       get_output( vec, chan ),
 381       make_xmm( xmm ) );
 382 }
 383
 384 /**
 385  * Copy a shader temporary to xmm register
 386  * \param xmm  the destination xmm register
 387  * \param vec  the src temp register
 388  * \param chan  src channel to fetch (X, Y, Z or W)
 389  */
 390 static void
 391 emit_tempf(
 392    struct x86_function *func,
 393    unsigned xmm,
 394    unsigned vec,
 395    unsigned chan )
 396 {
 397    sse_movaps(
 398       func,
 399       make_xmm( xmm ),
 400       get_temp( vec, chan ) );
 401 }
 402
 403 /**
 404  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 405  * \param xmm  the destination xmm register
 406  * \param vec  the src input/attribute coefficient index
 407  * \param chan  src channel to fetch (X, Y, Z or W)
 408  * \param member  0=a0, 1=dadx, 2=dady
 409  */
 410 static void
 411 emit_coef(
 412    struct x86_function *func,
 413    unsigned xmm,
 414    unsigned vec,
 415    unsigned chan,
 416    unsigned member )
 417 {
 418    sse_movss(
 419       func,
 420       make_xmm( xmm ),
 421       get_coef( vec, chan, member ) );
 422    sse_shufps(
 423       func,
 424       make_xmm( xmm ),
 425       make_xmm( xmm ),
 426       SHUF( 0, 0, 0, 0 ) );
 427 }
 428
 429 /**
 430  * Data store helpers.
 431  */
 432
 433 static void
 434 emit_inputs(
 435    struct x86_function *func,
 436    unsigned xmm,
 437    unsigned vec,
 438    unsigned chan )
 439 {
 440    sse_movups(
 441       func,
 442       get_input( vec, chan ),
 443       make_xmm( xmm ) );
 444 }
 445
 446 static void
 447 emit_temps(
 448    struct x86_function *func,
 449    unsigned xmm,
 450    unsigned vec,
 451    unsigned chan )
 452 {
 453    sse_movaps(
 454       func,
 455       get_temp( vec, chan ),
 456       make_xmm( xmm ) );
 457 }
 458
 459 static void
 460 emit_addrs(
 461    struct x86_function *func,
 462    unsigned xmm,
 463    unsigned vec,
 464    unsigned chan )
 465 {
 466    assert( vec == 0 );
 467
 468    emit_temps(
 469       func,
 470       xmm,
 471       vec + TGSI_EXEC_TEMP_ADDR,
 472       chan );
 473 }
 474
 475 /**
 476  * Coefficent fetch helpers.
 477  */
 478
 479 static void
 480 emit_coef_a0(
 481    struct x86_function *func,
 482    unsigned xmm,
 483    unsigned vec,
 484    unsigned chan )
 485 {
 486    emit_coef(
 487       func,
 488       xmm,
 489       vec,
 490       chan,
 491       0 );
 492 }
 493
 494 static void
 495 emit_coef_dadx(
 496    struct x86_function *func,
 497    unsigned xmm,
 498    unsigned vec,
 499    unsigned chan )
 500 {
 501    emit_coef(
 502       func,
 503       xmm,
 504       vec,
 505       chan,
 506       1 );
 507 }
 508
 509 static void
 510 emit_coef_dady(
 511    struct x86_function *func,
 512    unsigned xmm,
 513    unsigned vec,
 514    unsigned chan )
 515 {
 516    emit_coef(
 517       func,
 518       xmm,
 519       vec,
 520       chan,
 521       2 );
 522 }
 523
 524 /**
 525  * Function call helpers.
 526  */
 527
 528 /**
 529  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 530  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 531  * that the stack pointer is 16 byte aligned, as expected.
 532  */
 533 static void
 534 emit_func_call(
 535    struct x86_function *func,
 536    unsigned xmm_save,
 537    unsigned xmm_dst,
 538    void (PIPE_CDECL *code)() )
 539 {
 540    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 541    unsigned i, n;
 542    unsigned xmm_mask;
 543
 544    /* Bitmask of the xmm registers to save */
 545    xmm_mask = (1 << xmm_save) - 1;
 546    xmm_mask &= ~(1 << xmm_dst);
 547
 548    x86_push(
 549       func,
 550       x86_make_reg( file_REG32, reg_AX) );
 551    x86_push(
 552       func,
 553       x86_make_reg( file_REG32, reg_CX) );
 554    x86_push(
 555       func,
 556       x86_make_reg( file_REG32, reg_DX) );
 557
 558    /* Store XMM regs to the stack
 559     */
 560    for(i = 0, n = 0; i < 8; ++i)
 561       if(xmm_mask & (1 << i))
 562          ++n;
 563
 564    x86_sub_imm(
 565       func,
 566       x86_make_reg( file_REG32, reg_SP ),
 567       n*16);
 568
 569    for(i = 0, n = 0; i < 8; ++i)
 570       if(xmm_mask & (1 << i)) {
 571          sse_movups(
 572             func,
 573             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 574             make_xmm( i ) );
 575          ++n;
 576       }
 577
 578    /* Load the address of the buffer we use for passing arguments and
 579     * receiving results:
 580     */
 581    x86_lea(
 582       func,
 583       ecx,
 584       get_temp( TEMP_R0, 0 ) );
 585
 586    /* Push actual function arguments (currently just the pointer to
 587     * the buffer above), and call the function:
 588     */
 589    x86_push( func, ecx );
 590    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 591    x86_call( func, ecx );
 592    x86_pop(func, ecx );
 593
 594
 595    /* Pop the saved XMM regs:
 596     */
 597    for(i = 0, n = 0; i < 8; ++i)
 598       if(xmm_mask & (1 << i)) {
 599          sse_movups(
 600             func,
 601             make_xmm( i ),
 602             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 603          ++n;
 604       }
 605
 606    x86_add_imm(
 607       func,
 608       x86_make_reg( file_REG32, reg_SP ),
 609       n*16);
 610
 611    /* Restore GP registers in a reverse order.
 612     */
 613    x86_pop(
 614       func,
 615       x86_make_reg( file_REG32, reg_DX) );
 616    x86_pop(
 617       func,
 618       x86_make_reg( file_REG32, reg_CX) );
 619    x86_pop(
 620       func,
 621       x86_make_reg( file_REG32, reg_AX) );
 622 }
 623
 624
 625 static void
 626 emit_func_call_dst_src1(
 627    struct x86_function *func,
 628    unsigned xmm_save,
 629    unsigned xmm_dst,
 630    unsigned xmm_src0,
 631    void (PIPE_CDECL *code)() )
 632 {
 633    /* Store our input parameters (in xmm regs) to the buffer we use
 634     * for passing arguments.  We will pass a pointer to this buffer as
 635     * the actual function argument.
 636     */
 637    sse_movaps(
 638       func,
 639       get_temp( TEMP_R0, 0 ),
 640       make_xmm( xmm_src0 ) );
 641
 642    emit_func_call(
 643       func,
 644       xmm_save,
 645       xmm_dst,
 646       code );
 647
 648    sse_movaps(
 649       func,
 650       make_xmm( xmm_dst ),
 651       get_temp( TEMP_R0, 0 ) );
 652 }
 653
 654
 655 static void
 656 emit_func_call_dst_src2(
 657    struct x86_function *func,
 658    unsigned xmm_save,
 659    unsigned xmm_dst,
 660    unsigned xmm_src0,
 661    unsigned xmm_src1,
 662    void (PIPE_CDECL *code)() )
 663 {
 664    /* Store two inputs to parameter buffer.
 665     */
 666    sse_movaps(
 667       func,
 668       get_temp( TEMP_R0, 0 ),
 669       make_xmm( xmm_src0 ) );
 670
 671    sse_movaps(
 672       func,
 673       get_temp( TEMP_R0, 1 ),
 674       make_xmm( xmm_src1 ) );
 675
 676
 677    /* Emit the call
 678     */
 679    emit_func_call(
 680       func,
 681       xmm_save,
 682       xmm_dst,
 683       code );
 684
 685    /* Retrieve the results:
 686     */
 687    sse_movaps(
 688       func,
 689       make_xmm( xmm_dst ),
 690       get_temp( TEMP_R0, 0 ) );
 691 }
 692
 693
 694
 695
 696
 697 #if defined(PIPE_ARCH_SSE)
 698
 699 /*
 700  * Fast SSE2 implementation of special math functions.
 701  */
 702
 703 #define POLY0(x, c0) _mm_set1_ps(c0)
 704 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 705 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 706 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 707 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 708 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 709
 710 #define EXP_POLY_DEGREE 3
 711 #define LOG_POLY_DEGREE 5
 712
 713 /**
 714  * See http://www.devmaster.net/forums/showthread.php?p=43580
 715  */
 716 static INLINE __m128
 717 exp2f4(__m128 x)
 718 {
 719    __m128i ipart;
 720    __m128 fpart, expipart, expfpart;
 721
 722    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 723    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 724
 725    /* ipart = int(x - 0.5) */
 726    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 727
 728    /* fpart = x - ipart */
 729    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 730
 731    /* expipart = (float) (1 << ipart) */
 732    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 733
 734    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 735 #if EXP_POLY_DEGREE == 5
 736    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 737 #elif EXP_POLY_DEGREE == 4
 738    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 739 #elif EXP_POLY_DEGREE == 3
 740    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 741 #elif EXP_POLY_DEGREE == 2
 742    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 743 #else
 744 #error
 745 #endif
 746
 747    return _mm_mul_ps(expipart, expfpart);
 748 }
 749
 750
 751 /**
 752  * See http://www.devmaster.net/forums/showthread.php?p=43580
 753  */
 754 static INLINE __m128
 755 log2f4(__m128 x)
 756 {
 757    __m128i expmask = _mm_set1_epi32(0x7f800000);
 758    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 759    __m128 one = _mm_set1_ps(1.0f);
 760
 761    __m128i i = _mm_castps_si128(x);
 762
 763    /* exp = (float) exponent(x) */
 764    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 765
 766    /* mant = (float) mantissa(x) */
 767    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 768
 769    __m128 logmant;
 770
 771    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 772     * These coefficients can be generate with
 773     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 774     */
 775 #if LOG_POLY_DEGREE == 6
 776    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 777 #elif LOG_POLY_DEGREE == 5
 778    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 779 #elif LOG_POLY_DEGREE == 4
 780    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 781 #elif LOG_POLY_DEGREE == 3
 782    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 783 #else
 784 #error
 785 #endif
 786
 787    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 788    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 789
 790    return _mm_add_ps(logmant, exp);
 791 }
 792
 793
 794 static INLINE __m128
 795 powf4(__m128 x, __m128 y)
 796 {
 797    return exp2f4(_mm_mul_ps(log2f4(x), y));
 798 }
 799
 800 #endif /* PIPE_ARCH_SSE */
 801
 802
 803
 804 /**
 805  * Low-level instruction translators.
 806  */
 807
 808 static void
 809 emit_abs(
 810    struct x86_function *func,
 811    unsigned xmm )
 812 {
 813    sse_andps(
 814       func,
 815       make_xmm( xmm ),
 816       get_temp(
 817          TGSI_EXEC_TEMP_7FFFFFFF_I,
 818          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 819 }
 820
 821 static void
 822 emit_add(
 823    struct x86_function *func,
 824    unsigned xmm_dst,
 825    unsigned xmm_src )
 826 {
 827    sse_addps(
 828       func,
 829       make_xmm( xmm_dst ),
 830       make_xmm( xmm_src ) );
 831 }
 832
 833 static void PIPE_CDECL
 834 cos4f(
 835    float *store )
 836 {
 837    store[0] = cosf( store[0] );
 838    store[1] = cosf( store[1] );
 839    store[2] = cosf( store[2] );
 840    store[3] = cosf( store[3] );
 841 }
 842
 843 static void
 844 emit_cos(
 845    struct x86_function *func,
 846    unsigned xmm_save,
 847    unsigned xmm_dst )
 848 {
 849    emit_func_call_dst_src1(
 850       func,
 851       xmm_save,
 852       xmm_dst,
 853       xmm_dst,
 854       cos4f );
 855 }
 856
 857 static void PIPE_CDECL
 858 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 859 __attribute__((force_align_arg_pointer))
 860 #endif
 861 ex24f(
 862    float *store )
 863 {
 864 #if defined(PIPE_ARCH_SSE)
 865    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 866 #else
 867    store[0] = util_fast_exp2( store[0] );
 868    store[1] = util_fast_exp2( store[1] );
 869    store[2] = util_fast_exp2( store[2] );
 870    store[3] = util_fast_exp2( store[3] );
 871 #endif
 872 }
 873
 874 static void
 875 emit_ex2(
 876    struct x86_function *func,
 877    unsigned xmm_save,
 878    unsigned xmm_dst )
 879 {
 880    emit_func_call_dst_src1(
 881       func,
 882       xmm_save,
 883       xmm_dst,
 884       xmm_dst,
 885       ex24f );
 886 }
 887
 888 static void
 889 emit_f2it(
 890    struct x86_function *func,
 891    unsigned xmm )
 892 {
 893    sse2_cvttps2dq(
 894       func,
 895       make_xmm( xmm ),
 896       make_xmm( xmm ) );
 897 }
 898
 899 static void
 900 emit_i2f(
 901    struct x86_function *func,
 902    unsigned xmm )
 903 {
 904    sse2_cvtdq2ps(
 905       func,
 906       make_xmm( xmm ),
 907       make_xmm( xmm ) );
 908 }
 909
 910 static void PIPE_CDECL
 911 flr4f(
 912    float *store )
 913 {
 914    store[0] = floorf( store[0] );
 915    store[1] = floorf( store[1] );
 916    store[2] = floorf( store[2] );
 917    store[3] = floorf( store[3] );
 918 }
 919
 920 static void
 921 emit_flr(
 922    struct x86_function *func,
 923    unsigned xmm_save,
 924    unsigned xmm_dst )
 925 {
 926    emit_func_call_dst_src1(
 927       func,
 928       xmm_save,
 929       xmm_dst,
 930       xmm_dst,
 931       flr4f );
 932 }
 933
 934 static void PIPE_CDECL
 935 frc4f(
 936    float *store )
 937 {
 938    store[0] -= floorf( store[0] );
 939    store[1] -= floorf( store[1] );
 940    store[2] -= floorf( store[2] );
 941    store[3] -= floorf( store[3] );
 942 }
 943
 944 static void
 945 emit_frc(
 946    struct x86_function *func,
 947    unsigned xmm_save,
 948    unsigned xmm_dst )
 949 {
 950    emit_func_call_dst_src1(
 951       func,
 952       xmm_save,
 953       xmm_dst,
 954       xmm_dst,
 955       frc4f );
 956 }
 957
 958 static void PIPE_CDECL
 959 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 960 __attribute__((force_align_arg_pointer))
 961 #endif
 962 lg24f(
 963    float *store )
 964 {
 965 #if defined(PIPE_ARCH_SSE)
 966    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 967 #else
 968    store[0] = util_fast_log2( store[0] );
 969    store[1] = util_fast_log2( store[1] );
 970    store[2] = util_fast_log2( store[2] );
 971    store[3] = util_fast_log2( store[3] );
 972 #endif
 973 }
 974
 975 static void
 976 emit_lg2(
 977    struct x86_function *func,
 978    unsigned xmm_save,
 979    unsigned xmm_dst )
 980 {
 981    emit_func_call_dst_src1(
 982       func,
 983       xmm_save,
 984       xmm_dst,
 985       xmm_dst,
 986       lg24f );
 987 }
 988
 989 static void
 990 emit_MOV(
 991    struct x86_function *func,
 992    unsigned xmm_dst,
 993    unsigned xmm_src )
 994 {
 995    sse_movups(
 996       func,
 997       make_xmm( xmm_dst ),
 998       make_xmm( xmm_src ) );
 999 }
1000
1001 static void
1002 emit_mul (struct x86_function *func,
1003           unsigned xmm_dst,
1004           unsigned xmm_src)
1005 {
1006    sse_mulps(
1007       func,
1008       make_xmm( xmm_dst ),
1009       make_xmm( xmm_src ) );
1010 }
1011
1012 static void
1013 emit_neg(
1014    struct x86_function *func,
1015    unsigned xmm )
1016 {
1017    sse_xorps(
1018       func,
1019       make_xmm( xmm ),
1020       get_temp(
1021          TGSI_EXEC_TEMP_80000000_I,
1022          TGSI_EXEC_TEMP_80000000_C ) );
1023 }
1024
1025 static void PIPE_CDECL
1026 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1027 __attribute__((force_align_arg_pointer))
1028 #endif
1029 pow4f(
1030    float *store )
1031 {
1032 #if defined(PIPE_ARCH_SSE)
1033    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1034 #else
1035    store[0] = util_fast_pow( store[0], store[4] );
1036    store[1] = util_fast_pow( store[1], store[5] );
1037    store[2] = util_fast_pow( store[2], store[6] );
1038    store[3] = util_fast_pow( store[3], store[7] );
1039 #endif
1040 }
1041
1042 static void
1043 emit_pow(
1044    struct x86_function *func,
1045    unsigned xmm_save,
1046    unsigned xmm_dst,
1047    unsigned xmm_src0,
1048    unsigned xmm_src1 )
1049 {
1050    emit_func_call_dst_src2(
1051       func,
1052       xmm_save,
1053       xmm_dst,
1054       xmm_src0,
1055       xmm_src1,
1056       pow4f );
1057 }
1058
1059 static void
1060 emit_rcp (
1061    struct x86_function *func,
1062    unsigned xmm_dst,
1063    unsigned xmm_src )
1064 {
1065    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1066     * good enough.  Need to either emit a proper divide or use the
1067     * iterative technique described below in emit_rsqrt().
1068     */
1069    sse2_rcpps(
1070       func,
1071       make_xmm( xmm_dst ),
1072       make_xmm( xmm_src ) );
1073 }
1074
1075 static void PIPE_CDECL
1076 rnd4f(
1077    float *store )
1078 {
1079    store[0] = floorf( store[0] + 0.5f );
1080    store[1] = floorf( store[1] + 0.5f );
1081    store[2] = floorf( store[2] + 0.5f );
1082    store[3] = floorf( store[3] + 0.5f );
1083 }
1084
1085 static void
1086 emit_rnd(
1087    struct x86_function *func,
1088    unsigned xmm_save,
1089    unsigned xmm_dst )
1090 {
1091    emit_func_call_dst_src1(
1092       func,
1093       xmm_save,
1094       xmm_dst,
1095       xmm_dst,
1096       rnd4f );
1097 }
1098
1099 static void
1100 emit_rsqrt(
1101    struct x86_function *func,
1102    unsigned xmm_dst,
1103    unsigned xmm_src )
1104 {
1105 #if HIGH_PRECISION
1106    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1107     * implementations, it is possible to improve its precision at
1108     * fairly low cost, using a newton/raphson step, as below:
1109     *
1110     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1111     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1112     *
1113     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1114     */
1115    {
1116       struct x86_reg dst = make_xmm( xmm_dst );
1117       struct x86_reg src = make_xmm( xmm_src );
1118       struct x86_reg tmp0 = make_xmm( 2 );
1119       struct x86_reg tmp1 = make_xmm( 3 );
1120
1121       assert( xmm_dst != xmm_src );
1122       assert( xmm_dst != 2 && xmm_dst != 3 );
1123       assert( xmm_src != 2 && xmm_src != 3 );
1124
1125       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1126       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1127       sse_rsqrtps( func, tmp1, src  );
1128       sse_mulps(   func, src,  tmp1 );
1129       sse_mulps(   func, dst,  tmp1 );
1130       sse_mulps(   func, src,  tmp1 );
1131       sse_subps(   func, tmp0, src  );
1132       sse_mulps(   func, dst,  tmp0 );
1133    }
1134 #else
1135    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1136     * good enough.
1137     */
1138    sse_rsqrtps(
1139       func,
1140       make_xmm( xmm_dst ),
1141       make_xmm( xmm_src ) );
1142 #endif
1143 }
1144
1145 static void
1146 emit_setsign(
1147    struct x86_function *func,
1148    unsigned xmm )
1149 {
1150    sse_orps(
1151       func,
1152       make_xmm( xmm ),
1153       get_temp(
1154          TGSI_EXEC_TEMP_80000000_I,
1155          TGSI_EXEC_TEMP_80000000_C ) );
1156 }
1157
1158 static void PIPE_CDECL
1159 sgn4f(
1160    float *store )
1161 {
1162    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1163    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1164    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1165    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1166 }
1167
1168 static void
1169 emit_sgn(
1170    struct x86_function *func,
1171    unsigned xmm_save,
1172    unsigned xmm_dst )
1173 {
1174    emit_func_call_dst_src1(
1175       func,
1176       xmm_save,
1177       xmm_dst,
1178       xmm_dst,
1179       sgn4f );
1180 }
1181
1182 static void PIPE_CDECL
1183 sin4f(
1184    float *store )
1185 {
1186    store[0] = sinf( store[0] );
1187    store[1] = sinf( store[1] );
1188    store[2] = sinf( store[2] );
1189    store[3] = sinf( store[3] );
1190 }
1191
1192 static void
1193 emit_sin (struct x86_function *func,
1194           unsigned xmm_save,
1195           unsigned xmm_dst)
1196 {
1197    emit_func_call_dst_src1(
1198       func,
1199       xmm_save,
1200       xmm_dst,
1201       xmm_dst,
1202       sin4f );
1203 }
1204
1205 static void
1206 emit_sub(
1207    struct x86_function *func,
1208    unsigned xmm_dst,
1209    unsigned xmm_src )
1210 {
1211    sse_subps(
1212       func,
1213       make_xmm( xmm_dst ),
1214       make_xmm( xmm_src ) );
1215 }
1216
1217 /**
1218  * Register fetch.
1219  */
1220
1221 static void
1222 emit_fetch(
1223    struct x86_function *func,
1224    unsigned xmm,
1225    const struct tgsi_full_src_register *reg,
1226    const unsigned chan_index )
1227 {
1228    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1229
1230    switch (swizzle) {
1231    case TGSI_EXTSWIZZLE_X:
1232    case TGSI_EXTSWIZZLE_Y:
1233    case TGSI_EXTSWIZZLE_Z:
1234    case TGSI_EXTSWIZZLE_W:
1235       switch (reg->SrcRegister.File) {
1236       case TGSI_FILE_CONSTANT:
1237          emit_const(
1238             func,
1239             xmm,
1240             reg->SrcRegister.Index,
1241             swizzle,
1242             reg->SrcRegister.Indirect,
1243             reg->SrcRegisterInd.File,
1244             reg->SrcRegisterInd.Index );
1245          break;
1246
1247       case TGSI_FILE_IMMEDIATE:
1248          emit_immediate(
1249             func,
1250             xmm,
1251             reg->SrcRegister.Index,
1252             swizzle );
1253          break;
1254
1255       case TGSI_FILE_INPUT:
1256          emit_inputf(
1257             func,
1258             xmm,
1259             reg->SrcRegister.Index,
1260             swizzle );
1261          break;
1262
1263       case TGSI_FILE_TEMPORARY:
1264          emit_tempf(
1265             func,
1266             xmm,
1267             reg->SrcRegister.Index,
1268             swizzle );
1269          break;
1270
1271       default:
1272          assert( 0 );
1273       }
1274       break;
1275
1276    case TGSI_EXTSWIZZLE_ZERO:
1277       emit_tempf(
1278          func,
1279          xmm,
1280          TGSI_EXEC_TEMP_00000000_I,
1281          TGSI_EXEC_TEMP_00000000_C );
1282       break;
1283
1284    case TGSI_EXTSWIZZLE_ONE:
1285       emit_tempf(
1286          func,
1287          xmm,
1288          TEMP_ONE_I,
1289          TEMP_ONE_C );
1290       break;
1291
1292    default:
1293       assert( 0 );
1294    }
1295
1296    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1297    case TGSI_UTIL_SIGN_CLEAR:
1298       emit_abs( func, xmm );
1299       break;
1300
1301    case TGSI_UTIL_SIGN_SET:
1302       emit_setsign( func, xmm );
1303       break;
1304
1305    case TGSI_UTIL_SIGN_TOGGLE:
1306       emit_neg( func, xmm );
1307       break;
1308
1309    case TGSI_UTIL_SIGN_KEEP:
1310       break;
1311    }
1312 }
1313
1314 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1315    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1316
1317 /**
1318  * Register store.
1319  */
1320
1321 static void
1322 emit_store(
1323    struct x86_function *func,
1324    unsigned xmm,
1325    const struct tgsi_full_dst_register *reg,
1326    const struct tgsi_full_instruction *inst,
1327    unsigned chan_index )
1328 {
1329    switch( reg->DstRegister.File ) {
1330    case TGSI_FILE_OUTPUT:
1331       emit_output(
1332          func,
1333          xmm,
1334          reg->DstRegister.Index,
1335          chan_index );
1336       break;
1337
1338    case TGSI_FILE_TEMPORARY:
1339       emit_temps(
1340          func,
1341          xmm,
1342          reg->DstRegister.Index,
1343          chan_index );
1344       break;
1345
1346    case TGSI_FILE_ADDRESS:
1347       emit_addrs(
1348          func,
1349          xmm,
1350          reg->DstRegister.Index,
1351          chan_index );
1352       break;
1353
1354    default:
1355       assert( 0 );
1356    }
1357
1358    switch( inst->Instruction.Saturate ) {
1359    case TGSI_SAT_NONE:
1360       break;
1361
1362    case TGSI_SAT_ZERO_ONE:
1363       /* assert( 0 ); */
1364       break;
1365
1366    case TGSI_SAT_MINUS_PLUS_ONE:
1367       assert( 0 );
1368       break;
1369    }
1370 }
1371
1372 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1373    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1374
1375 /**
1376  * High-level instruction translators.
1377  */
1378
1379 static void
1380 emit_kil(
1381    struct x86_function *func,
1382    const struct tgsi_full_src_register *reg )
1383 {
1384    unsigned uniquemask;
1385    unsigned registers[4];
1386    unsigned nextregister = 0;
1387    unsigned firstchan = ~0;
1388    unsigned chan_index;
1389
1390    /* This mask stores component bits that were already tested. Note that
1391     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1392     * tested. */
1393    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1394
1395    FOR_EACH_CHANNEL( chan_index ) {
1396       unsigned swizzle;
1397
1398       /* unswizzle channel */
1399       swizzle = tgsi_util_get_full_src_register_extswizzle(
1400          reg,
1401          chan_index );
1402
1403       /* check if the component has not been already tested */
1404       if( !(uniquemask & (1 << swizzle)) ) {
1405          uniquemask |= 1 << swizzle;
1406
1407          /* allocate register */
1408          registers[chan_index] = nextregister;
1409          emit_fetch(
1410             func,
1411             nextregister,
1412             reg,
1413             chan_index );
1414          nextregister++;
1415
1416          /* mark the first channel used */
1417          if( firstchan == ~0 ) {
1418             firstchan = chan_index;
1419          }
1420       }
1421    }
1422
1423    x86_push(
1424       func,
1425       x86_make_reg( file_REG32, reg_AX ) );
1426    x86_push(
1427       func,
1428       x86_make_reg( file_REG32, reg_DX ) );
1429
1430    FOR_EACH_CHANNEL( chan_index ) {
1431       if( uniquemask & (1 << chan_index) ) {
1432          sse_cmpps(
1433             func,
1434             make_xmm( registers[chan_index] ),
1435             get_temp(
1436                TGSI_EXEC_TEMP_00000000_I,
1437                TGSI_EXEC_TEMP_00000000_C ),
1438             cc_LessThan );
1439
1440          if( chan_index == firstchan ) {
1441             sse_pmovmskb(
1442                func,
1443                x86_make_reg( file_REG32, reg_AX ),
1444                make_xmm( registers[chan_index] ) );
1445          }
1446          else {
1447             sse_pmovmskb(
1448                func,
1449                x86_make_reg( file_REG32, reg_DX ),
1450                make_xmm( registers[chan_index] ) );
1451             x86_or(
1452                func,
1453                x86_make_reg( file_REG32, reg_AX ),
1454                x86_make_reg( file_REG32, reg_DX ) );
1455          }
1456       }
1457    }
1458
1459    x86_or(
1460       func,
1461       get_temp(
1462          TGSI_EXEC_TEMP_KILMASK_I,
1463          TGSI_EXEC_TEMP_KILMASK_C ),
1464       x86_make_reg( file_REG32, reg_AX ) );
1465
1466    x86_pop(
1467       func,
1468       x86_make_reg( file_REG32, reg_DX ) );
1469    x86_pop(
1470       func,
1471       x86_make_reg( file_REG32, reg_AX ) );
1472 }
1473
1474
1475 static void
1476 emit_kilp(
1477    struct x86_function *func )
1478 {
1479    /* XXX todo / fix me */
1480 }
1481
1482
1483 static void
1484 emit_setcc(
1485    struct x86_function *func,
1486    struct tgsi_full_instruction *inst,
1487    enum sse_cc cc )
1488 {
1489    unsigned chan_index;
1490
1491    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1492       FETCH( func, *inst, 0, 0, chan_index );
1493       FETCH( func, *inst, 1, 1, chan_index );
1494       sse_cmpps(
1495          func,
1496          make_xmm( 0 ),
1497          make_xmm( 1 ),
1498          cc );
1499       sse_andps(
1500          func,
1501          make_xmm( 0 ),
1502          get_temp(
1503             TEMP_ONE_I,
1504             TEMP_ONE_C ) );
1505       STORE( func, *inst, 0, 0, chan_index );
1506    }
1507 }
1508
1509 static void
1510 emit_cmp(
1511    struct x86_function *func,
1512    struct tgsi_full_instruction *inst )
1513 {
1514    unsigned chan_index;
1515
1516    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1517       FETCH( func, *inst, 0, 0, chan_index );
1518       FETCH( func, *inst, 1, 1, chan_index );
1519       FETCH( func, *inst, 2, 2, chan_index );
1520       sse_cmpps(
1521          func,
1522          make_xmm( 0 ),
1523          get_temp(
1524             TGSI_EXEC_TEMP_00000000_I,
1525             TGSI_EXEC_TEMP_00000000_C ),
1526          cc_LessThan );
1527       sse_andps(
1528          func,
1529          make_xmm( 1 ),
1530          make_xmm( 0 ) );
1531       sse_andnps(
1532          func,
1533          make_xmm( 0 ),
1534          make_xmm( 2 ) );
1535       sse_orps(
1536          func,
1537          make_xmm( 0 ),
1538          make_xmm( 1 ) );
1539       STORE( func, *inst, 0, 0, chan_index );
1540    }
1541 }
1542
1543
1544 /**
1545  * Check if inst src/dest regs use indirect addressing into temporary
1546  * register file.
1547  */
1548 static boolean
1549 indirect_temp_reference(const struct tgsi_full_instruction *inst)
1550 {
1551    uint i;
1552    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1553       const struct tgsi_full_src_register *reg = &inst->FullSrcRegisters[i];
1554       if (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
1555           reg->SrcRegister.Indirect)
1556          return TRUE;
1557    }
1558    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1559       const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[i];
1560       if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
1561           reg->DstRegister.Indirect)
1562          return TRUE;
1563    }
1564    return FALSE;
1565 }
1566
1567
1568 static int
1569 emit_instruction(
1570    struct x86_function *func,
1571    struct tgsi_full_instruction *inst )
1572 {
1573    unsigned chan_index;
1574
1575    /* we can't handle indirect addressing into temp register file yet */
1576    if (indirect_temp_reference(inst))
1577       return FALSE;
1578
1579    switch (inst->Instruction.Opcode) {
1580    case TGSI_OPCODE_ARL:
1581       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1582          FETCH( func, *inst, 0, 0, chan_index );
1583          emit_flr(func, 0, 0);
1584          emit_f2it( func, 0 );
1585          STORE( func, *inst, 0, 0, chan_index );
1586       }
1587       break;
1588
1589    case TGSI_OPCODE_MOV:
1590    case TGSI_OPCODE_SWZ:
1591       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1592          FETCH( func, *inst, 0, 0, chan_index );
1593          STORE( func, *inst, 0, 0, chan_index );
1594       }
1595       break;
1596
1597    case TGSI_OPCODE_LIT:
1598       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1599           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1600          emit_tempf(
1601             func,
1602             0,
1603             TEMP_ONE_I,
1604             TEMP_ONE_C);
1605          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1606             STORE( func, *inst, 0, 0, CHAN_X );
1607          }
1608          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1609             STORE( func, *inst, 0, 0, CHAN_W );
1610          }
1611       }
1612       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1613           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1614          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1615             FETCH( func, *inst, 0, 0, CHAN_X );
1616             sse_maxps(
1617                func,
1618                make_xmm( 0 ),
1619                get_temp(
1620                   TGSI_EXEC_TEMP_00000000_I,
1621                   TGSI_EXEC_TEMP_00000000_C ) );
1622             STORE( func, *inst, 0, 0, CHAN_Y );
1623          }
1624          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1625             /* XMM[1] = SrcReg[0].yyyy */
1626             FETCH( func, *inst, 1, 0, CHAN_Y );
1627             /* XMM[1] = max(XMM[1], 0) */
1628             sse_maxps(
1629                func,
1630                make_xmm( 1 ),
1631                get_temp(
1632                   TGSI_EXEC_TEMP_00000000_I,
1633                   TGSI_EXEC_TEMP_00000000_C ) );
1634             /* XMM[2] = SrcReg[0].wwww */
1635             FETCH( func, *inst, 2, 0, CHAN_W );
1636             /* XMM[2] = min(XMM[2], 128.0) */
1637             sse_minps(
1638                func,
1639                make_xmm( 2 ),
1640                get_temp(
1641                   TGSI_EXEC_TEMP_128_I,
1642                   TGSI_EXEC_TEMP_128_C ) );
1643             /* XMM[2] = max(XMM[2], -128.0) */
1644             sse_maxps(
1645                func,
1646                make_xmm( 2 ),
1647                get_temp(
1648                   TGSI_EXEC_TEMP_MINUS_128_I,
1649                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1650             emit_pow( func, 3, 1, 1, 2 );
1651             FETCH( func, *inst, 0, 0, CHAN_X );
1652             sse_xorps(
1653                func,
1654                make_xmm( 2 ),
1655                make_xmm( 2 ) );
1656             sse_cmpps(
1657                func,
1658                make_xmm( 2 ),
1659                make_xmm( 0 ),
1660                cc_LessThan );
1661             sse_andps(
1662                func,
1663                make_xmm( 2 ),
1664                make_xmm( 1 ) );
1665             STORE( func, *inst, 2, 0, CHAN_Z );
1666          }
1667       }
1668       break;
1669
1670    case TGSI_OPCODE_RCP:
1671    /* TGSI_OPCODE_RECIP */
1672       FETCH( func, *inst, 0, 0, CHAN_X );
1673       emit_rcp( func, 0, 0 );
1674       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1675          STORE( func, *inst, 0, 0, chan_index );
1676       }
1677       break;
1678
1679    case TGSI_OPCODE_RSQ:
1680    /* TGSI_OPCODE_RECIPSQRT */
1681       FETCH( func, *inst, 0, 0, CHAN_X );
1682       emit_abs( func, 0 );
1683       emit_rsqrt( func, 1, 0 );
1684       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1685          STORE( func, *inst, 1, 0, chan_index );
1686       }
1687       break;
1688
1689    case TGSI_OPCODE_EXP:
1690       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1691           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1692           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1693          FETCH( func, *inst, 0, 0, CHAN_X );
1694          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1695              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1696             emit_MOV( func, 1, 0 );
1697             emit_flr( func, 2, 1 );
1698             /* dst.x = ex2(floor(src.x)) */
1699             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1700                emit_MOV( func, 2, 1 );
1701                emit_ex2( func, 3, 2 );
1702                STORE( func, *inst, 2, 0, CHAN_X );
1703             }
1704             /* dst.y = src.x - floor(src.x) */
1705             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1706                emit_MOV( func, 2, 0 );
1707                emit_sub( func, 2, 1 );
1708                STORE( func, *inst, 2, 0, CHAN_Y );
1709             }
1710          }
1711          /* dst.z = ex2(src.x) */
1712          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1713             emit_ex2( func, 3, 0 );
1714             STORE( func, *inst, 0, 0, CHAN_Z );
1715          }
1716       }
1717       /* dst.w = 1.0 */
1718       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1719          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1720          STORE( func, *inst, 0, 0, CHAN_W );
1721       }
1722       break;
1723
1724    case TGSI_OPCODE_LOG:
1725       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1726           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1727           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1728          FETCH( func, *inst, 0, 0, CHAN_X );
1729          emit_abs( func, 0 );
1730          emit_MOV( func, 1, 0 );
1731          emit_lg2( func, 2, 1 );
1732          /* dst.z = lg2(abs(src.x)) */
1733          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1734             STORE( func, *inst, 1, 0, CHAN_Z );
1735          }
1736          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1737              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1738             emit_flr( func, 2, 1 );
1739             /* dst.x = floor(lg2(abs(src.x))) */
1740             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1741                STORE( func, *inst, 1, 0, CHAN_X );
1742             }
1743             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1744             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1745                emit_ex2( func, 2, 1 );
1746                emit_rcp( func, 1, 1 );
1747                emit_mul( func, 0, 1 );
1748                STORE( func, *inst, 0, 0, CHAN_Y );
1749             }
1750          }
1751       }
1752       /* dst.w = 1.0 */
1753       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1754          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1755          STORE( func, *inst, 0, 0, CHAN_W );
1756       }
1757       break;
1758
1759    case TGSI_OPCODE_MUL:
1760       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1761          FETCH( func, *inst, 0, 0, chan_index );
1762          FETCH( func, *inst, 1, 1, chan_index );
1763          emit_mul( func, 0, 1 );
1764          STORE( func, *inst, 0, 0, chan_index );
1765       }
1766       break;
1767
1768    case TGSI_OPCODE_ADD:
1769       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1770          FETCH( func, *inst, 0, 0, chan_index );
1771          FETCH( func, *inst, 1, 1, chan_index );
1772          emit_add( func, 0, 1 );
1773          STORE( func, *inst, 0, 0, chan_index );
1774       }
1775       break;
1776
1777    case TGSI_OPCODE_DP3:
1778    /* TGSI_OPCODE_DOT3 */
1779       FETCH( func, *inst, 0, 0, CHAN_X );
1780       FETCH( func, *inst, 1, 1, CHAN_X );
1781       emit_mul( func, 0, 1 );
1782       FETCH( func, *inst, 1, 0, CHAN_Y );
1783       FETCH( func, *inst, 2, 1, CHAN_Y );
1784       emit_mul( func, 1, 2 );
1785       emit_add( func, 0, 1 );
1786       FETCH( func, *inst, 1, 0, CHAN_Z );
1787       FETCH( func, *inst, 2, 1, CHAN_Z );
1788       emit_mul( func, 1, 2 );
1789       emit_add( func, 0, 1 );
1790       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1791          STORE( func, *inst, 0, 0, chan_index );
1792       }
1793       break;
1794
1795    case TGSI_OPCODE_DP4:
1796    /* TGSI_OPCODE_DOT4 */
1797       FETCH( func, *inst, 0, 0, CHAN_X );
1798       FETCH( func, *inst, 1, 1, CHAN_X );
1799       emit_mul( func, 0, 1 );
1800       FETCH( func, *inst, 1, 0, CHAN_Y );
1801       FETCH( func, *inst, 2, 1, CHAN_Y );
1802       emit_mul( func, 1, 2 );
1803       emit_add( func, 0, 1 );
1804       FETCH( func, *inst, 1, 0, CHAN_Z );
1805       FETCH( func, *inst, 2, 1, CHAN_Z );
1806       emit_mul(func, 1, 2 );
1807       emit_add(func, 0, 1 );
1808       FETCH( func, *inst, 1, 0, CHAN_W );
1809       FETCH( func, *inst, 2, 1, CHAN_W );
1810       emit_mul( func, 1, 2 );
1811       emit_add( func, 0, 1 );
1812       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1813          STORE( func, *inst, 0, 0, chan_index );
1814       }
1815       break;
1816
1817    case TGSI_OPCODE_DST:
1818       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1819          emit_tempf(
1820             func,
1821             0,
1822             TEMP_ONE_I,
1823             TEMP_ONE_C );
1824          STORE( func, *inst, 0, 0, CHAN_X );
1825       }
1826       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1827          FETCH( func, *inst, 0, 0, CHAN_Y );
1828          FETCH( func, *inst, 1, 1, CHAN_Y );
1829          emit_mul( func, 0, 1 );
1830          STORE( func, *inst, 0, 0, CHAN_Y );
1831       }
1832       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1833          FETCH( func, *inst, 0, 0, CHAN_Z );
1834          STORE( func, *inst, 0, 0, CHAN_Z );
1835       }
1836       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1837          FETCH( func, *inst, 0, 1, CHAN_W );
1838          STORE( func, *inst, 0, 0, CHAN_W );
1839       }
1840       break;
1841
1842    case TGSI_OPCODE_MIN:
1843       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1844          FETCH( func, *inst, 0, 0, chan_index );
1845          FETCH( func, *inst, 1, 1, chan_index );
1846          sse_minps(
1847             func,
1848             make_xmm( 0 ),
1849             make_xmm( 1 ) );
1850          STORE( func, *inst, 0, 0, chan_index );
1851       }
1852       break;
1853
1854    case TGSI_OPCODE_MAX:
1855       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1856          FETCH( func, *inst, 0, 0, chan_index );
1857          FETCH( func, *inst, 1, 1, chan_index );
1858          sse_maxps(
1859             func,
1860             make_xmm( 0 ),
1861             make_xmm( 1 ) );
1862          STORE( func, *inst, 0, 0, chan_index );
1863       }
1864       break;
1865
1866    case TGSI_OPCODE_SLT:
1867    /* TGSI_OPCODE_SETLT */
1868       emit_setcc( func, inst, cc_LessThan );
1869       break;
1870
1871    case TGSI_OPCODE_SGE:
1872    /* TGSI_OPCODE_SETGE */
1873       emit_setcc( func, inst, cc_NotLessThan );
1874       break;
1875
1876    case TGSI_OPCODE_MAD:
1877    /* TGSI_OPCODE_MADD */
1878       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1879          FETCH( func, *inst, 0, 0, chan_index );
1880          FETCH( func, *inst, 1, 1, chan_index );
1881          FETCH( func, *inst, 2, 2, chan_index );
1882          emit_mul( func, 0, 1 );
1883          emit_add( func, 0, 2 );
1884          STORE( func, *inst, 0, 0, chan_index );
1885       }
1886       break;
1887
1888    case TGSI_OPCODE_SUB:
1889       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1890          FETCH( func, *inst, 0, 0, chan_index );
1891          FETCH( func, *inst, 1, 1, chan_index );
1892          emit_sub( func, 0, 1 );
1893          STORE( func, *inst, 0, 0, chan_index );
1894       }
1895       break;
1896
1897    case TGSI_OPCODE_LERP:
1898    /* TGSI_OPCODE_LRP */
1899       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1900          FETCH( func, *inst, 0, 0, chan_index );
1901          FETCH( func, *inst, 1, 1, chan_index );
1902          FETCH( func, *inst, 2, 2, chan_index );
1903          emit_sub( func, 1, 2 );
1904          emit_mul( func, 0, 1 );
1905          emit_add( func, 0, 2 );
1906          STORE( func, *inst, 0, 0, chan_index );
1907       }
1908       break;
1909
1910    case TGSI_OPCODE_CND:
1911       return 0;
1912       break;
1913
1914    case TGSI_OPCODE_CND0:
1915       return 0;
1916       break;
1917
1918    case TGSI_OPCODE_DOT2ADD:
1919    /* TGSI_OPCODE_DP2A */
1920       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
1921       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
1922       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
1923       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
1924       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
1925       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
1926       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
1927       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
1928       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
1929       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1930          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
1931       }
1932       break;
1933
1934    case TGSI_OPCODE_INDEX:
1935       return 0;
1936       break;
1937
1938    case TGSI_OPCODE_NEGATE:
1939       return 0;
1940       break;
1941
1942    case TGSI_OPCODE_FRAC:
1943    /* TGSI_OPCODE_FRC */
1944       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1945          FETCH( func, *inst, 0, 0, chan_index );
1946          emit_frc( func, 0, 0 );
1947          STORE( func, *inst, 0, 0, chan_index );
1948       }
1949       break;
1950
1951    case TGSI_OPCODE_CLAMP:
1952       return 0;
1953       break;
1954
1955    case TGSI_OPCODE_FLOOR:
1956    /* TGSI_OPCODE_FLR */
1957       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1958          FETCH( func, *inst, 0, 0, chan_index );
1959          emit_flr( func, 0, 0 );
1960          STORE( func, *inst, 0, 0, chan_index );
1961       }
1962       break;
1963
1964    case TGSI_OPCODE_ROUND:
1965       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1966          FETCH( func, *inst, 0, 0, chan_index );
1967          emit_rnd( func, 0, 0 );
1968          STORE( func, *inst, 0, 0, chan_index );
1969       }
1970       break;
1971
1972    case TGSI_OPCODE_EXPBASE2:
1973    /* TGSI_OPCODE_EX2 */
1974       FETCH( func, *inst, 0, 0, CHAN_X );
1975       emit_ex2( func, 0, 0 );
1976       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1977          STORE( func, *inst, 0, 0, chan_index );
1978       }
1979       break;
1980
1981    case TGSI_OPCODE_LOGBASE2:
1982    /* TGSI_OPCODE_LG2 */
1983       FETCH( func, *inst, 0, 0, CHAN_X );
1984       emit_lg2( func, 0, 0 );
1985       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1986          STORE( func, *inst, 0, 0, chan_index );
1987       }
1988       break;
1989
1990    case TGSI_OPCODE_POWER:
1991    /* TGSI_OPCODE_POW */
1992       FETCH( func, *inst, 0, 0, CHAN_X );
1993       FETCH( func, *inst, 1, 1, CHAN_X );
1994       emit_pow( func, 0, 0, 0, 1 );
1995       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1996          STORE( func, *inst, 0, 0, chan_index );
1997       }
1998       break;
1999
2000    case TGSI_OPCODE_CROSSPRODUCT:
2001    /* TGSI_OPCODE_XPD */
2002       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2003           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2004          FETCH( func, *inst, 1, 1, CHAN_Z );
2005          FETCH( func, *inst, 3, 0, CHAN_Z );
2006       }
2007       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2008           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2009          FETCH( func, *inst, 0, 0, CHAN_Y );
2010          FETCH( func, *inst, 4, 1, CHAN_Y );
2011       }
2012       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2013          emit_MOV( func, 2, 0 );
2014          emit_mul( func, 2, 1 );
2015          emit_MOV( func, 5, 3 );
2016          emit_mul( func, 5, 4 );
2017          emit_sub( func, 2, 5 );
2018          STORE( func, *inst, 2, 0, CHAN_X );
2019       }
2020       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2021           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2022          FETCH( func, *inst, 2, 1, CHAN_X );
2023          FETCH( func, *inst, 5, 0, CHAN_X );
2024       }
2025       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2026          emit_mul( func, 3, 2 );
2027          emit_mul( func, 1, 5 );
2028          emit_sub( func, 3, 1 );
2029          STORE( func, *inst, 3, 0, CHAN_Y );
2030       }
2031       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2032          emit_mul( func, 5, 4 );
2033          emit_mul( func, 0, 2 );
2034          emit_sub( func, 5, 0 );
2035          STORE( func, *inst, 5, 0, CHAN_Z );
2036       }
2037       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2038          emit_tempf(
2039             func,
2040             0,
2041             TEMP_ONE_I,
2042             TEMP_ONE_C );
2043          STORE( func, *inst, 0, 0, CHAN_W );
2044       }
2045       break;
2046
2047    case TGSI_OPCODE_MULTIPLYMATRIX:
2048       return 0;
2049       break;
2050
2051    case TGSI_OPCODE_ABS:
2052       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2053          FETCH( func, *inst, 0, 0, chan_index );
2054          emit_abs( func, 0) ;
2055
2056          STORE( func, *inst, 0, 0, chan_index );
2057       }
2058       break;
2059
2060    case TGSI_OPCODE_RCC:
2061       return 0;
2062       break;
2063
2064    case TGSI_OPCODE_DPH:
2065       FETCH( func, *inst, 0, 0, CHAN_X );
2066       FETCH( func, *inst, 1, 1, CHAN_X );
2067       emit_mul( func, 0, 1 );
2068       FETCH( func, *inst, 1, 0, CHAN_Y );
2069       FETCH( func, *inst, 2, 1, CHAN_Y );
2070       emit_mul( func, 1, 2 );
2071       emit_add( func, 0, 1 );
2072       FETCH( func, *inst, 1, 0, CHAN_Z );
2073       FETCH( func, *inst, 2, 1, CHAN_Z );
2074       emit_mul( func, 1, 2 );
2075       emit_add( func, 0, 1 );
2076       FETCH( func, *inst, 1, 1, CHAN_W );
2077       emit_add( func, 0, 1 );
2078       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2079          STORE( func, *inst, 0, 0, chan_index );
2080       }
2081       break;
2082
2083    case TGSI_OPCODE_COS:
2084       FETCH( func, *inst, 0, 0, CHAN_X );
2085       emit_cos( func, 0, 0 );
2086       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2087          STORE( func, *inst, 0, 0, chan_index );
2088       }
2089       break;
2090
2091    case TGSI_OPCODE_DDX:
2092       return 0;
2093       break;
2094
2095    case TGSI_OPCODE_DDY:
2096       return 0;
2097       break;
2098
2099    case TGSI_OPCODE_KILP:
2100       /* predicated kill */
2101       emit_kilp( func );
2102       return 0; /* XXX fix me */
2103       break;
2104
2105    case TGSI_OPCODE_KIL:
2106       /* conditional kill */
2107       emit_kil( func, &inst->FullSrcRegisters[0] );
2108       break;
2109
2110    case TGSI_OPCODE_PK2H:
2111       return 0;
2112       break;
2113
2114    case TGSI_OPCODE_PK2US:
2115       return 0;
2116       break;
2117
2118    case TGSI_OPCODE_PK4B:
2119       return 0;
2120       break;
2121
2122    case TGSI_OPCODE_PK4UB:
2123       return 0;
2124       break;
2125
2126    case TGSI_OPCODE_RFL:
2127       return 0;
2128       break;
2129
2130    case TGSI_OPCODE_SEQ:
2131       return 0;
2132       break;
2133
2134    case TGSI_OPCODE_SFL:
2135       return 0;
2136       break;
2137
2138    case TGSI_OPCODE_SGT:
2139       return 0;
2140       break;
2141
2142    case TGSI_OPCODE_SIN:
2143       FETCH( func, *inst, 0, 0, CHAN_X );
2144       emit_sin( func, 0, 0 );
2145       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2146          STORE( func, *inst, 0, 0, chan_index );
2147       }
2148       break;
2149
2150    case TGSI_OPCODE_SLE:
2151       return 0;
2152       break;
2153
2154    case TGSI_OPCODE_SNE:
2155       return 0;
2156       break;
2157
2158    case TGSI_OPCODE_STR:
2159       return 0;
2160       break;
2161
2162    case TGSI_OPCODE_TEX:
2163       if (0) {
2164          /* Disable dummy texture code:
2165           */
2166          emit_tempf(
2167             func,
2168             0,
2169             TEMP_ONE_I,
2170             TEMP_ONE_C );
2171          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2172             STORE( func, *inst, 0, 0, chan_index );
2173          }
2174       }
2175       else {
2176          return 0;
2177       }
2178       break;
2179
2180    case TGSI_OPCODE_TXD:
2181       return 0;
2182       break;
2183
2184    case TGSI_OPCODE_UP2H:
2185       return 0;
2186       break;
2187
2188    case TGSI_OPCODE_UP2US:
2189       return 0;
2190       break;
2191
2192    case TGSI_OPCODE_UP4B:
2193       return 0;
2194       break;
2195
2196    case TGSI_OPCODE_UP4UB:
2197       return 0;
2198       break;
2199
2200    case TGSI_OPCODE_X2D:
2201       return 0;
2202       break;
2203
2204    case TGSI_OPCODE_ARA:
2205       return 0;
2206       break;
2207
2208    case TGSI_OPCODE_ARR:
2209       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2210          FETCH( func, *inst, 0, 0, chan_index );
2211          emit_rnd( func, 0, 0 );
2212          emit_f2it( func, 0 );
2213          STORE( func, *inst, 0, 0, chan_index );
2214       }
2215       break;
2216
2217    case TGSI_OPCODE_BRA:
2218       return 0;
2219       break;
2220
2221    case TGSI_OPCODE_CAL:
2222       return 0;
2223       break;
2224
2225    case TGSI_OPCODE_RET:
2226       emit_ret( func );
2227       break;
2228
2229    case TGSI_OPCODE_END:
2230       break;
2231
2232    case TGSI_OPCODE_SSG:
2233    /* TGSI_OPCODE_SGN */
2234       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2235          FETCH( func, *inst, 0, 0, chan_index );
2236          emit_sgn( func, 0, 0 );
2237          STORE( func, *inst, 0, 0, chan_index );
2238       }
2239       break;
2240
2241    case TGSI_OPCODE_CMP:
2242       emit_cmp (func, inst);
2243       break;
2244
2245    case TGSI_OPCODE_SCS:
2246       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2247          FETCH( func, *inst, 0, 0, CHAN_X );
2248          emit_cos( func, 0, 0 );
2249          STORE( func, *inst, 0, 0, CHAN_X );
2250       }
2251       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2252          FETCH( func, *inst, 0, 0, CHAN_X );
2253          emit_sin( func, 0, 0 );
2254          STORE( func, *inst, 0, 0, CHAN_Y );
2255       }
2256       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2257          emit_tempf(
2258             func,
2259             0,
2260             TGSI_EXEC_TEMP_00000000_I,
2261             TGSI_EXEC_TEMP_00000000_C );
2262          STORE( func, *inst, 0, 0, CHAN_Z );
2263       }
2264       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2265          emit_tempf(
2266             func,
2267             0,
2268             TEMP_ONE_I,
2269             TEMP_ONE_C );
2270          STORE( func, *inst, 0, 0, CHAN_W );
2271       }
2272       break;
2273
2274    case TGSI_OPCODE_TXB:
2275       return 0;
2276       break;
2277
2278    case TGSI_OPCODE_NRM:
2279       /* fall-through */
2280    case TGSI_OPCODE_NRM4:
2281       /* 3 or 4-component normalization */
2282       {
2283          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2284
2285          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2286              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2287              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2288              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2289
2290             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2291
2292             /* xmm4 = src.x */
2293             /* xmm0 = src.x * src.x */
2294             FETCH(func, *inst, 0, 0, CHAN_X);
2295             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2296                emit_MOV(func, 4, 0);
2297             }
2298             emit_mul(func, 0, 0);
2299
2300             /* xmm5 = src.y */
2301             /* xmm0 = xmm0 + src.y * src.y */
2302             FETCH(func, *inst, 1, 0, CHAN_Y);
2303             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2304                emit_MOV(func, 5, 1);
2305             }
2306             emit_mul(func, 1, 1);
2307             emit_add(func, 0, 1);
2308
2309             /* xmm6 = src.z */
2310             /* xmm0 = xmm0 + src.z * src.z */
2311             FETCH(func, *inst, 1, 0, CHAN_Z);
2312             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2313                emit_MOV(func, 6, 1);
2314             }
2315             emit_mul(func, 1, 1);
2316             emit_add(func, 0, 1);
2317
2318             if (dims == 4) {
2319                /* xmm7 = src.w */
2320                /* xmm0 = xmm0 + src.w * src.w */
2321                FETCH(func, *inst, 1, 0, CHAN_W);
2322                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2323                   emit_MOV(func, 7, 1);
2324                }
2325                emit_mul(func, 1, 1);
2326                emit_add(func, 0, 1);
2327             }
2328
2329             /* xmm1 = 1 / sqrt(xmm0) */
2330             emit_rsqrt(func, 1, 0);
2331
2332             /* dst.x = xmm1 * src.x */
2333             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2334                emit_mul(func, 4, 1);
2335                STORE(func, *inst, 4, 0, CHAN_X);
2336             }
2337
2338             /* dst.y = xmm1 * src.y */
2339             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2340                emit_mul(func, 5, 1);
2341                STORE(func, *inst, 5, 0, CHAN_Y);
2342             }
2343
2344             /* dst.z = xmm1 * src.z */
2345             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2346                emit_mul(func, 6, 1);
2347                STORE(func, *inst, 6, 0, CHAN_Z);
2348             }
2349
2350             /* dst.w = xmm1 * src.w */
2351             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2352                emit_mul(func, 7, 1);
2353                STORE(func, *inst, 7, 0, CHAN_W);
2354             }
2355          }
2356
2357          /* dst0.w = 1.0 */
2358          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2359             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2360             STORE(func, *inst, 0, 0, CHAN_W);
2361          }
2362       }
2363       break;
2364
2365    case TGSI_OPCODE_DIV:
2366       return 0;
2367       break;
2368
2369    case TGSI_OPCODE_DP2:
2370       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2371       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2372       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2373       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2374       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2375       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2376       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2377       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2378          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2379       }
2380       break;
2381
2382    case TGSI_OPCODE_TXL:
2383       return 0;
2384       break;
2385
2386    case TGSI_OPCODE_BRK:
2387       return 0;
2388       break;
2389
2390    case TGSI_OPCODE_IF:
2391       return 0;
2392       break;
2393
2394    case TGSI_OPCODE_LOOP:
2395       return 0;
2396       break;
2397
2398    case TGSI_OPCODE_REP:
2399       return 0;
2400       break;
2401
2402    case TGSI_OPCODE_ELSE:
2403       return 0;
2404       break;
2405
2406    case TGSI_OPCODE_ENDIF:
2407       return 0;
2408       break;
2409
2410    case TGSI_OPCODE_ENDLOOP:
2411       return 0;
2412       break;
2413
2414    case TGSI_OPCODE_ENDREP:
2415       return 0;
2416       break;
2417
2418    case TGSI_OPCODE_PUSHA:
2419       return 0;
2420       break;
2421
2422    case TGSI_OPCODE_POPA:
2423       return 0;
2424       break;
2425
2426    case TGSI_OPCODE_CEIL:
2427       return 0;
2428       break;
2429
2430    case TGSI_OPCODE_I2F:
2431       return 0;
2432       break;
2433
2434    case TGSI_OPCODE_NOT:
2435       return 0;
2436       break;
2437
2438    case TGSI_OPCODE_TRUNC:
2439       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2440          FETCH( func, *inst, 0, 0, chan_index );
2441          emit_f2it( func, 0 );
2442          emit_i2f( func, 0 );
2443          STORE( func, *inst, 0, 0, chan_index );
2444       }
2445       break;
2446
2447    case TGSI_OPCODE_SHL:
2448       return 0;
2449       break;
2450
2451    case TGSI_OPCODE_SHR:
2452       return 0;
2453       break;
2454
2455    case TGSI_OPCODE_AND:
2456       return 0;
2457       break;
2458
2459    case TGSI_OPCODE_OR:
2460       return 0;
2461       break;
2462
2463    case TGSI_OPCODE_MOD:
2464       return 0;
2465       break;
2466
2467    case TGSI_OPCODE_XOR:
2468       return 0;
2469       break;
2470
2471    case TGSI_OPCODE_SAD:
2472       return 0;
2473       break;
2474
2475    case TGSI_OPCODE_TXF:
2476       return 0;
2477       break;
2478
2479    case TGSI_OPCODE_TXQ:
2480       return 0;
2481       break;
2482
2483    case TGSI_OPCODE_CONT:
2484       return 0;
2485       break;
2486
2487    case TGSI_OPCODE_EMIT:
2488       return 0;
2489       break;
2490
2491    case TGSI_OPCODE_ENDPRIM:
2492       return 0;
2493       break;
2494
2495    default:
2496       return 0;
2497    }
2498
2499    return 1;
2500 }
2501
2502 static void
2503 emit_declaration(
2504    struct x86_function *func,
2505    struct tgsi_full_declaration *decl )
2506 {
2507    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2508       unsigned first, last, mask;
2509       unsigned i, j;
2510
2511       first = decl->DeclarationRange.First;
2512       last = decl->DeclarationRange.Last;
2513       mask = decl->Declaration.UsageMask;
2514
2515       for( i = first; i <= last; i++ ) {
2516          for( j = 0; j < NUM_CHANNELS; j++ ) {
2517             if( mask & (1 << j) ) {
2518                switch( decl->Declaration.Interpolate ) {
2519                case TGSI_INTERPOLATE_CONSTANT:
2520                   emit_coef_a0( func, 0, i, j );
2521                   emit_inputs( func, 0, i, j );
2522                   break;
2523
2524                case TGSI_INTERPOLATE_LINEAR:
2525                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2526                   emit_coef_dadx( func, 1, i, j );
2527                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2528                   emit_coef_dady( func, 3, i, j );
2529                   emit_mul( func, 0, 1 );    /* x * dadx */
2530                   emit_coef_a0( func, 4, i, j );
2531                   emit_mul( func, 2, 3 );    /* y * dady */
2532                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2533                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2534                   emit_inputs( func, 0, i, j );
2535                   break;
2536
2537                case TGSI_INTERPOLATE_PERSPECTIVE:
2538                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2539                   emit_coef_dadx( func, 1, i, j );
2540                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2541                   emit_coef_dady( func, 3, i, j );
2542                   emit_mul( func, 0, 1 );    /* x * dadx */
2543                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2544                   emit_coef_a0( func, 5, i, j );
2545                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2546                   emit_mul( func, 2, 3 );    /* y * dady */
2547                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2548                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2549                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2550                   emit_inputs( func, 0, i, j );
2551                   break;
2552
2553                default:
2554                   assert( 0 );
2555                   break;
2556                }
2557             }
2558          }
2559       }
2560    }
2561 }
2562
2563 static void aos_to_soa( struct x86_function *func,
2564                         uint arg_aos,
2565                         uint arg_machine,
2566                         uint arg_num,
2567                         uint arg_stride )
2568 {
2569    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2570    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2571    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2572    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2573    int inner_loop;
2574
2575
2576    /* Save EBX */
2577    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2578
2579    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2580    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2581    x86_lea( func, soa_input,
2582             x86_make_disp( soa_input,
2583                            Offset(struct tgsi_exec_machine, Inputs) ) );
2584    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2585    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2586
2587    /* do */
2588    inner_loop = x86_get_label( func );
2589    {
2590       x86_push( func, aos_input );
2591       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2592       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2593       x86_add( func, aos_input, stride );
2594       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2595       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2596       x86_add( func, aos_input, stride );
2597       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2598       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2599       x86_add( func, aos_input, stride );
2600       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2601       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2602       x86_pop( func, aos_input );
2603
2604       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2605       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2606       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2607       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2608       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2609       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2610
2611       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2612       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2613       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2614       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2615
2616       /* Advance to next input */
2617       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2618       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2619    }
2620    /* while --num_inputs */
2621    x86_dec( func, num_inputs );
2622    x86_jcc( func, cc_NE, inner_loop );
2623
2624    /* Restore EBX */
2625    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2626 }
2627
2628 static void soa_to_aos( struct x86_function *func,
2629                         uint arg_aos,
2630                         uint arg_machine,
2631                         uint arg_num,
2632                         uint arg_stride )
2633 {
2634    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2635    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2636    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2637    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2638    int inner_loop;
2639
2640    /* Save EBX */
2641    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2642
2643    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2644    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2645    x86_lea( func, soa_output,
2646             x86_make_disp( soa_output,
2647                            Offset(struct tgsi_exec_machine, Outputs) ) );
2648    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2649
2650    /* do */
2651    inner_loop = x86_get_label( func );
2652    {
2653       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2654       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2655       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2656       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2657
2658       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2659       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2660       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2661       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2662       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2663       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2664
2665       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2666       x86_push( func, aos_output );
2667       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2668       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2669       x86_add( func, aos_output, temp );
2670       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2671       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2672       x86_add( func, aos_output, temp );
2673       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2674       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2675       x86_add( func, aos_output, temp );
2676       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2677       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2678       x86_pop( func, aos_output );
2679
2680       /* Advance to next output */
2681       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2682       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2683    }
2684    /* while --num_outputs */
2685    x86_dec( func, num_outputs );
2686    x86_jcc( func, cc_NE, inner_loop );
2687
2688    /* Restore EBX */
2689    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2690 }
2691
2692 /**
2693  * Translate a TGSI vertex/fragment shader to SSE2 code.
2694  * Slightly different things are done for vertex vs. fragment shaders.
2695  *
2696  * \param tokens  the TGSI input shader
2697  * \param func  the output SSE code/function
2698  * \param immediates  buffer to place immediates, later passed to SSE func
2699  * \param return  1 for success, 0 if translation failed
2700  */
2701 unsigned
2702 tgsi_emit_sse2(
2703    const struct tgsi_token *tokens,
2704    struct x86_function *func,
2705    float (*immediates)[4],
2706    boolean do_swizzles )
2707 {
2708    struct tgsi_parse_context parse;
2709    unsigned ok = 1;
2710    uint num_immediates = 0;
2711
2712    util_init_math();
2713
2714    func->csr = func->store;
2715
2716    tgsi_parse_init( &parse, tokens );
2717
2718    /* Can't just use EDI, EBX without save/restoring them:
2719     */
2720    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2721    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2722
2723    /*
2724     * Different function args for vertex/fragment shaders:
2725     */
2726    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2727       if (do_swizzles)
2728          aos_to_soa( func,
2729                      4,         /* aos_input */
2730                      1,         /* machine */
2731                      5,         /* num_inputs */
2732                      6 );       /* input_stride */
2733    }
2734
2735    x86_mov(
2736       func,
2737       get_machine_base(),
2738       x86_fn_arg( func, 1 ) );
2739    x86_mov(
2740       func,
2741       get_const_base(),
2742       x86_fn_arg( func, 2 ) );
2743    x86_mov(
2744       func,
2745       get_immediate_base(),
2746       x86_fn_arg( func, 3 ) );
2747
2748    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2749       x86_mov(
2750          func,
2751          get_coef_base(),
2752          x86_fn_arg( func, 4 ) );
2753    }
2754
2755
2756    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2757       tgsi_parse_token( &parse );
2758
2759       switch( parse.FullToken.Token.Type ) {
2760       case TGSI_TOKEN_TYPE_DECLARATION:
2761          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2762             emit_declaration(
2763                func,
2764                &parse.FullToken.FullDeclaration );
2765          }
2766          break;
2767
2768       case TGSI_TOKEN_TYPE_INSTRUCTION:
2769          ok = emit_instruction(
2770             func,
2771             &parse.FullToken.FullInstruction );
2772
2773          if (!ok) {
2774             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2775                          parse.FullToken.FullInstruction.Instruction.Opcode,
2776                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2777                          "vertex shader" : "fragment shader");
2778          }
2779          break;
2780
2781       case TGSI_TOKEN_TYPE_IMMEDIATE:
2782          /* simply copy the immediate values into the next immediates[] slot */
2783          {
2784             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
2785             uint i;
2786             assert(size <= 4);
2787             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2788             for( i = 0; i < size; i++ ) {
2789                immediates[num_immediates][i] =
2790                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2791             }
2792 #if 0
2793             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2794                    num_immediates,
2795                    immediates[num_immediates][0],
2796                    immediates[num_immediates][1],
2797                    immediates[num_immediates][2],
2798                    immediates[num_immediates][3]);
2799 #endif
2800             num_immediates++;
2801          }
2802          break;
2803
2804       default:
2805          ok = 0;
2806          assert( 0 );
2807       }
2808    }
2809
2810    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2811       if (do_swizzles)
2812          soa_to_aos( func,
2813                      7,         /* aos_output */
2814                      1,         /* machine */
2815                      8,         /* num_outputs */
2816                      9 );       /* output_stride */
2817    }
2818
2819    /* Can't just use EBX, EDI without save/restoring them:
2820     */
2821    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
2822    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2823
2824    emit_ret( func );
2825
2826    tgsi_parse_free( &parse );
2827
2828    return ok;
2829 }
2830
2831 #endif /* PIPE_ARCH_X86 */
2832