src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_debug.h"
  29 #include "pipe/p_shader_tokens.h"
  30 #include "util/u_math.h"
  31 #include "tgsi/tgsi_parse.h"
  32 #include "tgsi/tgsi_util.h"
  33 #include "tgsi_exec.h"
  34 #include "tgsi_sse2.h"
  35
  36 #include "rtasm/rtasm_x86sse.h"
  37
  38 #ifdef PIPE_ARCH_X86
  39
  40 /* for 1/sqrt()
  41  *
  42  * This costs about 100fps (close to 10%) in gears:
  43  */
  44 #define HIGH_PRECISION 1
  45
  46 #define FAST_MATH 1
  47
  48
  49 #define FOR_EACH_CHANNEL( CHAN )\
  50    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  51
  52 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  53    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  54
  55 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  56    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  57
  58 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  59    FOR_EACH_CHANNEL( CHAN )\
  60       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  61
  62 #define CHAN_X 0
  63 #define CHAN_Y 1
  64 #define CHAN_Z 2
  65 #define CHAN_W 3
  66
  67 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  68 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  69
  70 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  71 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  72 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  73 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  74
  75
  76 /**
  77  * X86 utility functions.
  78  */
  79
  80 static struct x86_reg
  81 make_xmm(
  82    unsigned xmm )
  83 {
  84    return x86_make_reg(
  85       file_XMM,
  86       (enum x86_reg_name) xmm );
  87 }
  88
  89 /**
  90  * X86 register mapping helpers.
  91  */
  92
  93 static struct x86_reg
  94 get_const_base( void )
  95 {
  96    return x86_make_reg(
  97       file_REG32,
  98       reg_CX );
  99 }
 100
 101 static struct x86_reg
 102 get_input_base( void )
 103 {
 104    return x86_make_reg(
 105       file_REG32,
 106       reg_AX );
 107 }
 108
 109 static struct x86_reg
 110 get_output_base( void )
 111 {
 112    return x86_make_reg(
 113       file_REG32,
 114       reg_DX );
 115 }
 116
 117 static struct x86_reg
 118 get_temp_base( void )
 119 {
 120    return x86_make_reg(
 121       file_REG32,
 122       reg_BX );
 123 }
 124
 125 static struct x86_reg
 126 get_coef_base( void )
 127 {
 128    return get_output_base();
 129 }
 130
 131 static struct x86_reg
 132 get_immediate_base( void )
 133 {
 134    return x86_make_reg(
 135       file_REG32,
 136       reg_DI );
 137 }
 138
 139
 140 /**
 141  * Data access helpers.
 142  */
 143
 144
 145 static struct x86_reg
 146 get_immediate(
 147    unsigned vec,
 148    unsigned chan )
 149 {
 150    return x86_make_disp(
 151       get_immediate_base(),
 152       (vec * 4 + chan) * 4 );
 153 }
 154
 155 static struct x86_reg
 156 get_const(
 157    unsigned vec,
 158    unsigned chan )
 159 {
 160    return x86_make_disp(
 161       get_const_base(),
 162       (vec * 4 + chan) * 4 );
 163 }
 164
 165 static struct x86_reg
 166 get_input(
 167    unsigned vec,
 168    unsigned chan )
 169 {
 170    return x86_make_disp(
 171       get_input_base(),
 172       (vec * 4 + chan) * 16 );
 173 }
 174
 175 static struct x86_reg
 176 get_output(
 177    unsigned vec,
 178    unsigned chan )
 179 {
 180    return x86_make_disp(
 181       get_output_base(),
 182       (vec * 4 + chan) * 16 );
 183 }
 184
 185 static struct x86_reg
 186 get_temp(
 187    unsigned vec,
 188    unsigned chan )
 189 {
 190    return x86_make_disp(
 191       get_temp_base(),
 192       (vec * 4 + chan) * 16 );
 193 }
 194
 195 static struct x86_reg
 196 get_coef(
 197    unsigned vec,
 198    unsigned chan,
 199    unsigned member )
 200 {
 201    return x86_make_disp(
 202       get_coef_base(),
 203       ((vec * 3 + member) * 4 + chan) * 4 );
 204 }
 205
 206
 207 static void
 208 emit_ret(
 209    struct x86_function  *func )
 210 {
 211    x86_ret( func );
 212 }
 213
 214
 215 /**
 216  * Data fetch helpers.
 217  */
 218
 219 /**
 220  * Copy a shader constant to xmm register
 221  * \param xmm  the destination xmm register
 222  * \param vec  the src const buffer index
 223  * \param chan  src channel to fetch (X, Y, Z or W)
 224  */
 225 static void
 226 emit_const(
 227    struct x86_function *func,
 228    uint xmm,
 229    int vec,
 230    uint chan,
 231    uint indirect,
 232    uint indirectFile,
 233    int indirectIndex )
 234 {
 235    if (indirect) {
 236       /* 'vec' is the offset from the address register's value.
 237        * We're loading CONST[ADDR+vec] into an xmm register.
 238        */
 239       struct x86_reg r0 = get_input_base();
 240       struct x86_reg r1 = get_output_base();
 241       uint i;
 242
 243       assert( indirectFile == TGSI_FILE_ADDRESS );
 244       assert( indirectIndex == 0 );
 245
 246       x86_push( func, r0 );
 247       x86_push( func, r1 );
 248
 249       /*
 250        * Loop over the four pixels or vertices in the quad.
 251        * Get the value of the address (offset) register for pixel/vertex[i],
 252        * add it to the src offset and index into the constant buffer.
 253        * Note that we're working on SOA data.
 254        * If any of the pixel/vertex execution channels are unused their
 255        * values will be garbage.  It's very important that we don't use
 256        * those garbage values as indexes into the constant buffer since
 257        * that'll cause segfaults.
 258        * The solution is to bitwise-AND the offset with the execution mask
 259        * register whose values are either 0 or ~0.
 260        * The caller must setup the execution mask register to indicate
 261        * which channels are valid/alive before running the shader.
 262        * The execution mask will also figure into loops and conditionals
 263        * someday.
 264        */
 265       for (i = 0; i < QUAD_SIZE; i++) {
 266          /* r1 = address register[i] */
 267          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 268          /* r0 = execution mask[i] */
 269          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 270          /* r1 = r1 & r0 */
 271          x86_and( func, r1, r0 );
 272          /* r0 = 'vec', the offset */
 273          x86_lea( func, r0, get_const( vec, chan ) );
 274
 275          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 276           */
 277          x86_add( func, r1, r1 );
 278          x86_add( func, r1, r1 );
 279          x86_add( func, r1, r1 );
 280          x86_add( func, r1, r1 );
 281
 282          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 283          x86_mov( func, r1, x86_deref( r0 ) );
 284          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 285       }
 286
 287       x86_pop( func, r1 );
 288       x86_pop( func, r0 );
 289
 290       sse_movaps(
 291          func,
 292          make_xmm( xmm ),
 293          get_temp( TEMP_R0, CHAN_X ) );
 294    }
 295    else {
 296       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 297       assert( vec >= 0 );
 298
 299       sse_movss(
 300          func,
 301          make_xmm( xmm ),
 302          get_const( vec, chan ) );
 303       sse_shufps(
 304          func,
 305          make_xmm( xmm ),
 306          make_xmm( xmm ),
 307          SHUF( 0, 0, 0, 0 ) );
 308    }
 309 }
 310
 311 static void
 312 emit_immediate(
 313    struct x86_function *func,
 314    unsigned xmm,
 315    unsigned vec,
 316    unsigned chan )
 317 {
 318    sse_movss(
 319       func,
 320       make_xmm( xmm ),
 321       get_immediate( vec, chan ) );
 322    sse_shufps(
 323       func,
 324       make_xmm( xmm ),
 325       make_xmm( xmm ),
 326       SHUF( 0, 0, 0, 0 ) );
 327 }
 328
 329
 330 /**
 331  * Copy a shader input to xmm register
 332  * \param xmm  the destination xmm register
 333  * \param vec  the src input attrib
 334  * \param chan  src channel to fetch (X, Y, Z or W)
 335  */
 336 static void
 337 emit_inputf(
 338    struct x86_function *func,
 339    unsigned xmm,
 340    unsigned vec,
 341    unsigned chan )
 342 {
 343    sse_movups(
 344       func,
 345       make_xmm( xmm ),
 346       get_input( vec, chan ) );
 347 }
 348
 349 /**
 350  * Store an xmm register to a shader output
 351  * \param xmm  the source xmm register
 352  * \param vec  the dest output attrib
 353  * \param chan  src dest channel to store (X, Y, Z or W)
 354  */
 355 static void
 356 emit_output(
 357    struct x86_function *func,
 358    unsigned xmm,
 359    unsigned vec,
 360    unsigned chan )
 361 {
 362    sse_movups(
 363       func,
 364       get_output( vec, chan ),
 365       make_xmm( xmm ) );
 366 }
 367
 368 /**
 369  * Copy a shader temporary to xmm register
 370  * \param xmm  the destination xmm register
 371  * \param vec  the src temp register
 372  * \param chan  src channel to fetch (X, Y, Z or W)
 373  */
 374 static void
 375 emit_tempf(
 376    struct x86_function *func,
 377    unsigned xmm,
 378    unsigned vec,
 379    unsigned chan )
 380 {
 381    sse_movaps(
 382       func,
 383       make_xmm( xmm ),
 384       get_temp( vec, chan ) );
 385 }
 386
 387 /**
 388  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 389  * \param xmm  the destination xmm register
 390  * \param vec  the src input/attribute coefficient index
 391  * \param chan  src channel to fetch (X, Y, Z or W)
 392  * \param member  0=a0, 1=dadx, 2=dady
 393  */
 394 static void
 395 emit_coef(
 396    struct x86_function *func,
 397    unsigned xmm,
 398    unsigned vec,
 399    unsigned chan,
 400    unsigned member )
 401 {
 402    sse_movss(
 403       func,
 404       make_xmm( xmm ),
 405       get_coef( vec, chan, member ) );
 406    sse_shufps(
 407       func,
 408       make_xmm( xmm ),
 409       make_xmm( xmm ),
 410       SHUF( 0, 0, 0, 0 ) );
 411 }
 412
 413 /**
 414  * Data store helpers.
 415  */
 416
 417 static void
 418 emit_inputs(
 419    struct x86_function *func,
 420    unsigned xmm,
 421    unsigned vec,
 422    unsigned chan )
 423 {
 424    sse_movups(
 425       func,
 426       get_input( vec, chan ),
 427       make_xmm( xmm ) );
 428 }
 429
 430 static void
 431 emit_temps(
 432    struct x86_function *func,
 433    unsigned xmm,
 434    unsigned vec,
 435    unsigned chan )
 436 {
 437    sse_movaps(
 438       func,
 439       get_temp( vec, chan ),
 440       make_xmm( xmm ) );
 441 }
 442
 443 static void
 444 emit_addrs(
 445    struct x86_function *func,
 446    unsigned xmm,
 447    unsigned vec,
 448    unsigned chan )
 449 {
 450    assert( vec == 0 );
 451
 452    emit_temps(
 453       func,
 454       xmm,
 455       vec + TGSI_EXEC_TEMP_ADDR,
 456       chan );
 457 }
 458
 459 /**
 460  * Coefficent fetch helpers.
 461  */
 462
 463 static void
 464 emit_coef_a0(
 465    struct x86_function *func,
 466    unsigned xmm,
 467    unsigned vec,
 468    unsigned chan )
 469 {
 470    emit_coef(
 471       func,
 472       xmm,
 473       vec,
 474       chan,
 475       0 );
 476 }
 477
 478 static void
 479 emit_coef_dadx(
 480    struct x86_function *func,
 481    unsigned xmm,
 482    unsigned vec,
 483    unsigned chan )
 484 {
 485    emit_coef(
 486       func,
 487       xmm,
 488       vec,
 489       chan,
 490       1 );
 491 }
 492
 493 static void
 494 emit_coef_dady(
 495    struct x86_function *func,
 496    unsigned xmm,
 497    unsigned vec,
 498    unsigned chan )
 499 {
 500    emit_coef(
 501       func,
 502       xmm,
 503       vec,
 504       chan,
 505       2 );
 506 }
 507
 508 /**
 509  * Function call helpers.
 510  */
 511
 512 static void
 513 emit_push_gp(
 514    struct x86_function *func )
 515 {
 516    x86_push(
 517       func,
 518       x86_make_reg( file_REG32, reg_AX) );
 519    x86_push(
 520       func,
 521       x86_make_reg( file_REG32, reg_CX) );
 522    x86_push(
 523       func,
 524       x86_make_reg( file_REG32, reg_DX) );
 525 }
 526
 527 static void
 528 x86_pop_gp(
 529    struct x86_function *func )
 530 {
 531    /* Restore GP registers in a reverse order.
 532     */
 533    x86_pop(
 534       func,
 535       x86_make_reg( file_REG32, reg_DX) );
 536    x86_pop(
 537       func,
 538       x86_make_reg( file_REG32, reg_CX) );
 539    x86_pop(
 540       func,
 541       x86_make_reg( file_REG32, reg_AX) );
 542 }
 543
 544 static void
 545 emit_func_call_dst(
 546    struct x86_function *func,
 547    unsigned xmm_dst,
 548    void (PIPE_CDECL *code)() )
 549 {
 550    sse_movaps(
 551       func,
 552       get_temp( TEMP_R0, 0 ),
 553       make_xmm( xmm_dst ) );
 554
 555    emit_push_gp(
 556       func );
 557
 558    {
 559       struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 560
 561       x86_lea(
 562          func,
 563          ecx,
 564          get_temp( TEMP_R0, 0 ) );
 565
 566       x86_push( func, ecx );
 567       x86_mov_reg_imm( func, ecx, (unsigned long) code );
 568       x86_call( func, ecx );
 569       x86_pop(func, ecx );
 570    }
 571
 572
 573    x86_pop_gp(
 574       func );
 575
 576    sse_movaps(
 577       func,
 578       make_xmm( xmm_dst ),
 579       get_temp( TEMP_R0, 0 ) );
 580 }
 581
 582 static void
 583 emit_func_call_dst_src(
 584    struct x86_function *func,
 585    unsigned xmm_dst,
 586    unsigned xmm_src,
 587    void (PIPE_CDECL *code)() )
 588 {
 589    sse_movaps(
 590       func,
 591       get_temp( TEMP_R0, 1 ),
 592       make_xmm( xmm_src ) );
 593
 594    emit_func_call_dst(
 595       func,
 596       xmm_dst,
 597       code );
 598 }
 599
 600 /**
 601  * Low-level instruction translators.
 602  */
 603
 604 static void
 605 emit_abs(
 606    struct x86_function *func,
 607    unsigned xmm )
 608 {
 609    sse_andps(
 610       func,
 611       make_xmm( xmm ),
 612       get_temp(
 613          TGSI_EXEC_TEMP_7FFFFFFF_I,
 614          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 615 }
 616
 617 static void
 618 emit_add(
 619    struct x86_function *func,
 620    unsigned xmm_dst,
 621    unsigned xmm_src )
 622 {
 623    sse_addps(
 624       func,
 625       make_xmm( xmm_dst ),
 626       make_xmm( xmm_src ) );
 627 }
 628
 629 static void PIPE_CDECL
 630 cos4f(
 631    float *store )
 632 {
 633    store[0] = cosf( store[0] );
 634    store[1] = cosf( store[1] );
 635    store[2] = cosf( store[2] );
 636    store[3] = cosf( store[3] );
 637 }
 638
 639 static void
 640 emit_cos(
 641    struct x86_function *func,
 642    unsigned xmm_dst )
 643 {
 644    emit_func_call_dst(
 645       func,
 646       xmm_dst,
 647       cos4f );
 648 }
 649
 650 static void PIPE_CDECL
 651 ex24f(
 652    float *store )
 653 {
 654 #if FAST_MATH
 655    store[0] = util_fast_exp2( store[0] );
 656    store[1] = util_fast_exp2( store[1] );
 657    store[2] = util_fast_exp2( store[2] );
 658    store[3] = util_fast_exp2( store[3] );
 659 #else
 660    store[0] = powf( 2.0f, store[0] );
 661    store[1] = powf( 2.0f, store[1] );
 662    store[2] = powf( 2.0f, store[2] );
 663    store[3] = powf( 2.0f, store[3] );
 664 #endif
 665 }
 666
 667 static void
 668 emit_ex2(
 669    struct x86_function *func,
 670    unsigned xmm_dst )
 671 {
 672    emit_func_call_dst(
 673       func,
 674       xmm_dst,
 675       ex24f );
 676 }
 677
 678 static void
 679 emit_f2it(
 680    struct x86_function *func,
 681    unsigned xmm )
 682 {
 683    sse2_cvttps2dq(
 684       func,
 685       make_xmm( xmm ),
 686       make_xmm( xmm ) );
 687 }
 688
 689 static void
 690 emit_i2f(
 691    struct x86_function *func,
 692    unsigned xmm )
 693 {
 694    sse2_cvtdq2ps(
 695       func,
 696       make_xmm( xmm ),
 697       make_xmm( xmm ) );
 698 }
 699
 700 static void PIPE_CDECL
 701 flr4f(
 702    float *store )
 703 {
 704    store[0] = floorf( store[0] );
 705    store[1] = floorf( store[1] );
 706    store[2] = floorf( store[2] );
 707    store[3] = floorf( store[3] );
 708 }
 709
 710 static void
 711 emit_flr(
 712    struct x86_function *func,
 713    unsigned xmm_dst )
 714 {
 715    emit_func_call_dst(
 716       func,
 717       xmm_dst,
 718       flr4f );
 719 }
 720
 721 static void PIPE_CDECL
 722 frc4f(
 723    float *store )
 724 {
 725    store[0] -= floorf( store[0] );
 726    store[1] -= floorf( store[1] );
 727    store[2] -= floorf( store[2] );
 728    store[3] -= floorf( store[3] );
 729 }
 730
 731 static void
 732 emit_frc(
 733    struct x86_function *func,
 734    unsigned xmm_dst )
 735 {
 736    emit_func_call_dst(
 737       func,
 738       xmm_dst,
 739       frc4f );
 740 }
 741
 742 static void PIPE_CDECL
 743 lg24f(
 744    float *store )
 745 {
 746    store[0] = util_fast_log2( store[0] );
 747    store[1] = util_fast_log2( store[1] );
 748    store[2] = util_fast_log2( store[2] );
 749    store[3] = util_fast_log2( store[3] );
 750 }
 751
 752 static void
 753 emit_lg2(
 754    struct x86_function *func,
 755    unsigned xmm_dst )
 756 {
 757    emit_func_call_dst(
 758       func,
 759       xmm_dst,
 760       lg24f );
 761 }
 762
 763 static void
 764 emit_MOV(
 765    struct x86_function *func,
 766    unsigned xmm_dst,
 767    unsigned xmm_src )
 768 {
 769    sse_movups(
 770       func,
 771       make_xmm( xmm_dst ),
 772       make_xmm( xmm_src ) );
 773 }
 774
 775 static void
 776 emit_mul (struct x86_function *func,
 777           unsigned xmm_dst,
 778           unsigned xmm_src)
 779 {
 780    sse_mulps(
 781       func,
 782       make_xmm( xmm_dst ),
 783       make_xmm( xmm_src ) );
 784 }
 785
 786 static void
 787 emit_neg(
 788    struct x86_function *func,
 789    unsigned xmm )
 790 {
 791    sse_xorps(
 792       func,
 793       make_xmm( xmm ),
 794       get_temp(
 795          TGSI_EXEC_TEMP_80000000_I,
 796          TGSI_EXEC_TEMP_80000000_C ) );
 797 }
 798
 799 static void PIPE_CDECL
 800 pow4f(
 801    float *store )
 802 {
 803 #if FAST_MATH
 804    store[0] = util_fast_pow( store[0], store[4] );
 805    store[1] = util_fast_pow( store[1], store[5] );
 806    store[2] = util_fast_pow( store[2], store[6] );
 807    store[3] = util_fast_pow( store[3], store[7] );
 808 #else
 809    store[0] = powf( store[0], store[4] );
 810    store[1] = powf( store[1], store[5] );
 811    store[2] = powf( store[2], store[6] );
 812    store[3] = powf( store[3], store[7] );
 813 #endif
 814 }
 815
 816 static void
 817 emit_pow(
 818    struct x86_function *func,
 819    unsigned xmm_dst,
 820    unsigned xmm_src )
 821 {
 822    emit_func_call_dst_src(
 823       func,
 824       xmm_dst,
 825       xmm_src,
 826       pow4f );
 827 }
 828
 829 static void
 830 emit_rcp (
 831    struct x86_function *func,
 832    unsigned xmm_dst,
 833    unsigned xmm_src )
 834 {
 835    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 836     * good enough.  Need to either emit a proper divide or use the
 837     * iterative technique described below in emit_rsqrt().
 838     */
 839    sse2_rcpps(
 840       func,
 841       make_xmm( xmm_dst ),
 842       make_xmm( xmm_src ) );
 843 }
 844
 845 static void PIPE_CDECL
 846 rnd4f(
 847    float *store )
 848 {
 849    store[0] = floorf( store[0] + 0.5f );
 850    store[1] = floorf( store[1] + 0.5f );
 851    store[2] = floorf( store[2] + 0.5f );
 852    store[3] = floorf( store[3] + 0.5f );
 853 }
 854
 855 static void
 856 emit_rnd(
 857    struct x86_function *func,
 858    unsigned xmm_save,
 859    unsigned xmm_dst )
 860 {
 861    emit_func_call_dst(
 862       func,
 863       xmm_save,
 864       xmm_dst,
 865       rnd4f );
 866 }
 867
 868 static void
 869 emit_rsqrt(
 870    struct x86_function *func,
 871    unsigned xmm_dst,
 872    unsigned xmm_src )
 873 {
 874 #if HIGH_PRECISION
 875    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
 876     * implementations, it is possible to improve its precision at
 877     * fairly low cost, using a newton/raphson step, as below:
 878     *
 879     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
 880     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
 881     *
 882     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
 883     */
 884    {
 885       struct x86_reg dst = make_xmm( xmm_dst );
 886       struct x86_reg src = make_xmm( xmm_src );
 887       struct x86_reg tmp0 = make_xmm( 2 );
 888       struct x86_reg tmp1 = make_xmm( 3 );
 889
 890       assert( xmm_dst != xmm_src );
 891       assert( xmm_dst != 2 && xmm_dst != 3 );
 892       assert( xmm_src != 2 && xmm_src != 3 );
 893
 894       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
 895       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
 896       sse_rsqrtps( func, tmp1, src  );
 897       sse_mulps(   func, src,  tmp1 );
 898       sse_mulps(   func, dst,  tmp1 );
 899       sse_mulps(   func, src,  tmp1 );
 900       sse_subps(   func, tmp0, src  );
 901       sse_mulps(   func, dst,  tmp0 );
 902    }
 903 #else
 904    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 905     * good enough.
 906     */
 907    sse_rsqrtps(
 908       func,
 909       make_xmm( xmm_dst ),
 910       make_xmm( xmm_src ) );
 911 #endif
 912 }
 913
 914 static void
 915 emit_setsign(
 916    struct x86_function *func,
 917    unsigned xmm )
 918 {
 919    sse_orps(
 920       func,
 921       make_xmm( xmm ),
 922       get_temp(
 923          TGSI_EXEC_TEMP_80000000_I,
 924          TGSI_EXEC_TEMP_80000000_C ) );
 925 }
 926
 927 static void PIPE_CDECL
 928 sin4f(
 929    float *store )
 930 {
 931    store[0] = sinf( store[0] );
 932    store[1] = sinf( store[1] );
 933    store[2] = sinf( store[2] );
 934    store[3] = sinf( store[3] );
 935 }
 936
 937 static void
 938 emit_sin (struct x86_function *func,
 939           unsigned xmm_dst)
 940 {
 941    emit_func_call_dst(
 942       func,
 943       xmm_dst,
 944       sin4f );
 945 }
 946
 947 static void
 948 emit_sub(
 949    struct x86_function *func,
 950    unsigned xmm_dst,
 951    unsigned xmm_src )
 952 {
 953    sse_subps(
 954       func,
 955       make_xmm( xmm_dst ),
 956       make_xmm( xmm_src ) );
 957 }
 958
 959 /**
 960  * Register fetch.
 961  */
 962
 963 static void
 964 emit_fetch(
 965    struct x86_function *func,
 966    unsigned xmm,
 967    const struct tgsi_full_src_register *reg,
 968    const unsigned chan_index )
 969 {
 970    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
 971
 972    switch (swizzle) {
 973    case TGSI_EXTSWIZZLE_X:
 974    case TGSI_EXTSWIZZLE_Y:
 975    case TGSI_EXTSWIZZLE_Z:
 976    case TGSI_EXTSWIZZLE_W:
 977       switch (reg->SrcRegister.File) {
 978       case TGSI_FILE_CONSTANT:
 979          emit_const(
 980             func,
 981             xmm,
 982             reg->SrcRegister.Index,
 983             swizzle,
 984             reg->SrcRegister.Indirect,
 985             reg->SrcRegisterInd.File,
 986             reg->SrcRegisterInd.Index );
 987          break;
 988
 989       case TGSI_FILE_IMMEDIATE:
 990          emit_immediate(
 991             func,
 992             xmm,
 993             reg->SrcRegister.Index,
 994             swizzle );
 995          break;
 996
 997       case TGSI_FILE_INPUT:
 998          emit_inputf(
 999             func,
1000             xmm,
1001             reg->SrcRegister.Index,
1002             swizzle );
1003          break;
1004
1005       case TGSI_FILE_TEMPORARY:
1006          emit_tempf(
1007             func,
1008             xmm,
1009             reg->SrcRegister.Index,
1010             swizzle );
1011          break;
1012
1013       default:
1014          assert( 0 );
1015       }
1016       break;
1017
1018    case TGSI_EXTSWIZZLE_ZERO:
1019       emit_tempf(
1020          func,
1021          xmm,
1022          TGSI_EXEC_TEMP_00000000_I,
1023          TGSI_EXEC_TEMP_00000000_C );
1024       break;
1025
1026    case TGSI_EXTSWIZZLE_ONE:
1027       emit_tempf(
1028          func,
1029          xmm,
1030          TEMP_ONE_I,
1031          TEMP_ONE_C );
1032       break;
1033
1034    default:
1035       assert( 0 );
1036    }
1037
1038    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1039    case TGSI_UTIL_SIGN_CLEAR:
1040       emit_abs( func, xmm );
1041       break;
1042
1043    case TGSI_UTIL_SIGN_SET:
1044       emit_setsign( func, xmm );
1045       break;
1046
1047    case TGSI_UTIL_SIGN_TOGGLE:
1048       emit_neg( func, xmm );
1049       break;
1050
1051    case TGSI_UTIL_SIGN_KEEP:
1052       break;
1053    }
1054 }
1055
1056 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1057    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1058
1059 /**
1060  * Register store.
1061  */
1062
1063 static void
1064 emit_store(
1065    struct x86_function *func,
1066    unsigned xmm,
1067    const struct tgsi_full_dst_register *reg,
1068    const struct tgsi_full_instruction *inst,
1069    unsigned chan_index )
1070 {
1071    switch( reg->DstRegister.File ) {
1072    case TGSI_FILE_OUTPUT:
1073       emit_output(
1074          func,
1075          xmm,
1076          reg->DstRegister.Index,
1077          chan_index );
1078       break;
1079
1080    case TGSI_FILE_TEMPORARY:
1081       emit_temps(
1082          func,
1083          xmm,
1084          reg->DstRegister.Index,
1085          chan_index );
1086       break;
1087
1088    case TGSI_FILE_ADDRESS:
1089       emit_addrs(
1090          func,
1091          xmm,
1092          reg->DstRegister.Index,
1093          chan_index );
1094       break;
1095
1096    default:
1097       assert( 0 );
1098    }
1099
1100    switch( inst->Instruction.Saturate ) {
1101    case TGSI_SAT_NONE:
1102       break;
1103
1104    case TGSI_SAT_ZERO_ONE:
1105       /* assert( 0 ); */
1106       break;
1107
1108    case TGSI_SAT_MINUS_PLUS_ONE:
1109       assert( 0 );
1110       break;
1111    }
1112 }
1113
1114 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1115    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1116
1117 /**
1118  * High-level instruction translators.
1119  */
1120
1121 static void
1122 emit_kil(
1123    struct x86_function *func,
1124    const struct tgsi_full_src_register *reg )
1125 {
1126    unsigned uniquemask;
1127    unsigned registers[4];
1128    unsigned nextregister = 0;
1129    unsigned firstchan = ~0;
1130    unsigned chan_index;
1131
1132    /* This mask stores component bits that were already tested. Note that
1133     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1134     * tested. */
1135    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1136
1137    FOR_EACH_CHANNEL( chan_index ) {
1138       unsigned swizzle;
1139
1140       /* unswizzle channel */
1141       swizzle = tgsi_util_get_full_src_register_extswizzle(
1142          reg,
1143          chan_index );
1144
1145       /* check if the component has not been already tested */
1146       if( !(uniquemask & (1 << swizzle)) ) {
1147          uniquemask |= 1 << swizzle;
1148
1149          /* allocate register */
1150          registers[chan_index] = nextregister;
1151          emit_fetch(
1152             func,
1153             nextregister,
1154             reg,
1155             chan_index );
1156          nextregister++;
1157
1158          /* mark the first channel used */
1159          if( firstchan == ~0 ) {
1160             firstchan = chan_index;
1161          }
1162       }
1163    }
1164
1165    x86_push(
1166       func,
1167       x86_make_reg( file_REG32, reg_AX ) );
1168    x86_push(
1169       func,
1170       x86_make_reg( file_REG32, reg_DX ) );
1171
1172    FOR_EACH_CHANNEL( chan_index ) {
1173       if( uniquemask & (1 << chan_index) ) {
1174          sse_cmpps(
1175             func,
1176             make_xmm( registers[chan_index] ),
1177             get_temp(
1178                TGSI_EXEC_TEMP_00000000_I,
1179                TGSI_EXEC_TEMP_00000000_C ),
1180             cc_LessThan );
1181
1182          if( chan_index == firstchan ) {
1183             sse_pmovmskb(
1184                func,
1185                x86_make_reg( file_REG32, reg_AX ),
1186                make_xmm( registers[chan_index] ) );
1187          }
1188          else {
1189             sse_pmovmskb(
1190                func,
1191                x86_make_reg( file_REG32, reg_DX ),
1192                make_xmm( registers[chan_index] ) );
1193             x86_or(
1194                func,
1195                x86_make_reg( file_REG32, reg_AX ),
1196                x86_make_reg( file_REG32, reg_DX ) );
1197          }
1198       }
1199    }
1200
1201    x86_or(
1202       func,
1203       get_temp(
1204          TGSI_EXEC_TEMP_KILMASK_I,
1205          TGSI_EXEC_TEMP_KILMASK_C ),
1206       x86_make_reg( file_REG32, reg_AX ) );
1207
1208    x86_pop(
1209       func,
1210       x86_make_reg( file_REG32, reg_DX ) );
1211    x86_pop(
1212       func,
1213       x86_make_reg( file_REG32, reg_AX ) );
1214 }
1215
1216
1217 static void
1218 emit_kilp(
1219    struct x86_function *func )
1220 {
1221    /* XXX todo / fix me */
1222 }
1223
1224
1225 static void
1226 emit_setcc(
1227    struct x86_function *func,
1228    struct tgsi_full_instruction *inst,
1229    enum sse_cc cc )
1230 {
1231    unsigned chan_index;
1232
1233    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1234       FETCH( func, *inst, 0, 0, chan_index );
1235       FETCH( func, *inst, 1, 1, chan_index );
1236       sse_cmpps(
1237          func,
1238          make_xmm( 0 ),
1239          make_xmm( 1 ),
1240          cc );
1241       sse_andps(
1242          func,
1243          make_xmm( 0 ),
1244          get_temp(
1245             TEMP_ONE_I,
1246             TEMP_ONE_C ) );
1247       STORE( func, *inst, 0, 0, chan_index );
1248    }
1249 }
1250
1251 static void
1252 emit_cmp(
1253    struct x86_function *func,
1254    struct tgsi_full_instruction *inst )
1255 {
1256    unsigned chan_index;
1257
1258    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1259       FETCH( func, *inst, 0, 0, chan_index );
1260       FETCH( func, *inst, 1, 1, chan_index );
1261       FETCH( func, *inst, 2, 2, chan_index );
1262       sse_cmpps(
1263          func,
1264          make_xmm( 0 ),
1265          get_temp(
1266             TGSI_EXEC_TEMP_00000000_I,
1267             TGSI_EXEC_TEMP_00000000_C ),
1268          cc_LessThan );
1269       sse_andps(
1270          func,
1271          make_xmm( 1 ),
1272          make_xmm( 0 ) );
1273       sse_andnps(
1274          func,
1275          make_xmm( 0 ),
1276          make_xmm( 2 ) );
1277       sse_orps(
1278          func,
1279          make_xmm( 0 ),
1280          make_xmm( 1 ) );
1281       STORE( func, *inst, 0, 0, chan_index );
1282    }
1283 }
1284
1285 static int
1286 emit_instruction(
1287    struct x86_function *func,
1288    struct tgsi_full_instruction *inst )
1289 {
1290    unsigned chan_index;
1291
1292    switch (inst->Instruction.Opcode) {
1293    case TGSI_OPCODE_ARL:
1294       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1295          FETCH( func, *inst, 0, 0, chan_index );
1296          emit_f2it( func, 0 );
1297          STORE( func, *inst, 0, 0, chan_index );
1298       }
1299       break;
1300
1301    case TGSI_OPCODE_MOV:
1302    case TGSI_OPCODE_SWZ:
1303       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1304          FETCH( func, *inst, 0, 0, chan_index );
1305          STORE( func, *inst, 0, 0, chan_index );
1306       }
1307       break;
1308
1309    case TGSI_OPCODE_LIT:
1310       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1311           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1312          emit_tempf(
1313             func,
1314             0,
1315             TEMP_ONE_I,
1316             TEMP_ONE_C);
1317          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1318             STORE( func, *inst, 0, 0, CHAN_X );
1319          }
1320          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1321             STORE( func, *inst, 0, 0, CHAN_W );
1322          }
1323       }
1324       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1325           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1326          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1327             FETCH( func, *inst, 0, 0, CHAN_X );
1328             sse_maxps(
1329                func,
1330                make_xmm( 0 ),
1331                get_temp(
1332                   TGSI_EXEC_TEMP_00000000_I,
1333                   TGSI_EXEC_TEMP_00000000_C ) );
1334             STORE( func, *inst, 0, 0, CHAN_Y );
1335          }
1336          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1337             /* XMM[1] = SrcReg[0].yyyy */
1338             FETCH( func, *inst, 1, 0, CHAN_Y );
1339             /* XMM[1] = max(XMM[1], 0) */
1340             sse_maxps(
1341                func,
1342                make_xmm( 1 ),
1343                get_temp(
1344                   TGSI_EXEC_TEMP_00000000_I,
1345                   TGSI_EXEC_TEMP_00000000_C ) );
1346             /* XMM[2] = SrcReg[0].wwww */
1347             FETCH( func, *inst, 2, 0, CHAN_W );
1348             /* XMM[2] = min(XMM[2], 128.0) */
1349             sse_minps(
1350                func,
1351                make_xmm( 2 ),
1352                get_temp(
1353                   TGSI_EXEC_TEMP_128_I,
1354                   TGSI_EXEC_TEMP_128_C ) );
1355             /* XMM[2] = max(XMM[2], -128.0) */
1356             sse_maxps(
1357                func,
1358                make_xmm( 2 ),
1359                get_temp(
1360                   TGSI_EXEC_TEMP_MINUS_128_I,
1361                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1362             emit_pow( func, 1, 2 );
1363             FETCH( func, *inst, 0, 0, CHAN_X );
1364             sse_xorps(
1365                func,
1366                make_xmm( 2 ),
1367                make_xmm( 2 ) );
1368             sse_cmpps(
1369                func,
1370                make_xmm( 2 ),
1371                make_xmm( 0 ),
1372                cc_LessThanEqual );
1373             sse_andps(
1374                func,
1375                make_xmm( 2 ),
1376                make_xmm( 1 ) );
1377             STORE( func, *inst, 2, 0, CHAN_Z );
1378          }
1379       }
1380       break;
1381
1382    case TGSI_OPCODE_RCP:
1383    /* TGSI_OPCODE_RECIP */
1384       FETCH( func, *inst, 0, 0, CHAN_X );
1385       emit_rcp( func, 0, 0 );
1386       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1387          STORE( func, *inst, 0, 0, chan_index );
1388       }
1389       break;
1390
1391    case TGSI_OPCODE_RSQ:
1392    /* TGSI_OPCODE_RECIPSQRT */
1393       FETCH( func, *inst, 0, 0, CHAN_X );
1394       emit_rsqrt( func, 1, 0 );
1395       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1396          STORE( func, *inst, 1, 0, chan_index );
1397       }
1398       break;
1399
1400    case TGSI_OPCODE_EXP:
1401       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1402           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1403           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1404          FETCH( func, *inst, 0, 0, CHAN_X );
1405          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1406              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1407             emit_MOV( func, 1, 0 );
1408             emit_flr( func, 1 );
1409             /* dst.x = ex2(floor(src.x)) */
1410             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1411                emit_MOV( func, 2, 1 );
1412                emit_ex2( func, 2 );
1413                STORE( func, *inst, 2, 0, CHAN_X );
1414             }
1415             /* dst.y = src.x - floor(src.x) */
1416             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1417                emit_MOV( func, 2, 0 );
1418                emit_sub( func, 2, 1 );
1419                STORE( func, *inst, 2, 0, CHAN_Y );
1420             }
1421          }
1422          /* dst.z = ex2(src.x) */
1423          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1424             emit_ex2( func, 0 );
1425             STORE( func, *inst, 0, 0, CHAN_Z );
1426          }
1427       }
1428       /* dst.w = 1.0 */
1429       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1430          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1431          STORE( func, *inst, 0, 0, CHAN_W );
1432       }
1433       break;
1434
1435    case TGSI_OPCODE_LOG:
1436       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1437           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1438           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1439          FETCH( func, *inst, 0, 0, CHAN_X );
1440          emit_abs( func, 0 );
1441          emit_MOV( func, 1, 0 );
1442          emit_lg2( func, 1 );
1443          /* dst.z = lg2(abs(src.x)) */
1444          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1445             STORE( func, *inst, 1, 0, CHAN_Z );
1446          }
1447          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1448              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1449             emit_flr( func, 1 );
1450             /* dst.x = floor(lg2(abs(src.x))) */
1451             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1452                STORE( func, *inst, 1, 0, CHAN_X );
1453             }
1454             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1455             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1456                emit_ex2( func, 1 );
1457                emit_rcp( func, 1, 1 );
1458                emit_mul( func, 0, 1 );
1459                STORE( func, *inst, 0, 0, CHAN_Y );
1460             }
1461          }
1462       }
1463       /* dst.w = 1.0 */
1464       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1465          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1466          STORE( func, *inst, 0, 0, CHAN_W );
1467       }
1468       break;
1469
1470    case TGSI_OPCODE_MUL:
1471       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1472          FETCH( func, *inst, 0, 0, chan_index );
1473          FETCH( func, *inst, 1, 1, chan_index );
1474          emit_mul( func, 0, 1 );
1475          STORE( func, *inst, 0, 0, chan_index );
1476       }
1477       break;
1478
1479    case TGSI_OPCODE_ADD:
1480       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1481          FETCH( func, *inst, 0, 0, chan_index );
1482          FETCH( func, *inst, 1, 1, chan_index );
1483          emit_add( func, 0, 1 );
1484          STORE( func, *inst, 0, 0, chan_index );
1485       }
1486       break;
1487
1488    case TGSI_OPCODE_DP3:
1489    /* TGSI_OPCODE_DOT3 */
1490       FETCH( func, *inst, 0, 0, CHAN_X );
1491       FETCH( func, *inst, 1, 1, CHAN_X );
1492       emit_mul( func, 0, 1 );
1493       FETCH( func, *inst, 1, 0, CHAN_Y );
1494       FETCH( func, *inst, 2, 1, CHAN_Y );
1495       emit_mul( func, 1, 2 );
1496       emit_add( func, 0, 1 );
1497       FETCH( func, *inst, 1, 0, CHAN_Z );
1498       FETCH( func, *inst, 2, 1, CHAN_Z );
1499       emit_mul( func, 1, 2 );
1500       emit_add( func, 0, 1 );
1501       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1502          STORE( func, *inst, 0, 0, chan_index );
1503       }
1504       break;
1505
1506    case TGSI_OPCODE_DP4:
1507    /* TGSI_OPCODE_DOT4 */
1508       FETCH( func, *inst, 0, 0, CHAN_X );
1509       FETCH( func, *inst, 1, 1, CHAN_X );
1510       emit_mul( func, 0, 1 );
1511       FETCH( func, *inst, 1, 0, CHAN_Y );
1512       FETCH( func, *inst, 2, 1, CHAN_Y );
1513       emit_mul( func, 1, 2 );
1514       emit_add( func, 0, 1 );
1515       FETCH( func, *inst, 1, 0, CHAN_Z );
1516       FETCH( func, *inst, 2, 1, CHAN_Z );
1517       emit_mul(func, 1, 2 );
1518       emit_add(func, 0, 1 );
1519       FETCH( func, *inst, 1, 0, CHAN_W );
1520       FETCH( func, *inst, 2, 1, CHAN_W );
1521       emit_mul( func, 1, 2 );
1522       emit_add( func, 0, 1 );
1523       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1524          STORE( func, *inst, 0, 0, chan_index );
1525       }
1526       break;
1527
1528    case TGSI_OPCODE_DST:
1529       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1530          emit_tempf(
1531             func,
1532             0,
1533             TEMP_ONE_I,
1534             TEMP_ONE_C );
1535          STORE( func, *inst, 0, 0, CHAN_X );
1536       }
1537       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1538          FETCH( func, *inst, 0, 0, CHAN_Y );
1539          FETCH( func, *inst, 1, 1, CHAN_Y );
1540          emit_mul( func, 0, 1 );
1541          STORE( func, *inst, 0, 0, CHAN_Y );
1542       }
1543       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1544          FETCH( func, *inst, 0, 0, CHAN_Z );
1545          STORE( func, *inst, 0, 0, CHAN_Z );
1546       }
1547       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1548          FETCH( func, *inst, 0, 1, CHAN_W );
1549          STORE( func, *inst, 0, 0, CHAN_W );
1550       }
1551       break;
1552
1553    case TGSI_OPCODE_MIN:
1554       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1555          FETCH( func, *inst, 0, 0, chan_index );
1556          FETCH( func, *inst, 1, 1, chan_index );
1557          sse_minps(
1558             func,
1559             make_xmm( 0 ),
1560             make_xmm( 1 ) );
1561          STORE( func, *inst, 0, 0, chan_index );
1562       }
1563       break;
1564
1565    case TGSI_OPCODE_MAX:
1566       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1567          FETCH( func, *inst, 0, 0, chan_index );
1568          FETCH( func, *inst, 1, 1, chan_index );
1569          sse_maxps(
1570             func,
1571             make_xmm( 0 ),
1572             make_xmm( 1 ) );
1573          STORE( func, *inst, 0, 0, chan_index );
1574       }
1575       break;
1576
1577    case TGSI_OPCODE_SLT:
1578    /* TGSI_OPCODE_SETLT */
1579       emit_setcc( func, inst, cc_LessThan );
1580       break;
1581
1582    case TGSI_OPCODE_SGE:
1583    /* TGSI_OPCODE_SETGE */
1584       emit_setcc( func, inst, cc_NotLessThan );
1585       break;
1586
1587    case TGSI_OPCODE_MAD:
1588    /* TGSI_OPCODE_MADD */
1589       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1590          FETCH( func, *inst, 0, 0, chan_index );
1591          FETCH( func, *inst, 1, 1, chan_index );
1592          FETCH( func, *inst, 2, 2, chan_index );
1593          emit_mul( func, 0, 1 );
1594          emit_add( func, 0, 2 );
1595          STORE( func, *inst, 0, 0, chan_index );
1596       }
1597       break;
1598
1599    case TGSI_OPCODE_SUB:
1600       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1601          FETCH( func, *inst, 0, 0, chan_index );
1602          FETCH( func, *inst, 1, 1, chan_index );
1603          emit_sub( func, 0, 1 );
1604          STORE( func, *inst, 0, 0, chan_index );
1605       }
1606       break;
1607
1608    case TGSI_OPCODE_LERP:
1609    /* TGSI_OPCODE_LRP */
1610       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1611          FETCH( func, *inst, 0, 0, chan_index );
1612          FETCH( func, *inst, 1, 1, chan_index );
1613          FETCH( func, *inst, 2, 2, chan_index );
1614          emit_sub( func, 1, 2 );
1615          emit_mul( func, 0, 1 );
1616          emit_add( func, 0, 2 );
1617          STORE( func, *inst, 0, 0, chan_index );
1618       }
1619       break;
1620
1621    case TGSI_OPCODE_CND:
1622       return 0;
1623       break;
1624
1625    case TGSI_OPCODE_CND0:
1626       return 0;
1627       break;
1628
1629    case TGSI_OPCODE_DOT2ADD:
1630    /* TGSI_OPCODE_DP2A */
1631       return 0;
1632       break;
1633
1634    case TGSI_OPCODE_INDEX:
1635       return 0;
1636       break;
1637
1638    case TGSI_OPCODE_NEGATE:
1639       return 0;
1640       break;
1641
1642    case TGSI_OPCODE_FRAC:
1643    /* TGSI_OPCODE_FRC */
1644       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1645          FETCH( func, *inst, 0, 0, chan_index );
1646          emit_frc( func, 0 );
1647          STORE( func, *inst, 0, 0, chan_index );
1648       }
1649       break;
1650
1651    case TGSI_OPCODE_CLAMP:
1652       return 0;
1653       break;
1654
1655    case TGSI_OPCODE_FLOOR:
1656    /* TGSI_OPCODE_FLR */
1657       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1658          FETCH( func, *inst, 0, 0, chan_index );
1659          emit_flr( func, 0 );
1660          STORE( func, *inst, 0, 0, chan_index );
1661       }
1662       break;
1663
1664    case TGSI_OPCODE_ROUND:
1665       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1666          FETCH( func, *inst, 0, 0, chan_index );
1667          emit_rnd( func, 0, 0 );
1668          STORE( func, *inst, 0, 0, chan_index );
1669       }
1670       break;
1671
1672    case TGSI_OPCODE_EXPBASE2:
1673    /* TGSI_OPCODE_EX2 */
1674       FETCH( func, *inst, 0, 0, CHAN_X );
1675       emit_ex2( func, 0 );
1676       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1677          STORE( func, *inst, 0, 0, chan_index );
1678       }
1679       break;
1680
1681    case TGSI_OPCODE_LOGBASE2:
1682    /* TGSI_OPCODE_LG2 */
1683       FETCH( func, *inst, 0, 0, CHAN_X );
1684       emit_lg2( func, 0 );
1685       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1686          STORE( func, *inst, 0, 0, chan_index );
1687       }
1688       break;
1689
1690    case TGSI_OPCODE_POWER:
1691    /* TGSI_OPCODE_POW */
1692       FETCH( func, *inst, 0, 0, CHAN_X );
1693       FETCH( func, *inst, 1, 1, CHAN_X );
1694       emit_pow( func, 0, 1 );
1695       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1696          STORE( func, *inst, 0, 0, chan_index );
1697       }
1698       break;
1699
1700    case TGSI_OPCODE_CROSSPRODUCT:
1701    /* TGSI_OPCODE_XPD */
1702       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1703           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1704          FETCH( func, *inst, 1, 1, CHAN_Z );
1705          FETCH( func, *inst, 3, 0, CHAN_Z );
1706       }
1707       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1708           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1709          FETCH( func, *inst, 0, 0, CHAN_Y );
1710          FETCH( func, *inst, 4, 1, CHAN_Y );
1711       }
1712       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1713          emit_MOV( func, 2, 0 );
1714          emit_mul( func, 2, 1 );
1715          emit_MOV( func, 5, 3 );
1716          emit_mul( func, 5, 4 );
1717          emit_sub( func, 2, 5 );
1718          STORE( func, *inst, 2, 0, CHAN_X );
1719       }
1720       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1721           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1722          FETCH( func, *inst, 2, 1, CHAN_X );
1723          FETCH( func, *inst, 5, 0, CHAN_X );
1724       }
1725       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1726          emit_mul( func, 3, 2 );
1727          emit_mul( func, 1, 5 );
1728          emit_sub( func, 3, 1 );
1729          STORE( func, *inst, 3, 0, CHAN_Y );
1730       }
1731       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1732          emit_mul( func, 5, 4 );
1733          emit_mul( func, 0, 2 );
1734          emit_sub( func, 5, 0 );
1735          STORE( func, *inst, 5, 0, CHAN_Z );
1736       }
1737       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1738          emit_tempf(
1739             func,
1740             0,
1741             TEMP_ONE_I,
1742             TEMP_ONE_C );
1743          STORE( func, *inst, 0, 0, CHAN_W );
1744       }
1745       break;
1746
1747    case TGSI_OPCODE_MULTIPLYMATRIX:
1748       return 0;
1749       break;
1750
1751    case TGSI_OPCODE_ABS:
1752       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1753          FETCH( func, *inst, 0, 0, chan_index );
1754          emit_abs( func, 0) ;
1755
1756          STORE( func, *inst, 0, 0, chan_index );
1757       }
1758       break;
1759
1760    case TGSI_OPCODE_RCC:
1761       return 0;
1762       break;
1763
1764    case TGSI_OPCODE_DPH:
1765       FETCH( func, *inst, 0, 0, CHAN_X );
1766       FETCH( func, *inst, 1, 1, CHAN_X );
1767       emit_mul( func, 0, 1 );
1768       FETCH( func, *inst, 1, 0, CHAN_Y );
1769       FETCH( func, *inst, 2, 1, CHAN_Y );
1770       emit_mul( func, 1, 2 );
1771       emit_add( func, 0, 1 );
1772       FETCH( func, *inst, 1, 0, CHAN_Z );
1773       FETCH( func, *inst, 2, 1, CHAN_Z );
1774       emit_mul( func, 1, 2 );
1775       emit_add( func, 0, 1 );
1776       FETCH( func, *inst, 1, 1, CHAN_W );
1777       emit_add( func, 0, 1 );
1778       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1779          STORE( func, *inst, 0, 0, chan_index );
1780       }
1781       break;
1782
1783    case TGSI_OPCODE_COS:
1784       FETCH( func, *inst, 0, 0, CHAN_X );
1785       emit_cos( func, 0 );
1786       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1787          STORE( func, *inst, 0, 0, chan_index );
1788       }
1789       break;
1790
1791    case TGSI_OPCODE_DDX:
1792       return 0;
1793       break;
1794
1795    case TGSI_OPCODE_DDY:
1796       return 0;
1797       break;
1798
1799    case TGSI_OPCODE_KILP:
1800       /* predicated kill */
1801       emit_kilp( func );
1802       return 0; /* XXX fix me */
1803       break;
1804
1805    case TGSI_OPCODE_KIL:
1806       /* conditional kill */
1807       emit_kil( func, &inst->FullSrcRegisters[0] );
1808       break;
1809
1810    case TGSI_OPCODE_PK2H:
1811       return 0;
1812       break;
1813
1814    case TGSI_OPCODE_PK2US:
1815       return 0;
1816       break;
1817
1818    case TGSI_OPCODE_PK4B:
1819       return 0;
1820       break;
1821
1822    case TGSI_OPCODE_PK4UB:
1823       return 0;
1824       break;
1825
1826    case TGSI_OPCODE_RFL:
1827       return 0;
1828       break;
1829
1830    case TGSI_OPCODE_SEQ:
1831       return 0;
1832       break;
1833
1834    case TGSI_OPCODE_SFL:
1835       return 0;
1836       break;
1837
1838    case TGSI_OPCODE_SGT:
1839       return 0;
1840       break;
1841
1842    case TGSI_OPCODE_SIN:
1843       FETCH( func, *inst, 0, 0, CHAN_X );
1844       emit_sin( func, 0 );
1845       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1846          STORE( func, *inst, 0, 0, chan_index );
1847       }
1848       break;
1849
1850    case TGSI_OPCODE_SLE:
1851       return 0;
1852       break;
1853
1854    case TGSI_OPCODE_SNE:
1855       return 0;
1856       break;
1857
1858    case TGSI_OPCODE_STR:
1859       return 0;
1860       break;
1861
1862    case TGSI_OPCODE_TEX:
1863       if (0) {
1864          /* Disable dummy texture code:
1865           */
1866          emit_tempf(
1867             func,
1868             0,
1869             TEMP_ONE_I,
1870             TEMP_ONE_C );
1871          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1872             STORE( func, *inst, 0, 0, chan_index );
1873          }
1874       }
1875       else {
1876          return 0;
1877       }
1878       break;
1879
1880    case TGSI_OPCODE_TXD:
1881       return 0;
1882       break;
1883
1884    case TGSI_OPCODE_UP2H:
1885       return 0;
1886       break;
1887
1888    case TGSI_OPCODE_UP2US:
1889       return 0;
1890       break;
1891
1892    case TGSI_OPCODE_UP4B:
1893       return 0;
1894       break;
1895
1896    case TGSI_OPCODE_UP4UB:
1897       return 0;
1898       break;
1899
1900    case TGSI_OPCODE_X2D:
1901       return 0;
1902       break;
1903
1904    case TGSI_OPCODE_ARA:
1905       return 0;
1906       break;
1907
1908    case TGSI_OPCODE_ARR:
1909       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1910          FETCH( func, *inst, 0, 0, chan_index );
1911          emit_rnd( func, 0, 0 );
1912          emit_f2it( func, 0 );
1913          STORE( func, *inst, 0, 0, chan_index );
1914       }
1915       break;
1916
1917    case TGSI_OPCODE_BRA:
1918       return 0;
1919       break;
1920
1921    case TGSI_OPCODE_CAL:
1922       return 0;
1923       break;
1924
1925    case TGSI_OPCODE_RET:
1926       emit_ret( func );
1927       break;
1928
1929    case TGSI_OPCODE_END:
1930       break;
1931
1932    case TGSI_OPCODE_SSG:
1933       return 0;
1934       break;
1935
1936    case TGSI_OPCODE_CMP:
1937       emit_cmp (func, inst);
1938       break;
1939
1940    case TGSI_OPCODE_SCS:
1941       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1942          FETCH( func, *inst, 0, 0, CHAN_X );
1943          emit_cos( func, 0 );
1944          STORE( func, *inst, 0, 0, CHAN_X );
1945       }
1946       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1947          FETCH( func, *inst, 0, 0, CHAN_X );
1948          emit_sin( func, 0 );
1949          STORE( func, *inst, 0, 0, CHAN_Y );
1950       }
1951       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1952          emit_tempf(
1953             func,
1954             0,
1955             TGSI_EXEC_TEMP_00000000_I,
1956             TGSI_EXEC_TEMP_00000000_C );
1957          STORE( func, *inst, 0, 0, CHAN_Z );
1958       }
1959       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1960          emit_tempf(
1961             func,
1962             0,
1963             TEMP_ONE_I,
1964             TEMP_ONE_C );
1965          STORE( func, *inst, 0, 0, CHAN_W );
1966       }
1967       break;
1968
1969    case TGSI_OPCODE_TXB:
1970       return 0;
1971       break;
1972
1973    case TGSI_OPCODE_NRM:
1974       return 0;
1975       break;
1976
1977    case TGSI_OPCODE_DIV:
1978       return 0;
1979       break;
1980
1981    case TGSI_OPCODE_DP2:
1982       return 0;
1983       break;
1984
1985    case TGSI_OPCODE_TXL:
1986       return 0;
1987       break;
1988
1989    case TGSI_OPCODE_BRK:
1990       return 0;
1991       break;
1992
1993    case TGSI_OPCODE_IF:
1994       return 0;
1995       break;
1996
1997    case TGSI_OPCODE_LOOP:
1998       return 0;
1999       break;
2000
2001    case TGSI_OPCODE_REP:
2002       return 0;
2003       break;
2004
2005    case TGSI_OPCODE_ELSE:
2006       return 0;
2007       break;
2008
2009    case TGSI_OPCODE_ENDIF:
2010       return 0;
2011       break;
2012
2013    case TGSI_OPCODE_ENDLOOP:
2014       return 0;
2015       break;
2016
2017    case TGSI_OPCODE_ENDREP:
2018       return 0;
2019       break;
2020
2021    case TGSI_OPCODE_PUSHA:
2022       return 0;
2023       break;
2024
2025    case TGSI_OPCODE_POPA:
2026       return 0;
2027       break;
2028
2029    case TGSI_OPCODE_CEIL:
2030       return 0;
2031       break;
2032
2033    case TGSI_OPCODE_I2F:
2034       return 0;
2035       break;
2036
2037    case TGSI_OPCODE_NOT:
2038       return 0;
2039       break;
2040
2041    case TGSI_OPCODE_TRUNC:
2042       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2043          FETCH( func, *inst, 0, 0, chan_index );
2044          emit_f2it( func, 0 );
2045          emit_i2f( func, 0 );
2046          STORE( func, *inst, 0, 0, chan_index );
2047       }
2048       break;
2049
2050    case TGSI_OPCODE_SHL:
2051       return 0;
2052       break;
2053
2054    case TGSI_OPCODE_SHR:
2055       return 0;
2056       break;
2057
2058    case TGSI_OPCODE_AND:
2059       return 0;
2060       break;
2061
2062    case TGSI_OPCODE_OR:
2063       return 0;
2064       break;
2065
2066    case TGSI_OPCODE_MOD:
2067       return 0;
2068       break;
2069
2070    case TGSI_OPCODE_XOR:
2071       return 0;
2072       break;
2073
2074    case TGSI_OPCODE_SAD:
2075       return 0;
2076       break;
2077
2078    case TGSI_OPCODE_TXF:
2079       return 0;
2080       break;
2081
2082    case TGSI_OPCODE_TXQ:
2083       return 0;
2084       break;
2085
2086    case TGSI_OPCODE_CONT:
2087       return 0;
2088       break;
2089
2090    case TGSI_OPCODE_EMIT:
2091       return 0;
2092       break;
2093
2094    case TGSI_OPCODE_ENDPRIM:
2095       return 0;
2096       break;
2097
2098    default:
2099       return 0;
2100    }
2101
2102    return 1;
2103 }
2104
2105 static void
2106 emit_declaration(
2107    struct x86_function *func,
2108    struct tgsi_full_declaration *decl )
2109 {
2110    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2111       unsigned first, last, mask;
2112       unsigned i, j;
2113
2114       first = decl->DeclarationRange.First;
2115       last = decl->DeclarationRange.Last;
2116       mask = decl->Declaration.UsageMask;
2117
2118       for( i = first; i <= last; i++ ) {
2119          for( j = 0; j < NUM_CHANNELS; j++ ) {
2120             if( mask & (1 << j) ) {
2121                switch( decl->Declaration.Interpolate ) {
2122                case TGSI_INTERPOLATE_CONSTANT:
2123                   emit_coef_a0( func, 0, i, j );
2124                   emit_inputs( func, 0, i, j );
2125                   break;
2126
2127                case TGSI_INTERPOLATE_LINEAR:
2128                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2129                   emit_coef_dadx( func, 1, i, j );
2130                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2131                   emit_coef_dady( func, 3, i, j );
2132                   emit_mul( func, 0, 1 );    /* x * dadx */
2133                   emit_coef_a0( func, 4, i, j );
2134                   emit_mul( func, 2, 3 );    /* y * dady */
2135                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2136                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2137                   emit_inputs( func, 0, i, j );
2138                   break;
2139
2140                case TGSI_INTERPOLATE_PERSPECTIVE:
2141                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2142                   emit_coef_dadx( func, 1, i, j );
2143                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2144                   emit_coef_dady( func, 3, i, j );
2145                   emit_mul( func, 0, 1 );    /* x * dadx */
2146                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2147                   emit_coef_a0( func, 5, i, j );
2148                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2149                   emit_mul( func, 2, 3 );    /* y * dady */
2150                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2151                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2152                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2153                   emit_inputs( func, 0, i, j );
2154                   break;
2155
2156                default:
2157                   assert( 0 );
2158                   break;
2159                }
2160             }
2161          }
2162       }
2163    }
2164 }
2165
2166 static void aos_to_soa( struct x86_function *func,
2167                         uint arg_aos,
2168                         uint arg_soa,
2169                         uint arg_num,
2170                         uint arg_stride )
2171 {
2172    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2173    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2174    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2175    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2176    int inner_loop;
2177
2178
2179    /* Save EBX */
2180    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2181
2182    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2183    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
2184    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2185    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2186
2187    /* do */
2188    inner_loop = x86_get_label( func );
2189    {
2190       x86_push( func, aos_input );
2191       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2192       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2193       x86_add( func, aos_input, stride );
2194       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2195       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2196       x86_add( func, aos_input, stride );
2197       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2198       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2199       x86_add( func, aos_input, stride );
2200       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2201       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2202       x86_pop( func, aos_input );
2203
2204       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2205       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2206       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2207       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2208       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2209       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2210
2211       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2212       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2213       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2214       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2215
2216       /* Advance to next input */
2217       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2218       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2219    }
2220    /* while --num_inputs */
2221    x86_dec( func, num_inputs );
2222    x86_jcc( func, cc_NE, inner_loop );
2223
2224    /* Restore EBX */
2225    x86_pop( func, aos_input );
2226 }
2227
2228 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2229 {
2230    struct x86_reg soa_output;
2231    struct x86_reg aos_output;
2232    struct x86_reg num_outputs;
2233    struct x86_reg temp;
2234    int inner_loop;
2235
2236    soa_output = x86_make_reg( file_REG32, reg_AX );
2237    aos_output = x86_make_reg( file_REG32, reg_BX );
2238    num_outputs = x86_make_reg( file_REG32, reg_CX );
2239    temp = x86_make_reg( file_REG32, reg_DX );
2240
2241    /* Save EBX */
2242    x86_push( func, aos_output );
2243
2244    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2245    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2246    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2247
2248    /* do */
2249    inner_loop = x86_get_label( func );
2250    {
2251       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2252       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2253       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2254       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2255
2256       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2257       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2258       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2259       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2260       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2261       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2262
2263       x86_mov( func, temp, x86_fn_arg( func, stride ) );
2264       x86_push( func, aos_output );
2265       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2266       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2267       x86_add( func, aos_output, temp );
2268       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2269       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2270       x86_add( func, aos_output, temp );
2271       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2272       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2273       x86_add( func, aos_output, temp );
2274       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2275       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2276       x86_pop( func, aos_output );
2277
2278       /* Advance to next output */
2279       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2280       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2281    }
2282    /* while --num_outputs */
2283    x86_dec( func, num_outputs );
2284    x86_jcc( func, cc_NE, inner_loop );
2285
2286    /* Restore EBX */
2287    x86_pop( func, aos_output );
2288 }
2289
2290 /**
2291  * Translate a TGSI vertex/fragment shader to SSE2 code.
2292  * Slightly different things are done for vertex vs. fragment shaders.
2293  *
2294  * Note that fragment shaders are responsible for interpolating shader
2295  * inputs. Because on x86 we have only 4 GP registers, and here we
2296  * have 5 shader arguments (input, output, const, temp and coef), the
2297  * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2298  * GP register holding the output argument is aliased with the coeff
2299  * argument, as outputs are not needed in the DECLARATION phase.
2300  *
2301  * \param tokens  the TGSI input shader
2302  * \param func  the output SSE code/function
2303  * \param immediates  buffer to place immediates, later passed to SSE func
2304  * \param return  1 for success, 0 if translation failed
2305  */
2306 unsigned
2307 tgsi_emit_sse2(
2308    const struct tgsi_token *tokens,
2309    struct x86_function *func,
2310    float (*immediates)[4],
2311    boolean do_swizzles )
2312 {
2313    struct tgsi_parse_context parse;
2314    boolean instruction_phase = FALSE;
2315    unsigned ok = 1;
2316    uint num_immediates = 0;
2317
2318    util_init_math();
2319
2320    func->csr = func->store;
2321
2322    tgsi_parse_init( &parse, tokens );
2323
2324    /* Can't just use EDI, EBX without save/restoring them:
2325     */
2326    x86_push(
2327       func,
2328       get_immediate_base() );
2329
2330    x86_push(
2331       func,
2332       get_temp_base() );
2333
2334
2335    /*
2336     * Different function args for vertex/fragment shaders:
2337     */
2338    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2339       /* DECLARATION phase, do not load output argument. */
2340       x86_mov(
2341          func,
2342          get_input_base(),
2343          x86_fn_arg( func, 1 ) );
2344       /* skipping outputs argument here */
2345       x86_mov(
2346          func,
2347          get_const_base(),
2348          x86_fn_arg( func, 3 ) );
2349       x86_mov(
2350          func,
2351          get_temp_base(),
2352          x86_fn_arg( func, 4 ) );
2353       x86_mov(
2354          func,
2355          get_coef_base(),
2356          x86_fn_arg( func, 5 ) );
2357       x86_mov(
2358          func,
2359          get_immediate_base(),
2360          x86_fn_arg( func, 6 ) );
2361    }
2362    else {
2363       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2364
2365       if (do_swizzles)
2366          aos_to_soa( func,
2367                      6,         /* aos_input */
2368                      1,         /* machine->input */
2369                      7,         /* num_inputs */
2370                      8 );       /* input_stride */
2371
2372       x86_mov(
2373          func,
2374          get_input_base(),
2375          x86_fn_arg( func, 1 ) );
2376       x86_mov(
2377          func,
2378          get_output_base(),
2379          x86_fn_arg( func, 2 ) );
2380       x86_mov(
2381          func,
2382          get_const_base(),
2383          x86_fn_arg( func, 3 ) );
2384       x86_mov(
2385          func,
2386          get_temp_base(),
2387          x86_fn_arg( func, 4 ) );
2388       x86_mov(
2389          func,
2390          get_immediate_base(),
2391          x86_fn_arg( func, 5 ) );
2392    }
2393
2394    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2395       tgsi_parse_token( &parse );
2396
2397       switch( parse.FullToken.Token.Type ) {
2398       case TGSI_TOKEN_TYPE_DECLARATION:
2399          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2400             emit_declaration(
2401                func,
2402                &parse.FullToken.FullDeclaration );
2403          }
2404          break;
2405
2406       case TGSI_TOKEN_TYPE_INSTRUCTION:
2407          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2408             if( !instruction_phase ) {
2409                /* INSTRUCTION phase, overwrite coeff with output. */
2410                instruction_phase = TRUE;
2411                x86_mov(
2412                   func,
2413                   get_output_base(),
2414                   x86_fn_arg( func, 2 ) );
2415             }
2416          }
2417
2418          ok = emit_instruction(
2419             func,
2420             &parse.FullToken.FullInstruction );
2421
2422          if (!ok) {
2423             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2424                          parse.FullToken.FullInstruction.Instruction.Opcode,
2425                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2426                          "vertex shader" : "fragment shader");
2427          }
2428          break;
2429
2430       case TGSI_TOKEN_TYPE_IMMEDIATE:
2431          /* simply copy the immediate values into the next immediates[] slot */
2432          {
2433             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2434             uint i;
2435             assert(size <= 4);
2436             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2437             for( i = 0; i < size; i++ ) {
2438                immediates[num_immediates][i] =
2439                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2440             }
2441 #if 0
2442             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2443                    num_immediates,
2444                    immediates[num_immediates][0],
2445                    immediates[num_immediates][1],
2446                    immediates[num_immediates][2],
2447                    immediates[num_immediates][3]);
2448 #endif
2449             num_immediates++;
2450          }
2451          break;
2452
2453       default:
2454          ok = 0;
2455          assert( 0 );
2456       }
2457    }
2458
2459    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2460       if (do_swizzles)
2461          soa_to_aos( func, 9, 2, 10, 11 );
2462    }
2463
2464    /* Can't just use EBX, EDI without save/restoring them:
2465     */
2466    x86_pop(
2467       func,
2468       get_temp_base() );
2469
2470    x86_pop(
2471       func,
2472       get_immediate_base() );
2473
2474    emit_ret( func );
2475
2476    tgsi_parse_free( &parse );
2477
2478    return ok;
2479 }
2480
2481 #endif /* PIPE_ARCH_X86 */
2482