src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_util.h"
  29 #include "pipe/p_shader_tokens.h"
  30 #include "tgsi/tgsi_parse.h"
  31 #include "tgsi/tgsi_util.h"
  32 #include "tgsi_exec.h"
  33 #include "tgsi_sse2.h"
  34
  35 #include "rtasm/rtasm_x86sse.h"
  36
  37 #ifdef PIPE_ARCH_X86
  38
  39 /* for 1/sqrt()
  40  *
  41  * This costs about 100fps (close to 10%) in gears:
  42  */
  43 #define HIGH_PRECISION 1
  44
  45
  46 #define FOR_EACH_CHANNEL( CHAN )\
  47    for( CHAN = 0; CHAN < 4; CHAN++ )
  48
  49 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  50    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  51
  52 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  53    if( IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  54
  55 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  56    FOR_EACH_CHANNEL( CHAN )\
  57       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  58
  59 #define CHAN_X 0
  60 #define CHAN_Y 1
  61 #define CHAN_Z 2
  62 #define CHAN_W 3
  63
  64 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  65
  66 /**
  67  * X86 utility functions.
  68  */
  69
  70 static struct x86_reg
  71 make_xmm(
  72    unsigned xmm )
  73 {
  74    return x86_make_reg(
  75       file_XMM,
  76       (enum x86_reg_name) xmm );
  77 }
  78
  79 /**
  80  * X86 register mapping helpers.
  81  */
  82
  83 static struct x86_reg
  84 get_const_base( void )
  85 {
  86    return x86_make_reg(
  87       file_REG32,
  88       reg_CX );
  89 }
  90
  91 static struct x86_reg
  92 get_input_base( void )
  93 {
  94    return x86_make_reg(
  95       file_REG32,
  96       reg_AX );
  97 }
  98
  99 static struct x86_reg
 100 get_output_base( void )
 101 {
 102    return x86_make_reg(
 103       file_REG32,
 104       reg_DX );
 105 }
 106
 107 static struct x86_reg
 108 get_temp_base( void )
 109 {
 110    return x86_make_reg(
 111       file_REG32,
 112       reg_BX );
 113 }
 114
 115 static struct x86_reg
 116 get_coef_base( void )
 117 {
 118    return get_output_base();
 119 }
 120
 121 static struct x86_reg
 122 get_immediate_base( void )
 123 {
 124    return x86_make_reg(
 125       file_REG32,
 126       reg_DI );
 127 }
 128
 129
 130 /**
 131  * Data access helpers.
 132  */
 133
 134
 135 static struct x86_reg
 136 get_immediate(
 137    unsigned vec,
 138    unsigned chan )
 139 {
 140    return x86_make_disp(
 141       get_immediate_base(),
 142       (vec * 4 + chan) * 4 );
 143 }
 144
 145 static struct x86_reg
 146 get_const(
 147    unsigned vec,
 148    unsigned chan )
 149 {
 150    return x86_make_disp(
 151       get_const_base(),
 152       (vec * 4 + chan) * 4 );
 153 }
 154
 155 static struct x86_reg
 156 get_input(
 157    unsigned vec,
 158    unsigned chan )
 159 {
 160    return x86_make_disp(
 161       get_input_base(),
 162       (vec * 4 + chan) * 16 );
 163 }
 164
 165 static struct x86_reg
 166 get_output(
 167    unsigned vec,
 168    unsigned chan )
 169 {
 170    return x86_make_disp(
 171       get_output_base(),
 172       (vec * 4 + chan) * 16 );
 173 }
 174
 175 static struct x86_reg
 176 get_temp(
 177    unsigned vec,
 178    unsigned chan )
 179 {
 180    return x86_make_disp(
 181       get_temp_base(),
 182       (vec * 4 + chan) * 16 );
 183 }
 184
 185 static struct x86_reg
 186 get_coef(
 187    unsigned vec,
 188    unsigned chan,
 189    unsigned member )
 190 {
 191    return x86_make_disp(
 192       get_coef_base(),
 193       ((vec * 3 + member) * 4 + chan) * 4 );
 194 }
 195
 196
 197 static void
 198 emit_ret(
 199    struct x86_function  *func )
 200 {
 201    x86_ret( func );
 202 }
 203
 204
 205 /**
 206  * Data fetch helpers.
 207  */
 208
 209 /**
 210  * Copy a shader constant to xmm register
 211  * \param xmm  the destination xmm register
 212  * \param vec  the src const buffer index
 213  * \param chan  src channel to fetch (X, Y, Z or W)
 214  */
 215 static void
 216 emit_const(
 217    struct x86_function *func,
 218    unsigned xmm,
 219    unsigned vec,
 220    unsigned chan )
 221 {
 222    sse_movss(
 223       func,
 224       make_xmm( xmm ),
 225       get_const( vec, chan ) );
 226    sse_shufps(
 227       func,
 228       make_xmm( xmm ),
 229       make_xmm( xmm ),
 230       SHUF( 0, 0, 0, 0 ) );
 231 }
 232
 233 static void
 234 emit_immediate(
 235    struct x86_function *func,
 236    unsigned xmm,
 237    unsigned vec,
 238    unsigned chan )
 239 {
 240    sse_movss(
 241       func,
 242       make_xmm( xmm ),
 243       get_immediate( vec, chan ) );
 244    sse_shufps(
 245       func,
 246       make_xmm( xmm ),
 247       make_xmm( xmm ),
 248       SHUF( 0, 0, 0, 0 ) );
 249 }
 250
 251
 252 /**
 253  * Copy a shader input to xmm register
 254  * \param xmm  the destination xmm register
 255  * \param vec  the src input attrib
 256  * \param chan  src channel to fetch (X, Y, Z or W)
 257  */
 258 static void
 259 emit_inputf(
 260    struct x86_function *func,
 261    unsigned xmm,
 262    unsigned vec,
 263    unsigned chan )
 264 {
 265    sse_movups(
 266       func,
 267       make_xmm( xmm ),
 268       get_input( vec, chan ) );
 269 }
 270
 271 /**
 272  * Store an xmm register to a shader output
 273  * \param xmm  the source xmm register
 274  * \param vec  the dest output attrib
 275  * \param chan  src dest channel to store (X, Y, Z or W)
 276  */
 277 static void
 278 emit_output(
 279    struct x86_function *func,
 280    unsigned xmm,
 281    unsigned vec,
 282    unsigned chan )
 283 {
 284    sse_movups(
 285       func,
 286       get_output( vec, chan ),
 287       make_xmm( xmm ) );
 288 }
 289
 290 /**
 291  * Copy a shader temporary to xmm register
 292  * \param xmm  the destination xmm register
 293  * \param vec  the src temp register
 294  * \param chan  src channel to fetch (X, Y, Z or W)
 295  */
 296 static void
 297 emit_tempf(
 298    struct x86_function *func,
 299    unsigned xmm,
 300    unsigned vec,
 301    unsigned chan )
 302 {
 303    sse_movaps(
 304       func,
 305       make_xmm( xmm ),
 306       get_temp( vec, chan ) );
 307 }
 308
 309 /**
 310  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 311  * \param xmm  the destination xmm register
 312  * \param vec  the src input/attribute coefficient index
 313  * \param chan  src channel to fetch (X, Y, Z or W)
 314  * \param member  0=a0, 1=dadx, 2=dady
 315  */
 316 static void
 317 emit_coef(
 318    struct x86_function *func,
 319    unsigned xmm,
 320    unsigned vec,
 321    unsigned chan,
 322    unsigned member )
 323 {
 324    sse_movss(
 325       func,
 326       make_xmm( xmm ),
 327       get_coef( vec, chan, member ) );
 328    sse_shufps(
 329       func,
 330       make_xmm( xmm ),
 331       make_xmm( xmm ),
 332       SHUF( 0, 0, 0, 0 ) );
 333 }
 334
 335 /**
 336  * Data store helpers.
 337  */
 338
 339 static void
 340 emit_inputs(
 341    struct x86_function *func,
 342    unsigned xmm,
 343    unsigned vec,
 344    unsigned chan )
 345 {
 346    sse_movups(
 347       func,
 348       get_input( vec, chan ),
 349       make_xmm( xmm ) );
 350 }
 351
 352 static void
 353 emit_temps(
 354    struct x86_function *func,
 355    unsigned xmm,
 356    unsigned vec,
 357    unsigned chan )
 358 {
 359    sse_movaps(
 360       func,
 361       get_temp( vec, chan ),
 362       make_xmm( xmm ) );
 363 }
 364
 365 static void
 366 emit_addrs(
 367    struct x86_function *func,
 368    unsigned xmm,
 369    unsigned vec,
 370    unsigned chan )
 371 {
 372    emit_temps(
 373       func,
 374       xmm,
 375       vec + TGSI_EXEC_NUM_TEMPS,
 376       chan );
 377 }
 378
 379 /**
 380  * Coefficent fetch helpers.
 381  */
 382
 383 static void
 384 emit_coef_a0(
 385    struct x86_function *func,
 386    unsigned xmm,
 387    unsigned vec,
 388    unsigned chan )
 389 {
 390    emit_coef(
 391       func,
 392       xmm,
 393       vec,
 394       chan,
 395       0 );
 396 }
 397
 398 static void
 399 emit_coef_dadx(
 400    struct x86_function *func,
 401    unsigned xmm,
 402    unsigned vec,
 403    unsigned chan )
 404 {
 405    emit_coef(
 406       func,
 407       xmm,
 408       vec,
 409       chan,
 410       1 );
 411 }
 412
 413 static void
 414 emit_coef_dady(
 415    struct x86_function *func,
 416    unsigned xmm,
 417    unsigned vec,
 418    unsigned chan )
 419 {
 420    emit_coef(
 421       func,
 422       xmm,
 423       vec,
 424       chan,
 425       2 );
 426 }
 427
 428 /**
 429  * Function call helpers.
 430  */
 431
 432 static void
 433 emit_push_gp(
 434    struct x86_function *func )
 435 {
 436    x86_push(
 437       func,
 438       x86_make_reg( file_REG32, reg_AX) );
 439    x86_push(
 440       func,
 441       x86_make_reg( file_REG32, reg_CX) );
 442    x86_push(
 443       func,
 444       x86_make_reg( file_REG32, reg_DX) );
 445 }
 446
 447 static void
 448 x86_pop_gp(
 449    struct x86_function *func )
 450 {
 451    /* Restore GP registers in a reverse order.
 452     */
 453    x86_pop(
 454       func,
 455       x86_make_reg( file_REG32, reg_DX) );
 456    x86_pop(
 457       func,
 458       x86_make_reg( file_REG32, reg_CX) );
 459    x86_pop(
 460       func,
 461       x86_make_reg( file_REG32, reg_AX) );
 462 }
 463
 464 static void
 465 emit_func_call_dst(
 466    struct x86_function *func,
 467    unsigned xmm_dst,
 468    void (PIPE_CDECL *code)() )
 469 {
 470    sse_movaps(
 471       func,
 472       get_temp( TEMP_R0, 0 ),
 473       make_xmm( xmm_dst ) );
 474
 475    emit_push_gp(
 476       func );
 477
 478    {
 479       struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 480
 481       x86_lea(
 482          func,
 483          ecx,
 484          get_temp( TEMP_R0, 0 ) );
 485
 486       x86_push( func, ecx );
 487       x86_mov_reg_imm( func, ecx, (unsigned long) code );
 488       x86_call( func, ecx );
 489       x86_pop(func, ecx );
 490    }
 491
 492
 493    x86_pop_gp(
 494       func );
 495
 496    sse_movaps(
 497       func,
 498       make_xmm( xmm_dst ),
 499       get_temp( TEMP_R0, 0 ) );
 500 }
 501
 502 static void
 503 emit_func_call_dst_src(
 504    struct x86_function *func,
 505    unsigned xmm_dst,
 506    unsigned xmm_src,
 507    void (PIPE_CDECL *code)() )
 508 {
 509    sse_movaps(
 510       func,
 511       get_temp( TEMP_R0, 1 ),
 512       make_xmm( xmm_src ) );
 513
 514    emit_func_call_dst(
 515       func,
 516       xmm_dst,
 517       code );
 518 }
 519
 520 /**
 521  * Low-level instruction translators.
 522  */
 523
 524 static void
 525 emit_abs(
 526    struct x86_function *func,
 527    unsigned xmm )
 528 {
 529    sse_andps(
 530       func,
 531       make_xmm( xmm ),
 532       get_temp(
 533          TGSI_EXEC_TEMP_7FFFFFFF_I,
 534          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 535 }
 536
 537 static void
 538 emit_add(
 539    struct x86_function *func,
 540    unsigned xmm_dst,
 541    unsigned xmm_src )
 542 {
 543    sse_addps(
 544       func,
 545       make_xmm( xmm_dst ),
 546       make_xmm( xmm_src ) );
 547 }
 548
 549 static void PIPE_CDECL
 550 cos4f(
 551    float *store )
 552 {
 553    const unsigned X = 0;
 554
 555    store[X + 0] = cosf( store[X + 0] );
 556    store[X + 1] = cosf( store[X + 1] );
 557    store[X + 2] = cosf( store[X + 2] );
 558    store[X + 3] = cosf( store[X + 3] );
 559 }
 560
 561 static void
 562 emit_cos(
 563    struct x86_function *func,
 564    unsigned xmm_dst )
 565 {
 566    emit_func_call_dst(
 567       func,
 568       xmm_dst,
 569       cos4f );
 570 }
 571
 572 static void PIPE_CDECL
 573 ex24f(
 574    float *store )
 575 {
 576    const unsigned X = 0;
 577
 578    store[X + 0] = powf( 2.0f, store[X + 0] );
 579    store[X + 1] = powf( 2.0f, store[X + 1] );
 580    store[X + 2] = powf( 2.0f, store[X + 2] );
 581    store[X + 3] = powf( 2.0f, store[X + 3] );
 582 }
 583
 584 static void
 585 emit_ex2(
 586    struct x86_function *func,
 587    unsigned xmm_dst )
 588 {
 589    emit_func_call_dst(
 590       func,
 591       xmm_dst,
 592       ex24f );
 593 }
 594
 595 static void
 596 emit_f2it(
 597    struct x86_function *func,
 598    unsigned xmm )
 599 {
 600    sse2_cvttps2dq(
 601       func,
 602       make_xmm( xmm ),
 603       make_xmm( xmm ) );
 604 }
 605
 606 static void PIPE_CDECL
 607 flr4f(
 608    float *store )
 609 {
 610    const unsigned X = 0;
 611
 612    store[X + 0] = floorf( store[X + 0] );
 613    store[X + 1] = floorf( store[X + 1] );
 614    store[X + 2] = floorf( store[X + 2] );
 615    store[X + 3] = floorf( store[X + 3] );
 616 }
 617
 618 static void
 619 emit_flr(
 620    struct x86_function *func,
 621    unsigned xmm_dst )
 622 {
 623    emit_func_call_dst(
 624       func,
 625       xmm_dst,
 626       flr4f );
 627 }
 628
 629 static void PIPE_CDECL
 630 frc4f(
 631    float *store )
 632 {
 633    const unsigned X = 0;
 634
 635    store[X + 0] -= floorf( store[X + 0] );
 636    store[X + 1] -= floorf( store[X + 1] );
 637    store[X + 2] -= floorf( store[X + 2] );
 638    store[X + 3] -= floorf( store[X + 3] );
 639 }
 640
 641 static void
 642 emit_frc(
 643    struct x86_function *func,
 644    unsigned xmm_dst )
 645 {
 646    emit_func_call_dst(
 647       func,
 648       xmm_dst,
 649       frc4f );
 650 }
 651
 652 static void PIPE_CDECL
 653 lg24f(
 654    float *store )
 655 {
 656    const unsigned X = 0;
 657
 658    store[X + 0] = LOG2( store[X + 0] );
 659    store[X + 1] = LOG2( store[X + 1] );
 660    store[X + 2] = LOG2( store[X + 2] );
 661    store[X + 3] = LOG2( store[X + 3] );
 662 }
 663
 664 static void
 665 emit_lg2(
 666    struct x86_function *func,
 667    unsigned xmm_dst )
 668 {
 669    emit_func_call_dst(
 670       func,
 671       xmm_dst,
 672       lg24f );
 673 }
 674
 675 static void
 676 emit_MOV(
 677    struct x86_function *func,
 678    unsigned xmm_dst,
 679    unsigned xmm_src )
 680 {
 681    sse_movups(
 682       func,
 683       make_xmm( xmm_dst ),
 684       make_xmm( xmm_src ) );
 685 }
 686
 687 static void
 688 emit_mul (struct x86_function *func,
 689           unsigned xmm_dst,
 690           unsigned xmm_src)
 691 {
 692    sse_mulps(
 693       func,
 694       make_xmm( xmm_dst ),
 695       make_xmm( xmm_src ) );
 696 }
 697
 698 static void
 699 emit_neg(
 700    struct x86_function *func,
 701    unsigned xmm )
 702 {
 703    sse_xorps(
 704       func,
 705       make_xmm( xmm ),
 706       get_temp(
 707          TGSI_EXEC_TEMP_80000000_I,
 708          TGSI_EXEC_TEMP_80000000_C ) );
 709 }
 710
 711 static void PIPE_CDECL
 712 pow4f(
 713    float *store )
 714 {
 715    const unsigned X = 0;
 716
 717    store[X + 0] = powf( store[X + 0], store[X + 4] );
 718    store[X + 1] = powf( store[X + 1], store[X + 5] );
 719    store[X + 2] = powf( store[X + 2], store[X + 6] );
 720    store[X + 3] = powf( store[X + 3], store[X + 7] );
 721 }
 722
 723 static void
 724 emit_pow(
 725    struct x86_function *func,
 726    unsigned xmm_dst,
 727    unsigned xmm_src )
 728 {
 729    emit_func_call_dst_src(
 730       func,
 731       xmm_dst,
 732       xmm_src,
 733       pow4f );
 734 }
 735
 736 static void
 737 emit_rcp (
 738    struct x86_function *func,
 739    unsigned xmm_dst,
 740    unsigned xmm_src )
 741 {
 742    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 743     * good enough.  Need to either emit a proper divide or use the
 744     * iterative technique described below in emit_rsqrt().
 745     */
 746    sse2_rcpps(
 747       func,
 748       make_xmm( xmm_dst ),
 749       make_xmm( xmm_src ) );
 750 }
 751
 752 static void
 753 emit_rsqrt(
 754    struct x86_function *func,
 755    unsigned xmm_dst,
 756    unsigned xmm_src )
 757 {
 758 #if HIGH_PRECISION
 759    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
 760     * implementations, it is possible to improve its precision at
 761     * fairly low cost, using a newton/raphson step, as below:
 762     *
 763     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
 764     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
 765     *
 766     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
 767     */
 768    {
 769       struct x86_reg dst = make_xmm( xmm_dst );
 770       struct x86_reg src = make_xmm( xmm_src );
 771       struct x86_reg tmp0 = make_xmm( 2 );
 772       struct x86_reg tmp1 = make_xmm( 3 );
 773
 774       assert( xmm_dst != xmm_src );
 775       assert( xmm_dst != 2 && xmm_dst != 3 );
 776       assert( xmm_src != 2 && xmm_src != 3 );
 777
 778       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
 779       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
 780       sse_rsqrtps( func, tmp1, src  );
 781       sse_mulps(   func, src,  tmp1 );
 782       sse_mulps(   func, dst,  tmp1 );
 783       sse_mulps(   func, src,  tmp1 );
 784       sse_subps(   func, tmp0, src  );
 785       sse_mulps(   func, dst,  tmp0 );
 786    }
 787 #else
 788    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 789     * good enough.
 790     */
 791    sse_rsqrtps(
 792       func,
 793       make_xmm( xmm_dst ),
 794       make_xmm( xmm_src ) );
 795 #endif
 796 }
 797
 798 static void
 799 emit_setsign(
 800    struct x86_function *func,
 801    unsigned xmm )
 802 {
 803    sse_orps(
 804       func,
 805       make_xmm( xmm ),
 806       get_temp(
 807          TGSI_EXEC_TEMP_80000000_I,
 808          TGSI_EXEC_TEMP_80000000_C ) );
 809 }
 810
 811 static void PIPE_CDECL
 812 sin4f(
 813    float *store )
 814 {
 815    const unsigned X = 0;
 816
 817    store[X + 0] = sinf( store[X + 0] );
 818    store[X + 1] = sinf( store[X + 1] );
 819    store[X + 2] = sinf( store[X + 2] );
 820    store[X + 3] = sinf( store[X + 3] );
 821 }
 822
 823 static void
 824 emit_sin (struct x86_function *func,
 825           unsigned xmm_dst)
 826 {
 827    emit_func_call_dst(
 828       func,
 829       xmm_dst,
 830       sin4f );
 831 }
 832
 833 static void
 834 emit_sub(
 835    struct x86_function *func,
 836    unsigned xmm_dst,
 837    unsigned xmm_src )
 838 {
 839    sse_subps(
 840       func,
 841       make_xmm( xmm_dst ),
 842       make_xmm( xmm_src ) );
 843 }
 844
 845 /**
 846  * Register fetch.
 847  */
 848
 849 static void
 850 emit_fetch(
 851    struct x86_function *func,
 852    unsigned xmm,
 853    const struct tgsi_full_src_register *reg,
 854    const unsigned chan_index )
 855 {
 856    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
 857
 858    switch( swizzle ) {
 859    case TGSI_EXTSWIZZLE_X:
 860    case TGSI_EXTSWIZZLE_Y:
 861    case TGSI_EXTSWIZZLE_Z:
 862    case TGSI_EXTSWIZZLE_W:
 863       switch( reg->SrcRegister.File ) {
 864       case TGSI_FILE_CONSTANT:
 865          emit_const(
 866             func,
 867             xmm,
 868             reg->SrcRegister.Index,
 869             swizzle );
 870          break;
 871
 872       case TGSI_FILE_IMMEDIATE:
 873          emit_immediate(
 874             func,
 875             xmm,
 876             reg->SrcRegister.Index,
 877             swizzle );
 878          break;
 879
 880       case TGSI_FILE_INPUT:
 881          emit_inputf(
 882             func,
 883             xmm,
 884             reg->SrcRegister.Index,
 885             swizzle );
 886          break;
 887
 888       case TGSI_FILE_TEMPORARY:
 889          emit_tempf(
 890             func,
 891             xmm,
 892             reg->SrcRegister.Index,
 893             swizzle );
 894          break;
 895
 896       default:
 897          assert( 0 );
 898       }
 899       break;
 900
 901    case TGSI_EXTSWIZZLE_ZERO:
 902       emit_tempf(
 903          func,
 904          xmm,
 905          TGSI_EXEC_TEMP_00000000_I,
 906          TGSI_EXEC_TEMP_00000000_C );
 907       break;
 908
 909    case TGSI_EXTSWIZZLE_ONE:
 910       emit_tempf(
 911          func,
 912          xmm,
 913          TGSI_EXEC_TEMP_ONE_I,
 914          TGSI_EXEC_TEMP_ONE_C );
 915       break;
 916
 917    default:
 918       assert( 0 );
 919    }
 920
 921    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
 922    case TGSI_UTIL_SIGN_CLEAR:
 923       emit_abs( func, xmm );
 924       break;
 925
 926    case TGSI_UTIL_SIGN_SET:
 927       emit_setsign( func, xmm );
 928       break;
 929
 930    case TGSI_UTIL_SIGN_TOGGLE:
 931       emit_neg( func, xmm );
 932       break;
 933
 934    case TGSI_UTIL_SIGN_KEEP:
 935       break;
 936    }
 937 }
 938
 939 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
 940    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
 941
 942 /**
 943  * Register store.
 944  */
 945
 946 static void
 947 emit_store(
 948    struct x86_function *func,
 949    unsigned xmm,
 950    const struct tgsi_full_dst_register *reg,
 951    const struct tgsi_full_instruction *inst,
 952    unsigned chan_index )
 953 {
 954    switch( reg->DstRegister.File ) {
 955    case TGSI_FILE_OUTPUT:
 956       emit_output(
 957          func,
 958          xmm,
 959          reg->DstRegister.Index,
 960          chan_index );
 961       break;
 962
 963    case TGSI_FILE_TEMPORARY:
 964       emit_temps(
 965          func,
 966          xmm,
 967          reg->DstRegister.Index,
 968          chan_index );
 969       break;
 970
 971    case TGSI_FILE_ADDRESS:
 972       emit_addrs(
 973          func,
 974          xmm,
 975          reg->DstRegister.Index,
 976          chan_index );
 977       break;
 978
 979    default:
 980       assert( 0 );
 981    }
 982
 983    switch( inst->Instruction.Saturate ) {
 984    case TGSI_SAT_NONE:
 985       break;
 986
 987    case TGSI_SAT_ZERO_ONE:
 988       /* assert( 0 ); */
 989       break;
 990
 991    case TGSI_SAT_MINUS_PLUS_ONE:
 992       assert( 0 );
 993       break;
 994    }
 995 }
 996
 997 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
 998    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
 999
1000 /**
1001  * High-level instruction translators.
1002  */
1003
1004 static void
1005 emit_kil(
1006    struct x86_function *func,
1007    const struct tgsi_full_src_register *reg )
1008 {
1009    unsigned uniquemask;
1010    unsigned registers[4];
1011    unsigned nextregister = 0;
1012    unsigned firstchan = ~0;
1013    unsigned chan_index;
1014
1015    /* This mask stores component bits that were already tested. Note that
1016     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1017     * tested. */
1018    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1019
1020    FOR_EACH_CHANNEL( chan_index ) {
1021       unsigned swizzle;
1022
1023       /* unswizzle channel */
1024       swizzle = tgsi_util_get_full_src_register_extswizzle(
1025          reg,
1026          chan_index );
1027
1028       /* check if the component has not been already tested */
1029       if( !(uniquemask & (1 << swizzle)) ) {
1030          uniquemask |= 1 << swizzle;
1031
1032          /* allocate register */
1033          registers[chan_index] = nextregister;
1034          emit_fetch(
1035             func,
1036             nextregister,
1037             reg,
1038             chan_index );
1039          nextregister++;
1040
1041          /* mark the first channel used */
1042          if( firstchan == ~0 ) {
1043             firstchan = chan_index;
1044          }
1045       }
1046    }
1047
1048    x86_push(
1049       func,
1050       x86_make_reg( file_REG32, reg_AX ) );
1051    x86_push(
1052       func,
1053       x86_make_reg( file_REG32, reg_DX ) );
1054
1055    FOR_EACH_CHANNEL( chan_index ) {
1056       if( uniquemask & (1 << chan_index) ) {
1057          sse_cmpps(
1058             func,
1059             make_xmm( registers[chan_index] ),
1060             get_temp(
1061                TGSI_EXEC_TEMP_00000000_I,
1062                TGSI_EXEC_TEMP_00000000_C ),
1063             cc_LessThan );
1064
1065          if( chan_index == firstchan ) {
1066             sse_pmovmskb(
1067                func,
1068                x86_make_reg( file_REG32, reg_AX ),
1069                make_xmm( registers[chan_index] ) );
1070          }
1071          else {
1072             sse_pmovmskb(
1073                func,
1074                x86_make_reg( file_REG32, reg_DX ),
1075                make_xmm( registers[chan_index] ) );
1076             x86_or(
1077                func,
1078                x86_make_reg( file_REG32, reg_AX ),
1079                x86_make_reg( file_REG32, reg_DX ) );
1080          }
1081       }
1082    }
1083
1084    x86_or(
1085       func,
1086       get_temp(
1087          TGSI_EXEC_TEMP_KILMASK_I,
1088          TGSI_EXEC_TEMP_KILMASK_C ),
1089       x86_make_reg( file_REG32, reg_AX ) );
1090
1091    x86_pop(
1092       func,
1093       x86_make_reg( file_REG32, reg_DX ) );
1094    x86_pop(
1095       func,
1096       x86_make_reg( file_REG32, reg_AX ) );
1097 }
1098
1099
1100 static void
1101 emit_kilp(
1102    struct x86_function *func )
1103 {
1104    /* XXX todo / fix me */
1105 }
1106
1107
1108 static void
1109 emit_setcc(
1110    struct x86_function *func,
1111    struct tgsi_full_instruction *inst,
1112    enum sse_cc cc )
1113 {
1114    unsigned chan_index;
1115
1116    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1117       FETCH( func, *inst, 0, 0, chan_index );
1118       FETCH( func, *inst, 1, 1, chan_index );
1119       sse_cmpps(
1120          func,
1121          make_xmm( 0 ),
1122          make_xmm( 1 ),
1123          cc );
1124       sse_andps(
1125          func,
1126          make_xmm( 0 ),
1127          get_temp(
1128             TGSI_EXEC_TEMP_ONE_I,
1129             TGSI_EXEC_TEMP_ONE_C ) );
1130       STORE( func, *inst, 0, 0, chan_index );
1131    }
1132 }
1133
1134 static void
1135 emit_cmp(
1136    struct x86_function *func,
1137    struct tgsi_full_instruction *inst )
1138 {
1139    unsigned chan_index;
1140
1141    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1142       FETCH( func, *inst, 0, 0, chan_index );
1143       FETCH( func, *inst, 1, 1, chan_index );
1144       FETCH( func, *inst, 2, 2, chan_index );
1145       sse_cmpps(
1146          func,
1147          make_xmm( 0 ),
1148          get_temp(
1149             TGSI_EXEC_TEMP_00000000_I,
1150             TGSI_EXEC_TEMP_00000000_C ),
1151          cc_LessThan );
1152       sse_andps(
1153          func,
1154          make_xmm( 1 ),
1155          make_xmm( 0 ) );
1156       sse_andnps(
1157          func,
1158          make_xmm( 0 ),
1159          make_xmm( 2 ) );
1160       sse_orps(
1161          func,
1162          make_xmm( 0 ),
1163          make_xmm( 1 ) );
1164       STORE( func, *inst, 0, 0, chan_index );
1165    }
1166 }
1167
1168 static int
1169 emit_instruction(
1170    struct x86_function *func,
1171    struct tgsi_full_instruction *inst )
1172 {
1173    unsigned chan_index;
1174
1175    switch( inst->Instruction.Opcode ) {
1176    case TGSI_OPCODE_ARL:
1177 #if 0
1178       /* XXX this isn't working properly (see glean vertProg1 test) */
1179       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1180          FETCH( func, *inst, 0, 0, chan_index );
1181          emit_f2it( func, 0 );
1182          STORE( func, *inst, 0, 0, chan_index );
1183       }
1184 #else
1185       return 0;
1186 #endif
1187       break;
1188
1189    case TGSI_OPCODE_MOV:
1190    case TGSI_OPCODE_SWZ:
1191       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1192          FETCH( func, *inst, 0, 0, chan_index );
1193          STORE( func, *inst, 0, 0, chan_index );
1194       }
1195       break;
1196
1197    case TGSI_OPCODE_LIT:
1198       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1199           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1200          emit_tempf(
1201             func,
1202             0,
1203             TGSI_EXEC_TEMP_ONE_I,
1204             TGSI_EXEC_TEMP_ONE_C);
1205          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1206             STORE( func, *inst, 0, 0, CHAN_X );
1207          }
1208          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1209             STORE( func, *inst, 0, 0, CHAN_W );
1210          }
1211       }
1212       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1213           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1214          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1215             FETCH( func, *inst, 0, 0, CHAN_X );
1216             sse_maxps(
1217                func,
1218                make_xmm( 0 ),
1219                get_temp(
1220                   TGSI_EXEC_TEMP_00000000_I,
1221                   TGSI_EXEC_TEMP_00000000_C ) );
1222             STORE( func, *inst, 0, 0, CHAN_Y );
1223          }
1224          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1225             /* XMM[1] = SrcReg[0].yyyy */
1226             FETCH( func, *inst, 1, 0, CHAN_Y );
1227             /* XMM[1] = max(XMM[1], 0) */
1228             sse_maxps(
1229                func,
1230                make_xmm( 1 ),
1231                get_temp(
1232                   TGSI_EXEC_TEMP_00000000_I,
1233                   TGSI_EXEC_TEMP_00000000_C ) );
1234             /* XMM[2] = SrcReg[0].wwww */
1235             FETCH( func, *inst, 2, 0, CHAN_W );
1236             /* XMM[2] = min(XMM[2], 128.0) */
1237             sse_minps(
1238                func,
1239                make_xmm( 2 ),
1240                get_temp(
1241                   TGSI_EXEC_TEMP_128_I,
1242                   TGSI_EXEC_TEMP_128_C ) );
1243             /* XMM[2] = max(XMM[2], -128.0) */
1244             sse_maxps(
1245                func,
1246                make_xmm( 2 ),
1247                get_temp(
1248                   TGSI_EXEC_TEMP_MINUS_128_I,
1249                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1250             emit_pow( func, 1, 2 );
1251             FETCH( func, *inst, 0, 0, CHAN_X );
1252             sse_xorps(
1253                func,
1254                make_xmm( 2 ),
1255                make_xmm( 2 ) );
1256             sse_cmpps(
1257                func,
1258                make_xmm( 2 ),
1259                make_xmm( 0 ),
1260                cc_LessThanEqual );
1261             sse_andps(
1262                func,
1263                make_xmm( 2 ),
1264                make_xmm( 1 ) );
1265             STORE( func, *inst, 2, 0, CHAN_Z );
1266          }
1267       }
1268       break;
1269
1270    case TGSI_OPCODE_RCP:
1271    /* TGSI_OPCODE_RECIP */
1272       FETCH( func, *inst, 0, 0, CHAN_X );
1273       emit_rcp( func, 0, 0 );
1274       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1275          STORE( func, *inst, 0, 0, chan_index );
1276       }
1277       break;
1278
1279    case TGSI_OPCODE_RSQ:
1280    /* TGSI_OPCODE_RECIPSQRT */
1281       FETCH( func, *inst, 0, 0, CHAN_X );
1282       emit_rsqrt( func, 1, 0 );
1283       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1284          STORE( func, *inst, 1, 0, chan_index );
1285       }
1286       break;
1287
1288    case TGSI_OPCODE_EXP:
1289       return 0;
1290       break;
1291
1292    case TGSI_OPCODE_LOG:
1293       return 0;
1294       break;
1295
1296    case TGSI_OPCODE_MUL:
1297       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1298          FETCH( func, *inst, 0, 0, chan_index );
1299          FETCH( func, *inst, 1, 1, chan_index );
1300          emit_mul( func, 0, 1 );
1301          STORE( func, *inst, 0, 0, chan_index );
1302       }
1303       break;
1304
1305    case TGSI_OPCODE_ADD:
1306       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1307          FETCH( func, *inst, 0, 0, chan_index );
1308          FETCH( func, *inst, 1, 1, chan_index );
1309          emit_add( func, 0, 1 );
1310          STORE( func, *inst, 0, 0, chan_index );
1311       }
1312       break;
1313
1314    case TGSI_OPCODE_DP3:
1315    /* TGSI_OPCODE_DOT3 */
1316       FETCH( func, *inst, 0, 0, CHAN_X );
1317       FETCH( func, *inst, 1, 1, CHAN_X );
1318       emit_mul( func, 0, 1 );
1319       FETCH( func, *inst, 1, 0, CHAN_Y );
1320       FETCH( func, *inst, 2, 1, CHAN_Y );
1321       emit_mul( func, 1, 2 );
1322       emit_add( func, 0, 1 );
1323       FETCH( func, *inst, 1, 0, CHAN_Z );
1324       FETCH( func, *inst, 2, 1, CHAN_Z );
1325       emit_mul( func, 1, 2 );
1326       emit_add( func, 0, 1 );
1327       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1328          STORE( func, *inst, 0, 0, chan_index );
1329       }
1330       break;
1331
1332    case TGSI_OPCODE_DP4:
1333    /* TGSI_OPCODE_DOT4 */
1334       FETCH( func, *inst, 0, 0, CHAN_X );
1335       FETCH( func, *inst, 1, 1, CHAN_X );
1336       emit_mul( func, 0, 1 );
1337       FETCH( func, *inst, 1, 0, CHAN_Y );
1338       FETCH( func, *inst, 2, 1, CHAN_Y );
1339       emit_mul( func, 1, 2 );
1340       emit_add( func, 0, 1 );
1341       FETCH( func, *inst, 1, 0, CHAN_Z );
1342       FETCH( func, *inst, 2, 1, CHAN_Z );
1343       emit_mul(func, 1, 2 );
1344       emit_add(func, 0, 1 );
1345       FETCH( func, *inst, 1, 0, CHAN_W );
1346       FETCH( func, *inst, 2, 1, CHAN_W );
1347       emit_mul( func, 1, 2 );
1348       emit_add( func, 0, 1 );
1349       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1350          STORE( func, *inst, 0, 0, chan_index );
1351       }
1352       break;
1353
1354    case TGSI_OPCODE_DST:
1355       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1356          emit_tempf(
1357             func,
1358             0,
1359             TGSI_EXEC_TEMP_ONE_I,
1360             TGSI_EXEC_TEMP_ONE_C );
1361          STORE( func, *inst, 0, 0, CHAN_X );
1362       }
1363       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1364          FETCH( func, *inst, 0, 0, CHAN_Y );
1365          FETCH( func, *inst, 1, 1, CHAN_Y );
1366          emit_mul( func, 0, 1 );
1367          STORE( func, *inst, 0, 0, CHAN_Y );
1368       }
1369       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1370          FETCH( func, *inst, 0, 0, CHAN_Z );
1371          STORE( func, *inst, 0, 0, CHAN_Z );
1372       }
1373       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1374          FETCH( func, *inst, 0, 1, CHAN_W );
1375          STORE( func, *inst, 0, 0, CHAN_W );
1376       }
1377       break;
1378
1379    case TGSI_OPCODE_MIN:
1380       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1381          FETCH( func, *inst, 0, 0, chan_index );
1382          FETCH( func, *inst, 1, 1, chan_index );
1383          sse_minps(
1384             func,
1385             make_xmm( 0 ),
1386             make_xmm( 1 ) );
1387          STORE( func, *inst, 0, 0, chan_index );
1388       }
1389       break;
1390
1391    case TGSI_OPCODE_MAX:
1392       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1393          FETCH( func, *inst, 0, 0, chan_index );
1394          FETCH( func, *inst, 1, 1, chan_index );
1395          sse_maxps(
1396             func,
1397             make_xmm( 0 ),
1398             make_xmm( 1 ) );
1399          STORE( func, *inst, 0, 0, chan_index );
1400       }
1401       break;
1402
1403    case TGSI_OPCODE_SLT:
1404    /* TGSI_OPCODE_SETLT */
1405       emit_setcc( func, inst, cc_LessThan );
1406       break;
1407
1408    case TGSI_OPCODE_SGE:
1409    /* TGSI_OPCODE_SETGE */
1410       emit_setcc( func, inst, cc_NotLessThan );
1411       break;
1412
1413    case TGSI_OPCODE_MAD:
1414    /* TGSI_OPCODE_MADD */
1415       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1416          FETCH( func, *inst, 0, 0, chan_index );
1417          FETCH( func, *inst, 1, 1, chan_index );
1418          FETCH( func, *inst, 2, 2, chan_index );
1419          emit_mul( func, 0, 1 );
1420          emit_add( func, 0, 2 );
1421          STORE( func, *inst, 0, 0, chan_index );
1422       }
1423       break;
1424
1425    case TGSI_OPCODE_SUB:
1426       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1427          FETCH( func, *inst, 0, 0, chan_index );
1428          FETCH( func, *inst, 1, 1, chan_index );
1429          emit_sub( func, 0, 1 );
1430          STORE( func, *inst, 0, 0, chan_index );
1431       }
1432       break;
1433
1434    case TGSI_OPCODE_LERP:
1435    /* TGSI_OPCODE_LRP */
1436       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1437          FETCH( func, *inst, 0, 0, chan_index );
1438          FETCH( func, *inst, 1, 1, chan_index );
1439          FETCH( func, *inst, 2, 2, chan_index );
1440          emit_sub( func, 1, 2 );
1441          emit_mul( func, 0, 1 );
1442          emit_add( func, 0, 2 );
1443          STORE( func, *inst, 0, 0, chan_index );
1444       }
1445       break;
1446
1447    case TGSI_OPCODE_CND:
1448       return 0;
1449       break;
1450
1451    case TGSI_OPCODE_CND0:
1452       return 0;
1453       break;
1454
1455    case TGSI_OPCODE_DOT2ADD:
1456    /* TGSI_OPCODE_DP2A */
1457       return 0;
1458       break;
1459
1460    case TGSI_OPCODE_INDEX:
1461       return 0;
1462       break;
1463
1464    case TGSI_OPCODE_NEGATE:
1465       return 0;
1466       break;
1467
1468    case TGSI_OPCODE_FRAC:
1469    /* TGSI_OPCODE_FRC */
1470       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1471          FETCH( func, *inst, 0, 0, chan_index );
1472          emit_frc( func, 0 );
1473          STORE( func, *inst, 0, 0, chan_index );
1474       }
1475       break;
1476
1477    case TGSI_OPCODE_CLAMP:
1478       return 0;
1479       break;
1480
1481    case TGSI_OPCODE_FLOOR:
1482    /* TGSI_OPCODE_FLR */
1483       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1484          FETCH( func, *inst, 0, 0, chan_index );
1485          emit_flr( func, 0 );
1486          STORE( func, *inst, 0, 0, chan_index );
1487       }
1488       break;
1489
1490    case TGSI_OPCODE_ROUND:
1491       return 0;
1492       break;
1493
1494    case TGSI_OPCODE_EXPBASE2:
1495    /* TGSI_OPCODE_EX2 */
1496       FETCH( func, *inst, 0, 0, CHAN_X );
1497       emit_ex2( func, 0 );
1498       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1499          STORE( func, *inst, 0, 0, chan_index );
1500       }
1501       break;
1502
1503    case TGSI_OPCODE_LOGBASE2:
1504    /* TGSI_OPCODE_LG2 */
1505       FETCH( func, *inst, 0, 0, CHAN_X );
1506       emit_lg2( func, 0 );
1507       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1508          STORE( func, *inst, 0, 0, chan_index );
1509       }
1510       break;
1511
1512    case TGSI_OPCODE_POWER:
1513    /* TGSI_OPCODE_POW */
1514       FETCH( func, *inst, 0, 0, CHAN_X );
1515       FETCH( func, *inst, 1, 1, CHAN_X );
1516       emit_pow( func, 0, 1 );
1517       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1518          STORE( func, *inst, 0, 0, chan_index );
1519       }
1520       break;
1521
1522    case TGSI_OPCODE_CROSSPRODUCT:
1523    /* TGSI_OPCODE_XPD */
1524       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1525           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1526          FETCH( func, *inst, 1, 1, CHAN_Z );
1527          FETCH( func, *inst, 3, 0, CHAN_Z );
1528       }
1529       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1530           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1531          FETCH( func, *inst, 0, 0, CHAN_Y );
1532          FETCH( func, *inst, 4, 1, CHAN_Y );
1533       }
1534       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1535          emit_MOV( func, 2, 0 );
1536          emit_mul( func, 2, 1 );
1537          emit_MOV( func, 5, 3 );
1538          emit_mul( func, 5, 4 );
1539          emit_sub( func, 2, 5 );
1540          STORE( func, *inst, 2, 0, CHAN_X );
1541       }
1542       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1543           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1544          FETCH( func, *inst, 2, 1, CHAN_X );
1545          FETCH( func, *inst, 5, 0, CHAN_X );
1546       }
1547       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1548          emit_mul( func, 3, 2 );
1549          emit_mul( func, 1, 5 );
1550          emit_sub( func, 3, 1 );
1551          STORE( func, *inst, 3, 0, CHAN_Y );
1552       }
1553       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1554          emit_mul( func, 5, 4 );
1555          emit_mul( func, 0, 2 );
1556          emit_sub( func, 5, 0 );
1557          STORE( func, *inst, 5, 0, CHAN_Z );
1558       }
1559       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1560          emit_tempf(
1561             func,
1562             0,
1563             TGSI_EXEC_TEMP_ONE_I,
1564             TGSI_EXEC_TEMP_ONE_C );
1565          STORE( func, *inst, 0, 0, CHAN_W );
1566       }
1567       break;
1568
1569    case TGSI_OPCODE_MULTIPLYMATRIX:
1570       return 0;
1571       break;
1572
1573    case TGSI_OPCODE_ABS:
1574       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1575          FETCH( func, *inst, 0, 0, chan_index );
1576          emit_abs( func, 0) ;
1577
1578          STORE( func, *inst, 0, 0, chan_index );
1579       }
1580       break;
1581
1582    case TGSI_OPCODE_RCC:
1583       return 0;
1584       break;
1585
1586    case TGSI_OPCODE_DPH:
1587       FETCH( func, *inst, 0, 0, CHAN_X );
1588       FETCH( func, *inst, 1, 1, CHAN_X );
1589       emit_mul( func, 0, 1 );
1590       FETCH( func, *inst, 1, 0, CHAN_Y );
1591       FETCH( func, *inst, 2, 1, CHAN_Y );
1592       emit_mul( func, 1, 2 );
1593       emit_add( func, 0, 1 );
1594       FETCH( func, *inst, 1, 0, CHAN_Z );
1595       FETCH( func, *inst, 2, 1, CHAN_Z );
1596       emit_mul( func, 1, 2 );
1597       emit_add( func, 0, 1 );
1598       FETCH( func, *inst, 1, 1, CHAN_W );
1599       emit_add( func, 0, 1 );
1600       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1601          STORE( func, *inst, 0, 0, chan_index );
1602       }
1603       break;
1604
1605    case TGSI_OPCODE_COS:
1606       FETCH( func, *inst, 0, 0, CHAN_X );
1607       emit_cos( func, 0 );
1608       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1609          STORE( func, *inst, 0, 0, chan_index );
1610       }
1611       break;
1612
1613    case TGSI_OPCODE_DDX:
1614       return 0;
1615       break;
1616
1617    case TGSI_OPCODE_DDY:
1618       return 0;
1619       break;
1620
1621    case TGSI_OPCODE_KILP:
1622       /* predicated kill */
1623       emit_kilp( func );
1624       return 0; /* XXX fix me */
1625       break;
1626
1627    case TGSI_OPCODE_KIL:
1628       /* conditional kill */
1629       emit_kil( func, &inst->FullSrcRegisters[0] );
1630       break;
1631
1632    case TGSI_OPCODE_PK2H:
1633       return 0;
1634       break;
1635
1636    case TGSI_OPCODE_PK2US:
1637       return 0;
1638       break;
1639
1640    case TGSI_OPCODE_PK4B:
1641       return 0;
1642       break;
1643
1644    case TGSI_OPCODE_PK4UB:
1645       return 0;
1646       break;
1647
1648    case TGSI_OPCODE_RFL:
1649       return 0;
1650       break;
1651
1652    case TGSI_OPCODE_SEQ:
1653       return 0;
1654       break;
1655
1656    case TGSI_OPCODE_SFL:
1657       return 0;
1658       break;
1659
1660    case TGSI_OPCODE_SGT:
1661       return 0;
1662       break;
1663
1664    case TGSI_OPCODE_SIN:
1665       FETCH( func, *inst, 0, 0, CHAN_X );
1666       emit_sin( func, 0 );
1667       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1668          STORE( func, *inst, 0, 0, chan_index );
1669       }
1670       break;
1671
1672    case TGSI_OPCODE_SLE:
1673       return 0;
1674       break;
1675
1676    case TGSI_OPCODE_SNE:
1677       return 0;
1678       break;
1679
1680    case TGSI_OPCODE_STR:
1681       return 0;
1682       break;
1683
1684    case TGSI_OPCODE_TEX:
1685       if (0) {
1686          /* Disable dummy texture code:
1687           */
1688          emit_tempf(
1689             func,
1690             0,
1691             TGSI_EXEC_TEMP_ONE_I,
1692             TGSI_EXEC_TEMP_ONE_C );
1693          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1694             STORE( func, *inst, 0, 0, chan_index );
1695          }
1696       }
1697       else {
1698          return 0;
1699       }
1700       break;
1701
1702    case TGSI_OPCODE_TXD:
1703       return 0;
1704       break;
1705
1706    case TGSI_OPCODE_UP2H:
1707       return 0;
1708       break;
1709
1710    case TGSI_OPCODE_UP2US:
1711       return 0;
1712       break;
1713
1714    case TGSI_OPCODE_UP4B:
1715       return 0;
1716       break;
1717
1718    case TGSI_OPCODE_UP4UB:
1719       return 0;
1720       break;
1721
1722    case TGSI_OPCODE_X2D:
1723       return 0;
1724       break;
1725
1726    case TGSI_OPCODE_ARA:
1727       return 0;
1728       break;
1729
1730    case TGSI_OPCODE_ARR:
1731       return 0;
1732       break;
1733
1734    case TGSI_OPCODE_BRA:
1735       return 0;
1736       break;
1737
1738    case TGSI_OPCODE_CAL:
1739       return 0;
1740       break;
1741
1742    case TGSI_OPCODE_RET:
1743       emit_ret( func );
1744       break;
1745
1746    case TGSI_OPCODE_END:
1747       break;
1748
1749    case TGSI_OPCODE_SSG:
1750       return 0;
1751       break;
1752
1753    case TGSI_OPCODE_CMP:
1754       emit_cmp (func, inst);
1755       break;
1756
1757    case TGSI_OPCODE_SCS:
1758       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1759          FETCH( func, *inst, 0, 0, CHAN_X );
1760          emit_cos( func, 0 );
1761          STORE( func, *inst, 0, 0, CHAN_X );
1762       }
1763       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1764          FETCH( func, *inst, 0, 0, CHAN_X );
1765          emit_sin( func, 0 );
1766          STORE( func, *inst, 0, 0, CHAN_Y );
1767       }
1768       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1769          emit_tempf(
1770             func,
1771             0,
1772             TGSI_EXEC_TEMP_00000000_I,
1773             TGSI_EXEC_TEMP_00000000_C );
1774          STORE( func, *inst, 0, 0, CHAN_Z );
1775       }
1776       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1777          emit_tempf(
1778             func,
1779             0,
1780             TGSI_EXEC_TEMP_ONE_I,
1781             TGSI_EXEC_TEMP_ONE_C );
1782          STORE( func, *inst, 0, 0, CHAN_W );
1783       }
1784       break;
1785
1786    case TGSI_OPCODE_TXB:
1787       return 0;
1788       break;
1789
1790    case TGSI_OPCODE_NRM:
1791       return 0;
1792       break;
1793
1794    case TGSI_OPCODE_DIV:
1795       return 0;
1796       break;
1797
1798    case TGSI_OPCODE_DP2:
1799       return 0;
1800       break;
1801
1802    case TGSI_OPCODE_TXL:
1803       return 0;
1804       break;
1805
1806    case TGSI_OPCODE_BRK:
1807       return 0;
1808       break;
1809
1810    case TGSI_OPCODE_IF:
1811       return 0;
1812       break;
1813
1814    case TGSI_OPCODE_LOOP:
1815       return 0;
1816       break;
1817
1818    case TGSI_OPCODE_REP:
1819       return 0;
1820       break;
1821
1822    case TGSI_OPCODE_ELSE:
1823       return 0;
1824       break;
1825
1826    case TGSI_OPCODE_ENDIF:
1827       return 0;
1828       break;
1829
1830    case TGSI_OPCODE_ENDLOOP:
1831       return 0;
1832       break;
1833
1834    case TGSI_OPCODE_ENDREP:
1835       return 0;
1836       break;
1837
1838    case TGSI_OPCODE_PUSHA:
1839       return 0;
1840       break;
1841
1842    case TGSI_OPCODE_POPA:
1843       return 0;
1844       break;
1845
1846    case TGSI_OPCODE_CEIL:
1847       return 0;
1848       break;
1849
1850    case TGSI_OPCODE_I2F:
1851       return 0;
1852       break;
1853
1854    case TGSI_OPCODE_NOT:
1855       return 0;
1856       break;
1857
1858    case TGSI_OPCODE_TRUNC:
1859       return 0;
1860       break;
1861
1862    case TGSI_OPCODE_SHL:
1863       return 0;
1864       break;
1865
1866    case TGSI_OPCODE_SHR:
1867       return 0;
1868       break;
1869
1870    case TGSI_OPCODE_AND:
1871       return 0;
1872       break;
1873
1874    case TGSI_OPCODE_OR:
1875       return 0;
1876       break;
1877
1878    case TGSI_OPCODE_MOD:
1879       return 0;
1880       break;
1881
1882    case TGSI_OPCODE_XOR:
1883       return 0;
1884       break;
1885
1886    case TGSI_OPCODE_SAD:
1887       return 0;
1888       break;
1889
1890    case TGSI_OPCODE_TXF:
1891       return 0;
1892       break;
1893
1894    case TGSI_OPCODE_TXQ:
1895       return 0;
1896       break;
1897
1898    case TGSI_OPCODE_CONT:
1899       return 0;
1900       break;
1901
1902    case TGSI_OPCODE_EMIT:
1903       return 0;
1904       break;
1905
1906    case TGSI_OPCODE_ENDPRIM:
1907       return 0;
1908       break;
1909
1910    default:
1911       return 0;
1912    }
1913
1914    return 1;
1915 }
1916
1917 static void
1918 emit_declaration(
1919    struct x86_function *func,
1920    struct tgsi_full_declaration *decl )
1921 {
1922    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1923       unsigned first, last, mask;
1924       unsigned i, j;
1925
1926       first = decl->DeclarationRange.First;
1927       last = decl->DeclarationRange.Last;
1928       mask = decl->Declaration.UsageMask;
1929
1930       for( i = first; i <= last; i++ ) {
1931          for( j = 0; j < NUM_CHANNELS; j++ ) {
1932             if( mask & (1 << j) ) {
1933                switch( decl->Declaration.Interpolate ) {
1934                case TGSI_INTERPOLATE_CONSTANT:
1935                   emit_coef_a0( func, 0, i, j );
1936                   emit_inputs( func, 0, i, j );
1937                   break;
1938
1939                case TGSI_INTERPOLATE_LINEAR:
1940                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
1941                   emit_coef_dadx( func, 1, i, j );
1942                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
1943                   emit_coef_dady( func, 3, i, j );
1944                   emit_mul( func, 0, 1 );    /* x * dadx */
1945                   emit_coef_a0( func, 4, i, j );
1946                   emit_mul( func, 2, 3 );    /* y * dady */
1947                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
1948                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
1949                   emit_inputs( func, 0, i, j );
1950                   break;
1951
1952                case TGSI_INTERPOLATE_PERSPECTIVE:
1953                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
1954                   emit_coef_dadx( func, 1, i, j );
1955                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
1956                   emit_coef_dady( func, 3, i, j );
1957                   emit_mul( func, 0, 1 );    /* x * dadx */
1958                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
1959                   emit_coef_a0( func, 5, i, j );
1960                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
1961                   emit_mul( func, 2, 3 );    /* y * dady */
1962                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
1963                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
1964                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
1965                   emit_inputs( func, 0, i, j );
1966                   break;
1967
1968                default:
1969                   assert( 0 );
1970                   break;
1971                }
1972             }
1973          }
1974       }
1975    }
1976 }
1977
1978 static void aos_to_soa( struct x86_function *func,
1979                         uint arg_aos,
1980                         uint arg_soa,
1981                         uint arg_num,
1982                         uint arg_stride )
1983 {
1984    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
1985    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
1986    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
1987    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
1988    int inner_loop;
1989
1990
1991    /* Save EBX */
1992    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
1993
1994    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
1995    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
1996    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
1997    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
1998
1999    /* do */
2000    inner_loop = x86_get_label( func );
2001    {
2002       x86_push( func, aos_input );
2003       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2004       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2005       x86_add( func, aos_input, stride );
2006       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2007       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2008       x86_add( func, aos_input, stride );
2009       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2010       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2011       x86_add( func, aos_input, stride );
2012       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2013       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2014       x86_pop( func, aos_input );
2015
2016       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2017       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2018       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2019       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2020       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2021       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2022
2023       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2024       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2025       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2026       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2027
2028       /* Advance to next input */
2029       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2030       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2031    }
2032    /* while --num_inputs */
2033    x86_dec( func, num_inputs );
2034    x86_jcc( func, cc_NE, inner_loop );
2035
2036    /* Restore EBX */
2037    x86_pop( func, aos_input );
2038 }
2039
2040 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2041 {
2042    struct x86_reg soa_output;
2043    struct x86_reg aos_output;
2044    struct x86_reg num_outputs;
2045    struct x86_reg temp;
2046    int inner_loop;
2047
2048    soa_output = x86_make_reg( file_REG32, reg_AX );
2049    aos_output = x86_make_reg( file_REG32, reg_BX );
2050    num_outputs = x86_make_reg( file_REG32, reg_CX );
2051    temp = x86_make_reg( file_REG32, reg_DX );
2052
2053    /* Save EBX */
2054    x86_push( func, aos_output );
2055
2056    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2057    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2058    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2059
2060    /* do */
2061    inner_loop = x86_get_label( func );
2062    {
2063       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2064       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2065       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2066       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2067
2068       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2069       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2070       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2071       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2072       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2073       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2074
2075       x86_mov( func, temp, x86_fn_arg( func, stride ) );
2076       x86_push( func, aos_output );
2077       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2078       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2079       x86_add( func, aos_output, temp );
2080       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2081       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2082       x86_add( func, aos_output, temp );
2083       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2084       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2085       x86_add( func, aos_output, temp );
2086       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2087       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2088       x86_pop( func, aos_output );
2089
2090       /* Advance to next output */
2091       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2092       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2093    }
2094    /* while --num_outputs */
2095    x86_dec( func, num_outputs );
2096    x86_jcc( func, cc_NE, inner_loop );
2097
2098    /* Restore EBX */
2099    x86_pop( func, aos_output );
2100 }
2101
2102 /**
2103  * Translate a TGSI vertex/fragment shader to SSE2 code.
2104  * Slightly different things are done for vertex vs. fragment shaders.
2105  *
2106  * Note that fragment shaders are responsible for interpolating shader
2107  * inputs. Because on x86 we have only 4 GP registers, and here we
2108  * have 5 shader arguments (input, output, const, temp and coef), the
2109  * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2110  * GP register holding the output argument is aliased with the coeff
2111  * argument, as outputs are not needed in the DECLARATION phase.
2112  *
2113  * \param tokens  the TGSI input shader
2114  * \param func  the output SSE code/function
2115  * \param immediates  buffer to place immediates, later passed to SSE func
2116  * \param return  1 for success, 0 if translation failed
2117  */
2118 unsigned
2119 tgsi_emit_sse2(
2120    const struct tgsi_token *tokens,
2121    struct x86_function *func,
2122    float (*immediates)[4],
2123    boolean do_swizzles )
2124 {
2125    struct tgsi_parse_context parse;
2126    boolean instruction_phase = FALSE;
2127    unsigned ok = 1;
2128    uint num_immediates = 0;
2129
2130    func->csr = func->store;
2131
2132    tgsi_parse_init( &parse, tokens );
2133
2134    /* Can't just use EDI, EBX without save/restoring them:
2135     */
2136    x86_push(
2137       func,
2138       get_immediate_base() );
2139
2140    x86_push(
2141       func,
2142       get_temp_base() );
2143
2144
2145    /*
2146     * Different function args for vertex/fragment shaders:
2147     */
2148    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2149       /* DECLARATION phase, do not load output argument. */
2150       x86_mov(
2151          func,
2152          get_input_base(),
2153          x86_fn_arg( func, 1 ) );
2154       /* skipping outputs argument here */
2155       x86_mov(
2156          func,
2157          get_const_base(),
2158          x86_fn_arg( func, 3 ) );
2159       x86_mov(
2160          func,
2161          get_temp_base(),
2162          x86_fn_arg( func, 4 ) );
2163       x86_mov(
2164          func,
2165          get_coef_base(),
2166          x86_fn_arg( func, 5 ) );
2167       x86_mov(
2168          func,
2169          get_immediate_base(),
2170          x86_fn_arg( func, 6 ) );
2171    }
2172    else {
2173       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2174
2175       if (do_swizzles)
2176          aos_to_soa( func,
2177                      6,         /* aos_input */
2178                      1,         /* machine->input */
2179                      7,         /* num_inputs */
2180                      8 );       /* input_stride */
2181
2182       x86_mov(
2183          func,
2184          get_input_base(),
2185          x86_fn_arg( func, 1 ) );
2186       x86_mov(
2187          func,
2188          get_output_base(),
2189          x86_fn_arg( func, 2 ) );
2190       x86_mov(
2191          func,
2192          get_const_base(),
2193          x86_fn_arg( func, 3 ) );
2194       x86_mov(
2195          func,
2196          get_temp_base(),
2197          x86_fn_arg( func, 4 ) );
2198       x86_mov(
2199          func,
2200          get_immediate_base(),
2201          x86_fn_arg( func, 5 ) );
2202    }
2203
2204    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2205       tgsi_parse_token( &parse );
2206
2207       switch( parse.FullToken.Token.Type ) {
2208       case TGSI_TOKEN_TYPE_DECLARATION:
2209          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2210             emit_declaration(
2211                func,
2212                &parse.FullToken.FullDeclaration );
2213          }
2214          break;
2215
2216       case TGSI_TOKEN_TYPE_INSTRUCTION:
2217          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2218             if( !instruction_phase ) {
2219                /* INSTRUCTION phase, overwrite coeff with output. */
2220                instruction_phase = TRUE;
2221                x86_mov(
2222                   func,
2223                   get_output_base(),
2224                   x86_fn_arg( func, 2 ) );
2225             }
2226          }
2227
2228          ok = emit_instruction(
2229             func,
2230             &parse.FullToken.FullInstruction );
2231
2232          if (!ok) {
2233             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2234                          parse.FullToken.FullInstruction.Instruction.Opcode,
2235                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2236                          "vertex shader" : "fragment shader");
2237          }
2238          break;
2239
2240       case TGSI_TOKEN_TYPE_IMMEDIATE:
2241          /* simply copy the immediate values into the next immediates[] slot */
2242          {
2243             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2244             uint i;
2245             assert(size <= 4);
2246             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2247             for( i = 0; i < size; i++ ) {
2248                immediates[num_immediates][i] =
2249                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2250             }
2251 #if 0
2252             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2253                    num_immediates,
2254                    immediates[num_immediates][0],
2255                    immediates[num_immediates][1],
2256                    immediates[num_immediates][2],
2257                    immediates[num_immediates][3]);
2258 #endif
2259             num_immediates++;
2260          }
2261          break;
2262
2263       default:
2264          ok = 0;
2265          assert( 0 );
2266       }
2267    }
2268
2269    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2270       if (do_swizzles)
2271          soa_to_aos( func, 9, 2, 10, 11 );
2272    }
2273
2274    /* Can't just use EBX, EDI without save/restoring them:
2275     */
2276    x86_pop(
2277       func,
2278       get_temp_base() );
2279
2280    x86_pop(
2281       func,
2282       get_immediate_base() );
2283
2284    emit_ret( func );
2285
2286    tgsi_parse_free( &parse );
2287
2288    return ok;
2289 }
2290
2291 #endif /* PIPE_ARCH_X86 */