src/mesa/pipe/tgsi/exec/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_util.h"
  29 #include "pipe/p_shader_tokens.h"
  30 #include "pipe/tgsi/util/tgsi_parse.h"
  31 #include "pipe/tgsi/util/tgsi_util.h"
  32 #include "tgsi_exec.h"
  33 #include "tgsi_sse2.h"
  34
  35 #include "x86/rtasm/x86sse.h"
  36
  37 #if defined(__i386__) || defined(__386__)
  38
  39 #define DUMP_SSE  0
  40
  41 #if DUMP_SSE
  42
  43 static void
  44 _print_reg(
  45    struct x86_reg reg )
  46 {
  47    if (reg.mod != mod_REG)
  48       debug_printf( "[" );
  49
  50    switch( reg.file ) {
  51    case file_REG32:
  52       switch( reg.idx ) {
  53       case reg_AX:
  54          debug_printf( "EAX" );
  55          break;
  56       case reg_CX:
  57          debug_printf( "ECX" );
  58          break;
  59       case reg_DX:
  60          debug_printf( "EDX" );
  61          break;
  62       case reg_BX:
  63          debug_printf( "EBX" );
  64          break;
  65       case reg_SP:
  66          debug_printf( "ESP" );
  67          break;
  68       case reg_BP:
  69          debug_printf( "EBP" );
  70          break;
  71       case reg_SI:
  72          debug_printf( "ESI" );
  73          break;
  74       case reg_DI:
  75          debug_printf( "EDI" );
  76          break;
  77       }
  78       break;
  79    case file_MMX:
  80       assert( 0 );
  81       break;
  82    case file_XMM:
  83       debug_printf( "XMM%u", reg.idx );
  84       break;
  85    case file_x87:
  86       assert( 0 );
  87       break;
  88    }
  89
  90    if (reg.mod == mod_DISP8 ||
  91        reg.mod == mod_DISP32)
  92       debug_printf("+%d", reg.disp);
  93
  94    if (reg.mod != mod_REG)
  95       debug_printf( "]" );
  96 }
  97
  98 static void
  99 _fill(
 100    const char  *op )
 101 {
 102    unsigned count = 10 - strlen( op );
 103
 104    while( count-- ) {
 105       debug_printf( " " );
 106    }
 107 }
 108
 109 #define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
 110 #define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
 111 #define DUMP( OP ) debug_printf( "\n%s", OP )
 112 #define DUMP_I( OP, I ) do {\
 113    debug_printf( "\n%s", OP );\
 114    _fill( OP );\
 115    debug_printf( "%u", I ); } while( 0 )
 116 #define DUMP_R( OP, R0 ) do {\
 117    debug_printf( "\n%s", OP );\
 118    _fill( OP );\
 119    _print_reg( R0 ); } while( 0 )
 120 #define DUMP_RR( OP, R0, R1 ) do {\
 121    debug_printf( "\n%s", OP );\
 122    _fill( OP );\
 123    _print_reg( R0 );\
 124    debug_printf( ", " );\
 125    _print_reg( R1 ); } while( 0 )
 126 #define DUMP_RRI( OP, R0, R1, I ) do {\
 127    debug_printf( "\n%s", OP );\
 128    _fill( OP );\
 129    _print_reg( R0 );\
 130    debug_printf( ", " );\
 131    _print_reg( R1 );\
 132    debug_printf( ", " );\
 133    debug_printf( "%u", I ); } while( 0 )
 134
 135 #else
 136
 137 #define DUMP_START()
 138 #define DUMP_END()
 139 #define DUMP( OP )
 140 #define DUMP_I( OP, I )
 141 #define DUMP_R( OP, R0 )
 142 #define DUMP_RR( OP, R0, R1 )
 143 #define DUMP_RRI( OP, R0, R1, I )
 144
 145 #endif
 146
 147 #define FOR_EACH_CHANNEL( CHAN )\
 148    for( CHAN = 0; CHAN < 4; CHAN++ )
 149
 150 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
 151    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
 152
 153 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
 154    if( IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
 155
 156 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
 157    FOR_EACH_CHANNEL( CHAN )\
 158       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
 159
 160 #define CHAN_X 0
 161 #define CHAN_Y 1
 162 #define CHAN_Z 2
 163 #define CHAN_W 3
 164
 165 #define TEMP_R0   TGSI_EXEC_TEMP_R0
 166
 167 /**
 168  * X86 utility functions.
 169  */
 170
 171 static struct x86_reg
 172 make_xmm(
 173    unsigned xmm )
 174 {
 175    return x86_make_reg(
 176       file_XMM,
 177       (enum x86_reg_name) xmm );
 178 }
 179
 180 /**
 181  * X86 register mapping helpers.
 182  */
 183
 184 static struct x86_reg
 185 get_const_base( void )
 186 {
 187    return x86_make_reg(
 188       file_REG32,
 189       reg_CX );
 190 }
 191
 192 static struct x86_reg
 193 get_input_base( void )
 194 {
 195    return x86_make_reg(
 196       file_REG32,
 197       reg_AX );
 198 }
 199
 200 static struct x86_reg
 201 get_output_base( void )
 202 {
 203    return x86_make_reg(
 204       file_REG32,
 205       reg_DX );
 206 }
 207
 208 static struct x86_reg
 209 get_temp_base( void )
 210 {
 211 #ifdef WIN32
 212    return x86_make_reg(
 213       file_REG32,
 214       reg_BX );
 215 #else
 216    return x86_make_reg(
 217       file_REG32,
 218       reg_SI );
 219 #endif
 220 }
 221
 222 static struct x86_reg
 223 get_coef_base( void )
 224 {
 225    return get_output_base();
 226 }
 227
 228 /**
 229  * Data access helpers.
 230  */
 231
 232 static struct x86_reg
 233 get_argument(
 234    unsigned index )
 235 {
 236    return x86_make_disp(
 237       x86_make_reg( file_REG32, reg_SP ),
 238       (index + 1) * 4 );
 239 }
 240
 241 static struct x86_reg
 242 get_const(
 243    unsigned vec,
 244    unsigned chan )
 245 {
 246    return x86_make_disp(
 247       get_const_base(),
 248       (vec * 4 + chan) * 4 );
 249 }
 250
 251 static struct x86_reg
 252 get_input(
 253    unsigned vec,
 254    unsigned chan )
 255 {
 256    return x86_make_disp(
 257       get_input_base(),
 258       (vec * 4 + chan) * 16 );
 259 }
 260
 261 static struct x86_reg
 262 get_output(
 263    unsigned vec,
 264    unsigned chan )
 265 {
 266    return x86_make_disp(
 267       get_output_base(),
 268       (vec * 4 + chan) * 16 );
 269 }
 270
 271 static struct x86_reg
 272 get_temp(
 273    unsigned vec,
 274    unsigned chan )
 275 {
 276    return x86_make_disp(
 277       get_temp_base(),
 278       (vec * 4 + chan) * 16 );
 279 }
 280
 281 static struct x86_reg
 282 get_coef(
 283    unsigned vec,
 284    unsigned chan,
 285    unsigned member )
 286 {
 287    return x86_make_disp(
 288       get_coef_base(),
 289       ((vec * 3 + member) * 4 + chan) * 4 );
 290 }
 291
 292 /**
 293  * X86 rtasm wrappers.
 294  */
 295
 296 static void
 297 emit_addps(
 298    struct x86_function  *func,
 299    struct x86_reg       dst,
 300    struct x86_reg       src )
 301 {
 302    DUMP_RR( "ADDPS", dst, src );
 303    sse_addps( func, dst, src );
 304 }
 305
 306 static void
 307 emit_andnps(
 308    struct x86_function  *func,
 309    struct x86_reg       dst,
 310    struct x86_reg       src )
 311 {
 312    DUMP_RR( "ANDNPS", dst, src );
 313    sse_andnps( func, dst, src );
 314 }
 315
 316 static void
 317 emit_andps(
 318    struct x86_function  *func,
 319    struct x86_reg       dst,
 320    struct x86_reg       src )
 321 {
 322    DUMP_RR( "ANDPS", dst, src );
 323    sse_andps( func, dst, src );
 324 }
 325
 326 static void
 327 emit_call(
 328    struct x86_function  *func,
 329    void                 (* addr)() )
 330 {
 331    DUMP_I( "CALL", addr );
 332    x86_call( func, addr );
 333 }
 334
 335 static void
 336 emit_cmpps(
 337    struct x86_function  *func,
 338    struct x86_reg       dst,
 339    struct x86_reg       src,
 340    enum sse_cc          cc )
 341 {
 342    DUMP_RRI( "CMPPS", dst, src, cc );
 343    sse_cmpps( func, dst, src, cc );
 344 }
 345
 346 static void
 347 emit_cvttps2dq(
 348    struct x86_function  *func,
 349    struct x86_reg       dst,
 350    struct x86_reg       src )
 351 {
 352    DUMP_RR( "CVTTPS2DQ", dst, src );
 353    sse2_cvttps2dq( func, dst, src );
 354 }
 355
 356 static void
 357 emit_maxps(
 358    struct x86_function  *func,
 359    struct x86_reg       dst,
 360    struct x86_reg       src )
 361 {
 362    DUMP_RR( "MAXPS", dst, src );
 363    sse_maxps( func, dst, src );
 364 }
 365
 366 static void
 367 emit_minps(
 368    struct x86_function  *func,
 369    struct x86_reg       dst,
 370    struct x86_reg       src )
 371 {
 372    DUMP_RR( "MINPS", dst, src );
 373    sse_minps( func, dst, src );
 374 }
 375
 376 static void
 377 emit_mov(
 378    struct x86_function  *func,
 379    struct x86_reg       dst,
 380    struct x86_reg       src )
 381 {
 382    DUMP_RR( "MOV", dst, src );
 383    x86_mov( func, dst, src );
 384 }
 385
 386 static void
 387 emit_movaps(
 388    struct x86_function  *func,
 389    struct x86_reg       dst,
 390    struct x86_reg       src )
 391 {
 392    DUMP_RR( "MOVAPS", dst, src );
 393    sse_movaps( func, dst, src );
 394 }
 395
 396 static void
 397 emit_movss(
 398    struct x86_function  *func,
 399    struct x86_reg       dst,
 400    struct x86_reg       src )
 401 {
 402    DUMP_RR( "MOVSS", dst, src );
 403    sse_movss( func, dst, src );
 404 }
 405
 406 static void
 407 emit_movups(
 408    struct x86_function  *func,
 409    struct x86_reg       dst,
 410    struct x86_reg       src )
 411 {
 412    DUMP_RR( "MOVUPS", dst, src );
 413    sse_movups( func, dst, src );
 414 }
 415
 416 static void
 417 emit_mulps(
 418    struct x86_function  *func,
 419    struct x86_reg       dst,
 420    struct x86_reg       src )
 421 {
 422    DUMP_RR( "MULPS", dst, src );
 423    sse_mulps( func, dst, src );
 424 }
 425
 426 static void
 427 emit_or(
 428    struct x86_function  *func,
 429    struct x86_reg       dst,
 430    struct x86_reg       src )
 431 {
 432    DUMP_RR( "OR", dst, src );
 433    x86_or( func, dst, src );
 434 }
 435
 436 static void
 437 emit_orps(
 438    struct x86_function  *func,
 439    struct x86_reg       dst,
 440    struct x86_reg       src )
 441 {
 442    DUMP_RR( "ORPS", dst, src );
 443    sse_orps( func, dst, src );
 444 }
 445
 446 static void
 447 emit_pmovmskb(
 448    struct x86_function  *func,
 449    struct x86_reg       dst,
 450    struct x86_reg       src )
 451 {
 452    DUMP_RR( "PMOVMSKB", dst, src );
 453    sse_pmovmskb( func, dst, src );
 454 }
 455
 456 static void
 457 emit_pop(
 458    struct x86_function  *func,
 459    struct x86_reg       dst )
 460 {
 461    DUMP_R( "POP", dst );
 462    x86_pop( func, dst );
 463 }
 464
 465 static void
 466 emit_push(
 467    struct x86_function  *func,
 468    struct x86_reg       dst )
 469 {
 470    DUMP_R( "PUSH", dst );
 471    x86_push( func, dst );
 472 }
 473
 474 static void
 475 emit_rcpps(
 476    struct x86_function  *func,
 477    struct x86_reg       dst,
 478    struct x86_reg       src )
 479 {
 480    DUMP_RR( "RCPPS", dst, src );
 481    sse2_rcpps( func, dst, src );
 482 }
 483
 484 #ifdef WIN32
 485 static void
 486 emit_retw(
 487    struct x86_function  *func,
 488    unsigned             size )
 489 {
 490    DUMP_I( "RET", size );
 491    x86_retw( func, size );
 492 }
 493 #else
 494 static void
 495 emit_ret(
 496    struct x86_function  *func )
 497 {
 498    DUMP( "RET" );
 499    x86_ret( func );
 500 }
 501 #endif
 502
 503 static void
 504 emit_rsqrtps(
 505    struct x86_function  *func,
 506    struct x86_reg       dst,
 507    struct x86_reg       src )
 508 {
 509    DUMP_RR( "RSQRTPS", dst, src );
 510    sse_rsqrtps( func, dst, src );
 511 }
 512
 513 static void
 514 emit_shufps(
 515    struct x86_function  *func,
 516    struct x86_reg       dst,
 517    struct x86_reg       src,
 518    unsigned char        shuf )
 519 {
 520    DUMP_RRI( "SHUFPS", dst, src, shuf );
 521    sse_shufps( func, dst, src, shuf );
 522 }
 523
 524 static void
 525 emit_subps(
 526    struct x86_function  *func,
 527    struct x86_reg       dst,
 528    struct x86_reg       src )
 529 {
 530    DUMP_RR( "SUBPS", dst, src );
 531    sse_subps( func, dst, src );
 532 }
 533
 534 static void
 535 emit_xorps(
 536    struct x86_function  *func,
 537    struct x86_reg       dst,
 538    struct x86_reg       src )
 539 {
 540    DUMP_RR( "XORPS", dst, src );
 541    sse_xorps( func, dst, src );
 542 }
 543
 544 /**
 545  * Data fetch helpers.
 546  */
 547
 548 static void
 549 emit_const(
 550    struct x86_function *func,
 551    unsigned xmm,
 552    unsigned vec,
 553    unsigned chan )
 554 {
 555    emit_movss(
 556       func,
 557       make_xmm( xmm ),
 558       get_const( vec, chan ) );
 559    emit_shufps(
 560       func,
 561       make_xmm( xmm ),
 562       make_xmm( xmm ),
 563       SHUF( 0, 0, 0, 0 ) );
 564 }
 565
 566 static void
 567 emit_inputf(
 568    struct x86_function *func,
 569    unsigned xmm,
 570    unsigned vec,
 571    unsigned chan )
 572 {
 573    emit_movups(
 574       func,
 575       make_xmm( xmm ),
 576       get_input( vec, chan ) );
 577 }
 578
 579 static void
 580 emit_output(
 581    struct x86_function *func,
 582    unsigned xmm,
 583    unsigned vec,
 584    unsigned chan )
 585 {
 586    emit_movups(
 587       func,
 588       get_output( vec, chan ),
 589       make_xmm( xmm ) );
 590 }
 591
 592 static void
 593 emit_tempf(
 594    struct x86_function *func,
 595    unsigned xmm,
 596    unsigned vec,
 597    unsigned chan )
 598 {
 599    emit_movaps(
 600       func,
 601       make_xmm( xmm ),
 602       get_temp( vec, chan ) );
 603 }
 604
 605 static void
 606 emit_coef(
 607    struct x86_function *func,
 608    unsigned xmm,
 609    unsigned vec,
 610    unsigned chan,
 611    unsigned member )
 612 {
 613    emit_movss(
 614       func,
 615       make_xmm( xmm ),
 616       get_coef( vec, chan, member ) );
 617    emit_shufps(
 618       func,
 619       make_xmm( xmm ),
 620       make_xmm( xmm ),
 621       SHUF( 0, 0, 0, 0 ) );
 622 }
 623
 624 /**
 625  * Data store helpers.
 626  */
 627
 628 static void
 629 emit_inputs(
 630    struct x86_function *func,
 631    unsigned xmm,
 632    unsigned vec,
 633    unsigned chan )
 634 {
 635    emit_movups(
 636       func,
 637       get_input( vec, chan ),
 638       make_xmm( xmm ) );
 639 }
 640
 641 static void
 642 emit_temps(
 643    struct x86_function *func,
 644    unsigned xmm,
 645    unsigned vec,
 646    unsigned chan )
 647 {
 648    emit_movaps(
 649       func,
 650       get_temp( vec, chan ),
 651       make_xmm( xmm ) );
 652 }
 653
 654 static void
 655 emit_addrs(
 656    struct x86_function *func,
 657    unsigned xmm,
 658    unsigned vec,
 659    unsigned chan )
 660 {
 661    emit_temps(
 662       func,
 663       xmm,
 664       vec + TGSI_EXEC_NUM_TEMPS,
 665       chan );
 666 }
 667
 668 /**
 669  * Coefficent fetch helpers.
 670  */
 671
 672 static void
 673 emit_coef_a0(
 674    struct x86_function *func,
 675    unsigned xmm,
 676    unsigned vec,
 677    unsigned chan )
 678 {
 679    emit_coef(
 680       func,
 681       xmm,
 682       vec,
 683       chan,
 684       0 );
 685 }
 686
 687 static void
 688 emit_coef_dadx(
 689    struct x86_function *func,
 690    unsigned xmm,
 691    unsigned vec,
 692    unsigned chan )
 693 {
 694    emit_coef(
 695       func,
 696       xmm,
 697       vec,
 698       chan,
 699       1 );
 700 }
 701
 702 static void
 703 emit_coef_dady(
 704    struct x86_function *func,
 705    unsigned xmm,
 706    unsigned vec,
 707    unsigned chan )
 708 {
 709    emit_coef(
 710       func,
 711       xmm,
 712       vec,
 713       chan,
 714       2 );
 715 }
 716
 717 /**
 718  * Function call helpers.
 719  */
 720
 721 static void
 722 emit_push_gp(
 723    struct x86_function *func )
 724 {
 725    emit_push(
 726       func,
 727       get_const_base() );
 728    emit_push(
 729       func,
 730       get_input_base() );
 731    emit_push(
 732       func,
 733       get_output_base() );
 734
 735    /* It is important on non-win32 platforms that temp base is pushed last.
 736     */
 737    emit_push(
 738       func,
 739       get_temp_base() );
 740 }
 741
 742 static void
 743 emit_pop_gp(
 744    struct x86_function *func )
 745 {
 746    /* Restore GP registers in a reverse order.
 747     */
 748    emit_pop(
 749       func,
 750       get_temp_base() );
 751    emit_pop(
 752       func,
 753       get_output_base() );
 754    emit_pop(
 755       func,
 756       get_input_base() );
 757    emit_pop(
 758       func,
 759       get_const_base() );
 760 }
 761
 762 static void
 763 emit_func_call_dst(
 764    struct x86_function *func,
 765    unsigned xmm_dst,
 766    void (*code)() )
 767 {
 768    emit_movaps(
 769       func,
 770       get_temp( TEMP_R0, 0 ),
 771       make_xmm( xmm_dst ) );
 772
 773    emit_push_gp(
 774       func );
 775
 776 #ifdef WIN32
 777    emit_push(
 778       func,
 779       get_temp( TEMP_R0, 0 ) );
 780 #endif
 781
 782    emit_call(
 783       func,
 784       code );
 785
 786    emit_pop_gp(
 787       func );
 788
 789    emit_movaps(
 790       func,
 791       make_xmm( xmm_dst ),
 792       get_temp( TEMP_R0, 0 ) );
 793 }
 794
 795 static void
 796 emit_func_call_dst_src(
 797    struct x86_function *func,
 798    unsigned xmm_dst,
 799    unsigned xmm_src,
 800    void (*code)() )
 801 {
 802    emit_movaps(
 803       func,
 804       get_temp( TEMP_R0, 1 ),
 805       make_xmm( xmm_src ) );
 806
 807    emit_func_call_dst(
 808       func,
 809       xmm_dst,
 810       code );
 811 }
 812
 813 /**
 814  * Low-level instruction translators.
 815  */
 816
 817 static void
 818 emit_abs(
 819    struct x86_function *func,
 820    unsigned xmm )
 821 {
 822    emit_andps(
 823       func,
 824       make_xmm( xmm ),
 825       get_temp(
 826          TGSI_EXEC_TEMP_7FFFFFFF_I,
 827          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 828 }
 829
 830 static void
 831 emit_add(
 832    struct x86_function *func,
 833    unsigned xmm_dst,
 834    unsigned xmm_src )
 835 {
 836    emit_addps(
 837       func,
 838       make_xmm( xmm_dst ),
 839       make_xmm( xmm_src ) );
 840 }
 841
 842 static void XSTDCALL
 843 cos4f(
 844    float *store )
 845 {
 846 #ifdef WIN32
 847    store[0] = (float) cos( (double) store[0] );
 848    store[1] = (float) cos( (double) store[1] );
 849    store[2] = (float) cos( (double) store[2] );
 850    store[3] = (float) cos( (double) store[3] );
 851 #else
 852    const unsigned X = TEMP_R0 * 16;
 853    store[X + 0] = cosf( store[X + 0] );
 854    store[X + 1] = cosf( store[X + 1] );
 855    store[X + 2] = cosf( store[X + 2] );
 856    store[X + 3] = cosf( store[X + 3] );
 857 #endif
 858 }
 859
 860 static void
 861 emit_cos(
 862    struct x86_function *func,
 863    unsigned xmm_dst )
 864 {
 865    emit_func_call_dst(
 866       func,
 867       xmm_dst,
 868       cos4f );
 869 }
 870
 871 static void XSTDCALL
 872 ex24f(
 873    float *store )
 874 {
 875 #ifdef WIN32
 876    store[0] = (float) pow( 2.0, (double) store[0] );
 877    store[1] = (float) pow( 2.0, (double) store[1] );
 878    store[2] = (float) pow( 2.0, (double) store[2] );
 879    store[3] = (float) pow( 2.0, (double) store[3] );
 880 #else
 881    const unsigned X = TEMP_R0 * 16;
 882    store[X + 0] = powf( 2.0f, store[X + 0] );
 883    store[X + 1] = powf( 2.0f, store[X + 1] );
 884    store[X + 2] = powf( 2.0f, store[X + 2] );
 885    store[X + 3] = powf( 2.0f, store[X + 3] );
 886 #endif
 887 }
 888
 889 static void
 890 emit_ex2(
 891    struct x86_function *func,
 892    unsigned xmm_dst )
 893 {
 894    emit_func_call_dst(
 895       func,
 896       xmm_dst,
 897       ex24f );
 898 }
 899
 900 static void
 901 emit_f2it(
 902    struct x86_function *func,
 903    unsigned xmm )
 904 {
 905    emit_cvttps2dq(
 906       func,
 907       make_xmm( xmm ),
 908       make_xmm( xmm ) );
 909 }
 910
 911 static void XSTDCALL
 912 flr4f(
 913    float *store )
 914 {
 915 #ifdef WIN32
 916    const unsigned X = 0;
 917 #else
 918    const unsigned X = TEMP_R0 * 16;
 919 #endif
 920    store[X + 0] = (float) floor( (double) store[X + 0] );
 921    store[X + 1] = (float) floor( (double) store[X + 1] );
 922    store[X + 2] = (float) floor( (double) store[X + 2] );
 923    store[X + 3] = (float) floor( (double) store[X + 3] );
 924 }
 925
 926 static void
 927 emit_flr(
 928    struct x86_function *func,
 929    unsigned xmm_dst )
 930 {
 931    emit_func_call_dst(
 932       func,
 933       xmm_dst,
 934       flr4f );
 935 }
 936
 937 static void XSTDCALL
 938 frc4f(
 939    float *store )
 940 {
 941 #ifdef WIN32
 942    const unsigned X = 0;
 943 #else
 944    const unsigned X = TEMP_R0 * 16;
 945 #endif
 946    store[X + 0] -= (float) floor( (double) store[X + 0] );
 947    store[X + 1] -= (float) floor( (double) store[X + 1] );
 948    store[X + 2] -= (float) floor( (double) store[X + 2] );
 949    store[X + 3] -= (float) floor( (double) store[X + 3] );
 950 }
 951
 952 static void
 953 emit_frc(
 954    struct x86_function *func,
 955    unsigned xmm_dst )
 956 {
 957    emit_func_call_dst(
 958       func,
 959       xmm_dst,
 960       frc4f );
 961 }
 962
 963 static void XSTDCALL
 964 lg24f(
 965    float *store )
 966 {
 967 #ifdef WIN32
 968    const unsigned X = 0;
 969 #else
 970    const unsigned X = TEMP_R0 * 16;
 971 #endif
 972    store[X + 0] = LOG2( store[X + 0] );
 973    store[X + 1] = LOG2( store[X + 1] );
 974    store[X + 2] = LOG2( store[X + 2] );
 975    store[X + 3] = LOG2( store[X + 3] );
 976 }
 977
 978 static void
 979 emit_lg2(
 980    struct x86_function *func,
 981    unsigned xmm_dst )
 982 {
 983    emit_func_call_dst(
 984       func,
 985       xmm_dst,
 986       lg24f );
 987 }
 988
 989 static void
 990 emit_MOV(
 991    struct x86_function *func,
 992    unsigned xmm_dst,
 993    unsigned xmm_src )
 994 {
 995    emit_movups(
 996       func,
 997       make_xmm( xmm_dst ),
 998       make_xmm( xmm_src ) );
 999 }
1000
1001 static void
1002 emit_mul (struct x86_function *func,
1003           unsigned xmm_dst,
1004           unsigned xmm_src)
1005 {
1006    emit_mulps(
1007       func,
1008       make_xmm( xmm_dst ),
1009       make_xmm( xmm_src ) );
1010 }
1011
1012 static void
1013 emit_neg(
1014    struct x86_function *func,
1015    unsigned xmm )
1016 {
1017    emit_xorps(
1018       func,
1019       make_xmm( xmm ),
1020       get_temp(
1021          TGSI_EXEC_TEMP_80000000_I,
1022          TGSI_EXEC_TEMP_80000000_C ) );
1023 }
1024
1025 static void XSTDCALL
1026 pow4f(
1027    float *store )
1028 {
1029 #ifdef WIN32
1030    store[0] = (float) pow( (double) store[0], (double) store[4] );
1031    store[1] = (float) pow( (double) store[1], (double) store[5] );
1032    store[2] = (float) pow( (double) store[2], (double) store[6] );
1033    store[3] = (float) pow( (double) store[3], (double) store[7] );
1034 #else
1035    const unsigned X = TEMP_R0 * 16;
1036    store[X + 0] = powf( store[X + 0], store[X + 4] );
1037    store[X + 1] = powf( store[X + 1], store[X + 5] );
1038    store[X + 2] = powf( store[X + 2], store[X + 6] );
1039    store[X + 3] = powf( store[X + 3], store[X + 7] );
1040 #endif
1041 }
1042
1043 static void
1044 emit_pow(
1045    struct x86_function *func,
1046    unsigned xmm_dst,
1047    unsigned xmm_src )
1048 {
1049    emit_func_call_dst_src(
1050       func,
1051       xmm_dst,
1052       xmm_src,
1053       pow4f );
1054 }
1055
1056 static void
1057 emit_rcp (
1058    struct x86_function *func,
1059    unsigned xmm_dst,
1060    unsigned xmm_src )
1061 {
1062    emit_rcpps(
1063       func,
1064       make_xmm( xmm_dst ),
1065       make_xmm( xmm_src ) );
1066 }
1067
1068 static void
1069 emit_rsqrt(
1070    struct x86_function *func,
1071    unsigned xmm_dst,
1072    unsigned xmm_src )
1073 {
1074    emit_rsqrtps(
1075       func,
1076       make_xmm( xmm_dst ),
1077       make_xmm( xmm_src ) );
1078 }
1079
1080 static void
1081 emit_setsign(
1082    struct x86_function *func,
1083    unsigned xmm )
1084 {
1085    emit_orps(
1086       func,
1087       make_xmm( xmm ),
1088       get_temp(
1089          TGSI_EXEC_TEMP_80000000_I,
1090          TGSI_EXEC_TEMP_80000000_C ) );
1091 }
1092
1093 static void XSTDCALL
1094 sin4f(
1095    float *store )
1096 {
1097 #ifdef WIN32
1098    store[0] = (float) sin( (double) store[0] );
1099    store[1] = (float) sin( (double) store[1] );
1100    store[2] = (float) sin( (double) store[2] );
1101    store[3] = (float) sin( (double) store[3] );
1102 #else
1103    const unsigned X = TEMP_R0 * 16;
1104    store[X + 0] = sinf( store[X + 0] );
1105    store[X + 1] = sinf( store[X + 1] );
1106    store[X + 2] = sinf( store[X + 2] );
1107    store[X + 3] = sinf( store[X + 3] );
1108 #endif
1109 }
1110
1111 static void
1112 emit_sin (struct x86_function *func,
1113           unsigned xmm_dst)
1114 {
1115    emit_func_call_dst(
1116       func,
1117       xmm_dst,
1118       sin4f );
1119 }
1120
1121 static void
1122 emit_sub(
1123    struct x86_function *func,
1124    unsigned xmm_dst,
1125    unsigned xmm_src )
1126 {
1127    emit_subps(
1128       func,
1129       make_xmm( xmm_dst ),
1130       make_xmm( xmm_src ) );
1131 }
1132
1133 /**
1134  * Register fetch.
1135  */
1136
1137 static void
1138 emit_fetch(
1139    struct x86_function *func,
1140    unsigned xmm,
1141    const struct tgsi_full_src_register *reg,
1142    const unsigned chan_index )
1143 {
1144    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
1145
1146    switch( swizzle ) {
1147    case TGSI_EXTSWIZZLE_X:
1148    case TGSI_EXTSWIZZLE_Y:
1149    case TGSI_EXTSWIZZLE_Z:
1150    case TGSI_EXTSWIZZLE_W:
1151       switch( reg->SrcRegister.File ) {
1152       case TGSI_FILE_CONSTANT:
1153          emit_const(
1154             func,
1155             xmm,
1156             reg->SrcRegister.Index,
1157             swizzle );
1158          break;
1159
1160       case TGSI_FILE_INPUT:
1161          emit_inputf(
1162             func,
1163             xmm,
1164             reg->SrcRegister.Index,
1165             swizzle );
1166          break;
1167
1168       case TGSI_FILE_TEMPORARY:
1169          emit_tempf(
1170             func,
1171             xmm,
1172             reg->SrcRegister.Index,
1173             swizzle );
1174          break;
1175
1176       default:
1177          assert( 0 );
1178       }
1179       break;
1180
1181    case TGSI_EXTSWIZZLE_ZERO:
1182       emit_tempf(
1183          func,
1184          xmm,
1185          TGSI_EXEC_TEMP_00000000_I,
1186          TGSI_EXEC_TEMP_00000000_C );
1187       break;
1188
1189    case TGSI_EXTSWIZZLE_ONE:
1190       emit_tempf(
1191          func,
1192          xmm,
1193          TGSI_EXEC_TEMP_ONE_I,
1194          TGSI_EXEC_TEMP_ONE_C );
1195       break;
1196
1197    default:
1198       assert( 0 );
1199    }
1200
1201    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1202    case TGSI_UTIL_SIGN_CLEAR:
1203       emit_abs( func, xmm );
1204       break;
1205
1206    case TGSI_UTIL_SIGN_SET:
1207       emit_setsign( func, xmm );
1208       break;
1209
1210    case TGSI_UTIL_SIGN_TOGGLE:
1211       emit_neg( func, xmm );
1212       break;
1213
1214    case TGSI_UTIL_SIGN_KEEP:
1215       break;
1216    }
1217 }
1218
1219 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1220    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1221
1222 /**
1223  * Register store.
1224  */
1225
1226 static void
1227 emit_store(
1228    struct x86_function *func,
1229    unsigned xmm,
1230    const struct tgsi_full_dst_register *reg,
1231    const struct tgsi_full_instruction *inst,
1232    unsigned chan_index )
1233 {
1234    switch( reg->DstRegister.File ) {
1235    case TGSI_FILE_OUTPUT:
1236       emit_output(
1237          func,
1238          xmm,
1239          reg->DstRegister.Index,
1240          chan_index );
1241       break;
1242
1243    case TGSI_FILE_TEMPORARY:
1244       emit_temps(
1245          func,
1246          xmm,
1247          reg->DstRegister.Index,
1248          chan_index );
1249       break;
1250
1251    case TGSI_FILE_ADDRESS:
1252       emit_addrs(
1253          func,
1254          xmm,
1255          reg->DstRegister.Index,
1256          chan_index );
1257       break;
1258
1259    default:
1260       assert( 0 );
1261    }
1262
1263    switch( inst->Instruction.Saturate ) {
1264    case TGSI_SAT_NONE:
1265       break;
1266
1267    case TGSI_SAT_ZERO_ONE:
1268 //      assert( 0 );
1269       break;
1270
1271    case TGSI_SAT_MINUS_PLUS_ONE:
1272       assert( 0 );
1273       break;
1274    }
1275 }
1276
1277 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1278    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1279
1280 /**
1281  * High-level instruction translators.
1282  */
1283
1284 static void
1285 emit_kil(
1286    struct x86_function *func,
1287    const struct tgsi_full_src_register *reg )
1288 {
1289    unsigned uniquemask;
1290    unsigned registers[4];
1291    unsigned nextregister = 0;
1292    unsigned firstchan = ~0;
1293    unsigned chan_index;
1294
1295    /* This mask stores component bits that were already tested. Note that
1296     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1297     * tested. */
1298    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1299
1300    FOR_EACH_CHANNEL( chan_index ) {
1301       unsigned swizzle;
1302
1303       /* unswizzle channel */
1304       swizzle = tgsi_util_get_full_src_register_extswizzle(
1305          reg,
1306          chan_index );
1307
1308       /* check if the component has not been already tested */
1309       if( !(uniquemask & (1 << swizzle)) ) {
1310          uniquemask |= 1 << swizzle;
1311
1312          /* allocate register */
1313          registers[chan_index] = nextregister;
1314          emit_fetch(
1315             func,
1316             nextregister,
1317             reg,
1318             chan_index );
1319          nextregister++;
1320
1321          /* mark the first channel used */
1322          if( firstchan == ~0 ) {
1323             firstchan = chan_index;
1324          }
1325       }
1326    }
1327
1328    emit_push(
1329       func,
1330       x86_make_reg( file_REG32, reg_AX ) );
1331    emit_push(
1332       func,
1333       x86_make_reg( file_REG32, reg_DX ) );
1334
1335    FOR_EACH_CHANNEL( chan_index ) {
1336       if( uniquemask & (1 << chan_index) ) {
1337          emit_cmpps(
1338             func,
1339             make_xmm( registers[chan_index] ),
1340             get_temp(
1341                TGSI_EXEC_TEMP_00000000_I,
1342                TGSI_EXEC_TEMP_00000000_C ),
1343             cc_LessThan );
1344
1345          if( chan_index == firstchan ) {
1346             emit_pmovmskb(
1347                func,
1348                x86_make_reg( file_REG32, reg_AX ),
1349                make_xmm( registers[chan_index] ) );
1350          }
1351          else {
1352             emit_pmovmskb(
1353                func,
1354                x86_make_reg( file_REG32, reg_DX ),
1355                make_xmm( registers[chan_index] ) );
1356             emit_or(
1357                func,
1358                x86_make_reg( file_REG32, reg_AX ),
1359                x86_make_reg( file_REG32, reg_DX ) );
1360          }
1361       }
1362    }
1363
1364    emit_or(
1365       func,
1366       get_temp(
1367          TGSI_EXEC_TEMP_KILMASK_I,
1368          TGSI_EXEC_TEMP_KILMASK_C ),
1369       x86_make_reg( file_REG32, reg_AX ) );
1370
1371    emit_pop(
1372       func,
1373       x86_make_reg( file_REG32, reg_DX ) );
1374    emit_pop(
1375       func,
1376       x86_make_reg( file_REG32, reg_AX ) );
1377 }
1378
1379 static void
1380 emit_setcc(
1381    struct x86_function *func,
1382    struct tgsi_full_instruction *inst,
1383    enum sse_cc cc )
1384 {
1385    unsigned chan_index;
1386
1387    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1388       FETCH( func, *inst, 0, 0, chan_index );
1389       FETCH( func, *inst, 1, 1, chan_index );
1390       emit_cmpps(
1391          func,
1392          make_xmm( 0 ),
1393          make_xmm( 1 ),
1394          cc );
1395       emit_andps(
1396          func,
1397          make_xmm( 0 ),
1398          get_temp(
1399             TGSI_EXEC_TEMP_ONE_I,
1400             TGSI_EXEC_TEMP_ONE_C ) );
1401       STORE( func, *inst, 0, 0, chan_index );
1402    }
1403 }
1404
1405 static void
1406 emit_cmp(
1407    struct x86_function *func,
1408    struct tgsi_full_instruction *inst )
1409 {
1410    unsigned chan_index;
1411
1412    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1413       FETCH( func, *inst, 0, 0, chan_index );
1414       FETCH( func, *inst, 1, 1, chan_index );
1415       FETCH( func, *inst, 2, 2, chan_index );
1416       emit_cmpps(
1417          func,
1418          make_xmm( 0 ),
1419          get_temp(
1420             TGSI_EXEC_TEMP_00000000_I,
1421             TGSI_EXEC_TEMP_00000000_C ),
1422          cc_LessThan );
1423       emit_andps(
1424          func,
1425          make_xmm( 1 ),
1426          make_xmm( 0 ) );
1427       emit_andnps(
1428          func,
1429          make_xmm( 0 ),
1430          make_xmm( 2 ) );
1431       emit_orps(
1432          func,
1433          make_xmm( 0 ),
1434          make_xmm( 1 ) );
1435       STORE( func, *inst, 0, 0, chan_index );
1436    }
1437 }
1438
1439 static void
1440 emit_instruction(
1441    struct x86_function *func,
1442    struct tgsi_full_instruction *inst )
1443 {
1444    unsigned chan_index;
1445
1446    switch( inst->Instruction.Opcode ) {
1447    case TGSI_OPCODE_ARL:
1448       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1449          FETCH( func, *inst, 0, 0, chan_index );
1450          emit_f2it( func, 0 );
1451          STORE( func, *inst, 0, 0, chan_index );
1452       }
1453       break;
1454
1455    case TGSI_OPCODE_MOV:
1456    /* TGSI_OPCODE_SWZ */
1457       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1458          FETCH( func, *inst, 0, 0, chan_index );
1459          STORE( func, *inst, 0, 0, chan_index );
1460       }
1461       break;
1462
1463    case TGSI_OPCODE_LIT:
1464       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1465           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1466          emit_tempf(
1467             func,
1468             0,
1469             TGSI_EXEC_TEMP_ONE_I,
1470             TGSI_EXEC_TEMP_ONE_C);
1471          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1472             STORE( func, *inst, 0, 0, CHAN_X );
1473          }
1474          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1475             STORE( func, *inst, 0, 0, CHAN_W );
1476          }
1477       }
1478       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1479           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1480          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1481             FETCH( func, *inst, 0, 0, CHAN_X );
1482             emit_maxps(
1483                func,
1484                make_xmm( 0 ),
1485                get_temp(
1486                   TGSI_EXEC_TEMP_00000000_I,
1487                   TGSI_EXEC_TEMP_00000000_C ) );
1488             STORE( func, *inst, 0, 0, CHAN_Y );
1489          }
1490          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1491             FETCH( func, *inst, 1, 0, CHAN_Y );
1492             emit_maxps(
1493                func,
1494                make_xmm( 1 ),
1495                get_temp(
1496                   TGSI_EXEC_TEMP_00000000_I,
1497                   TGSI_EXEC_TEMP_00000000_C ) );
1498             FETCH( func, *inst, 2, 0, CHAN_W );
1499             emit_minps(
1500                func,
1501                make_xmm( 2 ),
1502                get_temp(
1503                   TGSI_EXEC_TEMP_128_I,
1504                   TGSI_EXEC_TEMP_128_C ) );
1505             emit_maxps(
1506                func,
1507                make_xmm( 2 ),
1508                get_temp(
1509                   TGSI_EXEC_TEMP_MINUS_128_I,
1510                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1511             emit_pow( func, 1, 2 );
1512             FETCH( func, *inst, 0, 0, CHAN_X );
1513             emit_xorps(
1514                func,
1515                make_xmm( 2 ),
1516                make_xmm( 2 ) );
1517             emit_cmpps(
1518                func,
1519                make_xmm( 2 ),
1520                make_xmm( 0 ),
1521                cc_LessThanEqual );
1522             emit_andps(
1523                func,
1524                make_xmm( 2 ),
1525                make_xmm( 1 ) );
1526             STORE( func, *inst, 2, 0, CHAN_Z );
1527          }
1528       }
1529       break;
1530
1531    case TGSI_OPCODE_RCP:
1532    /* TGSI_OPCODE_RECIP */
1533       FETCH( func, *inst, 0, 0, CHAN_X );
1534       emit_rcp( func, 0, 0 );
1535       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1536          STORE( func, *inst, 0, 0, chan_index );
1537       }
1538       break;
1539
1540    case TGSI_OPCODE_RSQ:
1541    /* TGSI_OPCODE_RECIPSQRT */
1542       FETCH( func, *inst, 0, 0, CHAN_X );
1543       emit_rsqrt( func, 0, 0 );
1544       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1545          STORE( func, *inst, 0, 0, chan_index );
1546       }
1547       break;
1548
1549    case TGSI_OPCODE_EXP:
1550       assert( 0 );
1551       break;
1552
1553    case TGSI_OPCODE_LOG:
1554       assert( 0 );
1555       break;
1556
1557    case TGSI_OPCODE_MUL:
1558       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1559          FETCH( func, *inst, 0, 0, chan_index );
1560          FETCH( func, *inst, 1, 1, chan_index );
1561          emit_mul( func, 0, 1 );
1562          STORE( func, *inst, 0, 0, chan_index );
1563       }
1564       break;
1565
1566    case TGSI_OPCODE_ADD:
1567       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1568          FETCH( func, *inst, 0, 0, chan_index );
1569          FETCH( func, *inst, 1, 1, chan_index );
1570          emit_add( func, 0, 1 );
1571          STORE( func, *inst, 0, 0, chan_index );
1572       }
1573       break;
1574
1575    case TGSI_OPCODE_DP3:
1576    /* TGSI_OPCODE_DOT3 */
1577       FETCH( func, *inst, 0, 0, CHAN_X );
1578       FETCH( func, *inst, 1, 1, CHAN_X );
1579       emit_mul( func, 0, 1 );
1580       FETCH( func, *inst, 1, 0, CHAN_Y );
1581       FETCH( func, *inst, 2, 1, CHAN_Y );
1582       emit_mul( func, 1, 2 );
1583       emit_add( func, 0, 1 );
1584       FETCH( func, *inst, 1, 0, CHAN_Z );
1585       FETCH( func, *inst, 2, 1, CHAN_Z );
1586       emit_mul( func, 1, 2 );
1587       emit_add( func, 0, 1 );
1588       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1589          STORE( func, *inst, 0, 0, chan_index );
1590       }
1591       break;
1592
1593    case TGSI_OPCODE_DP4:
1594    /* TGSI_OPCODE_DOT4 */
1595       FETCH( func, *inst, 0, 0, CHAN_X );
1596       FETCH( func, *inst, 1, 1, CHAN_X );
1597       emit_mul( func, 0, 1 );
1598       FETCH( func, *inst, 1, 0, CHAN_Y );
1599       FETCH( func, *inst, 2, 1, CHAN_Y );
1600       emit_mul( func, 1, 2 );
1601       emit_add( func, 0, 1 );
1602       FETCH( func, *inst, 1, 0, CHAN_Z );
1603       FETCH( func, *inst, 2, 1, CHAN_Z );
1604       emit_mul(func, 1, 2 );
1605       emit_add(func, 0, 1 );
1606       FETCH( func, *inst, 1, 0, CHAN_W );
1607       FETCH( func, *inst, 2, 1, CHAN_W );
1608       emit_mul( func, 1, 2 );
1609       emit_add( func, 0, 1 );
1610       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1611          STORE( func, *inst, 0, 0, chan_index );
1612       }
1613       break;
1614
1615    case TGSI_OPCODE_DST:
1616       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1617          emit_tempf(
1618             func,
1619             0,
1620             TGSI_EXEC_TEMP_ONE_I,
1621             TGSI_EXEC_TEMP_ONE_C );
1622          STORE( func, *inst, 0, 0, CHAN_X );
1623       }
1624       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1625          FETCH( func, *inst, 0, 0, CHAN_Y );
1626          FETCH( func, *inst, 1, 1, CHAN_Y );
1627          emit_mul( func, 0, 1 );
1628          STORE( func, *inst, 0, 0, CHAN_Y );
1629       }
1630       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1631          FETCH( func, *inst, 0, 0, CHAN_Z );
1632          STORE( func, *inst, 0, 0, CHAN_Z );
1633       }
1634       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1635          FETCH( func, *inst, 0, 1, CHAN_W );
1636          STORE( func, *inst, 0, 0, CHAN_W );
1637       }
1638       break;
1639
1640    case TGSI_OPCODE_MIN:
1641       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1642          FETCH( func, *inst, 0, 0, chan_index );
1643          FETCH( func, *inst, 1, 1, chan_index );
1644          emit_minps(
1645             func,
1646             make_xmm( 0 ),
1647             make_xmm( 1 ) );
1648          STORE( func, *inst, 0, 0, chan_index );
1649       }
1650       break;
1651
1652    case TGSI_OPCODE_MAX:
1653       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1654          FETCH( func, *inst, 0, 0, chan_index );
1655          FETCH( func, *inst, 1, 1, chan_index );
1656          emit_maxps(
1657             func,
1658             make_xmm( 0 ),
1659             make_xmm( 1 ) );
1660          STORE( func, *inst, 0, 0, chan_index );
1661       }
1662       break;
1663
1664    case TGSI_OPCODE_SLT:
1665    /* TGSI_OPCODE_SETLT */
1666       emit_setcc( func, inst, cc_LessThan );
1667       break;
1668
1669    case TGSI_OPCODE_SGE:
1670    /* TGSI_OPCODE_SETGE */
1671       emit_setcc( func, inst, cc_NotLessThan );
1672       break;
1673
1674    case TGSI_OPCODE_MAD:
1675    /* TGSI_OPCODE_MADD */
1676       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1677          FETCH( func, *inst, 0, 0, chan_index );
1678          FETCH( func, *inst, 1, 1, chan_index );
1679          FETCH( func, *inst, 2, 2, chan_index );
1680          emit_mul( func, 0, 1 );
1681          emit_add( func, 0, 2 );
1682          STORE( func, *inst, 0, 0, chan_index );
1683       }
1684       break;
1685
1686    case TGSI_OPCODE_SUB:
1687       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1688          FETCH( func, *inst, 0, 0, chan_index );
1689          FETCH( func, *inst, 1, 1, chan_index );
1690          emit_sub( func, 0, 1 );
1691          STORE( func, *inst, 0, 0, chan_index );
1692       }
1693       break;
1694
1695    case TGSI_OPCODE_LERP:
1696    /* TGSI_OPCODE_LRP */
1697       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1698          FETCH( func, *inst, 0, 0, chan_index );
1699          FETCH( func, *inst, 1, 1, chan_index );
1700          FETCH( func, *inst, 2, 2, chan_index );
1701          emit_sub( func, 1, 2 );
1702          emit_mul( func, 0, 1 );
1703          emit_add( func, 0, 2 );
1704          STORE( func, *inst, 0, 0, chan_index );
1705       }
1706       break;
1707
1708    case TGSI_OPCODE_CND:
1709       assert( 0 );
1710       break;
1711
1712    case TGSI_OPCODE_CND0:
1713       assert( 0 );
1714       break;
1715
1716    case TGSI_OPCODE_DOT2ADD:
1717    /* TGSI_OPCODE_DP2A */
1718       assert( 0 );
1719       break;
1720
1721    case TGSI_OPCODE_INDEX:
1722       assert( 0 );
1723       break;
1724
1725    case TGSI_OPCODE_NEGATE:
1726       assert( 0 );
1727       break;
1728
1729    case TGSI_OPCODE_FRAC:
1730    /* TGSI_OPCODE_FRC */
1731       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1732          FETCH( func, *inst, 0, 0, chan_index );
1733          emit_frc( func, 0 );
1734          STORE( func, *inst, 0, 0, chan_index );
1735       }
1736       break;
1737
1738    case TGSI_OPCODE_CLAMP:
1739       assert( 0 );
1740       break;
1741
1742    case TGSI_OPCODE_FLOOR:
1743    /* TGSI_OPCODE_FLR */
1744       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1745          FETCH( func, *inst, 0, 0, chan_index );
1746          emit_flr( func, 0 );
1747          STORE( func, *inst, 0, 0, chan_index );
1748       }
1749       break;
1750
1751    case TGSI_OPCODE_ROUND:
1752       assert( 0 );
1753       break;
1754
1755    case TGSI_OPCODE_EXPBASE2:
1756    /* TGSI_OPCODE_EX2 */
1757       FETCH( func, *inst, 0, 0, CHAN_X );
1758       emit_ex2( func, 0 );
1759       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1760          STORE( func, *inst, 0, 0, chan_index );
1761       }
1762       break;
1763
1764    case TGSI_OPCODE_LOGBASE2:
1765    /* TGSI_OPCODE_LG2 */
1766       FETCH( func, *inst, 0, 0, CHAN_X );
1767       emit_lg2( func, 0 );
1768       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1769          STORE( func, *inst, 0, 0, chan_index );
1770       }
1771       break;
1772
1773    case TGSI_OPCODE_POWER:
1774    /* TGSI_OPCODE_POW */
1775       FETCH( func, *inst, 0, 0, CHAN_X );
1776       FETCH( func, *inst, 1, 1, CHAN_X );
1777       emit_pow( func, 0, 1 );
1778       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1779          STORE( func, *inst, 0, 0, chan_index );
1780       }
1781       break;
1782
1783    case TGSI_OPCODE_CROSSPRODUCT:
1784    /* TGSI_OPCODE_XPD */
1785       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1786           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1787          FETCH( func, *inst, 1, 1, CHAN_Z );
1788          FETCH( func, *inst, 3, 0, CHAN_Z );
1789       }
1790       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1791           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1792          FETCH( func, *inst, 0, 0, CHAN_Y );
1793          FETCH( func, *inst, 4, 1, CHAN_Y );
1794       }
1795       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1796          emit_MOV( func, 2, 0 );
1797          emit_mul( func, 2, 1 );
1798          emit_MOV( func, 5, 3 );
1799          emit_mul( func, 5, 4 );
1800          emit_sub( func, 2, 5 );
1801          STORE( func, *inst, 2, 0, CHAN_X );
1802       }
1803       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1804           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1805          FETCH( func, *inst, 2, 1, CHAN_X );
1806          FETCH( func, *inst, 5, 0, CHAN_X );
1807       }
1808       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1809          emit_mul( func, 3, 2 );
1810          emit_mul( func, 1, 5 );
1811          emit_sub( func, 3, 1 );
1812          STORE( func, *inst, 3, 0, CHAN_Y );
1813       }
1814       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1815          emit_mul( func, 5, 4 );
1816          emit_mul( func, 0, 2 );
1817          emit_sub( func, 5, 0 );
1818          STORE( func, *inst, 5, 0, CHAN_Z );
1819       }
1820       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1821          FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C );
1822          STORE( func, *inst, 0, 0, CHAN_W );
1823       }
1824       break;
1825
1826    case TGSI_OPCODE_MULTIPLYMATRIX:
1827       assert( 0 );
1828       break;
1829
1830    case TGSI_OPCODE_ABS:
1831       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1832          FETCH( func, *inst, 0, 0, chan_index );
1833          emit_abs( func, 0) ;
1834
1835          STORE( func, *inst, 0, 0, chan_index );
1836       }
1837       break;
1838
1839    case TGSI_OPCODE_RCC:
1840       assert( 0 );
1841       break;
1842
1843    case TGSI_OPCODE_DPH:
1844       FETCH( func, *inst, 0, 0, CHAN_X );
1845       FETCH( func, *inst, 1, 1, CHAN_X );
1846       emit_mul( func, 0, 1 );
1847       FETCH( func, *inst, 1, 0, CHAN_Y );
1848       FETCH( func, *inst, 2, 1, CHAN_Y );
1849       emit_mul( func, 1, 2 );
1850       emit_add( func, 0, 1 );
1851       FETCH( func, *inst, 1, 0, CHAN_Z );
1852       FETCH( func, *inst, 2, 1, CHAN_Z );
1853       emit_mul( func, 1, 2 );
1854       emit_add( func, 0, 1 );
1855       FETCH( func, *inst, 1, 1, CHAN_W );
1856       emit_add( func, 0, 1 );
1857       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1858          STORE( func, *inst, 0, 0, chan_index );
1859       }
1860       break;
1861
1862    case TGSI_OPCODE_COS:
1863       FETCH( func, *inst, 0, 0, CHAN_X );
1864       emit_cos( func, 0 );
1865       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1866          STORE( func, *inst, 0, 0, chan_index );
1867       }
1868       break;
1869
1870    case TGSI_OPCODE_DDX:
1871       assert( 0 );
1872       break;
1873
1874    case TGSI_OPCODE_DDY:
1875       assert( 0 );
1876       break;
1877
1878    case TGSI_OPCODE_KIL:
1879       emit_kil( func, &inst->FullSrcRegisters[0] );
1880       break;
1881
1882    case TGSI_OPCODE_PK2H:
1883       assert( 0 );
1884       break;
1885
1886    case TGSI_OPCODE_PK2US:
1887       assert( 0 );
1888       break;
1889
1890    case TGSI_OPCODE_PK4B:
1891       assert( 0 );
1892       break;
1893
1894    case TGSI_OPCODE_PK4UB:
1895       assert( 0 );
1896       break;
1897
1898    case TGSI_OPCODE_RFL:
1899       assert( 0 );
1900       break;
1901
1902    case TGSI_OPCODE_SEQ:
1903       assert( 0 );
1904       break;
1905
1906    case TGSI_OPCODE_SFL:
1907       assert( 0 );
1908       break;
1909
1910    case TGSI_OPCODE_SGT:
1911       assert( 0 );
1912       break;
1913
1914    case TGSI_OPCODE_SIN:
1915       FETCH( func, *inst, 0, 0, CHAN_X );
1916       emit_sin( func, 0 );
1917       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1918          STORE( func, *inst, 0, 0, chan_index );
1919       }
1920       break;
1921
1922    case TGSI_OPCODE_SLE:
1923       assert( 0 );
1924       break;
1925
1926    case TGSI_OPCODE_SNE:
1927       assert( 0 );
1928       break;
1929
1930    case TGSI_OPCODE_STR:
1931       assert( 0 );
1932       break;
1933
1934    case TGSI_OPCODE_TEX:
1935       emit_tempf(
1936          func,
1937          0,
1938          TGSI_EXEC_TEMP_ONE_I,
1939          TGSI_EXEC_TEMP_ONE_C );
1940       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1941          STORE( func, *inst, 0, 0, chan_index );
1942       }
1943       break;
1944
1945    case TGSI_OPCODE_TXD:
1946       assert( 0 );
1947       break;
1948
1949    case TGSI_OPCODE_UP2H:
1950       assert( 0 );
1951       break;
1952
1953    case TGSI_OPCODE_UP2US:
1954       assert( 0 );
1955       break;
1956
1957    case TGSI_OPCODE_UP4B:
1958       assert( 0 );
1959       break;
1960
1961    case TGSI_OPCODE_UP4UB:
1962       assert( 0 );
1963       break;
1964
1965    case TGSI_OPCODE_X2D:
1966       assert( 0 );
1967       break;
1968
1969    case TGSI_OPCODE_ARA:
1970       assert( 0 );
1971       break;
1972
1973    case TGSI_OPCODE_ARR:
1974       assert( 0 );
1975       break;
1976
1977    case TGSI_OPCODE_BRA:
1978       assert( 0 );
1979       break;
1980
1981    case TGSI_OPCODE_CAL:
1982       assert( 0 );
1983       break;
1984
1985    case TGSI_OPCODE_RET:
1986    case TGSI_OPCODE_END:
1987 #ifdef WIN32
1988       emit_retw( func, 16 );
1989 #else
1990       emit_ret( func );
1991 #endif
1992       break;
1993
1994    case TGSI_OPCODE_SSG:
1995       assert( 0 );
1996       break;
1997
1998    case TGSI_OPCODE_CMP:
1999       emit_cmp (func, inst);
2000       break;
2001
2002    case TGSI_OPCODE_SCS:
2003       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2004          FETCH( func, *inst, 0, 0, CHAN_X );
2005          emit_cos( func, 0 );
2006          STORE( func, *inst, 0, 0, CHAN_X );
2007       }
2008       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2009          FETCH( func, *inst, 0, 0, CHAN_Y );
2010          emit_sin( func, 0 );
2011          STORE( func, *inst, 0, 0, CHAN_Y );
2012       }
2013       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2014          FETCH( func, *inst, 0, TGSI_EXEC_TEMP_00000000_I, TGSI_EXEC_TEMP_00000000_C );
2015          STORE( func, *inst, 0, 0, CHAN_Z );
2016       }
2017       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2018          FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C );
2019          STORE( func, *inst, 0, 0, CHAN_W );
2020       }
2021       break;
2022
2023    case TGSI_OPCODE_TXB:
2024       assert( 0 );
2025       break;
2026
2027    case TGSI_OPCODE_NRM:
2028       assert( 0 );
2029       break;
2030
2031    case TGSI_OPCODE_DIV:
2032       assert( 0 );
2033       break;
2034
2035    case TGSI_OPCODE_DP2:
2036       assert( 0 );
2037       break;
2038
2039    case TGSI_OPCODE_TXL:
2040       assert( 0 );
2041       break;
2042
2043    case TGSI_OPCODE_BRK:
2044       assert( 0 );
2045       break;
2046
2047    case TGSI_OPCODE_IF:
2048       assert( 0 );
2049       break;
2050
2051    case TGSI_OPCODE_LOOP:
2052       assert( 0 );
2053       break;
2054
2055    case TGSI_OPCODE_REP:
2056       assert( 0 );
2057       break;
2058
2059    case TGSI_OPCODE_ELSE:
2060       assert( 0 );
2061       break;
2062
2063    case TGSI_OPCODE_ENDIF:
2064       assert( 0 );
2065       break;
2066
2067    case TGSI_OPCODE_ENDLOOP:
2068       assert( 0 );
2069       break;
2070
2071    case TGSI_OPCODE_ENDREP:
2072       assert( 0 );
2073       break;
2074
2075    case TGSI_OPCODE_PUSHA:
2076       assert( 0 );
2077       break;
2078
2079    case TGSI_OPCODE_POPA:
2080       assert( 0 );
2081       break;
2082
2083    case TGSI_OPCODE_CEIL:
2084       assert( 0 );
2085       break;
2086
2087    case TGSI_OPCODE_I2F:
2088       assert( 0 );
2089       break;
2090
2091    case TGSI_OPCODE_NOT:
2092       assert( 0 );
2093       break;
2094
2095    case TGSI_OPCODE_TRUNC:
2096       assert( 0 );
2097       break;
2098
2099    case TGSI_OPCODE_SHL:
2100       assert( 0 );
2101       break;
2102
2103    case TGSI_OPCODE_SHR:
2104       assert( 0 );
2105       break;
2106
2107    case TGSI_OPCODE_AND:
2108       assert( 0 );
2109       break;
2110
2111    case TGSI_OPCODE_OR:
2112       assert( 0 );
2113       break;
2114
2115    case TGSI_OPCODE_MOD:
2116       assert( 0 );
2117       break;
2118
2119    case TGSI_OPCODE_XOR:
2120       assert( 0 );
2121       break;
2122
2123    case TGSI_OPCODE_SAD:
2124       assert( 0 );
2125       break;
2126
2127    case TGSI_OPCODE_TXF:
2128       assert( 0 );
2129       break;
2130
2131    case TGSI_OPCODE_TXQ:
2132       assert( 0 );
2133       break;
2134
2135    case TGSI_OPCODE_CONT:
2136       assert( 0 );
2137       break;
2138
2139    case TGSI_OPCODE_EMIT:
2140       assert( 0 );
2141       break;
2142
2143    case TGSI_OPCODE_ENDPRIM:
2144       assert( 0 );
2145       break;
2146
2147    default:
2148       assert( 0 );
2149    }
2150 }
2151
2152 static void
2153 emit_declaration(
2154    struct x86_function *func,
2155    struct tgsi_full_declaration *decl )
2156 {
2157    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2158       unsigned first, last, mask;
2159       unsigned i, j;
2160
2161       assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
2162
2163       first = decl->u.DeclarationRange.First;
2164       last = decl->u.DeclarationRange.Last;
2165       mask = decl->Declaration.UsageMask;
2166
2167       /* Do not touch WPOS.xy */
2168       if( first == 0 ) {
2169          mask &= ~TGSI_WRITEMASK_XY;
2170          if( mask == TGSI_WRITEMASK_NONE ) {
2171             first++;
2172          }
2173       }
2174
2175       for( i = first; i <= last; i++ ) {
2176          for( j = 0; j < NUM_CHANNELS; j++ ) {
2177             if( mask & (1 << j) ) {
2178                switch( decl->Interpolation.Interpolate ) {
2179                case TGSI_INTERPOLATE_CONSTANT:
2180                   emit_coef_a0( func, 0, i, j );
2181                   emit_inputs( func, 0, i, j );
2182                   break;
2183
2184                case TGSI_INTERPOLATE_LINEAR:
2185                   emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
2186                   emit_coef_dadx( func, 1, i, j );
2187                   emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
2188                   emit_coef_dady( func, 3, i, j );
2189                   emit_mul( func, 0, 1 );    /* x * dadx */
2190                   emit_coef_a0( func, 4, i, j );
2191                   emit_mul( func, 2, 3 );    /* y * dady */
2192                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2193                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2194                   emit_inputs( func, 0, i, j );
2195                   break;
2196
2197                case TGSI_INTERPOLATE_PERSPECTIVE:
2198                   emit_inputf( func, 0, 0, TGSI_SWIZZLE_X );
2199                   emit_coef_dadx( func, 1, i, j );
2200                   emit_inputf( func, 2, 0, TGSI_SWIZZLE_Y );
2201                   emit_coef_dady( func, 3, i, j );
2202                   emit_mul( func, 0, 1 );    /* x * dadx */
2203                   emit_inputf( func, 4, 0, TGSI_SWIZZLE_W );
2204                   emit_coef_a0( func, 5, i, j );
2205                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2206                   emit_mul( func, 2, 3 );    /* y * dady */
2207                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2208                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2209                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2210                   emit_inputs( func, 0, i, j );
2211                   break;
2212
2213                default:
2214                   assert( 0 );
2215                }
2216             }
2217          }
2218       }
2219    }
2220 }
2221
2222 unsigned
2223 tgsi_emit_sse2(
2224    struct tgsi_token *tokens,
2225    struct x86_function *func )
2226 {
2227    struct tgsi_parse_context parse;
2228
2229    DUMP_START();
2230
2231    func->csr = func->store;
2232
2233    emit_mov(
2234       func,
2235       get_input_base(),
2236       get_argument( 0 ) );
2237    emit_mov(
2238       func,
2239       get_output_base(),
2240       get_argument( 1 ) );
2241    emit_mov(
2242       func,
2243       get_const_base(),
2244       get_argument( 2 ) );
2245    emit_mov(
2246       func,
2247       get_temp_base(),
2248       get_argument( 3 ) );
2249
2250    tgsi_parse_init( &parse, tokens );
2251
2252    while( !tgsi_parse_end_of_tokens( &parse ) ) {
2253       tgsi_parse_token( &parse );
2254
2255       switch( parse.FullToken.Token.Type ) {
2256       case TGSI_TOKEN_TYPE_DECLARATION:
2257          break;
2258
2259       case TGSI_TOKEN_TYPE_INSTRUCTION:
2260          emit_instruction(
2261             func,
2262             &parse.FullToken.FullInstruction );
2263          break;
2264
2265       case TGSI_TOKEN_TYPE_IMMEDIATE:
2266          /* XXX implement this */
2267          return 0;
2268
2269       default:
2270          assert( 0 );
2271       }
2272    }
2273
2274    tgsi_parse_free( &parse );
2275
2276    DUMP_END();
2277
2278    return 1;
2279 }
2280
2281 /**
2282  * Fragment shaders are responsible for interpolating shader inputs. Because on
2283  * x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
2284  * output, const, temp and coef), the code is split into two phases --
2285  * DECLARATION and INSTRUCTION phase.
2286  * GP register holding the output argument is aliased with the coeff argument,
2287  * as outputs are not needed in the DECLARATION phase.
2288  */
2289 unsigned
2290 tgsi_emit_sse2_fs(
2291    struct tgsi_token *tokens,
2292    struct x86_function *func )
2293 {
2294    struct tgsi_parse_context parse;
2295    boolean instruction_phase = FALSE;
2296
2297    DUMP_START();
2298
2299    func->csr = func->store;
2300
2301    /* DECLARATION phase, do not load output argument. */
2302    emit_mov(
2303       func,
2304       get_input_base(),
2305       get_argument( 0 ) );
2306    emit_mov(
2307       func,
2308       get_const_base(),
2309       get_argument( 2 ) );
2310    emit_mov(
2311       func,
2312       get_temp_base(),
2313       get_argument( 3 ) );
2314    emit_mov(
2315       func,
2316       get_coef_base(),
2317       get_argument( 4 ) );
2318
2319    tgsi_parse_init( &parse, tokens );
2320
2321    while( !tgsi_parse_end_of_tokens( &parse ) ) {
2322       tgsi_parse_token( &parse );
2323
2324       switch( parse.FullToken.Token.Type ) {
2325       case TGSI_TOKEN_TYPE_DECLARATION:
2326          emit_declaration(
2327             func,
2328             &parse.FullToken.FullDeclaration );
2329          break;
2330
2331       case TGSI_TOKEN_TYPE_INSTRUCTION:
2332          if( !instruction_phase ) {
2333             /* INSTRUCTION phase, overwrite coeff with output. */
2334             instruction_phase = TRUE;
2335             emit_mov(
2336                func,
2337                get_output_base(),
2338                get_argument( 1 ) );
2339          }
2340          emit_instruction(
2341             func,
2342             &parse.FullToken.FullInstruction );
2343          break;
2344
2345       case TGSI_TOKEN_TYPE_IMMEDIATE:
2346          /* XXX implement this */
2347          assert(0);
2348          break;
2349
2350       default:
2351          assert( 0 );
2352       }
2353    }
2354
2355    tgsi_parse_free( &parse );
2356
2357    DUMP_END();
2358
2359    return 1;
2360 }
2361
2362 #endif /* i386 */