src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_util.h"
  29 #include "pipe/p_shader_tokens.h"
  30 #include "tgsi/tgsi_parse.h"
  31 #include "tgsi/tgsi_util.h"
  32 #include "tgsi_exec.h"
  33 #include "tgsi_sse2.h"
  34
  35 #include "rtasm/rtasm_x86sse.h"
  36
  37 #ifdef PIPE_ARCH_X86
  38
  39 /* for 1/sqrt()
  40  *
  41  * This costs about 100fps (close to 10%) in gears:
  42  */
  43 #define HIGH_PRECISION 1
  44
  45
  46 #define FOR_EACH_CHANNEL( CHAN )\
  47    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  48
  49 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  50    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  51
  52 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  53    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  54
  55 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  56    FOR_EACH_CHANNEL( CHAN )\
  57       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  58
  59 #define CHAN_X 0
  60 #define CHAN_Y 1
  61 #define CHAN_Z 2
  62 #define CHAN_W 3
  63
  64 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  65 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  66
  67 /**
  68  * X86 utility functions.
  69  */
  70
  71 static struct x86_reg
  72 make_xmm(
  73    unsigned xmm )
  74 {
  75    return x86_make_reg(
  76       file_XMM,
  77       (enum x86_reg_name) xmm );
  78 }
  79
  80 /**
  81  * X86 register mapping helpers.
  82  */
  83
  84 static struct x86_reg
  85 get_const_base( void )
  86 {
  87    return x86_make_reg(
  88       file_REG32,
  89       reg_CX );
  90 }
  91
  92 static struct x86_reg
  93 get_input_base( void )
  94 {
  95    return x86_make_reg(
  96       file_REG32,
  97       reg_AX );
  98 }
  99
 100 static struct x86_reg
 101 get_output_base( void )
 102 {
 103    return x86_make_reg(
 104       file_REG32,
 105       reg_DX );
 106 }
 107
 108 static struct x86_reg
 109 get_temp_base( void )
 110 {
 111    return x86_make_reg(
 112       file_REG32,
 113       reg_BX );
 114 }
 115
 116 static struct x86_reg
 117 get_coef_base( void )
 118 {
 119    return get_output_base();
 120 }
 121
 122 static struct x86_reg
 123 get_immediate_base( void )
 124 {
 125    return x86_make_reg(
 126       file_REG32,
 127       reg_DI );
 128 }
 129
 130
 131 /**
 132  * Data access helpers.
 133  */
 134
 135
 136 static struct x86_reg
 137 get_immediate(
 138    unsigned vec,
 139    unsigned chan )
 140 {
 141    return x86_make_disp(
 142       get_immediate_base(),
 143       (vec * 4 + chan) * 4 );
 144 }
 145
 146 static struct x86_reg
 147 get_const(
 148    unsigned vec,
 149    unsigned chan )
 150 {
 151    return x86_make_disp(
 152       get_const_base(),
 153       (vec * 4 + chan) * 4 );
 154 }
 155
 156 static struct x86_reg
 157 get_input(
 158    unsigned vec,
 159    unsigned chan )
 160 {
 161    return x86_make_disp(
 162       get_input_base(),
 163       (vec * 4 + chan) * 16 );
 164 }
 165
 166 static struct x86_reg
 167 get_output(
 168    unsigned vec,
 169    unsigned chan )
 170 {
 171    return x86_make_disp(
 172       get_output_base(),
 173       (vec * 4 + chan) * 16 );
 174 }
 175
 176 static struct x86_reg
 177 get_temp(
 178    unsigned vec,
 179    unsigned chan )
 180 {
 181    return x86_make_disp(
 182       get_temp_base(),
 183       (vec * 4 + chan) * 16 );
 184 }
 185
 186 static struct x86_reg
 187 get_coef(
 188    unsigned vec,
 189    unsigned chan,
 190    unsigned member )
 191 {
 192    return x86_make_disp(
 193       get_coef_base(),
 194       ((vec * 3 + member) * 4 + chan) * 4 );
 195 }
 196
 197
 198 static void
 199 emit_ret(
 200    struct x86_function  *func )
 201 {
 202    x86_ret( func );
 203 }
 204
 205
 206 /**
 207  * Data fetch helpers.
 208  */
 209
 210 /**
 211  * Copy a shader constant to xmm register
 212  * \param xmm  the destination xmm register
 213  * \param vec  the src const buffer index
 214  * \param chan  src channel to fetch (X, Y, Z or W)
 215  */
 216 static void
 217 emit_const(
 218    struct x86_function *func,
 219    uint xmm,
 220    int vec,
 221    uint chan,
 222    uint indirect,
 223    uint indirectFile,
 224    int indirectIndex )
 225 {
 226    if (indirect) {
 227       struct x86_reg r0 = get_input_base();
 228       struct x86_reg r1 = get_output_base();
 229       uint i;
 230
 231       assert( indirectFile == TGSI_FILE_ADDRESS );
 232       assert( indirectIndex == 0 );
 233
 234       x86_push( func, r0 );
 235       x86_push( func, r1 );
 236
 237       for (i = 0; i < QUAD_SIZE; i++) {
 238          x86_lea( func, r0, get_const( vec, chan ) );
 239          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 240
 241          /* Quick hack to multiply by 16 -- need to add SHL to rtasm.
 242           */
 243          x86_add( func, r1, r1 );
 244          x86_add( func, r1, r1 );
 245          x86_add( func, r1, r1 );
 246          x86_add( func, r1, r1 );
 247
 248          x86_add( func, r0, r1 );
 249          x86_mov( func, r1, x86_deref( r0 ) );
 250          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 251       }
 252
 253       x86_pop( func, r1 );
 254       x86_pop( func, r0 );
 255
 256       sse_movaps(
 257          func,
 258          make_xmm( xmm ),
 259          get_temp( TEMP_R0, CHAN_X ) );
 260    }
 261    else {
 262       assert( vec >= 0 );
 263
 264       sse_movss(
 265          func,
 266          make_xmm( xmm ),
 267          get_const( vec, chan ) );
 268       sse_shufps(
 269          func,
 270          make_xmm( xmm ),
 271          make_xmm( xmm ),
 272          SHUF( 0, 0, 0, 0 ) );
 273    }
 274 }
 275
 276 static void
 277 emit_immediate(
 278    struct x86_function *func,
 279    unsigned xmm,
 280    unsigned vec,
 281    unsigned chan )
 282 {
 283    sse_movss(
 284       func,
 285       make_xmm( xmm ),
 286       get_immediate( vec, chan ) );
 287    sse_shufps(
 288       func,
 289       make_xmm( xmm ),
 290       make_xmm( xmm ),
 291       SHUF( 0, 0, 0, 0 ) );
 292 }
 293
 294
 295 /**
 296  * Copy a shader input to xmm register
 297  * \param xmm  the destination xmm register
 298  * \param vec  the src input attrib
 299  * \param chan  src channel to fetch (X, Y, Z or W)
 300  */
 301 static void
 302 emit_inputf(
 303    struct x86_function *func,
 304    unsigned xmm,
 305    unsigned vec,
 306    unsigned chan )
 307 {
 308    sse_movups(
 309       func,
 310       make_xmm( xmm ),
 311       get_input( vec, chan ) );
 312 }
 313
 314 /**
 315  * Store an xmm register to a shader output
 316  * \param xmm  the source xmm register
 317  * \param vec  the dest output attrib
 318  * \param chan  src dest channel to store (X, Y, Z or W)
 319  */
 320 static void
 321 emit_output(
 322    struct x86_function *func,
 323    unsigned xmm,
 324    unsigned vec,
 325    unsigned chan )
 326 {
 327    sse_movups(
 328       func,
 329       get_output( vec, chan ),
 330       make_xmm( xmm ) );
 331 }
 332
 333 /**
 334  * Copy a shader temporary to xmm register
 335  * \param xmm  the destination xmm register
 336  * \param vec  the src temp register
 337  * \param chan  src channel to fetch (X, Y, Z or W)
 338  */
 339 static void
 340 emit_tempf(
 341    struct x86_function *func,
 342    unsigned xmm,
 343    unsigned vec,
 344    unsigned chan )
 345 {
 346    sse_movaps(
 347       func,
 348       make_xmm( xmm ),
 349       get_temp( vec, chan ) );
 350 }
 351
 352 /**
 353  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 354  * \param xmm  the destination xmm register
 355  * \param vec  the src input/attribute coefficient index
 356  * \param chan  src channel to fetch (X, Y, Z or W)
 357  * \param member  0=a0, 1=dadx, 2=dady
 358  */
 359 static void
 360 emit_coef(
 361    struct x86_function *func,
 362    unsigned xmm,
 363    unsigned vec,
 364    unsigned chan,
 365    unsigned member )
 366 {
 367    sse_movss(
 368       func,
 369       make_xmm( xmm ),
 370       get_coef( vec, chan, member ) );
 371    sse_shufps(
 372       func,
 373       make_xmm( xmm ),
 374       make_xmm( xmm ),
 375       SHUF( 0, 0, 0, 0 ) );
 376 }
 377
 378 /**
 379  * Data store helpers.
 380  */
 381
 382 static void
 383 emit_inputs(
 384    struct x86_function *func,
 385    unsigned xmm,
 386    unsigned vec,
 387    unsigned chan )
 388 {
 389    sse_movups(
 390       func,
 391       get_input( vec, chan ),
 392       make_xmm( xmm ) );
 393 }
 394
 395 static void
 396 emit_temps(
 397    struct x86_function *func,
 398    unsigned xmm,
 399    unsigned vec,
 400    unsigned chan )
 401 {
 402    sse_movaps(
 403       func,
 404       get_temp( vec, chan ),
 405       make_xmm( xmm ) );
 406 }
 407
 408 static void
 409 emit_addrs(
 410    struct x86_function *func,
 411    unsigned xmm,
 412    unsigned vec,
 413    unsigned chan )
 414 {
 415    assert( vec == 0 );
 416
 417    emit_temps(
 418       func,
 419       xmm,
 420       vec + TGSI_EXEC_TEMP_ADDR,
 421       chan );
 422 }
 423
 424 /**
 425  * Coefficent fetch helpers.
 426  */
 427
 428 static void
 429 emit_coef_a0(
 430    struct x86_function *func,
 431    unsigned xmm,
 432    unsigned vec,
 433    unsigned chan )
 434 {
 435    emit_coef(
 436       func,
 437       xmm,
 438       vec,
 439       chan,
 440       0 );
 441 }
 442
 443 static void
 444 emit_coef_dadx(
 445    struct x86_function *func,
 446    unsigned xmm,
 447    unsigned vec,
 448    unsigned chan )
 449 {
 450    emit_coef(
 451       func,
 452       xmm,
 453       vec,
 454       chan,
 455       1 );
 456 }
 457
 458 static void
 459 emit_coef_dady(
 460    struct x86_function *func,
 461    unsigned xmm,
 462    unsigned vec,
 463    unsigned chan )
 464 {
 465    emit_coef(
 466       func,
 467       xmm,
 468       vec,
 469       chan,
 470       2 );
 471 }
 472
 473 /**
 474  * Function call helpers.
 475  */
 476
 477 static void
 478 emit_push_gp(
 479    struct x86_function *func )
 480 {
 481    x86_push(
 482       func,
 483       x86_make_reg( file_REG32, reg_AX) );
 484    x86_push(
 485       func,
 486       x86_make_reg( file_REG32, reg_CX) );
 487    x86_push(
 488       func,
 489       x86_make_reg( file_REG32, reg_DX) );
 490 }
 491
 492 static void
 493 x86_pop_gp(
 494    struct x86_function *func )
 495 {
 496    /* Restore GP registers in a reverse order.
 497     */
 498    x86_pop(
 499       func,
 500       x86_make_reg( file_REG32, reg_DX) );
 501    x86_pop(
 502       func,
 503       x86_make_reg( file_REG32, reg_CX) );
 504    x86_pop(
 505       func,
 506       x86_make_reg( file_REG32, reg_AX) );
 507 }
 508
 509 static void
 510 emit_func_call_dst(
 511    struct x86_function *func,
 512    unsigned xmm_dst,
 513    void (PIPE_CDECL *code)() )
 514 {
 515    sse_movaps(
 516       func,
 517       get_temp( TEMP_R0, 0 ),
 518       make_xmm( xmm_dst ) );
 519
 520    emit_push_gp(
 521       func );
 522
 523    {
 524       struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 525
 526       x86_lea(
 527          func,
 528          ecx,
 529          get_temp( TEMP_R0, 0 ) );
 530
 531       x86_push( func, ecx );
 532       x86_mov_reg_imm( func, ecx, (unsigned long) code );
 533       x86_call( func, ecx );
 534       x86_pop(func, ecx );
 535    }
 536
 537
 538    x86_pop_gp(
 539       func );
 540
 541    sse_movaps(
 542       func,
 543       make_xmm( xmm_dst ),
 544       get_temp( TEMP_R0, 0 ) );
 545 }
 546
 547 static void
 548 emit_func_call_dst_src(
 549    struct x86_function *func,
 550    unsigned xmm_dst,
 551    unsigned xmm_src,
 552    void (PIPE_CDECL *code)() )
 553 {
 554    sse_movaps(
 555       func,
 556       get_temp( TEMP_R0, 1 ),
 557       make_xmm( xmm_src ) );
 558
 559    emit_func_call_dst(
 560       func,
 561       xmm_dst,
 562       code );
 563 }
 564
 565 /**
 566  * Low-level instruction translators.
 567  */
 568
 569 static void
 570 emit_abs(
 571    struct x86_function *func,
 572    unsigned xmm )
 573 {
 574    sse_andps(
 575       func,
 576       make_xmm( xmm ),
 577       get_temp(
 578          TGSI_EXEC_TEMP_7FFFFFFF_I,
 579          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 580 }
 581
 582 static void
 583 emit_add(
 584    struct x86_function *func,
 585    unsigned xmm_dst,
 586    unsigned xmm_src )
 587 {
 588    sse_addps(
 589       func,
 590       make_xmm( xmm_dst ),
 591       make_xmm( xmm_src ) );
 592 }
 593
 594 static void PIPE_CDECL
 595 cos4f(
 596    float *store )
 597 {
 598    const unsigned X = 0;
 599
 600    store[X + 0] = cosf( store[X + 0] );
 601    store[X + 1] = cosf( store[X + 1] );
 602    store[X + 2] = cosf( store[X + 2] );
 603    store[X + 3] = cosf( store[X + 3] );
 604 }
 605
 606 static void
 607 emit_cos(
 608    struct x86_function *func,
 609    unsigned xmm_dst )
 610 {
 611    emit_func_call_dst(
 612       func,
 613       xmm_dst,
 614       cos4f );
 615 }
 616
 617 static void PIPE_CDECL
 618 ex24f(
 619    float *store )
 620 {
 621    const unsigned X = 0;
 622
 623    store[X + 0] = powf( 2.0f, store[X + 0] );
 624    store[X + 1] = powf( 2.0f, store[X + 1] );
 625    store[X + 2] = powf( 2.0f, store[X + 2] );
 626    store[X + 3] = powf( 2.0f, store[X + 3] );
 627 }
 628
 629 static void
 630 emit_ex2(
 631    struct x86_function *func,
 632    unsigned xmm_dst )
 633 {
 634    emit_func_call_dst(
 635       func,
 636       xmm_dst,
 637       ex24f );
 638 }
 639
 640 static void
 641 emit_f2it(
 642    struct x86_function *func,
 643    unsigned xmm )
 644 {
 645    sse2_cvttps2dq(
 646       func,
 647       make_xmm( xmm ),
 648       make_xmm( xmm ) );
 649 }
 650
 651 static void PIPE_CDECL
 652 flr4f(
 653    float *store )
 654 {
 655    const unsigned X = 0;
 656
 657    store[X + 0] = floorf( store[X + 0] );
 658    store[X + 1] = floorf( store[X + 1] );
 659    store[X + 2] = floorf( store[X + 2] );
 660    store[X + 3] = floorf( store[X + 3] );
 661 }
 662
 663 static void
 664 emit_flr(
 665    struct x86_function *func,
 666    unsigned xmm_dst )
 667 {
 668    emit_func_call_dst(
 669       func,
 670       xmm_dst,
 671       flr4f );
 672 }
 673
 674 static void PIPE_CDECL
 675 frc4f(
 676    float *store )
 677 {
 678    const unsigned X = 0;
 679
 680    store[X + 0] -= floorf( store[X + 0] );
 681    store[X + 1] -= floorf( store[X + 1] );
 682    store[X + 2] -= floorf( store[X + 2] );
 683    store[X + 3] -= floorf( store[X + 3] );
 684 }
 685
 686 static void
 687 emit_frc(
 688    struct x86_function *func,
 689    unsigned xmm_dst )
 690 {
 691    emit_func_call_dst(
 692       func,
 693       xmm_dst,
 694       frc4f );
 695 }
 696
 697 static void PIPE_CDECL
 698 lg24f(
 699    float *store )
 700 {
 701    const unsigned X = 0;
 702
 703    store[X + 0] = LOG2( store[X + 0] );
 704    store[X + 1] = LOG2( store[X + 1] );
 705    store[X + 2] = LOG2( store[X + 2] );
 706    store[X + 3] = LOG2( store[X + 3] );
 707 }
 708
 709 static void
 710 emit_lg2(
 711    struct x86_function *func,
 712    unsigned xmm_dst )
 713 {
 714    emit_func_call_dst(
 715       func,
 716       xmm_dst,
 717       lg24f );
 718 }
 719
 720 static void
 721 emit_MOV(
 722    struct x86_function *func,
 723    unsigned xmm_dst,
 724    unsigned xmm_src )
 725 {
 726    sse_movups(
 727       func,
 728       make_xmm( xmm_dst ),
 729       make_xmm( xmm_src ) );
 730 }
 731
 732 static void
 733 emit_mul (struct x86_function *func,
 734           unsigned xmm_dst,
 735           unsigned xmm_src)
 736 {
 737    sse_mulps(
 738       func,
 739       make_xmm( xmm_dst ),
 740       make_xmm( xmm_src ) );
 741 }
 742
 743 static void
 744 emit_neg(
 745    struct x86_function *func,
 746    unsigned xmm )
 747 {
 748    sse_xorps(
 749       func,
 750       make_xmm( xmm ),
 751       get_temp(
 752          TGSI_EXEC_TEMP_80000000_I,
 753          TGSI_EXEC_TEMP_80000000_C ) );
 754 }
 755
 756 static void PIPE_CDECL
 757 pow4f(
 758    float *store )
 759 {
 760    const unsigned X = 0;
 761
 762    store[X + 0] = powf( store[X + 0], store[X + 4] );
 763    store[X + 1] = powf( store[X + 1], store[X + 5] );
 764    store[X + 2] = powf( store[X + 2], store[X + 6] );
 765    store[X + 3] = powf( store[X + 3], store[X + 7] );
 766 }
 767
 768 static void
 769 emit_pow(
 770    struct x86_function *func,
 771    unsigned xmm_dst,
 772    unsigned xmm_src )
 773 {
 774    emit_func_call_dst_src(
 775       func,
 776       xmm_dst,
 777       xmm_src,
 778       pow4f );
 779 }
 780
 781 static void
 782 emit_rcp (
 783    struct x86_function *func,
 784    unsigned xmm_dst,
 785    unsigned xmm_src )
 786 {
 787    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 788     * good enough.  Need to either emit a proper divide or use the
 789     * iterative technique described below in emit_rsqrt().
 790     */
 791    sse2_rcpps(
 792       func,
 793       make_xmm( xmm_dst ),
 794       make_xmm( xmm_src ) );
 795 }
 796
 797 static void
 798 emit_rsqrt(
 799    struct x86_function *func,
 800    unsigned xmm_dst,
 801    unsigned xmm_src )
 802 {
 803 #if HIGH_PRECISION
 804    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
 805     * implementations, it is possible to improve its precision at
 806     * fairly low cost, using a newton/raphson step, as below:
 807     *
 808     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
 809     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
 810     *
 811     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
 812     */
 813    {
 814       struct x86_reg dst = make_xmm( xmm_dst );
 815       struct x86_reg src = make_xmm( xmm_src );
 816       struct x86_reg tmp0 = make_xmm( 2 );
 817       struct x86_reg tmp1 = make_xmm( 3 );
 818
 819       assert( xmm_dst != xmm_src );
 820       assert( xmm_dst != 2 && xmm_dst != 3 );
 821       assert( xmm_src != 2 && xmm_src != 3 );
 822
 823       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
 824       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
 825       sse_rsqrtps( func, tmp1, src  );
 826       sse_mulps(   func, src,  tmp1 );
 827       sse_mulps(   func, dst,  tmp1 );
 828       sse_mulps(   func, src,  tmp1 );
 829       sse_subps(   func, tmp0, src  );
 830       sse_mulps(   func, dst,  tmp0 );
 831    }
 832 #else
 833    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 834     * good enough.
 835     */
 836    sse_rsqrtps(
 837       func,
 838       make_xmm( xmm_dst ),
 839       make_xmm( xmm_src ) );
 840 #endif
 841 }
 842
 843 static void
 844 emit_setsign(
 845    struct x86_function *func,
 846    unsigned xmm )
 847 {
 848    sse_orps(
 849       func,
 850       make_xmm( xmm ),
 851       get_temp(
 852          TGSI_EXEC_TEMP_80000000_I,
 853          TGSI_EXEC_TEMP_80000000_C ) );
 854 }
 855
 856 static void PIPE_CDECL
 857 sin4f(
 858    float *store )
 859 {
 860    const unsigned X = 0;
 861
 862    store[X + 0] = sinf( store[X + 0] );
 863    store[X + 1] = sinf( store[X + 1] );
 864    store[X + 2] = sinf( store[X + 2] );
 865    store[X + 3] = sinf( store[X + 3] );
 866 }
 867
 868 static void
 869 emit_sin (struct x86_function *func,
 870           unsigned xmm_dst)
 871 {
 872    emit_func_call_dst(
 873       func,
 874       xmm_dst,
 875       sin4f );
 876 }
 877
 878 static void
 879 emit_sub(
 880    struct x86_function *func,
 881    unsigned xmm_dst,
 882    unsigned xmm_src )
 883 {
 884    sse_subps(
 885       func,
 886       make_xmm( xmm_dst ),
 887       make_xmm( xmm_src ) );
 888 }
 889
 890 /**
 891  * Register fetch.
 892  */
 893
 894 static void
 895 emit_fetch(
 896    struct x86_function *func,
 897    unsigned xmm,
 898    const struct tgsi_full_src_register *reg,
 899    const unsigned chan_index )
 900 {
 901    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
 902
 903    switch (swizzle) {
 904    case TGSI_EXTSWIZZLE_X:
 905    case TGSI_EXTSWIZZLE_Y:
 906    case TGSI_EXTSWIZZLE_Z:
 907    case TGSI_EXTSWIZZLE_W:
 908       switch (reg->SrcRegister.File) {
 909       case TGSI_FILE_CONSTANT:
 910          emit_const(
 911             func,
 912             xmm,
 913             reg->SrcRegister.Index,
 914             swizzle,
 915             reg->SrcRegister.Indirect,
 916             reg->SrcRegisterInd.File,
 917             reg->SrcRegisterInd.Index );
 918          break;
 919
 920       case TGSI_FILE_IMMEDIATE:
 921          emit_immediate(
 922             func,
 923             xmm,
 924             reg->SrcRegister.Index,
 925             swizzle );
 926          break;
 927
 928       case TGSI_FILE_INPUT:
 929          emit_inputf(
 930             func,
 931             xmm,
 932             reg->SrcRegister.Index,
 933             swizzle );
 934          break;
 935
 936       case TGSI_FILE_TEMPORARY:
 937          emit_tempf(
 938             func,
 939             xmm,
 940             reg->SrcRegister.Index,
 941             swizzle );
 942          break;
 943
 944       default:
 945          assert( 0 );
 946       }
 947       break;
 948
 949    case TGSI_EXTSWIZZLE_ZERO:
 950       emit_tempf(
 951          func,
 952          xmm,
 953          TGSI_EXEC_TEMP_00000000_I,
 954          TGSI_EXEC_TEMP_00000000_C );
 955       break;
 956
 957    case TGSI_EXTSWIZZLE_ONE:
 958       emit_tempf(
 959          func,
 960          xmm,
 961          TGSI_EXEC_TEMP_ONE_I,
 962          TGSI_EXEC_TEMP_ONE_C );
 963       break;
 964
 965    default:
 966       assert( 0 );
 967    }
 968
 969    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
 970    case TGSI_UTIL_SIGN_CLEAR:
 971       emit_abs( func, xmm );
 972       break;
 973
 974    case TGSI_UTIL_SIGN_SET:
 975       emit_setsign( func, xmm );
 976       break;
 977
 978    case TGSI_UTIL_SIGN_TOGGLE:
 979       emit_neg( func, xmm );
 980       break;
 981
 982    case TGSI_UTIL_SIGN_KEEP:
 983       break;
 984    }
 985 }
 986
 987 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
 988    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
 989
 990 /**
 991  * Register store.
 992  */
 993
 994 static void
 995 emit_store(
 996    struct x86_function *func,
 997    unsigned xmm,
 998    const struct tgsi_full_dst_register *reg,
 999    const struct tgsi_full_instruction *inst,
1000    unsigned chan_index )
1001 {
1002    switch( reg->DstRegister.File ) {
1003    case TGSI_FILE_OUTPUT:
1004       emit_output(
1005          func,
1006          xmm,
1007          reg->DstRegister.Index,
1008          chan_index );
1009       break;
1010
1011    case TGSI_FILE_TEMPORARY:
1012       emit_temps(
1013          func,
1014          xmm,
1015          reg->DstRegister.Index,
1016          chan_index );
1017       break;
1018
1019    case TGSI_FILE_ADDRESS:
1020       emit_addrs(
1021          func,
1022          xmm,
1023          reg->DstRegister.Index,
1024          chan_index );
1025       break;
1026
1027    default:
1028       assert( 0 );
1029    }
1030
1031    switch( inst->Instruction.Saturate ) {
1032    case TGSI_SAT_NONE:
1033       break;
1034
1035    case TGSI_SAT_ZERO_ONE:
1036       /* assert( 0 ); */
1037       break;
1038
1039    case TGSI_SAT_MINUS_PLUS_ONE:
1040       assert( 0 );
1041       break;
1042    }
1043 }
1044
1045 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1046    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1047
1048 /**
1049  * High-level instruction translators.
1050  */
1051
1052 static void
1053 emit_kil(
1054    struct x86_function *func,
1055    const struct tgsi_full_src_register *reg )
1056 {
1057    unsigned uniquemask;
1058    unsigned registers[4];
1059    unsigned nextregister = 0;
1060    unsigned firstchan = ~0;
1061    unsigned chan_index;
1062
1063    /* This mask stores component bits that were already tested. Note that
1064     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1065     * tested. */
1066    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1067
1068    FOR_EACH_CHANNEL( chan_index ) {
1069       unsigned swizzle;
1070
1071       /* unswizzle channel */
1072       swizzle = tgsi_util_get_full_src_register_extswizzle(
1073          reg,
1074          chan_index );
1075
1076       /* check if the component has not been already tested */
1077       if( !(uniquemask & (1 << swizzle)) ) {
1078          uniquemask |= 1 << swizzle;
1079
1080          /* allocate register */
1081          registers[chan_index] = nextregister;
1082          emit_fetch(
1083             func,
1084             nextregister,
1085             reg,
1086             chan_index );
1087          nextregister++;
1088
1089          /* mark the first channel used */
1090          if( firstchan == ~0 ) {
1091             firstchan = chan_index;
1092          }
1093       }
1094    }
1095
1096    x86_push(
1097       func,
1098       x86_make_reg( file_REG32, reg_AX ) );
1099    x86_push(
1100       func,
1101       x86_make_reg( file_REG32, reg_DX ) );
1102
1103    FOR_EACH_CHANNEL( chan_index ) {
1104       if( uniquemask & (1 << chan_index) ) {
1105          sse_cmpps(
1106             func,
1107             make_xmm( registers[chan_index] ),
1108             get_temp(
1109                TGSI_EXEC_TEMP_00000000_I,
1110                TGSI_EXEC_TEMP_00000000_C ),
1111             cc_LessThan );
1112
1113          if( chan_index == firstchan ) {
1114             sse_pmovmskb(
1115                func,
1116                x86_make_reg( file_REG32, reg_AX ),
1117                make_xmm( registers[chan_index] ) );
1118          }
1119          else {
1120             sse_pmovmskb(
1121                func,
1122                x86_make_reg( file_REG32, reg_DX ),
1123                make_xmm( registers[chan_index] ) );
1124             x86_or(
1125                func,
1126                x86_make_reg( file_REG32, reg_AX ),
1127                x86_make_reg( file_REG32, reg_DX ) );
1128          }
1129       }
1130    }
1131
1132    x86_or(
1133       func,
1134       get_temp(
1135          TGSI_EXEC_TEMP_KILMASK_I,
1136          TGSI_EXEC_TEMP_KILMASK_C ),
1137       x86_make_reg( file_REG32, reg_AX ) );
1138
1139    x86_pop(
1140       func,
1141       x86_make_reg( file_REG32, reg_DX ) );
1142    x86_pop(
1143       func,
1144       x86_make_reg( file_REG32, reg_AX ) );
1145 }
1146
1147
1148 static void
1149 emit_kilp(
1150    struct x86_function *func )
1151 {
1152    /* XXX todo / fix me */
1153 }
1154
1155
1156 static void
1157 emit_setcc(
1158    struct x86_function *func,
1159    struct tgsi_full_instruction *inst,
1160    enum sse_cc cc )
1161 {
1162    unsigned chan_index;
1163
1164    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1165       FETCH( func, *inst, 0, 0, chan_index );
1166       FETCH( func, *inst, 1, 1, chan_index );
1167       sse_cmpps(
1168          func,
1169          make_xmm( 0 ),
1170          make_xmm( 1 ),
1171          cc );
1172       sse_andps(
1173          func,
1174          make_xmm( 0 ),
1175          get_temp(
1176             TGSI_EXEC_TEMP_ONE_I,
1177             TGSI_EXEC_TEMP_ONE_C ) );
1178       STORE( func, *inst, 0, 0, chan_index );
1179    }
1180 }
1181
1182 static void
1183 emit_cmp(
1184    struct x86_function *func,
1185    struct tgsi_full_instruction *inst )
1186 {
1187    unsigned chan_index;
1188
1189    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1190       FETCH( func, *inst, 0, 0, chan_index );
1191       FETCH( func, *inst, 1, 1, chan_index );
1192       FETCH( func, *inst, 2, 2, chan_index );
1193       sse_cmpps(
1194          func,
1195          make_xmm( 0 ),
1196          get_temp(
1197             TGSI_EXEC_TEMP_00000000_I,
1198             TGSI_EXEC_TEMP_00000000_C ),
1199          cc_LessThan );
1200       sse_andps(
1201          func,
1202          make_xmm( 1 ),
1203          make_xmm( 0 ) );
1204       sse_andnps(
1205          func,
1206          make_xmm( 0 ),
1207          make_xmm( 2 ) );
1208       sse_orps(
1209          func,
1210          make_xmm( 0 ),
1211          make_xmm( 1 ) );
1212       STORE( func, *inst, 0, 0, chan_index );
1213    }
1214 }
1215
1216 static int
1217 emit_instruction(
1218    struct x86_function *func,
1219    struct tgsi_full_instruction *inst )
1220 {
1221    unsigned chan_index;
1222
1223    switch (inst->Instruction.Opcode) {
1224    case TGSI_OPCODE_ARL:
1225       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1226          FETCH( func, *inst, 0, 0, chan_index );
1227          emit_f2it( func, 0 );
1228          STORE( func, *inst, 0, 0, chan_index );
1229       }
1230       break;
1231
1232    case TGSI_OPCODE_MOV:
1233    case TGSI_OPCODE_SWZ:
1234       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1235          FETCH( func, *inst, 0, 0, chan_index );
1236          STORE( func, *inst, 0, 0, chan_index );
1237       }
1238       break;
1239
1240    case TGSI_OPCODE_LIT:
1241       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1242           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1243          emit_tempf(
1244             func,
1245             0,
1246             TGSI_EXEC_TEMP_ONE_I,
1247             TGSI_EXEC_TEMP_ONE_C);
1248          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1249             STORE( func, *inst, 0, 0, CHAN_X );
1250          }
1251          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1252             STORE( func, *inst, 0, 0, CHAN_W );
1253          }
1254       }
1255       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1256           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1257          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1258             FETCH( func, *inst, 0, 0, CHAN_X );
1259             sse_maxps(
1260                func,
1261                make_xmm( 0 ),
1262                get_temp(
1263                   TGSI_EXEC_TEMP_00000000_I,
1264                   TGSI_EXEC_TEMP_00000000_C ) );
1265             STORE( func, *inst, 0, 0, CHAN_Y );
1266          }
1267          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1268             /* XMM[1] = SrcReg[0].yyyy */
1269             FETCH( func, *inst, 1, 0, CHAN_Y );
1270             /* XMM[1] = max(XMM[1], 0) */
1271             sse_maxps(
1272                func,
1273                make_xmm( 1 ),
1274                get_temp(
1275                   TGSI_EXEC_TEMP_00000000_I,
1276                   TGSI_EXEC_TEMP_00000000_C ) );
1277             /* XMM[2] = SrcReg[0].wwww */
1278             FETCH( func, *inst, 2, 0, CHAN_W );
1279             /* XMM[2] = min(XMM[2], 128.0) */
1280             sse_minps(
1281                func,
1282                make_xmm( 2 ),
1283                get_temp(
1284                   TGSI_EXEC_TEMP_128_I,
1285                   TGSI_EXEC_TEMP_128_C ) );
1286             /* XMM[2] = max(XMM[2], -128.0) */
1287             sse_maxps(
1288                func,
1289                make_xmm( 2 ),
1290                get_temp(
1291                   TGSI_EXEC_TEMP_MINUS_128_I,
1292                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1293             emit_pow( func, 1, 2 );
1294             FETCH( func, *inst, 0, 0, CHAN_X );
1295             sse_xorps(
1296                func,
1297                make_xmm( 2 ),
1298                make_xmm( 2 ) );
1299             sse_cmpps(
1300                func,
1301                make_xmm( 2 ),
1302                make_xmm( 0 ),
1303                cc_LessThanEqual );
1304             sse_andps(
1305                func,
1306                make_xmm( 2 ),
1307                make_xmm( 1 ) );
1308             STORE( func, *inst, 2, 0, CHAN_Z );
1309          }
1310       }
1311       break;
1312
1313    case TGSI_OPCODE_RCP:
1314    /* TGSI_OPCODE_RECIP */
1315       FETCH( func, *inst, 0, 0, CHAN_X );
1316       emit_rcp( func, 0, 0 );
1317       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1318          STORE( func, *inst, 0, 0, chan_index );
1319       }
1320       break;
1321
1322    case TGSI_OPCODE_RSQ:
1323    /* TGSI_OPCODE_RECIPSQRT */
1324       FETCH( func, *inst, 0, 0, CHAN_X );
1325       emit_rsqrt( func, 1, 0 );
1326       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1327          STORE( func, *inst, 1, 0, chan_index );
1328       }
1329       break;
1330
1331    case TGSI_OPCODE_EXP:
1332       return 0;
1333       break;
1334
1335    case TGSI_OPCODE_LOG:
1336       return 0;
1337       break;
1338
1339    case TGSI_OPCODE_MUL:
1340       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1341          FETCH( func, *inst, 0, 0, chan_index );
1342          FETCH( func, *inst, 1, 1, chan_index );
1343          emit_mul( func, 0, 1 );
1344          STORE( func, *inst, 0, 0, chan_index );
1345       }
1346       break;
1347
1348    case TGSI_OPCODE_ADD:
1349       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1350          FETCH( func, *inst, 0, 0, chan_index );
1351          FETCH( func, *inst, 1, 1, chan_index );
1352          emit_add( func, 0, 1 );
1353          STORE( func, *inst, 0, 0, chan_index );
1354       }
1355       break;
1356
1357    case TGSI_OPCODE_DP3:
1358    /* TGSI_OPCODE_DOT3 */
1359       FETCH( func, *inst, 0, 0, CHAN_X );
1360       FETCH( func, *inst, 1, 1, CHAN_X );
1361       emit_mul( func, 0, 1 );
1362       FETCH( func, *inst, 1, 0, CHAN_Y );
1363       FETCH( func, *inst, 2, 1, CHAN_Y );
1364       emit_mul( func, 1, 2 );
1365       emit_add( func, 0, 1 );
1366       FETCH( func, *inst, 1, 0, CHAN_Z );
1367       FETCH( func, *inst, 2, 1, CHAN_Z );
1368       emit_mul( func, 1, 2 );
1369       emit_add( func, 0, 1 );
1370       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1371          STORE( func, *inst, 0, 0, chan_index );
1372       }
1373       break;
1374
1375    case TGSI_OPCODE_DP4:
1376    /* TGSI_OPCODE_DOT4 */
1377       FETCH( func, *inst, 0, 0, CHAN_X );
1378       FETCH( func, *inst, 1, 1, CHAN_X );
1379       emit_mul( func, 0, 1 );
1380       FETCH( func, *inst, 1, 0, CHAN_Y );
1381       FETCH( func, *inst, 2, 1, CHAN_Y );
1382       emit_mul( func, 1, 2 );
1383       emit_add( func, 0, 1 );
1384       FETCH( func, *inst, 1, 0, CHAN_Z );
1385       FETCH( func, *inst, 2, 1, CHAN_Z );
1386       emit_mul(func, 1, 2 );
1387       emit_add(func, 0, 1 );
1388       FETCH( func, *inst, 1, 0, CHAN_W );
1389       FETCH( func, *inst, 2, 1, CHAN_W );
1390       emit_mul( func, 1, 2 );
1391       emit_add( func, 0, 1 );
1392       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1393          STORE( func, *inst, 0, 0, chan_index );
1394       }
1395       break;
1396
1397    case TGSI_OPCODE_DST:
1398       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1399          emit_tempf(
1400             func,
1401             0,
1402             TGSI_EXEC_TEMP_ONE_I,
1403             TGSI_EXEC_TEMP_ONE_C );
1404          STORE( func, *inst, 0, 0, CHAN_X );
1405       }
1406       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1407          FETCH( func, *inst, 0, 0, CHAN_Y );
1408          FETCH( func, *inst, 1, 1, CHAN_Y );
1409          emit_mul( func, 0, 1 );
1410          STORE( func, *inst, 0, 0, CHAN_Y );
1411       }
1412       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1413          FETCH( func, *inst, 0, 0, CHAN_Z );
1414          STORE( func, *inst, 0, 0, CHAN_Z );
1415       }
1416       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1417          FETCH( func, *inst, 0, 1, CHAN_W );
1418          STORE( func, *inst, 0, 0, CHAN_W );
1419       }
1420       break;
1421
1422    case TGSI_OPCODE_MIN:
1423       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1424          FETCH( func, *inst, 0, 0, chan_index );
1425          FETCH( func, *inst, 1, 1, chan_index );
1426          sse_minps(
1427             func,
1428             make_xmm( 0 ),
1429             make_xmm( 1 ) );
1430          STORE( func, *inst, 0, 0, chan_index );
1431       }
1432       break;
1433
1434    case TGSI_OPCODE_MAX:
1435       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1436          FETCH( func, *inst, 0, 0, chan_index );
1437          FETCH( func, *inst, 1, 1, chan_index );
1438          sse_maxps(
1439             func,
1440             make_xmm( 0 ),
1441             make_xmm( 1 ) );
1442          STORE( func, *inst, 0, 0, chan_index );
1443       }
1444       break;
1445
1446    case TGSI_OPCODE_SLT:
1447    /* TGSI_OPCODE_SETLT */
1448       emit_setcc( func, inst, cc_LessThan );
1449       break;
1450
1451    case TGSI_OPCODE_SGE:
1452    /* TGSI_OPCODE_SETGE */
1453       emit_setcc( func, inst, cc_NotLessThan );
1454       break;
1455
1456    case TGSI_OPCODE_MAD:
1457    /* TGSI_OPCODE_MADD */
1458       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1459          FETCH( func, *inst, 0, 0, chan_index );
1460          FETCH( func, *inst, 1, 1, chan_index );
1461          FETCH( func, *inst, 2, 2, chan_index );
1462          emit_mul( func, 0, 1 );
1463          emit_add( func, 0, 2 );
1464          STORE( func, *inst, 0, 0, chan_index );
1465       }
1466       break;
1467
1468    case TGSI_OPCODE_SUB:
1469       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1470          FETCH( func, *inst, 0, 0, chan_index );
1471          FETCH( func, *inst, 1, 1, chan_index );
1472          emit_sub( func, 0, 1 );
1473          STORE( func, *inst, 0, 0, chan_index );
1474       }
1475       break;
1476
1477    case TGSI_OPCODE_LERP:
1478    /* TGSI_OPCODE_LRP */
1479       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1480          FETCH( func, *inst, 0, 0, chan_index );
1481          FETCH( func, *inst, 1, 1, chan_index );
1482          FETCH( func, *inst, 2, 2, chan_index );
1483          emit_sub( func, 1, 2 );
1484          emit_mul( func, 0, 1 );
1485          emit_add( func, 0, 2 );
1486          STORE( func, *inst, 0, 0, chan_index );
1487       }
1488       break;
1489
1490    case TGSI_OPCODE_CND:
1491       return 0;
1492       break;
1493
1494    case TGSI_OPCODE_CND0:
1495       return 0;
1496       break;
1497
1498    case TGSI_OPCODE_DOT2ADD:
1499    /* TGSI_OPCODE_DP2A */
1500       return 0;
1501       break;
1502
1503    case TGSI_OPCODE_INDEX:
1504       return 0;
1505       break;
1506
1507    case TGSI_OPCODE_NEGATE:
1508       return 0;
1509       break;
1510
1511    case TGSI_OPCODE_FRAC:
1512    /* TGSI_OPCODE_FRC */
1513       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1514          FETCH( func, *inst, 0, 0, chan_index );
1515          emit_frc( func, 0 );
1516          STORE( func, *inst, 0, 0, chan_index );
1517       }
1518       break;
1519
1520    case TGSI_OPCODE_CLAMP:
1521       return 0;
1522       break;
1523
1524    case TGSI_OPCODE_FLOOR:
1525    /* TGSI_OPCODE_FLR */
1526       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1527          FETCH( func, *inst, 0, 0, chan_index );
1528          emit_flr( func, 0 );
1529          STORE( func, *inst, 0, 0, chan_index );
1530       }
1531       break;
1532
1533    case TGSI_OPCODE_ROUND:
1534       return 0;
1535       break;
1536
1537    case TGSI_OPCODE_EXPBASE2:
1538    /* TGSI_OPCODE_EX2 */
1539       FETCH( func, *inst, 0, 0, CHAN_X );
1540       emit_ex2( func, 0 );
1541       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1542          STORE( func, *inst, 0, 0, chan_index );
1543       }
1544       break;
1545
1546    case TGSI_OPCODE_LOGBASE2:
1547    /* TGSI_OPCODE_LG2 */
1548       FETCH( func, *inst, 0, 0, CHAN_X );
1549       emit_lg2( func, 0 );
1550       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1551          STORE( func, *inst, 0, 0, chan_index );
1552       }
1553       break;
1554
1555    case TGSI_OPCODE_POWER:
1556    /* TGSI_OPCODE_POW */
1557       FETCH( func, *inst, 0, 0, CHAN_X );
1558       FETCH( func, *inst, 1, 1, CHAN_X );
1559       emit_pow( func, 0, 1 );
1560       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1561          STORE( func, *inst, 0, 0, chan_index );
1562       }
1563       break;
1564
1565    case TGSI_OPCODE_CROSSPRODUCT:
1566    /* TGSI_OPCODE_XPD */
1567       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1568           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1569          FETCH( func, *inst, 1, 1, CHAN_Z );
1570          FETCH( func, *inst, 3, 0, CHAN_Z );
1571       }
1572       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1573           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1574          FETCH( func, *inst, 0, 0, CHAN_Y );
1575          FETCH( func, *inst, 4, 1, CHAN_Y );
1576       }
1577       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1578          emit_MOV( func, 2, 0 );
1579          emit_mul( func, 2, 1 );
1580          emit_MOV( func, 5, 3 );
1581          emit_mul( func, 5, 4 );
1582          emit_sub( func, 2, 5 );
1583          STORE( func, *inst, 2, 0, CHAN_X );
1584       }
1585       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1586           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1587          FETCH( func, *inst, 2, 1, CHAN_X );
1588          FETCH( func, *inst, 5, 0, CHAN_X );
1589       }
1590       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1591          emit_mul( func, 3, 2 );
1592          emit_mul( func, 1, 5 );
1593          emit_sub( func, 3, 1 );
1594          STORE( func, *inst, 3, 0, CHAN_Y );
1595       }
1596       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1597          emit_mul( func, 5, 4 );
1598          emit_mul( func, 0, 2 );
1599          emit_sub( func, 5, 0 );
1600          STORE( func, *inst, 5, 0, CHAN_Z );
1601       }
1602       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1603          emit_tempf(
1604             func,
1605             0,
1606             TGSI_EXEC_TEMP_ONE_I,
1607             TGSI_EXEC_TEMP_ONE_C );
1608          STORE( func, *inst, 0, 0, CHAN_W );
1609       }
1610       break;
1611
1612    case TGSI_OPCODE_MULTIPLYMATRIX:
1613       return 0;
1614       break;
1615
1616    case TGSI_OPCODE_ABS:
1617       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1618          FETCH( func, *inst, 0, 0, chan_index );
1619          emit_abs( func, 0) ;
1620
1621          STORE( func, *inst, 0, 0, chan_index );
1622       }
1623       break;
1624
1625    case TGSI_OPCODE_RCC:
1626       return 0;
1627       break;
1628
1629    case TGSI_OPCODE_DPH:
1630       FETCH( func, *inst, 0, 0, CHAN_X );
1631       FETCH( func, *inst, 1, 1, CHAN_X );
1632       emit_mul( func, 0, 1 );
1633       FETCH( func, *inst, 1, 0, CHAN_Y );
1634       FETCH( func, *inst, 2, 1, CHAN_Y );
1635       emit_mul( func, 1, 2 );
1636       emit_add( func, 0, 1 );
1637       FETCH( func, *inst, 1, 0, CHAN_Z );
1638       FETCH( func, *inst, 2, 1, CHAN_Z );
1639       emit_mul( func, 1, 2 );
1640       emit_add( func, 0, 1 );
1641       FETCH( func, *inst, 1, 1, CHAN_W );
1642       emit_add( func, 0, 1 );
1643       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1644          STORE( func, *inst, 0, 0, chan_index );
1645       }
1646       break;
1647
1648    case TGSI_OPCODE_COS:
1649       FETCH( func, *inst, 0, 0, CHAN_X );
1650       emit_cos( func, 0 );
1651       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1652          STORE( func, *inst, 0, 0, chan_index );
1653       }
1654       break;
1655
1656    case TGSI_OPCODE_DDX:
1657       return 0;
1658       break;
1659
1660    case TGSI_OPCODE_DDY:
1661       return 0;
1662       break;
1663
1664    case TGSI_OPCODE_KILP:
1665       /* predicated kill */
1666       emit_kilp( func );
1667       return 0; /* XXX fix me */
1668       break;
1669
1670    case TGSI_OPCODE_KIL:
1671       /* conditional kill */
1672       emit_kil( func, &inst->FullSrcRegisters[0] );
1673       break;
1674
1675    case TGSI_OPCODE_PK2H:
1676       return 0;
1677       break;
1678
1679    case TGSI_OPCODE_PK2US:
1680       return 0;
1681       break;
1682
1683    case TGSI_OPCODE_PK4B:
1684       return 0;
1685       break;
1686
1687    case TGSI_OPCODE_PK4UB:
1688       return 0;
1689       break;
1690
1691    case TGSI_OPCODE_RFL:
1692       return 0;
1693       break;
1694
1695    case TGSI_OPCODE_SEQ:
1696       return 0;
1697       break;
1698
1699    case TGSI_OPCODE_SFL:
1700       return 0;
1701       break;
1702
1703    case TGSI_OPCODE_SGT:
1704       return 0;
1705       break;
1706
1707    case TGSI_OPCODE_SIN:
1708       FETCH( func, *inst, 0, 0, CHAN_X );
1709       emit_sin( func, 0 );
1710       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1711          STORE( func, *inst, 0, 0, chan_index );
1712       }
1713       break;
1714
1715    case TGSI_OPCODE_SLE:
1716       return 0;
1717       break;
1718
1719    case TGSI_OPCODE_SNE:
1720       return 0;
1721       break;
1722
1723    case TGSI_OPCODE_STR:
1724       return 0;
1725       break;
1726
1727    case TGSI_OPCODE_TEX:
1728       if (0) {
1729          /* Disable dummy texture code:
1730           */
1731          emit_tempf(
1732             func,
1733             0,
1734             TGSI_EXEC_TEMP_ONE_I,
1735             TGSI_EXEC_TEMP_ONE_C );
1736          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1737             STORE( func, *inst, 0, 0, chan_index );
1738          }
1739       }
1740       else {
1741          return 0;
1742       }
1743       break;
1744
1745    case TGSI_OPCODE_TXD:
1746       return 0;
1747       break;
1748
1749    case TGSI_OPCODE_UP2H:
1750       return 0;
1751       break;
1752
1753    case TGSI_OPCODE_UP2US:
1754       return 0;
1755       break;
1756
1757    case TGSI_OPCODE_UP4B:
1758       return 0;
1759       break;
1760
1761    case TGSI_OPCODE_UP4UB:
1762       return 0;
1763       break;
1764
1765    case TGSI_OPCODE_X2D:
1766       return 0;
1767       break;
1768
1769    case TGSI_OPCODE_ARA:
1770       return 0;
1771       break;
1772
1773    case TGSI_OPCODE_ARR:
1774       return 0;
1775       break;
1776
1777    case TGSI_OPCODE_BRA:
1778       return 0;
1779       break;
1780
1781    case TGSI_OPCODE_CAL:
1782       return 0;
1783       break;
1784
1785    case TGSI_OPCODE_RET:
1786       emit_ret( func );
1787       break;
1788
1789    case TGSI_OPCODE_END:
1790       break;
1791
1792    case TGSI_OPCODE_SSG:
1793       return 0;
1794       break;
1795
1796    case TGSI_OPCODE_CMP:
1797       emit_cmp (func, inst);
1798       break;
1799
1800    case TGSI_OPCODE_SCS:
1801       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1802          FETCH( func, *inst, 0, 0, CHAN_X );
1803          emit_cos( func, 0 );
1804          STORE( func, *inst, 0, 0, CHAN_X );
1805       }
1806       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1807          FETCH( func, *inst, 0, 0, CHAN_X );
1808          emit_sin( func, 0 );
1809          STORE( func, *inst, 0, 0, CHAN_Y );
1810       }
1811       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1812          emit_tempf(
1813             func,
1814             0,
1815             TGSI_EXEC_TEMP_00000000_I,
1816             TGSI_EXEC_TEMP_00000000_C );
1817          STORE( func, *inst, 0, 0, CHAN_Z );
1818       }
1819       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1820          emit_tempf(
1821             func,
1822             0,
1823             TGSI_EXEC_TEMP_ONE_I,
1824             TGSI_EXEC_TEMP_ONE_C );
1825          STORE( func, *inst, 0, 0, CHAN_W );
1826       }
1827       break;
1828
1829    case TGSI_OPCODE_TXB:
1830       return 0;
1831       break;
1832
1833    case TGSI_OPCODE_NRM:
1834       return 0;
1835       break;
1836
1837    case TGSI_OPCODE_DIV:
1838       return 0;
1839       break;
1840
1841    case TGSI_OPCODE_DP2:
1842       return 0;
1843       break;
1844
1845    case TGSI_OPCODE_TXL:
1846       return 0;
1847       break;
1848
1849    case TGSI_OPCODE_BRK:
1850       return 0;
1851       break;
1852
1853    case TGSI_OPCODE_IF:
1854       return 0;
1855       break;
1856
1857    case TGSI_OPCODE_LOOP:
1858       return 0;
1859       break;
1860
1861    case TGSI_OPCODE_REP:
1862       return 0;
1863       break;
1864
1865    case TGSI_OPCODE_ELSE:
1866       return 0;
1867       break;
1868
1869    case TGSI_OPCODE_ENDIF:
1870       return 0;
1871       break;
1872
1873    case TGSI_OPCODE_ENDLOOP:
1874       return 0;
1875       break;
1876
1877    case TGSI_OPCODE_ENDREP:
1878       return 0;
1879       break;
1880
1881    case TGSI_OPCODE_PUSHA:
1882       return 0;
1883       break;
1884
1885    case TGSI_OPCODE_POPA:
1886       return 0;
1887       break;
1888
1889    case TGSI_OPCODE_CEIL:
1890       return 0;
1891       break;
1892
1893    case TGSI_OPCODE_I2F:
1894       return 0;
1895       break;
1896
1897    case TGSI_OPCODE_NOT:
1898       return 0;
1899       break;
1900
1901    case TGSI_OPCODE_TRUNC:
1902       return 0;
1903       break;
1904
1905    case TGSI_OPCODE_SHL:
1906       return 0;
1907       break;
1908
1909    case TGSI_OPCODE_SHR:
1910       return 0;
1911       break;
1912
1913    case TGSI_OPCODE_AND:
1914       return 0;
1915       break;
1916
1917    case TGSI_OPCODE_OR:
1918       return 0;
1919       break;
1920
1921    case TGSI_OPCODE_MOD:
1922       return 0;
1923       break;
1924
1925    case TGSI_OPCODE_XOR:
1926       return 0;
1927       break;
1928
1929    case TGSI_OPCODE_SAD:
1930       return 0;
1931       break;
1932
1933    case TGSI_OPCODE_TXF:
1934       return 0;
1935       break;
1936
1937    case TGSI_OPCODE_TXQ:
1938       return 0;
1939       break;
1940
1941    case TGSI_OPCODE_CONT:
1942       return 0;
1943       break;
1944
1945    case TGSI_OPCODE_EMIT:
1946       return 0;
1947       break;
1948
1949    case TGSI_OPCODE_ENDPRIM:
1950       return 0;
1951       break;
1952
1953    default:
1954       return 0;
1955    }
1956
1957    return 1;
1958 }
1959
1960 static void
1961 emit_declaration(
1962    struct x86_function *func,
1963    struct tgsi_full_declaration *decl )
1964 {
1965    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1966       unsigned first, last, mask;
1967       unsigned i, j;
1968
1969       first = decl->DeclarationRange.First;
1970       last = decl->DeclarationRange.Last;
1971       mask = decl->Declaration.UsageMask;
1972
1973       for( i = first; i <= last; i++ ) {
1974          for( j = 0; j < NUM_CHANNELS; j++ ) {
1975             if( mask & (1 << j) ) {
1976                switch( decl->Declaration.Interpolate ) {
1977                case TGSI_INTERPOLATE_CONSTANT:
1978                   emit_coef_a0( func, 0, i, j );
1979                   emit_inputs( func, 0, i, j );
1980                   break;
1981
1982                case TGSI_INTERPOLATE_LINEAR:
1983                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
1984                   emit_coef_dadx( func, 1, i, j );
1985                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
1986                   emit_coef_dady( func, 3, i, j );
1987                   emit_mul( func, 0, 1 );    /* x * dadx */
1988                   emit_coef_a0( func, 4, i, j );
1989                   emit_mul( func, 2, 3 );    /* y * dady */
1990                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
1991                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
1992                   emit_inputs( func, 0, i, j );
1993                   break;
1994
1995                case TGSI_INTERPOLATE_PERSPECTIVE:
1996                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
1997                   emit_coef_dadx( func, 1, i, j );
1998                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
1999                   emit_coef_dady( func, 3, i, j );
2000                   emit_mul( func, 0, 1 );    /* x * dadx */
2001                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2002                   emit_coef_a0( func, 5, i, j );
2003                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2004                   emit_mul( func, 2, 3 );    /* y * dady */
2005                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2006                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2007                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2008                   emit_inputs( func, 0, i, j );
2009                   break;
2010
2011                default:
2012                   assert( 0 );
2013                   break;
2014                }
2015             }
2016          }
2017       }
2018    }
2019 }
2020
2021 static void aos_to_soa( struct x86_function *func,
2022                         uint arg_aos,
2023                         uint arg_soa,
2024                         uint arg_num,
2025                         uint arg_stride )
2026 {
2027    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2028    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2029    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2030    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2031    int inner_loop;
2032
2033
2034    /* Save EBX */
2035    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2036
2037    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2038    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
2039    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2040    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2041
2042    /* do */
2043    inner_loop = x86_get_label( func );
2044    {
2045       x86_push( func, aos_input );
2046       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2047       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2048       x86_add( func, aos_input, stride );
2049       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2050       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2051       x86_add( func, aos_input, stride );
2052       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2053       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2054       x86_add( func, aos_input, stride );
2055       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2056       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2057       x86_pop( func, aos_input );
2058
2059       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2060       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2061       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2062       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2063       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2064       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2065
2066       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2067       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2068       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2069       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2070
2071       /* Advance to next input */
2072       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2073       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2074    }
2075    /* while --num_inputs */
2076    x86_dec( func, num_inputs );
2077    x86_jcc( func, cc_NE, inner_loop );
2078
2079    /* Restore EBX */
2080    x86_pop( func, aos_input );
2081 }
2082
2083 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2084 {
2085    struct x86_reg soa_output;
2086    struct x86_reg aos_output;
2087    struct x86_reg num_outputs;
2088    struct x86_reg temp;
2089    int inner_loop;
2090
2091    soa_output = x86_make_reg( file_REG32, reg_AX );
2092    aos_output = x86_make_reg( file_REG32, reg_BX );
2093    num_outputs = x86_make_reg( file_REG32, reg_CX );
2094    temp = x86_make_reg( file_REG32, reg_DX );
2095
2096    /* Save EBX */
2097    x86_push( func, aos_output );
2098
2099    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2100    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2101    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2102
2103    /* do */
2104    inner_loop = x86_get_label( func );
2105    {
2106       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2107       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2108       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2109       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2110
2111       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2112       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2113       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2114       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2115       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2116       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2117
2118       x86_mov( func, temp, x86_fn_arg( func, stride ) );
2119       x86_push( func, aos_output );
2120       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2121       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2122       x86_add( func, aos_output, temp );
2123       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2124       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2125       x86_add( func, aos_output, temp );
2126       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2127       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2128       x86_add( func, aos_output, temp );
2129       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2130       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2131       x86_pop( func, aos_output );
2132
2133       /* Advance to next output */
2134       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2135       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2136    }
2137    /* while --num_outputs */
2138    x86_dec( func, num_outputs );
2139    x86_jcc( func, cc_NE, inner_loop );
2140
2141    /* Restore EBX */
2142    x86_pop( func, aos_output );
2143 }
2144
2145 /**
2146  * Translate a TGSI vertex/fragment shader to SSE2 code.
2147  * Slightly different things are done for vertex vs. fragment shaders.
2148  *
2149  * Note that fragment shaders are responsible for interpolating shader
2150  * inputs. Because on x86 we have only 4 GP registers, and here we
2151  * have 5 shader arguments (input, output, const, temp and coef), the
2152  * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2153  * GP register holding the output argument is aliased with the coeff
2154  * argument, as outputs are not needed in the DECLARATION phase.
2155  *
2156  * \param tokens  the TGSI input shader
2157  * \param func  the output SSE code/function
2158  * \param immediates  buffer to place immediates, later passed to SSE func
2159  * \param return  1 for success, 0 if translation failed
2160  */
2161 unsigned
2162 tgsi_emit_sse2(
2163    const struct tgsi_token *tokens,
2164    struct x86_function *func,
2165    float (*immediates)[4],
2166    boolean do_swizzles )
2167 {
2168    struct tgsi_parse_context parse;
2169    boolean instruction_phase = FALSE;
2170    unsigned ok = 1;
2171    uint num_immediates = 0;
2172
2173    func->csr = func->store;
2174
2175    tgsi_parse_init( &parse, tokens );
2176
2177    /* Can't just use EDI, EBX without save/restoring them:
2178     */
2179    x86_push(
2180       func,
2181       get_immediate_base() );
2182
2183    x86_push(
2184       func,
2185       get_temp_base() );
2186
2187
2188    /*
2189     * Different function args for vertex/fragment shaders:
2190     */
2191    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2192       /* DECLARATION phase, do not load output argument. */
2193       x86_mov(
2194          func,
2195          get_input_base(),
2196          x86_fn_arg( func, 1 ) );
2197       /* skipping outputs argument here */
2198       x86_mov(
2199          func,
2200          get_const_base(),
2201          x86_fn_arg( func, 3 ) );
2202       x86_mov(
2203          func,
2204          get_temp_base(),
2205          x86_fn_arg( func, 4 ) );
2206       x86_mov(
2207          func,
2208          get_coef_base(),
2209          x86_fn_arg( func, 5 ) );
2210       x86_mov(
2211          func,
2212          get_immediate_base(),
2213          x86_fn_arg( func, 6 ) );
2214    }
2215    else {
2216       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2217
2218       if (do_swizzles)
2219          aos_to_soa( func,
2220                      6,         /* aos_input */
2221                      1,         /* machine->input */
2222                      7,         /* num_inputs */
2223                      8 );       /* input_stride */
2224
2225       x86_mov(
2226          func,
2227          get_input_base(),
2228          x86_fn_arg( func, 1 ) );
2229       x86_mov(
2230          func,
2231          get_output_base(),
2232          x86_fn_arg( func, 2 ) );
2233       x86_mov(
2234          func,
2235          get_const_base(),
2236          x86_fn_arg( func, 3 ) );
2237       x86_mov(
2238          func,
2239          get_temp_base(),
2240          x86_fn_arg( func, 4 ) );
2241       x86_mov(
2242          func,
2243          get_immediate_base(),
2244          x86_fn_arg( func, 5 ) );
2245    }
2246
2247    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2248       tgsi_parse_token( &parse );
2249
2250       switch( parse.FullToken.Token.Type ) {
2251       case TGSI_TOKEN_TYPE_DECLARATION:
2252          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2253             emit_declaration(
2254                func,
2255                &parse.FullToken.FullDeclaration );
2256          }
2257          break;
2258
2259       case TGSI_TOKEN_TYPE_INSTRUCTION:
2260          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2261             if( !instruction_phase ) {
2262                /* INSTRUCTION phase, overwrite coeff with output. */
2263                instruction_phase = TRUE;
2264                x86_mov(
2265                   func,
2266                   get_output_base(),
2267                   x86_fn_arg( func, 2 ) );
2268             }
2269          }
2270
2271          ok = emit_instruction(
2272             func,
2273             &parse.FullToken.FullInstruction );
2274
2275          if (!ok) {
2276             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2277                          parse.FullToken.FullInstruction.Instruction.Opcode,
2278                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2279                          "vertex shader" : "fragment shader");
2280          }
2281          break;
2282
2283       case TGSI_TOKEN_TYPE_IMMEDIATE:
2284          /* simply copy the immediate values into the next immediates[] slot */
2285          {
2286             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2287             uint i;
2288             assert(size <= 4);
2289             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2290             for( i = 0; i < size; i++ ) {
2291                immediates[num_immediates][i] =
2292                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2293             }
2294 #if 0
2295             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2296                    num_immediates,
2297                    immediates[num_immediates][0],
2298                    immediates[num_immediates][1],
2299                    immediates[num_immediates][2],
2300                    immediates[num_immediates][3]);
2301 #endif
2302             num_immediates++;
2303          }
2304          break;
2305
2306       default:
2307          ok = 0;
2308          assert( 0 );
2309       }
2310    }
2311
2312    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2313       if (do_swizzles)
2314          soa_to_aos( func, 9, 2, 10, 11 );
2315    }
2316
2317    /* Can't just use EBX, EDI without save/restoring them:
2318     */
2319    x86_pop(
2320       func,
2321       get_temp_base() );
2322
2323    x86_pop(
2324       func,
2325       get_immediate_base() );
2326
2327    emit_ret( func );
2328
2329    tgsi_parse_free( &parse );
2330
2331    return ok;
2332 }
2333
2334 #endif /* PIPE_ARCH_X86 */