src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28 #include "pipe/p_debug.h"
  29 #include "pipe/p_shader_tokens.h"
  30 #include "util/u_math.h"
  31 #include "tgsi/tgsi_parse.h"
  32 #include "tgsi/tgsi_util.h"
  33 #include "tgsi_exec.h"
  34 #include "tgsi_sse2.h"
  35
  36 #include "rtasm/rtasm_x86sse.h"
  37
  38 #ifdef PIPE_ARCH_X86
  39
  40 /* for 1/sqrt()
  41  *
  42  * This costs about 100fps (close to 10%) in gears:
  43  */
  44 #define HIGH_PRECISION 1
  45
  46 #define FAST_MATH 1
  47
  48
  49 #define FOR_EACH_CHANNEL( CHAN )\
  50    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  51
  52 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  53    ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  54
  55 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  56    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  57
  58 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  59    FOR_EACH_CHANNEL( CHAN )\
  60       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  61
  62 #define CHAN_X 0
  63 #define CHAN_Y 1
  64 #define CHAN_Z 2
  65 #define CHAN_W 3
  66
  67 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  68 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  69
  70 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  71 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  72 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  73 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  74
  75
  76 /**
  77  * X86 utility functions.
  78  */
  79
  80 static struct x86_reg
  81 make_xmm(
  82    unsigned xmm )
  83 {
  84    return x86_make_reg(
  85       file_XMM,
  86       (enum x86_reg_name) xmm );
  87 }
  88
  89 /**
  90  * X86 register mapping helpers.
  91  */
  92
  93 static struct x86_reg
  94 get_const_base( void )
  95 {
  96    return x86_make_reg(
  97       file_REG32,
  98       reg_CX );
  99 }
 100
 101 static struct x86_reg
 102 get_input_base( void )
 103 {
 104    return x86_make_reg(
 105       file_REG32,
 106       reg_AX );
 107 }
 108
 109 static struct x86_reg
 110 get_output_base( void )
 111 {
 112    return x86_make_reg(
 113       file_REG32,
 114       reg_DX );
 115 }
 116
 117 static struct x86_reg
 118 get_temp_base( void )
 119 {
 120    return x86_make_reg(
 121       file_REG32,
 122       reg_BX );
 123 }
 124
 125 static struct x86_reg
 126 get_coef_base( void )
 127 {
 128    return get_output_base();
 129 }
 130
 131 static struct x86_reg
 132 get_immediate_base( void )
 133 {
 134    return x86_make_reg(
 135       file_REG32,
 136       reg_DI );
 137 }
 138
 139
 140 /**
 141  * Data access helpers.
 142  */
 143
 144
 145 static struct x86_reg
 146 get_immediate(
 147    unsigned vec,
 148    unsigned chan )
 149 {
 150    return x86_make_disp(
 151       get_immediate_base(),
 152       (vec * 4 + chan) * 4 );
 153 }
 154
 155 static struct x86_reg
 156 get_const(
 157    unsigned vec,
 158    unsigned chan )
 159 {
 160    return x86_make_disp(
 161       get_const_base(),
 162       (vec * 4 + chan) * 4 );
 163 }
 164
 165 static struct x86_reg
 166 get_input(
 167    unsigned vec,
 168    unsigned chan )
 169 {
 170    return x86_make_disp(
 171       get_input_base(),
 172       (vec * 4 + chan) * 16 );
 173 }
 174
 175 static struct x86_reg
 176 get_output(
 177    unsigned vec,
 178    unsigned chan )
 179 {
 180    return x86_make_disp(
 181       get_output_base(),
 182       (vec * 4 + chan) * 16 );
 183 }
 184
 185 static struct x86_reg
 186 get_temp(
 187    unsigned vec,
 188    unsigned chan )
 189 {
 190    return x86_make_disp(
 191       get_temp_base(),
 192       (vec * 4 + chan) * 16 );
 193 }
 194
 195 static struct x86_reg
 196 get_coef(
 197    unsigned vec,
 198    unsigned chan,
 199    unsigned member )
 200 {
 201    return x86_make_disp(
 202       get_coef_base(),
 203       ((vec * 3 + member) * 4 + chan) * 4 );
 204 }
 205
 206
 207 static void
 208 emit_ret(
 209    struct x86_function  *func )
 210 {
 211    x86_ret( func );
 212 }
 213
 214
 215 /**
 216  * Data fetch helpers.
 217  */
 218
 219 /**
 220  * Copy a shader constant to xmm register
 221  * \param xmm  the destination xmm register
 222  * \param vec  the src const buffer index
 223  * \param chan  src channel to fetch (X, Y, Z or W)
 224  */
 225 static void
 226 emit_const(
 227    struct x86_function *func,
 228    uint xmm,
 229    int vec,
 230    uint chan,
 231    uint indirect,
 232    uint indirectFile,
 233    int indirectIndex )
 234 {
 235    if (indirect) {
 236       /* 'vec' is the offset from the address register's value.
 237        * We're loading CONST[ADDR+vec] into an xmm register.
 238        */
 239       struct x86_reg r0 = get_input_base();
 240       struct x86_reg r1 = get_output_base();
 241       uint i;
 242
 243       assert( indirectFile == TGSI_FILE_ADDRESS );
 244       assert( indirectIndex == 0 );
 245
 246       x86_push( func, r0 );
 247       x86_push( func, r1 );
 248
 249       /*
 250        * Loop over the four pixels or vertices in the quad.
 251        * Get the value of the address (offset) register for pixel/vertex[i],
 252        * add it to the src offset and index into the constant buffer.
 253        * Note that we're working on SOA data.
 254        * If any of the pixel/vertex execution channels are unused their
 255        * values will be garbage.  It's very important that we don't use
 256        * those garbage values as indexes into the constant buffer since
 257        * that'll cause segfaults.
 258        * The solution is to bitwise-AND the offset with the execution mask
 259        * register whose values are either 0 or ~0.
 260        * The caller must setup the execution mask register to indicate
 261        * which channels are valid/alive before running the shader.
 262        * The execution mask will also figure into loops and conditionals
 263        * someday.
 264        */
 265       for (i = 0; i < QUAD_SIZE; i++) {
 266          /* r1 = address register[i] */
 267          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 268          /* r0 = execution mask[i] */
 269          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 270          /* r1 = r1 & r0 */
 271          x86_and( func, r1, r0 );
 272          /* r0 = 'vec', the offset */
 273          x86_lea( func, r0, get_const( vec, chan ) );
 274
 275          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 276           */
 277          x86_add( func, r1, r1 );
 278          x86_add( func, r1, r1 );
 279          x86_add( func, r1, r1 );
 280          x86_add( func, r1, r1 );
 281
 282          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 283          x86_mov( func, r1, x86_deref( r0 ) );
 284          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 285       }
 286
 287       x86_pop( func, r1 );
 288       x86_pop( func, r0 );
 289
 290       sse_movaps(
 291          func,
 292          make_xmm( xmm ),
 293          get_temp( TEMP_R0, CHAN_X ) );
 294    }
 295    else {
 296       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 297       assert( vec >= 0 );
 298
 299       sse_movss(
 300          func,
 301          make_xmm( xmm ),
 302          get_const( vec, chan ) );
 303       sse_shufps(
 304          func,
 305          make_xmm( xmm ),
 306          make_xmm( xmm ),
 307          SHUF( 0, 0, 0, 0 ) );
 308    }
 309 }
 310
 311 static void
 312 emit_immediate(
 313    struct x86_function *func,
 314    unsigned xmm,
 315    unsigned vec,
 316    unsigned chan )
 317 {
 318    sse_movss(
 319       func,
 320       make_xmm( xmm ),
 321       get_immediate( vec, chan ) );
 322    sse_shufps(
 323       func,
 324       make_xmm( xmm ),
 325       make_xmm( xmm ),
 326       SHUF( 0, 0, 0, 0 ) );
 327 }
 328
 329
 330 /**
 331  * Copy a shader input to xmm register
 332  * \param xmm  the destination xmm register
 333  * \param vec  the src input attrib
 334  * \param chan  src channel to fetch (X, Y, Z or W)
 335  */
 336 static void
 337 emit_inputf(
 338    struct x86_function *func,
 339    unsigned xmm,
 340    unsigned vec,
 341    unsigned chan )
 342 {
 343    sse_movups(
 344       func,
 345       make_xmm( xmm ),
 346       get_input( vec, chan ) );
 347 }
 348
 349 /**
 350  * Store an xmm register to a shader output
 351  * \param xmm  the source xmm register
 352  * \param vec  the dest output attrib
 353  * \param chan  src dest channel to store (X, Y, Z or W)
 354  */
 355 static void
 356 emit_output(
 357    struct x86_function *func,
 358    unsigned xmm,
 359    unsigned vec,
 360    unsigned chan )
 361 {
 362    sse_movups(
 363       func,
 364       get_output( vec, chan ),
 365       make_xmm( xmm ) );
 366 }
 367
 368 /**
 369  * Copy a shader temporary to xmm register
 370  * \param xmm  the destination xmm register
 371  * \param vec  the src temp register
 372  * \param chan  src channel to fetch (X, Y, Z or W)
 373  */
 374 static void
 375 emit_tempf(
 376    struct x86_function *func,
 377    unsigned xmm,
 378    unsigned vec,
 379    unsigned chan )
 380 {
 381    sse_movaps(
 382       func,
 383       make_xmm( xmm ),
 384       get_temp( vec, chan ) );
 385 }
 386
 387 /**
 388  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 389  * \param xmm  the destination xmm register
 390  * \param vec  the src input/attribute coefficient index
 391  * \param chan  src channel to fetch (X, Y, Z or W)
 392  * \param member  0=a0, 1=dadx, 2=dady
 393  */
 394 static void
 395 emit_coef(
 396    struct x86_function *func,
 397    unsigned xmm,
 398    unsigned vec,
 399    unsigned chan,
 400    unsigned member )
 401 {
 402    sse_movss(
 403       func,
 404       make_xmm( xmm ),
 405       get_coef( vec, chan, member ) );
 406    sse_shufps(
 407       func,
 408       make_xmm( xmm ),
 409       make_xmm( xmm ),
 410       SHUF( 0, 0, 0, 0 ) );
 411 }
 412
 413 /**
 414  * Data store helpers.
 415  */
 416
 417 static void
 418 emit_inputs(
 419    struct x86_function *func,
 420    unsigned xmm,
 421    unsigned vec,
 422    unsigned chan )
 423 {
 424    sse_movups(
 425       func,
 426       get_input( vec, chan ),
 427       make_xmm( xmm ) );
 428 }
 429
 430 static void
 431 emit_temps(
 432    struct x86_function *func,
 433    unsigned xmm,
 434    unsigned vec,
 435    unsigned chan )
 436 {
 437    sse_movaps(
 438       func,
 439       get_temp( vec, chan ),
 440       make_xmm( xmm ) );
 441 }
 442
 443 static void
 444 emit_addrs(
 445    struct x86_function *func,
 446    unsigned xmm,
 447    unsigned vec,
 448    unsigned chan )
 449 {
 450    assert( vec == 0 );
 451
 452    emit_temps(
 453       func,
 454       xmm,
 455       vec + TGSI_EXEC_TEMP_ADDR,
 456       chan );
 457 }
 458
 459 /**
 460  * Coefficent fetch helpers.
 461  */
 462
 463 static void
 464 emit_coef_a0(
 465    struct x86_function *func,
 466    unsigned xmm,
 467    unsigned vec,
 468    unsigned chan )
 469 {
 470    emit_coef(
 471       func,
 472       xmm,
 473       vec,
 474       chan,
 475       0 );
 476 }
 477
 478 static void
 479 emit_coef_dadx(
 480    struct x86_function *func,
 481    unsigned xmm,
 482    unsigned vec,
 483    unsigned chan )
 484 {
 485    emit_coef(
 486       func,
 487       xmm,
 488       vec,
 489       chan,
 490       1 );
 491 }
 492
 493 static void
 494 emit_coef_dady(
 495    struct x86_function *func,
 496    unsigned xmm,
 497    unsigned vec,
 498    unsigned chan )
 499 {
 500    emit_coef(
 501       func,
 502       xmm,
 503       vec,
 504       chan,
 505       2 );
 506 }
 507
 508 /**
 509  * Function call helpers.
 510  */
 511
 512 static void
 513 emit_push_gp(
 514    struct x86_function *func )
 515 {
 516    x86_push(
 517       func,
 518       x86_make_reg( file_REG32, reg_AX) );
 519    x86_push(
 520       func,
 521       x86_make_reg( file_REG32, reg_CX) );
 522    x86_push(
 523       func,
 524       x86_make_reg( file_REG32, reg_DX) );
 525 }
 526
 527 static void
 528 x86_pop_gp(
 529    struct x86_function *func )
 530 {
 531    /* Restore GP registers in a reverse order.
 532     */
 533    x86_pop(
 534       func,
 535       x86_make_reg( file_REG32, reg_DX) );
 536    x86_pop(
 537       func,
 538       x86_make_reg( file_REG32, reg_CX) );
 539    x86_pop(
 540       func,
 541       x86_make_reg( file_REG32, reg_AX) );
 542 }
 543
 544 static void
 545 emit_func_call_dst(
 546    struct x86_function *func,
 547    unsigned xmm_dst,
 548    void (PIPE_CDECL *code)() )
 549 {
 550    sse_movaps(
 551       func,
 552       get_temp( TEMP_R0, 0 ),
 553       make_xmm( xmm_dst ) );
 554
 555    emit_push_gp(
 556       func );
 557
 558    {
 559       struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 560
 561       x86_lea(
 562          func,
 563          ecx,
 564          get_temp( TEMP_R0, 0 ) );
 565
 566       x86_push( func, ecx );
 567       x86_mov_reg_imm( func, ecx, (unsigned long) code );
 568       x86_call( func, ecx );
 569       x86_pop(func, ecx );
 570    }
 571
 572
 573    x86_pop_gp(
 574       func );
 575
 576    sse_movaps(
 577       func,
 578       make_xmm( xmm_dst ),
 579       get_temp( TEMP_R0, 0 ) );
 580 }
 581
 582 static void
 583 emit_func_call_dst_src(
 584    struct x86_function *func,
 585    unsigned xmm_dst,
 586    unsigned xmm_src,
 587    void (PIPE_CDECL *code)() )
 588 {
 589    sse_movaps(
 590       func,
 591       get_temp( TEMP_R0, 1 ),
 592       make_xmm( xmm_src ) );
 593
 594    emit_func_call_dst(
 595       func,
 596       xmm_dst,
 597       code );
 598 }
 599
 600 /**
 601  * Low-level instruction translators.
 602  */
 603
 604 static void
 605 emit_abs(
 606    struct x86_function *func,
 607    unsigned xmm )
 608 {
 609    sse_andps(
 610       func,
 611       make_xmm( xmm ),
 612       get_temp(
 613          TGSI_EXEC_TEMP_7FFFFFFF_I,
 614          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 615 }
 616
 617 static void
 618 emit_add(
 619    struct x86_function *func,
 620    unsigned xmm_dst,
 621    unsigned xmm_src )
 622 {
 623    sse_addps(
 624       func,
 625       make_xmm( xmm_dst ),
 626       make_xmm( xmm_src ) );
 627 }
 628
 629 static void PIPE_CDECL
 630 cos4f(
 631    float *store )
 632 {
 633    store[0] = cosf( store[0] );
 634    store[1] = cosf( store[1] );
 635    store[2] = cosf( store[2] );
 636    store[3] = cosf( store[3] );
 637 }
 638
 639 static void
 640 emit_cos(
 641    struct x86_function *func,
 642    unsigned xmm_dst )
 643 {
 644    emit_func_call_dst(
 645       func,
 646       xmm_dst,
 647       cos4f );
 648 }
 649
 650 static void PIPE_CDECL
 651 ex24f(
 652    float *store )
 653 {
 654 #if FAST_MATH
 655    store[0] = util_fast_exp2( store[0] );
 656    store[1] = util_fast_exp2( store[1] );
 657    store[2] = util_fast_exp2( store[2] );
 658    store[3] = util_fast_exp2( store[3] );
 659 #else
 660    store[0] = powf( 2.0f, store[0] );
 661    store[1] = powf( 2.0f, store[1] );
 662    store[2] = powf( 2.0f, store[2] );
 663    store[3] = powf( 2.0f, store[3] );
 664 #endif
 665 }
 666
 667 static void
 668 emit_ex2(
 669    struct x86_function *func,
 670    unsigned xmm_dst )
 671 {
 672    emit_func_call_dst(
 673       func,
 674       xmm_dst,
 675       ex24f );
 676 }
 677
 678 static void
 679 emit_f2it(
 680    struct x86_function *func,
 681    unsigned xmm )
 682 {
 683    sse2_cvttps2dq(
 684       func,
 685       make_xmm( xmm ),
 686       make_xmm( xmm ) );
 687 }
 688
 689 static void
 690 emit_i2f(
 691    struct x86_function *func,
 692    unsigned xmm )
 693 {
 694    sse2_cvtdq2ps(
 695       func,
 696       make_xmm( xmm ),
 697       make_xmm( xmm ) );
 698 }
 699
 700 static void PIPE_CDECL
 701 flr4f(
 702    float *store )
 703 {
 704    store[0] = floorf( store[0] );
 705    store[1] = floorf( store[1] );
 706    store[2] = floorf( store[2] );
 707    store[3] = floorf( store[3] );
 708 }
 709
 710 static void
 711 emit_flr(
 712    struct x86_function *func,
 713    unsigned xmm_dst )
 714 {
 715    emit_func_call_dst(
 716       func,
 717       xmm_dst,
 718       flr4f );
 719 }
 720
 721 static void PIPE_CDECL
 722 frc4f(
 723    float *store )
 724 {
 725    store[0] -= floorf( store[0] );
 726    store[1] -= floorf( store[1] );
 727    store[2] -= floorf( store[2] );
 728    store[3] -= floorf( store[3] );
 729 }
 730
 731 static void
 732 emit_frc(
 733    struct x86_function *func,
 734    unsigned xmm_dst )
 735 {
 736    emit_func_call_dst(
 737       func,
 738       xmm_dst,
 739       frc4f );
 740 }
 741
 742 static void PIPE_CDECL
 743 lg24f(
 744    float *store )
 745 {
 746    store[0] = util_fast_log2( store[0] );
 747    store[1] = util_fast_log2( store[1] );
 748    store[2] = util_fast_log2( store[2] );
 749    store[3] = util_fast_log2( store[3] );
 750 }
 751
 752 static void
 753 emit_lg2(
 754    struct x86_function *func,
 755    unsigned xmm_dst )
 756 {
 757    emit_func_call_dst(
 758       func,
 759       xmm_dst,
 760       lg24f );
 761 }
 762
 763 static void
 764 emit_MOV(
 765    struct x86_function *func,
 766    unsigned xmm_dst,
 767    unsigned xmm_src )
 768 {
 769    sse_movups(
 770       func,
 771       make_xmm( xmm_dst ),
 772       make_xmm( xmm_src ) );
 773 }
 774
 775 static void
 776 emit_mul (struct x86_function *func,
 777           unsigned xmm_dst,
 778           unsigned xmm_src)
 779 {
 780    sse_mulps(
 781       func,
 782       make_xmm( xmm_dst ),
 783       make_xmm( xmm_src ) );
 784 }
 785
 786 static void
 787 emit_neg(
 788    struct x86_function *func,
 789    unsigned xmm )
 790 {
 791    sse_xorps(
 792       func,
 793       make_xmm( xmm ),
 794       get_temp(
 795          TGSI_EXEC_TEMP_80000000_I,
 796          TGSI_EXEC_TEMP_80000000_C ) );
 797 }
 798
 799 static void PIPE_CDECL
 800 pow4f(
 801    float *store )
 802 {
 803 #if FAST_MATH
 804    store[0] = util_fast_pow( store[0], store[4] );
 805    store[1] = util_fast_pow( store[1], store[5] );
 806    store[2] = util_fast_pow( store[2], store[6] );
 807    store[3] = util_fast_pow( store[3], store[7] );
 808 #else
 809    store[0] = powf( store[0], store[4] );
 810    store[1] = powf( store[1], store[5] );
 811    store[2] = powf( store[2], store[6] );
 812    store[3] = powf( store[3], store[7] );
 813 #endif
 814 }
 815
 816 static void
 817 emit_pow(
 818    struct x86_function *func,
 819    unsigned xmm_dst,
 820    unsigned xmm_src )
 821 {
 822    emit_func_call_dst_src(
 823       func,
 824       xmm_dst,
 825       xmm_src,
 826       pow4f );
 827 }
 828
 829 static void
 830 emit_rcp (
 831    struct x86_function *func,
 832    unsigned xmm_dst,
 833    unsigned xmm_src )
 834 {
 835    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 836     * good enough.  Need to either emit a proper divide or use the
 837     * iterative technique described below in emit_rsqrt().
 838     */
 839    sse2_rcpps(
 840       func,
 841       make_xmm( xmm_dst ),
 842       make_xmm( xmm_src ) );
 843 }
 844
 845 static void PIPE_CDECL
 846 rnd4f(
 847    float *store )
 848 {
 849    store[0] = floorf( store[0] + 0.5f );
 850    store[1] = floorf( store[1] + 0.5f );
 851    store[2] = floorf( store[2] + 0.5f );
 852    store[3] = floorf( store[3] + 0.5f );
 853 }
 854
 855 static void
 856 emit_rnd(
 857    struct x86_function *func,
 858    unsigned xmm_save,
 859    unsigned xmm_dst )
 860 {
 861    emit_func_call_dst(
 862       func,
 863       xmm_save,
 864       xmm_dst,
 865       rnd4f );
 866 }
 867
 868 static void
 869 emit_rsqrt(
 870    struct x86_function *func,
 871    unsigned xmm_dst,
 872    unsigned xmm_src )
 873 {
 874 #if HIGH_PRECISION
 875    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
 876     * implementations, it is possible to improve its precision at
 877     * fairly low cost, using a newton/raphson step, as below:
 878     *
 879     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
 880     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
 881     *
 882     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
 883     */
 884    {
 885       struct x86_reg dst = make_xmm( xmm_dst );
 886       struct x86_reg src = make_xmm( xmm_src );
 887       struct x86_reg tmp0 = make_xmm( 2 );
 888       struct x86_reg tmp1 = make_xmm( 3 );
 889
 890       assert( xmm_dst != xmm_src );
 891       assert( xmm_dst != 2 && xmm_dst != 3 );
 892       assert( xmm_src != 2 && xmm_src != 3 );
 893
 894       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
 895       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
 896       sse_rsqrtps( func, tmp1, src  );
 897       sse_mulps(   func, src,  tmp1 );
 898       sse_mulps(   func, dst,  tmp1 );
 899       sse_mulps(   func, src,  tmp1 );
 900       sse_subps(   func, tmp0, src  );
 901       sse_mulps(   func, dst,  tmp0 );
 902    }
 903 #else
 904    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
 905     * good enough.
 906     */
 907    sse_rsqrtps(
 908       func,
 909       make_xmm( xmm_dst ),
 910       make_xmm( xmm_src ) );
 911 #endif
 912 }
 913
 914 static void
 915 emit_setsign(
 916    struct x86_function *func,
 917    unsigned xmm )
 918 {
 919    sse_orps(
 920       func,
 921       make_xmm( xmm ),
 922       get_temp(
 923          TGSI_EXEC_TEMP_80000000_I,
 924          TGSI_EXEC_TEMP_80000000_C ) );
 925 }
 926
 927 static void PIPE_CDECL
 928 sgn4f(
 929    float *store )
 930 {
 931    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
 932    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
 933    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
 934    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
 935 }
 936
 937 static void
 938 emit_sgn(
 939    struct x86_function *func,
 940    unsigned xmm_save,
 941    unsigned xmm_dst )
 942 {
 943    emit_func_call_dst(
 944       func,
 945       xmm_save,
 946       xmm_dst,
 947       sgn4f );
 948 }
 949
 950 static void PIPE_CDECL
 951 sin4f(
 952    float *store )
 953 {
 954    store[0] = sinf( store[0] );
 955    store[1] = sinf( store[1] );
 956    store[2] = sinf( store[2] );
 957    store[3] = sinf( store[3] );
 958 }
 959
 960 static void
 961 emit_sin (struct x86_function *func,
 962           unsigned xmm_dst)
 963 {
 964    emit_func_call_dst(
 965       func,
 966       xmm_dst,
 967       sin4f );
 968 }
 969
 970 static void
 971 emit_sub(
 972    struct x86_function *func,
 973    unsigned xmm_dst,
 974    unsigned xmm_src )
 975 {
 976    sse_subps(
 977       func,
 978       make_xmm( xmm_dst ),
 979       make_xmm( xmm_src ) );
 980 }
 981
 982 /**
 983  * Register fetch.
 984  */
 985
 986 static void
 987 emit_fetch(
 988    struct x86_function *func,
 989    unsigned xmm,
 990    const struct tgsi_full_src_register *reg,
 991    const unsigned chan_index )
 992 {
 993    unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
 994
 995    switch (swizzle) {
 996    case TGSI_EXTSWIZZLE_X:
 997    case TGSI_EXTSWIZZLE_Y:
 998    case TGSI_EXTSWIZZLE_Z:
 999    case TGSI_EXTSWIZZLE_W:
1000       switch (reg->SrcRegister.File) {
1001       case TGSI_FILE_CONSTANT:
1002          emit_const(
1003             func,
1004             xmm,
1005             reg->SrcRegister.Index,
1006             swizzle,
1007             reg->SrcRegister.Indirect,
1008             reg->SrcRegisterInd.File,
1009             reg->SrcRegisterInd.Index );
1010          break;
1011
1012       case TGSI_FILE_IMMEDIATE:
1013          emit_immediate(
1014             func,
1015             xmm,
1016             reg->SrcRegister.Index,
1017             swizzle );
1018          break;
1019
1020       case TGSI_FILE_INPUT:
1021          emit_inputf(
1022             func,
1023             xmm,
1024             reg->SrcRegister.Index,
1025             swizzle );
1026          break;
1027
1028       case TGSI_FILE_TEMPORARY:
1029          emit_tempf(
1030             func,
1031             xmm,
1032             reg->SrcRegister.Index,
1033             swizzle );
1034          break;
1035
1036       default:
1037          assert( 0 );
1038       }
1039       break;
1040
1041    case TGSI_EXTSWIZZLE_ZERO:
1042       emit_tempf(
1043          func,
1044          xmm,
1045          TGSI_EXEC_TEMP_00000000_I,
1046          TGSI_EXEC_TEMP_00000000_C );
1047       break;
1048
1049    case TGSI_EXTSWIZZLE_ONE:
1050       emit_tempf(
1051          func,
1052          xmm,
1053          TEMP_ONE_I,
1054          TEMP_ONE_C );
1055       break;
1056
1057    default:
1058       assert( 0 );
1059    }
1060
1061    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1062    case TGSI_UTIL_SIGN_CLEAR:
1063       emit_abs( func, xmm );
1064       break;
1065
1066    case TGSI_UTIL_SIGN_SET:
1067       emit_setsign( func, xmm );
1068       break;
1069
1070    case TGSI_UTIL_SIGN_TOGGLE:
1071       emit_neg( func, xmm );
1072       break;
1073
1074    case TGSI_UTIL_SIGN_KEEP:
1075       break;
1076    }
1077 }
1078
1079 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1080    emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
1081
1082 /**
1083  * Register store.
1084  */
1085
1086 static void
1087 emit_store(
1088    struct x86_function *func,
1089    unsigned xmm,
1090    const struct tgsi_full_dst_register *reg,
1091    const struct tgsi_full_instruction *inst,
1092    unsigned chan_index )
1093 {
1094    switch( reg->DstRegister.File ) {
1095    case TGSI_FILE_OUTPUT:
1096       emit_output(
1097          func,
1098          xmm,
1099          reg->DstRegister.Index,
1100          chan_index );
1101       break;
1102
1103    case TGSI_FILE_TEMPORARY:
1104       emit_temps(
1105          func,
1106          xmm,
1107          reg->DstRegister.Index,
1108          chan_index );
1109       break;
1110
1111    case TGSI_FILE_ADDRESS:
1112       emit_addrs(
1113          func,
1114          xmm,
1115          reg->DstRegister.Index,
1116          chan_index );
1117       break;
1118
1119    default:
1120       assert( 0 );
1121    }
1122
1123    switch( inst->Instruction.Saturate ) {
1124    case TGSI_SAT_NONE:
1125       break;
1126
1127    case TGSI_SAT_ZERO_ONE:
1128       /* assert( 0 ); */
1129       break;
1130
1131    case TGSI_SAT_MINUS_PLUS_ONE:
1132       assert( 0 );
1133       break;
1134    }
1135 }
1136
1137 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1138    emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
1139
1140 /**
1141  * High-level instruction translators.
1142  */
1143
1144 static void
1145 emit_kil(
1146    struct x86_function *func,
1147    const struct tgsi_full_src_register *reg )
1148 {
1149    unsigned uniquemask;
1150    unsigned registers[4];
1151    unsigned nextregister = 0;
1152    unsigned firstchan = ~0;
1153    unsigned chan_index;
1154
1155    /* This mask stores component bits that were already tested. Note that
1156     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1157     * tested. */
1158    uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
1159
1160    FOR_EACH_CHANNEL( chan_index ) {
1161       unsigned swizzle;
1162
1163       /* unswizzle channel */
1164       swizzle = tgsi_util_get_full_src_register_extswizzle(
1165          reg,
1166          chan_index );
1167
1168       /* check if the component has not been already tested */
1169       if( !(uniquemask & (1 << swizzle)) ) {
1170          uniquemask |= 1 << swizzle;
1171
1172          /* allocate register */
1173          registers[chan_index] = nextregister;
1174          emit_fetch(
1175             func,
1176             nextregister,
1177             reg,
1178             chan_index );
1179          nextregister++;
1180
1181          /* mark the first channel used */
1182          if( firstchan == ~0 ) {
1183             firstchan = chan_index;
1184          }
1185       }
1186    }
1187
1188    x86_push(
1189       func,
1190       x86_make_reg( file_REG32, reg_AX ) );
1191    x86_push(
1192       func,
1193       x86_make_reg( file_REG32, reg_DX ) );
1194
1195    FOR_EACH_CHANNEL( chan_index ) {
1196       if( uniquemask & (1 << chan_index) ) {
1197          sse_cmpps(
1198             func,
1199             make_xmm( registers[chan_index] ),
1200             get_temp(
1201                TGSI_EXEC_TEMP_00000000_I,
1202                TGSI_EXEC_TEMP_00000000_C ),
1203             cc_LessThan );
1204
1205          if( chan_index == firstchan ) {
1206             sse_pmovmskb(
1207                func,
1208                x86_make_reg( file_REG32, reg_AX ),
1209                make_xmm( registers[chan_index] ) );
1210          }
1211          else {
1212             sse_pmovmskb(
1213                func,
1214                x86_make_reg( file_REG32, reg_DX ),
1215                make_xmm( registers[chan_index] ) );
1216             x86_or(
1217                func,
1218                x86_make_reg( file_REG32, reg_AX ),
1219                x86_make_reg( file_REG32, reg_DX ) );
1220          }
1221       }
1222    }
1223
1224    x86_or(
1225       func,
1226       get_temp(
1227          TGSI_EXEC_TEMP_KILMASK_I,
1228          TGSI_EXEC_TEMP_KILMASK_C ),
1229       x86_make_reg( file_REG32, reg_AX ) );
1230
1231    x86_pop(
1232       func,
1233       x86_make_reg( file_REG32, reg_DX ) );
1234    x86_pop(
1235       func,
1236       x86_make_reg( file_REG32, reg_AX ) );
1237 }
1238
1239
1240 static void
1241 emit_kilp(
1242    struct x86_function *func )
1243 {
1244    /* XXX todo / fix me */
1245 }
1246
1247
1248 static void
1249 emit_setcc(
1250    struct x86_function *func,
1251    struct tgsi_full_instruction *inst,
1252    enum sse_cc cc )
1253 {
1254    unsigned chan_index;
1255
1256    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1257       FETCH( func, *inst, 0, 0, chan_index );
1258       FETCH( func, *inst, 1, 1, chan_index );
1259       sse_cmpps(
1260          func,
1261          make_xmm( 0 ),
1262          make_xmm( 1 ),
1263          cc );
1264       sse_andps(
1265          func,
1266          make_xmm( 0 ),
1267          get_temp(
1268             TEMP_ONE_I,
1269             TEMP_ONE_C ) );
1270       STORE( func, *inst, 0, 0, chan_index );
1271    }
1272 }
1273
1274 static void
1275 emit_cmp(
1276    struct x86_function *func,
1277    struct tgsi_full_instruction *inst )
1278 {
1279    unsigned chan_index;
1280
1281    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1282       FETCH( func, *inst, 0, 0, chan_index );
1283       FETCH( func, *inst, 1, 1, chan_index );
1284       FETCH( func, *inst, 2, 2, chan_index );
1285       sse_cmpps(
1286          func,
1287          make_xmm( 0 ),
1288          get_temp(
1289             TGSI_EXEC_TEMP_00000000_I,
1290             TGSI_EXEC_TEMP_00000000_C ),
1291          cc_LessThan );
1292       sse_andps(
1293          func,
1294          make_xmm( 1 ),
1295          make_xmm( 0 ) );
1296       sse_andnps(
1297          func,
1298          make_xmm( 0 ),
1299          make_xmm( 2 ) );
1300       sse_orps(
1301          func,
1302          make_xmm( 0 ),
1303          make_xmm( 1 ) );
1304       STORE( func, *inst, 0, 0, chan_index );
1305    }
1306 }
1307
1308 static int
1309 emit_instruction(
1310    struct x86_function *func,
1311    struct tgsi_full_instruction *inst )
1312 {
1313    unsigned chan_index;
1314
1315    switch (inst->Instruction.Opcode) {
1316    case TGSI_OPCODE_ARL:
1317       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1318          FETCH( func, *inst, 0, 0, chan_index );
1319          emit_f2it( func, 0 );
1320          STORE( func, *inst, 0, 0, chan_index );
1321       }
1322       break;
1323
1324    case TGSI_OPCODE_MOV:
1325    case TGSI_OPCODE_SWZ:
1326       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1327          FETCH( func, *inst, 0, 0, chan_index );
1328          STORE( func, *inst, 0, 0, chan_index );
1329       }
1330       break;
1331
1332    case TGSI_OPCODE_LIT:
1333       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1334           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1335          emit_tempf(
1336             func,
1337             0,
1338             TEMP_ONE_I,
1339             TEMP_ONE_C);
1340          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1341             STORE( func, *inst, 0, 0, CHAN_X );
1342          }
1343          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1344             STORE( func, *inst, 0, 0, CHAN_W );
1345          }
1346       }
1347       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1348           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1349          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1350             FETCH( func, *inst, 0, 0, CHAN_X );
1351             sse_maxps(
1352                func,
1353                make_xmm( 0 ),
1354                get_temp(
1355                   TGSI_EXEC_TEMP_00000000_I,
1356                   TGSI_EXEC_TEMP_00000000_C ) );
1357             STORE( func, *inst, 0, 0, CHAN_Y );
1358          }
1359          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1360             /* XMM[1] = SrcReg[0].yyyy */
1361             FETCH( func, *inst, 1, 0, CHAN_Y );
1362             /* XMM[1] = max(XMM[1], 0) */
1363             sse_maxps(
1364                func,
1365                make_xmm( 1 ),
1366                get_temp(
1367                   TGSI_EXEC_TEMP_00000000_I,
1368                   TGSI_EXEC_TEMP_00000000_C ) );
1369             /* XMM[2] = SrcReg[0].wwww */
1370             FETCH( func, *inst, 2, 0, CHAN_W );
1371             /* XMM[2] = min(XMM[2], 128.0) */
1372             sse_minps(
1373                func,
1374                make_xmm( 2 ),
1375                get_temp(
1376                   TGSI_EXEC_TEMP_128_I,
1377                   TGSI_EXEC_TEMP_128_C ) );
1378             /* XMM[2] = max(XMM[2], -128.0) */
1379             sse_maxps(
1380                func,
1381                make_xmm( 2 ),
1382                get_temp(
1383                   TGSI_EXEC_TEMP_MINUS_128_I,
1384                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1385             emit_pow( func, 1, 2 );
1386             FETCH( func, *inst, 0, 0, CHAN_X );
1387             sse_xorps(
1388                func,
1389                make_xmm( 2 ),
1390                make_xmm( 2 ) );
1391             sse_cmpps(
1392                func,
1393                make_xmm( 2 ),
1394                make_xmm( 0 ),
1395                cc_LessThanEqual );
1396             sse_andps(
1397                func,
1398                make_xmm( 2 ),
1399                make_xmm( 1 ) );
1400             STORE( func, *inst, 2, 0, CHAN_Z );
1401          }
1402       }
1403       break;
1404
1405    case TGSI_OPCODE_RCP:
1406    /* TGSI_OPCODE_RECIP */
1407       FETCH( func, *inst, 0, 0, CHAN_X );
1408       emit_rcp( func, 0, 0 );
1409       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1410          STORE( func, *inst, 0, 0, chan_index );
1411       }
1412       break;
1413
1414    case TGSI_OPCODE_RSQ:
1415    /* TGSI_OPCODE_RECIPSQRT */
1416       FETCH( func, *inst, 0, 0, CHAN_X );
1417       emit_rsqrt( func, 1, 0 );
1418       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1419          STORE( func, *inst, 1, 0, chan_index );
1420       }
1421       break;
1422
1423    case TGSI_OPCODE_EXP:
1424       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1425           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1426           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1427          FETCH( func, *inst, 0, 0, CHAN_X );
1428          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1429              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1430             emit_MOV( func, 1, 0 );
1431             emit_flr( func, 1 );
1432             /* dst.x = ex2(floor(src.x)) */
1433             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1434                emit_MOV( func, 2, 1 );
1435                emit_ex2( func, 2 );
1436                STORE( func, *inst, 2, 0, CHAN_X );
1437             }
1438             /* dst.y = src.x - floor(src.x) */
1439             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1440                emit_MOV( func, 2, 0 );
1441                emit_sub( func, 2, 1 );
1442                STORE( func, *inst, 2, 0, CHAN_Y );
1443             }
1444          }
1445          /* dst.z = ex2(src.x) */
1446          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1447             emit_ex2( func, 0 );
1448             STORE( func, *inst, 0, 0, CHAN_Z );
1449          }
1450       }
1451       /* dst.w = 1.0 */
1452       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1453          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1454          STORE( func, *inst, 0, 0, CHAN_W );
1455       }
1456       break;
1457
1458    case TGSI_OPCODE_LOG:
1459       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1460           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1461           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1462          FETCH( func, *inst, 0, 0, CHAN_X );
1463          emit_abs( func, 0 );
1464          emit_MOV( func, 1, 0 );
1465          emit_lg2( func, 1 );
1466          /* dst.z = lg2(abs(src.x)) */
1467          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1468             STORE( func, *inst, 1, 0, CHAN_Z );
1469          }
1470          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1471              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1472             emit_flr( func, 1 );
1473             /* dst.x = floor(lg2(abs(src.x))) */
1474             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1475                STORE( func, *inst, 1, 0, CHAN_X );
1476             }
1477             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1478             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1479                emit_ex2( func, 1 );
1480                emit_rcp( func, 1, 1 );
1481                emit_mul( func, 0, 1 );
1482                STORE( func, *inst, 0, 0, CHAN_Y );
1483             }
1484          }
1485       }
1486       /* dst.w = 1.0 */
1487       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1488          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1489          STORE( func, *inst, 0, 0, CHAN_W );
1490       }
1491       break;
1492
1493    case TGSI_OPCODE_MUL:
1494       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1495          FETCH( func, *inst, 0, 0, chan_index );
1496          FETCH( func, *inst, 1, 1, chan_index );
1497          emit_mul( func, 0, 1 );
1498          STORE( func, *inst, 0, 0, chan_index );
1499       }
1500       break;
1501
1502    case TGSI_OPCODE_ADD:
1503       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1504          FETCH( func, *inst, 0, 0, chan_index );
1505          FETCH( func, *inst, 1, 1, chan_index );
1506          emit_add( func, 0, 1 );
1507          STORE( func, *inst, 0, 0, chan_index );
1508       }
1509       break;
1510
1511    case TGSI_OPCODE_DP3:
1512    /* TGSI_OPCODE_DOT3 */
1513       FETCH( func, *inst, 0, 0, CHAN_X );
1514       FETCH( func, *inst, 1, 1, CHAN_X );
1515       emit_mul( func, 0, 1 );
1516       FETCH( func, *inst, 1, 0, CHAN_Y );
1517       FETCH( func, *inst, 2, 1, CHAN_Y );
1518       emit_mul( func, 1, 2 );
1519       emit_add( func, 0, 1 );
1520       FETCH( func, *inst, 1, 0, CHAN_Z );
1521       FETCH( func, *inst, 2, 1, CHAN_Z );
1522       emit_mul( func, 1, 2 );
1523       emit_add( func, 0, 1 );
1524       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1525          STORE( func, *inst, 0, 0, chan_index );
1526       }
1527       break;
1528
1529    case TGSI_OPCODE_DP4:
1530    /* TGSI_OPCODE_DOT4 */
1531       FETCH( func, *inst, 0, 0, CHAN_X );
1532       FETCH( func, *inst, 1, 1, CHAN_X );
1533       emit_mul( func, 0, 1 );
1534       FETCH( func, *inst, 1, 0, CHAN_Y );
1535       FETCH( func, *inst, 2, 1, CHAN_Y );
1536       emit_mul( func, 1, 2 );
1537       emit_add( func, 0, 1 );
1538       FETCH( func, *inst, 1, 0, CHAN_Z );
1539       FETCH( func, *inst, 2, 1, CHAN_Z );
1540       emit_mul(func, 1, 2 );
1541       emit_add(func, 0, 1 );
1542       FETCH( func, *inst, 1, 0, CHAN_W );
1543       FETCH( func, *inst, 2, 1, CHAN_W );
1544       emit_mul( func, 1, 2 );
1545       emit_add( func, 0, 1 );
1546       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1547          STORE( func, *inst, 0, 0, chan_index );
1548       }
1549       break;
1550
1551    case TGSI_OPCODE_DST:
1552       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1553          emit_tempf(
1554             func,
1555             0,
1556             TEMP_ONE_I,
1557             TEMP_ONE_C );
1558          STORE( func, *inst, 0, 0, CHAN_X );
1559       }
1560       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1561          FETCH( func, *inst, 0, 0, CHAN_Y );
1562          FETCH( func, *inst, 1, 1, CHAN_Y );
1563          emit_mul( func, 0, 1 );
1564          STORE( func, *inst, 0, 0, CHAN_Y );
1565       }
1566       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1567          FETCH( func, *inst, 0, 0, CHAN_Z );
1568          STORE( func, *inst, 0, 0, CHAN_Z );
1569       }
1570       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1571          FETCH( func, *inst, 0, 1, CHAN_W );
1572          STORE( func, *inst, 0, 0, CHAN_W );
1573       }
1574       break;
1575
1576    case TGSI_OPCODE_MIN:
1577       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1578          FETCH( func, *inst, 0, 0, chan_index );
1579          FETCH( func, *inst, 1, 1, chan_index );
1580          sse_minps(
1581             func,
1582             make_xmm( 0 ),
1583             make_xmm( 1 ) );
1584          STORE( func, *inst, 0, 0, chan_index );
1585       }
1586       break;
1587
1588    case TGSI_OPCODE_MAX:
1589       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1590          FETCH( func, *inst, 0, 0, chan_index );
1591          FETCH( func, *inst, 1, 1, chan_index );
1592          sse_maxps(
1593             func,
1594             make_xmm( 0 ),
1595             make_xmm( 1 ) );
1596          STORE( func, *inst, 0, 0, chan_index );
1597       }
1598       break;
1599
1600    case TGSI_OPCODE_SLT:
1601    /* TGSI_OPCODE_SETLT */
1602       emit_setcc( func, inst, cc_LessThan );
1603       break;
1604
1605    case TGSI_OPCODE_SGE:
1606    /* TGSI_OPCODE_SETGE */
1607       emit_setcc( func, inst, cc_NotLessThan );
1608       break;
1609
1610    case TGSI_OPCODE_MAD:
1611    /* TGSI_OPCODE_MADD */
1612       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1613          FETCH( func, *inst, 0, 0, chan_index );
1614          FETCH( func, *inst, 1, 1, chan_index );
1615          FETCH( func, *inst, 2, 2, chan_index );
1616          emit_mul( func, 0, 1 );
1617          emit_add( func, 0, 2 );
1618          STORE( func, *inst, 0, 0, chan_index );
1619       }
1620       break;
1621
1622    case TGSI_OPCODE_SUB:
1623       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1624          FETCH( func, *inst, 0, 0, chan_index );
1625          FETCH( func, *inst, 1, 1, chan_index );
1626          emit_sub( func, 0, 1 );
1627          STORE( func, *inst, 0, 0, chan_index );
1628       }
1629       break;
1630
1631    case TGSI_OPCODE_LERP:
1632    /* TGSI_OPCODE_LRP */
1633       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1634          FETCH( func, *inst, 0, 0, chan_index );
1635          FETCH( func, *inst, 1, 1, chan_index );
1636          FETCH( func, *inst, 2, 2, chan_index );
1637          emit_sub( func, 1, 2 );
1638          emit_mul( func, 0, 1 );
1639          emit_add( func, 0, 2 );
1640          STORE( func, *inst, 0, 0, chan_index );
1641       }
1642       break;
1643
1644    case TGSI_OPCODE_CND:
1645       return 0;
1646       break;
1647
1648    case TGSI_OPCODE_CND0:
1649       return 0;
1650       break;
1651
1652    case TGSI_OPCODE_DOT2ADD:
1653    /* TGSI_OPCODE_DP2A */
1654       return 0;
1655       break;
1656
1657    case TGSI_OPCODE_INDEX:
1658       return 0;
1659       break;
1660
1661    case TGSI_OPCODE_NEGATE:
1662       return 0;
1663       break;
1664
1665    case TGSI_OPCODE_FRAC:
1666    /* TGSI_OPCODE_FRC */
1667       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1668          FETCH( func, *inst, 0, 0, chan_index );
1669          emit_frc( func, 0 );
1670          STORE( func, *inst, 0, 0, chan_index );
1671       }
1672       break;
1673
1674    case TGSI_OPCODE_CLAMP:
1675       return 0;
1676       break;
1677
1678    case TGSI_OPCODE_FLOOR:
1679    /* TGSI_OPCODE_FLR */
1680       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1681          FETCH( func, *inst, 0, 0, chan_index );
1682          emit_flr( func, 0 );
1683          STORE( func, *inst, 0, 0, chan_index );
1684       }
1685       break;
1686
1687    case TGSI_OPCODE_ROUND:
1688       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1689          FETCH( func, *inst, 0, 0, chan_index );
1690          emit_rnd( func, 0, 0 );
1691          STORE( func, *inst, 0, 0, chan_index );
1692       }
1693       break;
1694
1695    case TGSI_OPCODE_EXPBASE2:
1696    /* TGSI_OPCODE_EX2 */
1697       FETCH( func, *inst, 0, 0, CHAN_X );
1698       emit_ex2( func, 0 );
1699       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1700          STORE( func, *inst, 0, 0, chan_index );
1701       }
1702       break;
1703
1704    case TGSI_OPCODE_LOGBASE2:
1705    /* TGSI_OPCODE_LG2 */
1706       FETCH( func, *inst, 0, 0, CHAN_X );
1707       emit_lg2( func, 0 );
1708       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1709          STORE( func, *inst, 0, 0, chan_index );
1710       }
1711       break;
1712
1713    case TGSI_OPCODE_POWER:
1714    /* TGSI_OPCODE_POW */
1715       FETCH( func, *inst, 0, 0, CHAN_X );
1716       FETCH( func, *inst, 1, 1, CHAN_X );
1717       emit_pow( func, 0, 1 );
1718       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1719          STORE( func, *inst, 0, 0, chan_index );
1720       }
1721       break;
1722
1723    case TGSI_OPCODE_CROSSPRODUCT:
1724    /* TGSI_OPCODE_XPD */
1725       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1726           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1727          FETCH( func, *inst, 1, 1, CHAN_Z );
1728          FETCH( func, *inst, 3, 0, CHAN_Z );
1729       }
1730       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1731           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1732          FETCH( func, *inst, 0, 0, CHAN_Y );
1733          FETCH( func, *inst, 4, 1, CHAN_Y );
1734       }
1735       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1736          emit_MOV( func, 2, 0 );
1737          emit_mul( func, 2, 1 );
1738          emit_MOV( func, 5, 3 );
1739          emit_mul( func, 5, 4 );
1740          emit_sub( func, 2, 5 );
1741          STORE( func, *inst, 2, 0, CHAN_X );
1742       }
1743       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1744           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1745          FETCH( func, *inst, 2, 1, CHAN_X );
1746          FETCH( func, *inst, 5, 0, CHAN_X );
1747       }
1748       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1749          emit_mul( func, 3, 2 );
1750          emit_mul( func, 1, 5 );
1751          emit_sub( func, 3, 1 );
1752          STORE( func, *inst, 3, 0, CHAN_Y );
1753       }
1754       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1755          emit_mul( func, 5, 4 );
1756          emit_mul( func, 0, 2 );
1757          emit_sub( func, 5, 0 );
1758          STORE( func, *inst, 5, 0, CHAN_Z );
1759       }
1760       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1761          emit_tempf(
1762             func,
1763             0,
1764             TEMP_ONE_I,
1765             TEMP_ONE_C );
1766          STORE( func, *inst, 0, 0, CHAN_W );
1767       }
1768       break;
1769
1770    case TGSI_OPCODE_MULTIPLYMATRIX:
1771       return 0;
1772       break;
1773
1774    case TGSI_OPCODE_ABS:
1775       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1776          FETCH( func, *inst, 0, 0, chan_index );
1777          emit_abs( func, 0) ;
1778
1779          STORE( func, *inst, 0, 0, chan_index );
1780       }
1781       break;
1782
1783    case TGSI_OPCODE_RCC:
1784       return 0;
1785       break;
1786
1787    case TGSI_OPCODE_DPH:
1788       FETCH( func, *inst, 0, 0, CHAN_X );
1789       FETCH( func, *inst, 1, 1, CHAN_X );
1790       emit_mul( func, 0, 1 );
1791       FETCH( func, *inst, 1, 0, CHAN_Y );
1792       FETCH( func, *inst, 2, 1, CHAN_Y );
1793       emit_mul( func, 1, 2 );
1794       emit_add( func, 0, 1 );
1795       FETCH( func, *inst, 1, 0, CHAN_Z );
1796       FETCH( func, *inst, 2, 1, CHAN_Z );
1797       emit_mul( func, 1, 2 );
1798       emit_add( func, 0, 1 );
1799       FETCH( func, *inst, 1, 1, CHAN_W );
1800       emit_add( func, 0, 1 );
1801       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1802          STORE( func, *inst, 0, 0, chan_index );
1803       }
1804       break;
1805
1806    case TGSI_OPCODE_COS:
1807       FETCH( func, *inst, 0, 0, CHAN_X );
1808       emit_cos( func, 0 );
1809       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1810          STORE( func, *inst, 0, 0, chan_index );
1811       }
1812       break;
1813
1814    case TGSI_OPCODE_DDX:
1815       return 0;
1816       break;
1817
1818    case TGSI_OPCODE_DDY:
1819       return 0;
1820       break;
1821
1822    case TGSI_OPCODE_KILP:
1823       /* predicated kill */
1824       emit_kilp( func );
1825       return 0; /* XXX fix me */
1826       break;
1827
1828    case TGSI_OPCODE_KIL:
1829       /* conditional kill */
1830       emit_kil( func, &inst->FullSrcRegisters[0] );
1831       break;
1832
1833    case TGSI_OPCODE_PK2H:
1834       return 0;
1835       break;
1836
1837    case TGSI_OPCODE_PK2US:
1838       return 0;
1839       break;
1840
1841    case TGSI_OPCODE_PK4B:
1842       return 0;
1843       break;
1844
1845    case TGSI_OPCODE_PK4UB:
1846       return 0;
1847       break;
1848
1849    case TGSI_OPCODE_RFL:
1850       return 0;
1851       break;
1852
1853    case TGSI_OPCODE_SEQ:
1854       return 0;
1855       break;
1856
1857    case TGSI_OPCODE_SFL:
1858       return 0;
1859       break;
1860
1861    case TGSI_OPCODE_SGT:
1862       return 0;
1863       break;
1864
1865    case TGSI_OPCODE_SIN:
1866       FETCH( func, *inst, 0, 0, CHAN_X );
1867       emit_sin( func, 0 );
1868       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1869          STORE( func, *inst, 0, 0, chan_index );
1870       }
1871       break;
1872
1873    case TGSI_OPCODE_SLE:
1874       return 0;
1875       break;
1876
1877    case TGSI_OPCODE_SNE:
1878       return 0;
1879       break;
1880
1881    case TGSI_OPCODE_STR:
1882       return 0;
1883       break;
1884
1885    case TGSI_OPCODE_TEX:
1886       if (0) {
1887          /* Disable dummy texture code:
1888           */
1889          emit_tempf(
1890             func,
1891             0,
1892             TEMP_ONE_I,
1893             TEMP_ONE_C );
1894          FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1895             STORE( func, *inst, 0, 0, chan_index );
1896          }
1897       }
1898       else {
1899          return 0;
1900       }
1901       break;
1902
1903    case TGSI_OPCODE_TXD:
1904       return 0;
1905       break;
1906
1907    case TGSI_OPCODE_UP2H:
1908       return 0;
1909       break;
1910
1911    case TGSI_OPCODE_UP2US:
1912       return 0;
1913       break;
1914
1915    case TGSI_OPCODE_UP4B:
1916       return 0;
1917       break;
1918
1919    case TGSI_OPCODE_UP4UB:
1920       return 0;
1921       break;
1922
1923    case TGSI_OPCODE_X2D:
1924       return 0;
1925       break;
1926
1927    case TGSI_OPCODE_ARA:
1928       return 0;
1929       break;
1930
1931 #if 0
1932    case TGSI_OPCODE_ARR:
1933       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1934          FETCH( func, *inst, 0, 0, chan_index );
1935          emit_rnd( func, 0, 0 );
1936          emit_f2it( func, 0 );
1937          STORE( func, *inst, 0, 0, chan_index );
1938       }
1939       break;
1940 #endif
1941    case TGSI_OPCODE_BRA:
1942       return 0;
1943       break;
1944
1945    case TGSI_OPCODE_CAL:
1946       return 0;
1947       break;
1948
1949    case TGSI_OPCODE_RET:
1950       emit_ret( func );
1951       break;
1952
1953    case TGSI_OPCODE_END:
1954       break;
1955
1956    case TGSI_OPCODE_SSG:
1957    /* TGSI_OPCODE_SGN */
1958       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1959          FETCH( func, *inst, 0, 0, chan_index );
1960          emit_sgn( func, 0, 0 );
1961          STORE( func, *inst, 0, 0, chan_index );
1962       }
1963       break;
1964
1965    case TGSI_OPCODE_CMP:
1966       emit_cmp (func, inst);
1967       break;
1968
1969    case TGSI_OPCODE_SCS:
1970       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
1971          FETCH( func, *inst, 0, 0, CHAN_X );
1972          emit_cos( func, 0 );
1973          STORE( func, *inst, 0, 0, CHAN_X );
1974       }
1975       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
1976          FETCH( func, *inst, 0, 0, CHAN_X );
1977          emit_sin( func, 0 );
1978          STORE( func, *inst, 0, 0, CHAN_Y );
1979       }
1980       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
1981          emit_tempf(
1982             func,
1983             0,
1984             TGSI_EXEC_TEMP_00000000_I,
1985             TGSI_EXEC_TEMP_00000000_C );
1986          STORE( func, *inst, 0, 0, CHAN_Z );
1987       }
1988       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
1989          emit_tempf(
1990             func,
1991             0,
1992             TEMP_ONE_I,
1993             TEMP_ONE_C );
1994          STORE( func, *inst, 0, 0, CHAN_W );
1995       }
1996       break;
1997
1998    case TGSI_OPCODE_TXB:
1999       return 0;
2000       break;
2001
2002    case TGSI_OPCODE_NRM:
2003       return 0;
2004       break;
2005
2006    case TGSI_OPCODE_DIV:
2007       return 0;
2008       break;
2009
2010    case TGSI_OPCODE_DP2:
2011       return 0;
2012       break;
2013
2014    case TGSI_OPCODE_TXL:
2015       return 0;
2016       break;
2017
2018    case TGSI_OPCODE_BRK:
2019       return 0;
2020       break;
2021
2022    case TGSI_OPCODE_IF:
2023       return 0;
2024       break;
2025
2026    case TGSI_OPCODE_LOOP:
2027       return 0;
2028       break;
2029
2030    case TGSI_OPCODE_REP:
2031       return 0;
2032       break;
2033
2034    case TGSI_OPCODE_ELSE:
2035       return 0;
2036       break;
2037
2038    case TGSI_OPCODE_ENDIF:
2039       return 0;
2040       break;
2041
2042    case TGSI_OPCODE_ENDLOOP:
2043       return 0;
2044       break;
2045
2046    case TGSI_OPCODE_ENDREP:
2047       return 0;
2048       break;
2049
2050    case TGSI_OPCODE_PUSHA:
2051       return 0;
2052       break;
2053
2054    case TGSI_OPCODE_POPA:
2055       return 0;
2056       break;
2057
2058    case TGSI_OPCODE_CEIL:
2059       return 0;
2060       break;
2061
2062    case TGSI_OPCODE_I2F:
2063       return 0;
2064       break;
2065
2066    case TGSI_OPCODE_NOT:
2067       return 0;
2068       break;
2069
2070    case TGSI_OPCODE_TRUNC:
2071       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2072          FETCH( func, *inst, 0, 0, chan_index );
2073          emit_f2it( func, 0 );
2074          emit_i2f( func, 0 );
2075          STORE( func, *inst, 0, 0, chan_index );
2076       }
2077       break;
2078
2079    case TGSI_OPCODE_SHL:
2080       return 0;
2081       break;
2082
2083    case TGSI_OPCODE_SHR:
2084       return 0;
2085       break;
2086
2087    case TGSI_OPCODE_AND:
2088       return 0;
2089       break;
2090
2091    case TGSI_OPCODE_OR:
2092       return 0;
2093       break;
2094
2095    case TGSI_OPCODE_MOD:
2096       return 0;
2097       break;
2098
2099    case TGSI_OPCODE_XOR:
2100       return 0;
2101       break;
2102
2103    case TGSI_OPCODE_SAD:
2104       return 0;
2105       break;
2106
2107    case TGSI_OPCODE_TXF:
2108       return 0;
2109       break;
2110
2111    case TGSI_OPCODE_TXQ:
2112       return 0;
2113       break;
2114
2115    case TGSI_OPCODE_CONT:
2116       return 0;
2117       break;
2118
2119    case TGSI_OPCODE_EMIT:
2120       return 0;
2121       break;
2122
2123    case TGSI_OPCODE_ENDPRIM:
2124       return 0;
2125       break;
2126
2127    default:
2128       return 0;
2129    }
2130
2131    return 1;
2132 }
2133
2134 static void
2135 emit_declaration(
2136    struct x86_function *func,
2137    struct tgsi_full_declaration *decl )
2138 {
2139    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2140       unsigned first, last, mask;
2141       unsigned i, j;
2142
2143       first = decl->DeclarationRange.First;
2144       last = decl->DeclarationRange.Last;
2145       mask = decl->Declaration.UsageMask;
2146
2147       for( i = first; i <= last; i++ ) {
2148          for( j = 0; j < NUM_CHANNELS; j++ ) {
2149             if( mask & (1 << j) ) {
2150                switch( decl->Declaration.Interpolate ) {
2151                case TGSI_INTERPOLATE_CONSTANT:
2152                   emit_coef_a0( func, 0, i, j );
2153                   emit_inputs( func, 0, i, j );
2154                   break;
2155
2156                case TGSI_INTERPOLATE_LINEAR:
2157                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2158                   emit_coef_dadx( func, 1, i, j );
2159                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2160                   emit_coef_dady( func, 3, i, j );
2161                   emit_mul( func, 0, 1 );    /* x * dadx */
2162                   emit_coef_a0( func, 4, i, j );
2163                   emit_mul( func, 2, 3 );    /* y * dady */
2164                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2165                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2166                   emit_inputs( func, 0, i, j );
2167                   break;
2168
2169                case TGSI_INTERPOLATE_PERSPECTIVE:
2170                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2171                   emit_coef_dadx( func, 1, i, j );
2172                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2173                   emit_coef_dady( func, 3, i, j );
2174                   emit_mul( func, 0, 1 );    /* x * dadx */
2175                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2176                   emit_coef_a0( func, 5, i, j );
2177                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2178                   emit_mul( func, 2, 3 );    /* y * dady */
2179                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2180                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2181                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2182                   emit_inputs( func, 0, i, j );
2183                   break;
2184
2185                default:
2186                   assert( 0 );
2187                   break;
2188                }
2189             }
2190          }
2191       }
2192    }
2193 }
2194
2195 static void aos_to_soa( struct x86_function *func,
2196                         uint arg_aos,
2197                         uint arg_soa,
2198                         uint arg_num,
2199                         uint arg_stride )
2200 {
2201    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2202    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2203    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2204    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2205    int inner_loop;
2206
2207
2208    /* Save EBX */
2209    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2210
2211    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2212    x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
2213    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2214    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2215
2216    /* do */
2217    inner_loop = x86_get_label( func );
2218    {
2219       x86_push( func, aos_input );
2220       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2221       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2222       x86_add( func, aos_input, stride );
2223       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2224       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2225       x86_add( func, aos_input, stride );
2226       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2227       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2228       x86_add( func, aos_input, stride );
2229       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2230       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2231       x86_pop( func, aos_input );
2232
2233       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2234       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2235       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2236       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2237       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2238       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2239
2240       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2241       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2242       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2243       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2244
2245       /* Advance to next input */
2246       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2247       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2248    }
2249    /* while --num_inputs */
2250    x86_dec( func, num_inputs );
2251    x86_jcc( func, cc_NE, inner_loop );
2252
2253    /* Restore EBX */
2254    x86_pop( func, aos_input );
2255 }
2256
2257 static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
2258 {
2259    struct x86_reg soa_output;
2260    struct x86_reg aos_output;
2261    struct x86_reg num_outputs;
2262    struct x86_reg temp;
2263    int inner_loop;
2264
2265    soa_output = x86_make_reg( file_REG32, reg_AX );
2266    aos_output = x86_make_reg( file_REG32, reg_BX );
2267    num_outputs = x86_make_reg( file_REG32, reg_CX );
2268    temp = x86_make_reg( file_REG32, reg_DX );
2269
2270    /* Save EBX */
2271    x86_push( func, aos_output );
2272
2273    x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
2274    x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
2275    x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
2276
2277    /* do */
2278    inner_loop = x86_get_label( func );
2279    {
2280       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2281       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2282       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2283       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2284
2285       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2286       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2287       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2288       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2289       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2290       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2291
2292       x86_mov( func, temp, x86_fn_arg( func, stride ) );
2293       x86_push( func, aos_output );
2294       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2295       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2296       x86_add( func, aos_output, temp );
2297       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2298       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2299       x86_add( func, aos_output, temp );
2300       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2301       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2302       x86_add( func, aos_output, temp );
2303       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2304       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2305       x86_pop( func, aos_output );
2306
2307       /* Advance to next output */
2308       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2309       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2310    }
2311    /* while --num_outputs */
2312    x86_dec( func, num_outputs );
2313    x86_jcc( func, cc_NE, inner_loop );
2314
2315    /* Restore EBX */
2316    x86_pop( func, aos_output );
2317 }
2318
2319 /**
2320  * Translate a TGSI vertex/fragment shader to SSE2 code.
2321  * Slightly different things are done for vertex vs. fragment shaders.
2322  *
2323  * Note that fragment shaders are responsible for interpolating shader
2324  * inputs. Because on x86 we have only 4 GP registers, and here we
2325  * have 5 shader arguments (input, output, const, temp and coef), the
2326  * code is split into two phases -- DECLARATION and INSTRUCTION phase.
2327  * GP register holding the output argument is aliased with the coeff
2328  * argument, as outputs are not needed in the DECLARATION phase.
2329  *
2330  * \param tokens  the TGSI input shader
2331  * \param func  the output SSE code/function
2332  * \param immediates  buffer to place immediates, later passed to SSE func
2333  * \param return  1 for success, 0 if translation failed
2334  */
2335 unsigned
2336 tgsi_emit_sse2(
2337    const struct tgsi_token *tokens,
2338    struct x86_function *func,
2339    float (*immediates)[4],
2340    boolean do_swizzles )
2341 {
2342    struct tgsi_parse_context parse;
2343    boolean instruction_phase = FALSE;
2344    unsigned ok = 1;
2345    uint num_immediates = 0;
2346
2347    util_init_math();
2348
2349    func->csr = func->store;
2350
2351    tgsi_parse_init( &parse, tokens );
2352
2353    /* Can't just use EDI, EBX without save/restoring them:
2354     */
2355    x86_push(
2356       func,
2357       get_immediate_base() );
2358
2359    x86_push(
2360       func,
2361       get_temp_base() );
2362
2363
2364    /*
2365     * Different function args for vertex/fragment shaders:
2366     */
2367    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2368       /* DECLARATION phase, do not load output argument. */
2369       x86_mov(
2370          func,
2371          get_input_base(),
2372          x86_fn_arg( func, 1 ) );
2373       /* skipping outputs argument here */
2374       x86_mov(
2375          func,
2376          get_const_base(),
2377          x86_fn_arg( func, 3 ) );
2378       x86_mov(
2379          func,
2380          get_temp_base(),
2381          x86_fn_arg( func, 4 ) );
2382       x86_mov(
2383          func,
2384          get_coef_base(),
2385          x86_fn_arg( func, 5 ) );
2386       x86_mov(
2387          func,
2388          get_immediate_base(),
2389          x86_fn_arg( func, 6 ) );
2390    }
2391    else {
2392       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
2393
2394       if (do_swizzles)
2395          aos_to_soa( func,
2396                      6,         /* aos_input */
2397                      1,         /* machine->input */
2398                      7,         /* num_inputs */
2399                      8 );       /* input_stride */
2400
2401       x86_mov(
2402          func,
2403          get_input_base(),
2404          x86_fn_arg( func, 1 ) );
2405       x86_mov(
2406          func,
2407          get_output_base(),
2408          x86_fn_arg( func, 2 ) );
2409       x86_mov(
2410          func,
2411          get_const_base(),
2412          x86_fn_arg( func, 3 ) );
2413       x86_mov(
2414          func,
2415          get_temp_base(),
2416          x86_fn_arg( func, 4 ) );
2417       x86_mov(
2418          func,
2419          get_immediate_base(),
2420          x86_fn_arg( func, 5 ) );
2421    }
2422
2423    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
2424       tgsi_parse_token( &parse );
2425
2426       switch( parse.FullToken.Token.Type ) {
2427       case TGSI_TOKEN_TYPE_DECLARATION:
2428          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2429             emit_declaration(
2430                func,
2431                &parse.FullToken.FullDeclaration );
2432          }
2433          break;
2434
2435       case TGSI_TOKEN_TYPE_INSTRUCTION:
2436          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2437             if( !instruction_phase ) {
2438                /* INSTRUCTION phase, overwrite coeff with output. */
2439                instruction_phase = TRUE;
2440                x86_mov(
2441                   func,
2442                   get_output_base(),
2443                   x86_fn_arg( func, 2 ) );
2444             }
2445          }
2446
2447          ok = emit_instruction(
2448             func,
2449             &parse.FullToken.FullInstruction );
2450
2451          if (!ok) {
2452             debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n",
2453                          parse.FullToken.FullInstruction.Instruction.Opcode,
2454                          parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
2455                          "vertex shader" : "fragment shader");
2456          }
2457          break;
2458
2459       case TGSI_TOKEN_TYPE_IMMEDIATE:
2460          /* simply copy the immediate values into the next immediates[] slot */
2461          {
2462             const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
2463             uint i;
2464             assert(size <= 4);
2465             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
2466             for( i = 0; i < size; i++ ) {
2467                immediates[num_immediates][i] =
2468                   parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
2469             }
2470 #if 0
2471             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
2472                    num_immediates,
2473                    immediates[num_immediates][0],
2474                    immediates[num_immediates][1],
2475                    immediates[num_immediates][2],
2476                    immediates[num_immediates][3]);
2477 #endif
2478             num_immediates++;
2479          }
2480          break;
2481
2482       default:
2483          ok = 0;
2484          assert( 0 );
2485       }
2486    }
2487
2488    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2489       if (do_swizzles)
2490          soa_to_aos( func, 9, 2, 10, 11 );
2491    }
2492
2493    /* Can't just use EBX, EDI without save/restoring them:
2494     */
2495    x86_pop(
2496       func,
2497       get_temp_base() );
2498
2499    x86_pop(
2500       func,
2501       get_immediate_base() );
2502
2503    emit_ret( func );
2504
2505    tgsi_parse_free( &parse );
2506
2507    return ok;
2508 }
2509
2510 #endif /* PIPE_ARCH_X86 */
2511