src/gallium/auxiliary/tgsi/tgsi_sse2.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
   4  * All Rights Reserved.
   5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the
   9  * "Software"), to deal in the Software without restriction, including
  10  * without limitation the rights to use, copy, modify, merge, publish,
  11  * distribute, sub license, and/or sell copies of the Software, and to
  12  * permit persons to whom the Software is furnished to do so, subject to
  13  * the following conditions:
  14  *
  15  * The above copyright notice and this permission notice (including the
  16  * next paragraph) shall be included in all copies or substantial portions
  17  * of the Software.
  18  *
  19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  22  * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  26  *
  27  **************************************************************************/
  28
  29 #include "pipe/p_config.h"
  30
  31 #if defined(PIPE_ARCH_X86)
  32
  33 #include "util/u_debug.h"
  34 #include "pipe/p_shader_tokens.h"
  35 #include "util/u_math.h"
  36 #include "util/u_memory.h"
  37 #if defined(PIPE_ARCH_SSE)
  38 #include "util/u_sse.h"
  39 #endif
  40 #include "tgsi/tgsi_info.h"
  41 #include "tgsi/tgsi_parse.h"
  42 #include "tgsi/tgsi_util.h"
  43 #include "tgsi/tgsi_dump.h"
  44 #include "tgsi/tgsi_exec.h"
  45 #include "tgsi/tgsi_sse2.h"
  46
  47 #include "rtasm/rtasm_x86sse.h"
  48
  49 /* for 1/sqrt()
  50  *
  51  * This costs about 100fps (close to 10%) in gears:
  52  */
  53 #define HIGH_PRECISION 1
  54
  55 #define FAST_MATH 1
  56
  57
  58 #define FOR_EACH_CHANNEL( CHAN )\
  59    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  60
  61 #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  62    ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
  63
  64 #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  65    if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  66
  67 #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  68    FOR_EACH_CHANNEL( CHAN )\
  69       IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  70
  71 #define CHAN_X 0
  72 #define CHAN_Y 1
  73 #define CHAN_Z 2
  74 #define CHAN_W 3
  75
  76 #define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
  77 #define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
  78
  79 #define TEMP_R0   TGSI_EXEC_TEMP_R0
  80 #define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
  81 #define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
  82 #define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
  83
  84
  85 /**
  86  * X86 utility functions.
  87  */
  88
  89 static struct x86_reg
  90 make_xmm(
  91    unsigned xmm )
  92 {
  93    return x86_make_reg(
  94       file_XMM,
  95       (enum x86_reg_name) xmm );
  96 }
  97
  98 /**
  99  * X86 register mapping helpers.
 100  */
 101
 102 static struct x86_reg
 103 get_const_base( void )
 104 {
 105    return x86_make_reg(
 106       file_REG32,
 107       reg_AX );
 108 }
 109
 110 static struct x86_reg
 111 get_machine_base( void )
 112 {
 113    return x86_make_reg(
 114       file_REG32,
 115       reg_CX );
 116 }
 117
 118 static struct x86_reg
 119 get_input_base( void )
 120 {
 121    return x86_make_disp(
 122       get_machine_base(),
 123       Offset(struct tgsi_exec_machine, Inputs) );
 124 }
 125
 126 static struct x86_reg
 127 get_output_base( void )
 128 {
 129    return x86_make_disp(
 130       get_machine_base(),
 131       Offset(struct tgsi_exec_machine, Outputs) );
 132 }
 133
 134 static struct x86_reg
 135 get_temp_base( void )
 136 {
 137    return x86_make_disp(
 138       get_machine_base(),
 139       Offset(struct tgsi_exec_machine, Temps) );
 140 }
 141
 142 static struct x86_reg
 143 get_coef_base( void )
 144 {
 145    return x86_make_reg(
 146       file_REG32,
 147       reg_BX );
 148 }
 149
 150 static struct x86_reg
 151 get_sampler_base( void )
 152 {
 153    return x86_make_reg(
 154       file_REG32,
 155       reg_DI );
 156 }
 157
 158 static struct x86_reg
 159 get_immediate_base( void )
 160 {
 161    return x86_make_reg(
 162       file_REG32,
 163       reg_DX );
 164 }
 165
 166 static struct x86_reg
 167 get_system_value_base( void )
 168 {
 169    return x86_make_disp(
 170       get_machine_base(),
 171       Offset(struct tgsi_exec_machine, SystemValue) );
 172 }
 173
 174
 175 /**
 176  * Data access helpers.
 177  */
 178
 179
 180 static struct x86_reg
 181 get_immediate(
 182    unsigned vec,
 183    unsigned chan )
 184 {
 185    return x86_make_disp(
 186       get_immediate_base(),
 187       (vec * 4 + chan) * 4 );
 188 }
 189
 190 static struct x86_reg
 191 get_const(
 192    unsigned vec,
 193    unsigned chan )
 194 {
 195    return x86_make_disp(
 196       get_const_base(),
 197       (vec * 4 + chan) * 4 );
 198 }
 199
 200 static struct x86_reg
 201 get_sampler_ptr(
 202    unsigned unit )
 203 {
 204    return x86_make_disp(
 205       get_sampler_base(),
 206       unit * sizeof( struct tgsi_sampler * ) );
 207 }
 208
 209 static struct x86_reg
 210 get_input(
 211    unsigned vec,
 212    unsigned chan )
 213 {
 214    return x86_make_disp(
 215       get_input_base(),
 216       (vec * 4 + chan) * 16 );
 217 }
 218
 219 static struct x86_reg
 220 get_output(
 221    unsigned vec,
 222    unsigned chan )
 223 {
 224    return x86_make_disp(
 225       get_output_base(),
 226       (vec * 4 + chan) * 16 );
 227 }
 228
 229 static struct x86_reg
 230 get_temp(
 231    unsigned vec,
 232    unsigned chan )
 233 {
 234    return x86_make_disp(
 235       get_temp_base(),
 236       (vec * 4 + chan) * 16 );
 237 }
 238
 239 static struct x86_reg
 240 get_system_value(
 241    unsigned vec,
 242    unsigned chan )
 243 {
 244    return x86_make_disp(
 245       get_system_value_base(), /* base */
 246       (vec * 4 + chan) * 4 );  /* byte offset from base */
 247 }
 248
 249 static struct x86_reg
 250 get_coef(
 251    unsigned vec,
 252    unsigned chan,
 253    unsigned member )
 254 {
 255    return x86_make_disp(
 256       get_coef_base(),
 257       ((vec * 3 + member) * 4 + chan) * 4 );
 258 }
 259
 260
 261 static void
 262 emit_ret(
 263    struct x86_function  *func )
 264 {
 265    x86_ret( func );
 266 }
 267
 268
 269 /**
 270  * Data fetch helpers.
 271  */
 272
 273 /**
 274  * Copy a shader constant to xmm register
 275  * \param xmm  the destination xmm register
 276  * \param vec  the src const buffer index
 277  * \param chan  src channel to fetch (X, Y, Z or W)
 278  */
 279 static void
 280 emit_const(
 281    struct x86_function *func,
 282    uint xmm,
 283    int vec,
 284    uint chan,
 285    uint indirect,
 286    uint indirectFile,
 287    int indirectIndex )
 288 {
 289    if (indirect) {
 290       /* 'vec' is the offset from the address register's value.
 291        * We're loading CONST[ADDR+vec] into an xmm register.
 292        */
 293       struct x86_reg r0 = get_immediate_base();
 294       struct x86_reg r1 = get_coef_base();
 295       uint i;
 296
 297       assert( indirectFile == TGSI_FILE_ADDRESS );
 298       assert( indirectIndex == 0 );
 299       assert( r0.mod == mod_REG );
 300       assert( r1.mod == mod_REG );
 301
 302       x86_push( func, r0 );
 303       x86_push( func, r1 );
 304
 305       /*
 306        * Loop over the four pixels or vertices in the quad.
 307        * Get the value of the address (offset) register for pixel/vertex[i],
 308        * add it to the src offset and index into the constant buffer.
 309        * Note that we're working on SOA data.
 310        * If any of the pixel/vertex execution channels are unused their
 311        * values will be garbage.  It's very important that we don't use
 312        * those garbage values as indexes into the constant buffer since
 313        * that'll cause segfaults.
 314        * The solution is to bitwise-AND the offset with the execution mask
 315        * register whose values are either 0 or ~0.
 316        * The caller must setup the execution mask register to indicate
 317        * which channels are valid/alive before running the shader.
 318        * The execution mask will also figure into loops and conditionals
 319        * someday.
 320        */
 321       for (i = 0; i < QUAD_SIZE; i++) {
 322          /* r1 = address register[i] */
 323          x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
 324          /* r0 = execution mask[i] */
 325          x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
 326          /* r1 = r1 & r0 */
 327          x86_and( func, r1, r0 );
 328          /* r0 = 'vec', the offset */
 329          x86_lea( func, r0, get_const( vec, chan ) );
 330
 331          /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
 332           */
 333          x86_add( func, r1, r1 );
 334          x86_add( func, r1, r1 );
 335          x86_add( func, r1, r1 );
 336          x86_add( func, r1, r1 );
 337
 338          x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
 339          x86_mov( func, r1, x86_deref( r0 ) );
 340          x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
 341       }
 342
 343       x86_pop( func, r1 );
 344       x86_pop( func, r0 );
 345
 346       sse_movaps(
 347          func,
 348          make_xmm( xmm ),
 349          get_temp( TEMP_R0, CHAN_X ) );
 350    }
 351    else {
 352       /* 'vec' is the index into the src register file, such as TEMP[vec] */
 353       assert( vec >= 0 );
 354
 355       sse_movss(
 356          func,
 357          make_xmm( xmm ),
 358          get_const( vec, chan ) );
 359       sse_shufps(
 360          func,
 361          make_xmm( xmm ),
 362          make_xmm( xmm ),
 363          SHUF( 0, 0, 0, 0 ) );
 364    }
 365 }
 366
 367 static void
 368 emit_immediate(
 369    struct x86_function *func,
 370    unsigned xmm,
 371    unsigned vec,
 372    unsigned chan )
 373 {
 374    sse_movss(
 375       func,
 376       make_xmm( xmm ),
 377       get_immediate( vec, chan ) );
 378    sse_shufps(
 379       func,
 380       make_xmm( xmm ),
 381       make_xmm( xmm ),
 382       SHUF( 0, 0, 0, 0 ) );
 383 }
 384
 385
 386 /**
 387  * Copy a shader input to xmm register
 388  * \param xmm  the destination xmm register
 389  * \param vec  the src input attrib
 390  * \param chan  src channel to fetch (X, Y, Z or W)
 391  */
 392 static void
 393 emit_inputf(
 394    struct x86_function *func,
 395    unsigned xmm,
 396    unsigned vec,
 397    unsigned chan )
 398 {
 399    sse_movups(
 400       func,
 401       make_xmm( xmm ),
 402       get_input( vec, chan ) );
 403 }
 404
 405 /**
 406  * Store an xmm register to a shader output
 407  * \param xmm  the source xmm register
 408  * \param vec  the dest output attrib
 409  * \param chan  src dest channel to store (X, Y, Z or W)
 410  */
 411 static void
 412 emit_output(
 413    struct x86_function *func,
 414    unsigned xmm,
 415    unsigned vec,
 416    unsigned chan )
 417 {
 418    sse_movups(
 419       func,
 420       get_output( vec, chan ),
 421       make_xmm( xmm ) );
 422 }
 423
 424 /**
 425  * Copy a shader temporary to xmm register
 426  * \param xmm  the destination xmm register
 427  * \param vec  the src temp register
 428  * \param chan  src channel to fetch (X, Y, Z or W)
 429  */
 430 static void
 431 emit_tempf(
 432    struct x86_function *func,
 433    unsigned xmm,
 434    unsigned vec,
 435    unsigned chan )
 436 {
 437    sse_movaps(
 438       func,
 439       make_xmm( xmm ),
 440       get_temp( vec, chan ) );
 441 }
 442
 443 /**
 444  * Copy a system value to xmm register
 445  * \param xmm  the destination xmm register
 446  * \param vec  the source system value register
 447  * \param chan  src channel to fetch (X, Y, Z or W)
 448  */
 449 static void
 450 emit_system_value(
 451    struct x86_function *func,
 452    unsigned xmm,
 453    unsigned vec,
 454    unsigned chan )
 455 {
 456    sse_movss(
 457       func,
 458       make_xmm( xmm ),
 459       get_system_value( vec, chan ) );
 460    sse_shufps(
 461       func,
 462       make_xmm( xmm ),
 463       make_xmm( xmm ),
 464       SHUF( 0, 0, 0, 0 ) );
 465 }
 466
 467 /**
 468  * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
 469  * \param xmm  the destination xmm register
 470  * \param vec  the src input/attribute coefficient index
 471  * \param chan  src channel to fetch (X, Y, Z or W)
 472  * \param member  0=a0, 1=dadx, 2=dady
 473  */
 474 static void
 475 emit_coef(
 476    struct x86_function *func,
 477    unsigned xmm,
 478    unsigned vec,
 479    unsigned chan,
 480    unsigned member )
 481 {
 482    sse_movss(
 483       func,
 484       make_xmm( xmm ),
 485       get_coef( vec, chan, member ) );
 486    sse_shufps(
 487       func,
 488       make_xmm( xmm ),
 489       make_xmm( xmm ),
 490       SHUF( 0, 0, 0, 0 ) );
 491 }
 492
 493 /**
 494  * Data store helpers.
 495  */
 496
 497 static void
 498 emit_inputs(
 499    struct x86_function *func,
 500    unsigned xmm,
 501    unsigned vec,
 502    unsigned chan )
 503 {
 504    sse_movups(
 505       func,
 506       get_input( vec, chan ),
 507       make_xmm( xmm ) );
 508 }
 509
 510 static void
 511 emit_temps(
 512    struct x86_function *func,
 513    unsigned xmm,
 514    unsigned vec,
 515    unsigned chan )
 516 {
 517    sse_movaps(
 518       func,
 519       get_temp( vec, chan ),
 520       make_xmm( xmm ) );
 521 }
 522
 523 static void
 524 emit_addrs(
 525    struct x86_function *func,
 526    unsigned xmm,
 527    unsigned vec,
 528    unsigned chan )
 529 {
 530    assert( vec == 0 );
 531
 532    emit_temps(
 533       func,
 534       xmm,
 535       vec + TGSI_EXEC_TEMP_ADDR,
 536       chan );
 537 }
 538
 539 /**
 540  * Coefficent fetch helpers.
 541  */
 542
 543 static void
 544 emit_coef_a0(
 545    struct x86_function *func,
 546    unsigned xmm,
 547    unsigned vec,
 548    unsigned chan )
 549 {
 550    emit_coef(
 551       func,
 552       xmm,
 553       vec,
 554       chan,
 555       0 );
 556 }
 557
 558 static void
 559 emit_coef_dadx(
 560    struct x86_function *func,
 561    unsigned xmm,
 562    unsigned vec,
 563    unsigned chan )
 564 {
 565    emit_coef(
 566       func,
 567       xmm,
 568       vec,
 569       chan,
 570       1 );
 571 }
 572
 573 static void
 574 emit_coef_dady(
 575    struct x86_function *func,
 576    unsigned xmm,
 577    unsigned vec,
 578    unsigned chan )
 579 {
 580    emit_coef(
 581       func,
 582       xmm,
 583       vec,
 584       chan,
 585       2 );
 586 }
 587
 588 /**
 589  * Function call helpers.
 590  */
 591
 592 /**
 593  * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be
 594  * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
 595  * that the stack pointer is 16 byte aligned, as expected.
 596  */
 597 static void
 598 emit_func_call(
 599    struct x86_function *func,
 600    unsigned xmm_save_mask,
 601    const struct x86_reg *arg,
 602    unsigned nr_args,
 603    void (PIPE_CDECL *code)() )
 604 {
 605    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
 606    unsigned i, n;
 607
 608    x86_push(
 609       func,
 610       x86_make_reg( file_REG32, reg_AX) );
 611    x86_push(
 612       func,
 613       x86_make_reg( file_REG32, reg_CX) );
 614    x86_push(
 615       func,
 616       x86_make_reg( file_REG32, reg_DX) );
 617
 618    /* Store XMM regs to the stack
 619     */
 620    for(i = 0, n = 0; i < 8; ++i)
 621       if(xmm_save_mask & (1 << i))
 622          ++n;
 623
 624    x86_sub_imm(
 625       func,
 626       x86_make_reg( file_REG32, reg_SP ),
 627       n*16);
 628
 629    for(i = 0, n = 0; i < 8; ++i)
 630       if(xmm_save_mask & (1 << i)) {
 631          sse_movups(
 632             func,
 633             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
 634             make_xmm( i ) );
 635          ++n;
 636       }
 637
 638    for (i = 0; i < nr_args; i++) {
 639       /* Load the address of the buffer we use for passing arguments and
 640        * receiving results:
 641        */
 642       x86_lea(
 643          func,
 644          ecx,
 645          arg[i] );
 646
 647       /* Push actual function arguments (currently just the pointer to
 648        * the buffer above), and call the function:
 649        */
 650       x86_push( func, ecx );
 651    }
 652
 653    x86_mov_reg_imm( func, ecx, (unsigned long) code );
 654    x86_call( func, ecx );
 655
 656    /* Pop the arguments (or just add an immediate to esp)
 657     */
 658    for (i = 0; i < nr_args; i++) {
 659       x86_pop(func, ecx );
 660    }
 661
 662    /* Pop the saved XMM regs:
 663     */
 664    for(i = 0, n = 0; i < 8; ++i)
 665       if(xmm_save_mask & (1 << i)) {
 666          sse_movups(
 667             func,
 668             make_xmm( i ),
 669             x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
 670          ++n;
 671       }
 672
 673    x86_add_imm(
 674       func,
 675       x86_make_reg( file_REG32, reg_SP ),
 676       n*16);
 677
 678    /* Restore GP registers in a reverse order.
 679     */
 680    x86_pop(
 681       func,
 682       x86_make_reg( file_REG32, reg_DX) );
 683    x86_pop(
 684       func,
 685       x86_make_reg( file_REG32, reg_CX) );
 686    x86_pop(
 687       func,
 688       x86_make_reg( file_REG32, reg_AX) );
 689 }
 690
 691 static void
 692 emit_func_call_dst_src1(
 693    struct x86_function *func,
 694    unsigned xmm_save,
 695    unsigned xmm_dst,
 696    unsigned xmm_src0,
 697    void (PIPE_CDECL *code)() )
 698 {
 699    struct x86_reg store = get_temp( TEMP_R0, 0 );
 700    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 701
 702    /* Store our input parameters (in xmm regs) to the buffer we use
 703     * for passing arguments.  We will pass a pointer to this buffer as
 704     * the actual function argument.
 705     */
 706    sse_movaps(
 707       func,
 708       store,
 709       make_xmm( xmm_src0 ) );
 710
 711    emit_func_call( func,
 712                    xmm_mask,
 713                    &store,
 714                    1,
 715                    code );
 716
 717    sse_movaps(
 718       func,
 719       make_xmm( xmm_dst ),
 720       store );
 721 }
 722
 723
 724 static void
 725 emit_func_call_dst_src2(
 726    struct x86_function *func,
 727    unsigned xmm_save,
 728    unsigned xmm_dst,
 729    unsigned xmm_src0,
 730    unsigned xmm_src1,
 731    void (PIPE_CDECL *code)() )
 732 {
 733    struct x86_reg store = get_temp( TEMP_R0, 0 );
 734    unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
 735
 736    /* Store two inputs to parameter buffer.
 737     */
 738    sse_movaps(
 739       func,
 740       store,
 741       make_xmm( xmm_src0 ) );
 742
 743    sse_movaps(
 744       func,
 745       x86_make_disp( store, 4 * sizeof(float) ),
 746       make_xmm( xmm_src1 ) );
 747
 748
 749    /* Emit the call
 750     */
 751    emit_func_call( func,
 752                    xmm_mask,
 753                    &store,
 754                    1,
 755                    code );
 756
 757    /* Retrieve the results:
 758     */
 759    sse_movaps(
 760       func,
 761       make_xmm( xmm_dst ),
 762       store );
 763 }
 764
 765
 766
 767
 768
 769 #if defined(PIPE_ARCH_SSE)
 770
 771 /*
 772  * Fast SSE2 implementation of special math functions.
 773  */
 774
 775 #define POLY0(x, c0) _mm_set1_ps(c0)
 776 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
 777 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
 778 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
 779 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
 780 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
 781
 782 #define EXP_POLY_DEGREE 3
 783 #define LOG_POLY_DEGREE 5
 784
 785 /**
 786  * See http://www.devmaster.net/forums/showthread.php?p=43580
 787  */
 788 static INLINE __m128
 789 exp2f4(__m128 x)
 790 {
 791    __m128i ipart;
 792    __m128 fpart, expipart, expfpart;
 793
 794    x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
 795    x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
 796
 797    /* ipart = int(x - 0.5) */
 798    ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
 799
 800    /* fpart = x - ipart */
 801    fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
 802
 803    /* expipart = (float) (1 << ipart) */
 804    expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
 805
 806    /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
 807 #if EXP_POLY_DEGREE == 5
 808    expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
 809 #elif EXP_POLY_DEGREE == 4
 810    expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
 811 #elif EXP_POLY_DEGREE == 3
 812    expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
 813 #elif EXP_POLY_DEGREE == 2
 814    expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
 815 #else
 816 #error
 817 #endif
 818
 819    return _mm_mul_ps(expipart, expfpart);
 820 }
 821
 822
 823 /**
 824  * See http://www.devmaster.net/forums/showthread.php?p=43580
 825  */
 826 static INLINE __m128
 827 log2f4(__m128 x)
 828 {
 829    __m128i expmask = _mm_set1_epi32(0x7f800000);
 830    __m128i mantmask = _mm_set1_epi32(0x007fffff);
 831    __m128 one = _mm_set1_ps(1.0f);
 832
 833    __m128i i = _mm_castps_si128(x);
 834
 835    /* exp = (float) exponent(x) */
 836    __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
 837
 838    /* mant = (float) mantissa(x) */
 839    __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
 840
 841    __m128 logmant;
 842
 843    /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
 844     * These coefficients can be generate with
 845     * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
 846     */
 847 #if LOG_POLY_DEGREE == 6
 848    logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
 849 #elif LOG_POLY_DEGREE == 5
 850    logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
 851 #elif LOG_POLY_DEGREE == 4
 852    logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
 853 #elif LOG_POLY_DEGREE == 3
 854    logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
 855 #else
 856 #error
 857 #endif
 858
 859    /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
 860    logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
 861
 862    return _mm_add_ps(logmant, exp);
 863 }
 864
 865
 866 static INLINE __m128
 867 powf4(__m128 x, __m128 y)
 868 {
 869    return exp2f4(_mm_mul_ps(log2f4(x), y));
 870 }
 871
 872 #endif /* PIPE_ARCH_SSE */
 873
 874
 875
 876 /**
 877  * Low-level instruction translators.
 878  */
 879
 880 static void
 881 emit_abs(
 882    struct x86_function *func,
 883    unsigned xmm )
 884 {
 885    sse_andps(
 886       func,
 887       make_xmm( xmm ),
 888       get_temp(
 889          TGSI_EXEC_TEMP_7FFFFFFF_I,
 890          TGSI_EXEC_TEMP_7FFFFFFF_C ) );
 891 }
 892
 893 static void
 894 emit_add(
 895    struct x86_function *func,
 896    unsigned xmm_dst,
 897    unsigned xmm_src )
 898 {
 899    sse_addps(
 900       func,
 901       make_xmm( xmm_dst ),
 902       make_xmm( xmm_src ) );
 903 }
 904
 905 static void PIPE_CDECL
 906 cos4f(
 907    float *store )
 908 {
 909    store[0] = cosf( store[0] );
 910    store[1] = cosf( store[1] );
 911    store[2] = cosf( store[2] );
 912    store[3] = cosf( store[3] );
 913 }
 914
 915 static void
 916 emit_cos(
 917    struct x86_function *func,
 918    unsigned xmm_save,
 919    unsigned xmm_dst )
 920 {
 921    emit_func_call_dst_src1(
 922       func,
 923       xmm_save,
 924       xmm_dst,
 925       xmm_dst,
 926       cos4f );
 927 }
 928
 929 static void PIPE_CDECL
 930 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
 931 __attribute__((force_align_arg_pointer))
 932 #endif
 933 ex24f(
 934    float *store )
 935 {
 936 #if defined(PIPE_ARCH_SSE)
 937    _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 938 #else
 939    store[0] = util_fast_exp2( store[0] );
 940    store[1] = util_fast_exp2( store[1] );
 941    store[2] = util_fast_exp2( store[2] );
 942    store[3] = util_fast_exp2( store[3] );
 943 #endif
 944 }
 945
 946 static void
 947 emit_ex2(
 948    struct x86_function *func,
 949    unsigned xmm_save,
 950    unsigned xmm_dst )
 951 {
 952    emit_func_call_dst_src1(
 953       func,
 954       xmm_save,
 955       xmm_dst,
 956       xmm_dst,
 957       ex24f );
 958 }
 959
 960 static void
 961 emit_f2it(
 962    struct x86_function *func,
 963    unsigned xmm )
 964 {
 965    sse2_cvttps2dq(
 966       func,
 967       make_xmm( xmm ),
 968       make_xmm( xmm ) );
 969 }
 970
 971 static void
 972 emit_i2f(
 973    struct x86_function *func,
 974    unsigned xmm )
 975 {
 976    sse2_cvtdq2ps(
 977       func,
 978       make_xmm( xmm ),
 979       make_xmm( xmm ) );
 980 }
 981
 982 static void PIPE_CDECL
 983 flr4f(
 984    float *store )
 985 {
 986    store[0] = floorf( store[0] );
 987    store[1] = floorf( store[1] );
 988    store[2] = floorf( store[2] );
 989    store[3] = floorf( store[3] );
 990 }
 991
 992 static void
 993 emit_flr(
 994    struct x86_function *func,
 995    unsigned xmm_save,
 996    unsigned xmm_dst )
 997 {
 998    emit_func_call_dst_src1(
 999       func,
1000       xmm_save,
1001       xmm_dst,
1002       xmm_dst,
1003       flr4f );
1004 }
1005
1006 static void PIPE_CDECL
1007 frc4f(
1008    float *store )
1009 {
1010    store[0] -= floorf( store[0] );
1011    store[1] -= floorf( store[1] );
1012    store[2] -= floorf( store[2] );
1013    store[3] -= floorf( store[3] );
1014 }
1015
1016 static void
1017 emit_frc(
1018    struct x86_function *func,
1019    unsigned xmm_save,
1020    unsigned xmm_dst )
1021 {
1022    emit_func_call_dst_src1(
1023       func,
1024       xmm_save,
1025       xmm_dst,
1026       xmm_dst,
1027       frc4f );
1028 }
1029
1030 static void PIPE_CDECL
1031 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1032 __attribute__((force_align_arg_pointer))
1033 #endif
1034 lg24f(
1035    float *store )
1036 {
1037 #if defined(PIPE_ARCH_SSE)
1038    _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
1039 #else
1040    store[0] = util_fast_log2( store[0] );
1041    store[1] = util_fast_log2( store[1] );
1042    store[2] = util_fast_log2( store[2] );
1043    store[3] = util_fast_log2( store[3] );
1044 #endif
1045 }
1046
1047 static void
1048 emit_lg2(
1049    struct x86_function *func,
1050    unsigned xmm_save,
1051    unsigned xmm_dst )
1052 {
1053    emit_func_call_dst_src1(
1054       func,
1055       xmm_save,
1056       xmm_dst,
1057       xmm_dst,
1058       lg24f );
1059 }
1060
1061 static void
1062 emit_MOV(
1063    struct x86_function *func,
1064    unsigned xmm_dst,
1065    unsigned xmm_src )
1066 {
1067    sse_movups(
1068       func,
1069       make_xmm( xmm_dst ),
1070       make_xmm( xmm_src ) );
1071 }
1072
1073 static void
1074 emit_mul (struct x86_function *func,
1075           unsigned xmm_dst,
1076           unsigned xmm_src)
1077 {
1078    sse_mulps(
1079       func,
1080       make_xmm( xmm_dst ),
1081       make_xmm( xmm_src ) );
1082 }
1083
1084 static void
1085 emit_neg(
1086    struct x86_function *func,
1087    unsigned xmm )
1088 {
1089    sse_xorps(
1090       func,
1091       make_xmm( xmm ),
1092       get_temp(
1093          TGSI_EXEC_TEMP_80000000_I,
1094          TGSI_EXEC_TEMP_80000000_C ) );
1095 }
1096
1097 static void PIPE_CDECL
1098 #if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
1099 __attribute__((force_align_arg_pointer))
1100 #endif
1101 pow4f(
1102    float *store )
1103 {
1104 #if defined(PIPE_ARCH_SSE)
1105    _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
1106 #else
1107    store[0] = util_fast_pow( store[0], store[4] );
1108    store[1] = util_fast_pow( store[1], store[5] );
1109    store[2] = util_fast_pow( store[2], store[6] );
1110    store[3] = util_fast_pow( store[3], store[7] );
1111 #endif
1112 }
1113
1114 static void
1115 emit_pow(
1116    struct x86_function *func,
1117    unsigned xmm_save,
1118    unsigned xmm_dst,
1119    unsigned xmm_src0,
1120    unsigned xmm_src1 )
1121 {
1122    emit_func_call_dst_src2(
1123       func,
1124       xmm_save,
1125       xmm_dst,
1126       xmm_src0,
1127       xmm_src1,
1128       pow4f );
1129 }
1130
1131 static void
1132 emit_rcp (
1133    struct x86_function *func,
1134    unsigned xmm_dst,
1135    unsigned xmm_src )
1136 {
1137    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1138     * good enough.  Need to either emit a proper divide or use the
1139     * iterative technique described below in emit_rsqrt().
1140     */
1141    sse2_rcpps(
1142       func,
1143       make_xmm( xmm_dst ),
1144       make_xmm( xmm_src ) );
1145 }
1146
1147 static void PIPE_CDECL
1148 rnd4f(
1149    float *store )
1150 {
1151    store[0] = floorf( store[0] + 0.5f );
1152    store[1] = floorf( store[1] + 0.5f );
1153    store[2] = floorf( store[2] + 0.5f );
1154    store[3] = floorf( store[3] + 0.5f );
1155 }
1156
1157 static void
1158 emit_rnd(
1159    struct x86_function *func,
1160    unsigned xmm_save,
1161    unsigned xmm_dst )
1162 {
1163    emit_func_call_dst_src1(
1164       func,
1165       xmm_save,
1166       xmm_dst,
1167       xmm_dst,
1168       rnd4f );
1169 }
1170
1171 static void
1172 emit_rsqrt(
1173    struct x86_function *func,
1174    unsigned xmm_dst,
1175    unsigned xmm_src )
1176 {
1177 #if HIGH_PRECISION
1178    /* Although rsqrtps() and rcpps() are low precision on some/all SSE
1179     * implementations, it is possible to improve its precision at
1180     * fairly low cost, using a newton/raphson step, as below:
1181     *
1182     * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1183     * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1184     *
1185     * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1186     */
1187    {
1188       struct x86_reg dst = make_xmm( xmm_dst );
1189       struct x86_reg src = make_xmm( xmm_src );
1190       struct x86_reg tmp0 = make_xmm( 2 );
1191       struct x86_reg tmp1 = make_xmm( 3 );
1192
1193       assert( xmm_dst != xmm_src );
1194       assert( xmm_dst != 2 && xmm_dst != 3 );
1195       assert( xmm_src != 2 && xmm_src != 3 );
1196
1197       sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
1198       sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
1199       sse_rsqrtps( func, tmp1, src  );
1200       sse_mulps(   func, src,  tmp1 );
1201       sse_mulps(   func, dst,  tmp1 );
1202       sse_mulps(   func, src,  tmp1 );
1203       sse_subps(   func, tmp0, src  );
1204       sse_mulps(   func, dst,  tmp0 );
1205    }
1206 #else
1207    /* On Intel CPUs at least, this is only accurate to 12 bits -- not
1208     * good enough.
1209     */
1210    sse_rsqrtps(
1211       func,
1212       make_xmm( xmm_dst ),
1213       make_xmm( xmm_src ) );
1214 #endif
1215 }
1216
1217 static void
1218 emit_setsign(
1219    struct x86_function *func,
1220    unsigned xmm )
1221 {
1222    sse_orps(
1223       func,
1224       make_xmm( xmm ),
1225       get_temp(
1226          TGSI_EXEC_TEMP_80000000_I,
1227          TGSI_EXEC_TEMP_80000000_C ) );
1228 }
1229
1230 static void PIPE_CDECL
1231 sgn4f(
1232    float *store )
1233 {
1234    store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
1235    store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
1236    store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
1237    store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
1238 }
1239
1240 static void
1241 emit_sgn(
1242    struct x86_function *func,
1243    unsigned xmm_save,
1244    unsigned xmm_dst )
1245 {
1246    emit_func_call_dst_src1(
1247       func,
1248       xmm_save,
1249       xmm_dst,
1250       xmm_dst,
1251       sgn4f );
1252 }
1253
1254 static void PIPE_CDECL
1255 sin4f(
1256    float *store )
1257 {
1258    store[0] = sinf( store[0] );
1259    store[1] = sinf( store[1] );
1260    store[2] = sinf( store[2] );
1261    store[3] = sinf( store[3] );
1262 }
1263
1264 static void
1265 emit_sin (struct x86_function *func,
1266           unsigned xmm_save,
1267           unsigned xmm_dst)
1268 {
1269    emit_func_call_dst_src1(
1270       func,
1271       xmm_save,
1272       xmm_dst,
1273       xmm_dst,
1274       sin4f );
1275 }
1276
1277 static void
1278 emit_sub(
1279    struct x86_function *func,
1280    unsigned xmm_dst,
1281    unsigned xmm_src )
1282 {
1283    sse_subps(
1284       func,
1285       make_xmm( xmm_dst ),
1286       make_xmm( xmm_src ) );
1287 }
1288
1289 /**
1290  * Register fetch.
1291  */
1292 static void
1293 emit_fetch(
1294    struct x86_function *func,
1295    unsigned xmm,
1296    const struct tgsi_full_src_register *reg,
1297    const unsigned chan_index )
1298 {
1299    unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1300
1301    switch (swizzle) {
1302    case TGSI_SWIZZLE_X:
1303    case TGSI_SWIZZLE_Y:
1304    case TGSI_SWIZZLE_Z:
1305    case TGSI_SWIZZLE_W:
1306       switch (reg->Register.File) {
1307       case TGSI_FILE_CONSTANT:
1308          emit_const(
1309             func,
1310             xmm,
1311             reg->Register.Index,
1312             swizzle,
1313             reg->Register.Indirect,
1314             reg->Indirect.File,
1315             reg->Indirect.Index );
1316          break;
1317
1318       case TGSI_FILE_IMMEDIATE:
1319          emit_immediate(
1320             func,
1321             xmm,
1322             reg->Register.Index,
1323             swizzle );
1324          break;
1325
1326       case TGSI_FILE_SYSTEM_VALUE:
1327          emit_system_value(
1328             func,
1329             xmm,
1330             reg->Register.Index,
1331             swizzle );
1332          break;
1333
1334       case TGSI_FILE_INPUT:
1335          emit_inputf(
1336             func,
1337             xmm,
1338             reg->Register.Index,
1339             swizzle );
1340          break;
1341
1342       case TGSI_FILE_TEMPORARY:
1343          emit_tempf(
1344             func,
1345             xmm,
1346             reg->Register.Index,
1347             swizzle );
1348          break;
1349
1350       default:
1351          assert( 0 );
1352       }
1353       break;
1354
1355    default:
1356       assert( 0 );
1357    }
1358
1359    switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
1360    case TGSI_UTIL_SIGN_CLEAR:
1361       emit_abs( func, xmm );
1362       break;
1363
1364    case TGSI_UTIL_SIGN_SET:
1365       emit_setsign( func, xmm );
1366       break;
1367
1368    case TGSI_UTIL_SIGN_TOGGLE:
1369       emit_neg( func, xmm );
1370       break;
1371
1372    case TGSI_UTIL_SIGN_KEEP:
1373       break;
1374    }
1375 }
1376
1377 #define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
1378    emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
1379
1380 /**
1381  * Register store.
1382  */
1383 static void
1384 emit_store(
1385    struct x86_function *func,
1386    unsigned xmm,
1387    const struct tgsi_full_dst_register *reg,
1388    const struct tgsi_full_instruction *inst,
1389    unsigned chan_index )
1390 {
1391    switch( inst->Instruction.Saturate ) {
1392    case TGSI_SAT_NONE:
1393       break;
1394
1395    case TGSI_SAT_ZERO_ONE:
1396       sse_maxps(
1397          func,
1398          make_xmm( xmm ),
1399          get_temp(
1400             TGSI_EXEC_TEMP_00000000_I,
1401             TGSI_EXEC_TEMP_00000000_C ) );
1402
1403       sse_minps(
1404          func,
1405          make_xmm( xmm ),
1406          get_temp(
1407             TGSI_EXEC_TEMP_ONE_I,
1408             TGSI_EXEC_TEMP_ONE_C ) );
1409       break;
1410
1411    case TGSI_SAT_MINUS_PLUS_ONE:
1412       assert( 0 );
1413       break;
1414    }
1415
1416
1417    switch( reg->Register.File ) {
1418    case TGSI_FILE_OUTPUT:
1419       emit_output(
1420          func,
1421          xmm,
1422          reg->Register.Index,
1423          chan_index );
1424       break;
1425
1426    case TGSI_FILE_TEMPORARY:
1427       emit_temps(
1428          func,
1429          xmm,
1430          reg->Register.Index,
1431          chan_index );
1432       break;
1433
1434    case TGSI_FILE_ADDRESS:
1435       emit_addrs(
1436          func,
1437          xmm,
1438          reg->Register.Index,
1439          chan_index );
1440       break;
1441
1442    default:
1443       assert( 0 );
1444    }
1445 }
1446
1447 #define STORE( FUNC, INST, XMM, INDEX, CHAN )\
1448    emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
1449
1450
1451 static void PIPE_CDECL
1452 fetch_texel( struct tgsi_sampler **sampler,
1453              float *store )
1454 {
1455 #if 0
1456    uint j;
1457
1458    debug_printf("%s sampler: %p (%p) store: %p\n",
1459                 __FUNCTION__,
1460                 sampler, *sampler,
1461                 store );
1462
1463    for (j = 0; j < 4; j++)
1464       debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
1465                    j,
1466                    store[0+j],
1467                    store[4+j],
1468                    store[8 + j],
1469                    store[12 + j]);
1470 #endif
1471
1472    {
1473       float rgba[NUM_CHANNELS][QUAD_SIZE];
1474       (*sampler)->get_samples(*sampler,
1475                               &store[0],  /* s */
1476                               &store[4],  /* t */
1477                               &store[8],  /* r */
1478                               &store[12], /* lodbias */
1479                               tgsi_sampler_lod_bias,
1480                               rgba);      /* results */
1481
1482       memcpy( store, rgba, 16 * sizeof(float));
1483    }
1484
1485 #if 0
1486    for (j = 0; j < 4; j++)
1487       debug_printf("sample %d result %f %f %f %f\n",
1488                    j,
1489                    store[0+j],
1490                    store[4+j],
1491                    store[8+j],
1492                    store[12+j]);
1493 #endif
1494 }
1495
1496 /**
1497  * High-level instruction translators.
1498  */
1499 static void
1500 emit_tex( struct x86_function *func,
1501           const struct tgsi_full_instruction *inst,
1502           boolean lodbias,
1503           boolean projected)
1504 {
1505    const uint unit = inst->Src[1].Register.Index;
1506    struct x86_reg args[2];
1507    unsigned count;
1508    unsigned i;
1509
1510    assert(inst->Instruction.Texture);
1511    switch (inst->Texture.Texture) {
1512    case TGSI_TEXTURE_1D:
1513       count = 1;
1514       break;
1515    case TGSI_TEXTURE_2D:
1516    case TGSI_TEXTURE_RECT:
1517    case TGSI_TEXTURE_1D_ARRAY:
1518       count = 2;
1519       break;
1520    case TGSI_TEXTURE_SHADOW1D:
1521    case TGSI_TEXTURE_SHADOW2D:
1522    case TGSI_TEXTURE_SHADOWRECT:
1523    case TGSI_TEXTURE_3D:
1524    case TGSI_TEXTURE_CUBE:
1525    case TGSI_TEXTURE_2D_ARRAY:
1526       count = 3;
1527       break;
1528    default:
1529       assert(0);
1530       return;
1531    }
1532
1533    if (lodbias) {
1534       FETCH( func, *inst, 3, 0, 3 );
1535    }
1536    else {
1537       emit_tempf(
1538          func,
1539          3,
1540          TGSI_EXEC_TEMP_00000000_I,
1541          TGSI_EXEC_TEMP_00000000_C );
1542
1543    }
1544
1545    /* store lodbias whether enabled or not -- fetch_texel currently
1546     * respects it always.
1547     */
1548    sse_movaps( func,
1549                get_temp( TEMP_R0, 3 ),
1550                make_xmm( 3 ) );
1551
1552    if (projected) {
1553       FETCH( func, *inst, 3, 0, 3 );
1554
1555       emit_rcp( func, 3, 3 );
1556    }
1557
1558    for (i = 0; i < count; i++) {
1559       FETCH( func, *inst, i, 0, i );
1560
1561       if (projected) {
1562          sse_mulps(
1563             func,
1564             make_xmm( i ),
1565             make_xmm( 3 ) );
1566       }
1567
1568       /* Store in the argument buffer:
1569        */
1570       sse_movaps(
1571          func,
1572          get_temp( TEMP_R0, i ),
1573          make_xmm( i ) );
1574    }
1575
1576    args[0] = get_temp( TEMP_R0, 0 );
1577    args[1] = get_sampler_ptr( unit );
1578
1579    emit_func_call( func,
1580                    0,
1581                    args,
1582                    Elements(args),
1583                    fetch_texel );
1584
1585    /* If all four channels are enabled, could use a pointer to
1586     * dst[0].x instead of TEMP_R0 for store?
1587     */
1588    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
1589
1590       sse_movaps(
1591          func,
1592          make_xmm( 0 ),
1593          get_temp( TEMP_R0, i ) );
1594
1595       STORE( func, *inst, 0, 0, i );
1596    }
1597 }
1598
1599
1600 static void
1601 emit_kil(
1602    struct x86_function *func,
1603    const struct tgsi_full_src_register *reg )
1604 {
1605    unsigned uniquemask;
1606    unsigned unique_count = 0;
1607    unsigned chan_index;
1608    unsigned i;
1609
1610    /* This mask stores component bits that were already tested. Note that
1611     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
1612     * tested.
1613     */
1614    uniquemask = 0;
1615
1616    FOR_EACH_CHANNEL( chan_index ) {
1617       unsigned swizzle;
1618
1619       /* unswizzle channel */
1620       swizzle = tgsi_util_get_full_src_register_swizzle(
1621          reg,
1622          chan_index );
1623
1624       /* check if the component has not been already tested */
1625       if( !(uniquemask & (1 << swizzle)) ) {
1626          uniquemask |= 1 << swizzle;
1627
1628          /* allocate register */
1629          emit_fetch(
1630             func,
1631             unique_count++,
1632             reg,
1633             chan_index );
1634       }
1635    }
1636
1637    x86_push(
1638       func,
1639       x86_make_reg( file_REG32, reg_AX ) );
1640    x86_push(
1641       func,
1642       x86_make_reg( file_REG32, reg_DX ) );
1643
1644    for (i = 0 ; i < unique_count; i++ ) {
1645       struct x86_reg dataXMM = make_xmm(i);
1646
1647       sse_cmpps(
1648          func,
1649          dataXMM,
1650          get_temp(
1651             TGSI_EXEC_TEMP_00000000_I,
1652             TGSI_EXEC_TEMP_00000000_C ),
1653          cc_LessThan );
1654
1655       if( i == 0 ) {
1656          sse_movmskps(
1657             func,
1658             x86_make_reg( file_REG32, reg_AX ),
1659             dataXMM );
1660       }
1661       else {
1662          sse_movmskps(
1663             func,
1664             x86_make_reg( file_REG32, reg_DX ),
1665             dataXMM );
1666          x86_or(
1667             func,
1668             x86_make_reg( file_REG32, reg_AX ),
1669             x86_make_reg( file_REG32, reg_DX ) );
1670       }
1671    }
1672
1673    x86_or(
1674       func,
1675       get_temp(
1676          TGSI_EXEC_TEMP_KILMASK_I,
1677          TGSI_EXEC_TEMP_KILMASK_C ),
1678       x86_make_reg( file_REG32, reg_AX ) );
1679
1680    x86_pop(
1681       func,
1682       x86_make_reg( file_REG32, reg_DX ) );
1683    x86_pop(
1684       func,
1685       x86_make_reg( file_REG32, reg_AX ) );
1686 }
1687
1688
1689 static void
1690 emit_kilp(
1691    struct x86_function *func )
1692 {
1693    /* XXX todo / fix me */
1694 }
1695
1696
1697 static void
1698 emit_setcc(
1699    struct x86_function *func,
1700    struct tgsi_full_instruction *inst,
1701    enum sse_cc cc )
1702 {
1703    unsigned chan_index;
1704
1705    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1706       FETCH( func, *inst, 0, 0, chan_index );
1707       FETCH( func, *inst, 1, 1, chan_index );
1708       sse_cmpps(
1709          func,
1710          make_xmm( 0 ),
1711          make_xmm( 1 ),
1712          cc );
1713       sse_andps(
1714          func,
1715          make_xmm( 0 ),
1716          get_temp(
1717             TEMP_ONE_I,
1718             TEMP_ONE_C ) );
1719       STORE( func, *inst, 0, 0, chan_index );
1720    }
1721 }
1722
1723 static void
1724 emit_cmp(
1725    struct x86_function *func,
1726    struct tgsi_full_instruction *inst )
1727 {
1728    unsigned chan_index;
1729
1730    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1731       FETCH( func, *inst, 0, 0, chan_index );
1732       FETCH( func, *inst, 1, 1, chan_index );
1733       FETCH( func, *inst, 2, 2, chan_index );
1734       sse_cmpps(
1735          func,
1736          make_xmm( 0 ),
1737          get_temp(
1738             TGSI_EXEC_TEMP_00000000_I,
1739             TGSI_EXEC_TEMP_00000000_C ),
1740          cc_LessThan );
1741       sse_andps(
1742          func,
1743          make_xmm( 1 ),
1744          make_xmm( 0 ) );
1745       sse_andnps(
1746          func,
1747          make_xmm( 0 ),
1748          make_xmm( 2 ) );
1749       sse_orps(
1750          func,
1751          make_xmm( 0 ),
1752          make_xmm( 1 ) );
1753       STORE( func, *inst, 0, 0, chan_index );
1754    }
1755 }
1756
1757
1758 /**
1759  * Check if inst src/dest regs use indirect addressing into temporary,
1760  * input or output register files.
1761  */
1762 static boolean
1763 indirect_reg_reference(const struct tgsi_full_instruction *inst)
1764 {
1765    uint i;
1766    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1767       const struct tgsi_full_src_register *reg = &inst->Src[i];
1768       if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1769            reg->Register.File == TGSI_FILE_INPUT ||
1770            reg->Register.File == TGSI_FILE_OUTPUT) &&
1771           reg->Register.Indirect)
1772          return TRUE;
1773    }
1774    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
1775       const struct tgsi_full_dst_register *reg = &inst->Dst[i];
1776       if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
1777            reg->Register.File == TGSI_FILE_INPUT ||
1778            reg->Register.File == TGSI_FILE_OUTPUT) &&
1779           reg->Register.Indirect)
1780          return TRUE;
1781    }
1782    return FALSE;
1783 }
1784
1785
1786 static int
1787 emit_instruction(
1788    struct x86_function *func,
1789    struct tgsi_full_instruction *inst )
1790 {
1791    unsigned chan_index;
1792
1793    /* we can't handle indirect addressing into temp register file yet */
1794    if (indirect_reg_reference(inst))
1795       return FALSE;
1796
1797    switch (inst->Instruction.Opcode) {
1798    case TGSI_OPCODE_ARL:
1799       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1800          FETCH( func, *inst, 0, 0, chan_index );
1801          emit_flr(func, 0, 0);
1802          emit_f2it( func, 0 );
1803          STORE( func, *inst, 0, 0, chan_index );
1804       }
1805       break;
1806
1807    case TGSI_OPCODE_MOV:
1808       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1809          FETCH( func, *inst, 4 + chan_index, 0, chan_index );
1810       }
1811       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1812          STORE( func, *inst, 4 + chan_index, 0, chan_index );
1813       }
1814       break;
1815
1816    case TGSI_OPCODE_LIT:
1817       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1818           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1819          emit_tempf(
1820             func,
1821             0,
1822             TEMP_ONE_I,
1823             TEMP_ONE_C);
1824          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
1825             STORE( func, *inst, 0, 0, CHAN_X );
1826          }
1827          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
1828             STORE( func, *inst, 0, 0, CHAN_W );
1829          }
1830       }
1831       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1832           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1833          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
1834             FETCH( func, *inst, 0, 0, CHAN_X );
1835             sse_maxps(
1836                func,
1837                make_xmm( 0 ),
1838                get_temp(
1839                   TGSI_EXEC_TEMP_00000000_I,
1840                   TGSI_EXEC_TEMP_00000000_C ) );
1841             STORE( func, *inst, 0, 0, CHAN_Y );
1842          }
1843          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
1844             /* XMM[1] = SrcReg[0].yyyy */
1845             FETCH( func, *inst, 1, 0, CHAN_Y );
1846             /* XMM[1] = max(XMM[1], 0) */
1847             sse_maxps(
1848                func,
1849                make_xmm( 1 ),
1850                get_temp(
1851                   TGSI_EXEC_TEMP_00000000_I,
1852                   TGSI_EXEC_TEMP_00000000_C ) );
1853             /* XMM[2] = SrcReg[0].wwww */
1854             FETCH( func, *inst, 2, 0, CHAN_W );
1855             /* XMM[2] = min(XMM[2], 128.0) */
1856             sse_minps(
1857                func,
1858                make_xmm( 2 ),
1859                get_temp(
1860                   TGSI_EXEC_TEMP_128_I,
1861                   TGSI_EXEC_TEMP_128_C ) );
1862             /* XMM[2] = max(XMM[2], -128.0) */
1863             sse_maxps(
1864                func,
1865                make_xmm( 2 ),
1866                get_temp(
1867                   TGSI_EXEC_TEMP_MINUS_128_I,
1868                   TGSI_EXEC_TEMP_MINUS_128_C ) );
1869             emit_pow( func, 3, 1, 1, 2 );
1870             FETCH( func, *inst, 0, 0, CHAN_X );
1871             sse_xorps(
1872                func,
1873                make_xmm( 2 ),
1874                make_xmm( 2 ) );
1875             sse_cmpps(
1876                func,
1877                make_xmm( 2 ),
1878                make_xmm( 0 ),
1879                cc_LessThan );
1880             sse_andps(
1881                func,
1882                make_xmm( 2 ),
1883                make_xmm( 1 ) );
1884             STORE( func, *inst, 2, 0, CHAN_Z );
1885          }
1886       }
1887       break;
1888
1889    case TGSI_OPCODE_RCP:
1890       FETCH( func, *inst, 0, 0, CHAN_X );
1891       emit_rcp( func, 0, 0 );
1892       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1893          STORE( func, *inst, 0, 0, chan_index );
1894       }
1895       break;
1896
1897    case TGSI_OPCODE_RSQ:
1898       FETCH( func, *inst, 0, 0, CHAN_X );
1899       emit_abs( func, 0 );
1900       emit_rsqrt( func, 1, 0 );
1901       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1902          STORE( func, *inst, 1, 0, chan_index );
1903       }
1904       break;
1905
1906    case TGSI_OPCODE_EXP:
1907       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1908           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1909           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1910          FETCH( func, *inst, 0, 0, CHAN_X );
1911          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1912              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1913             emit_MOV( func, 1, 0 );
1914             emit_flr( func, 2, 1 );
1915             /* dst.x = ex2(floor(src.x)) */
1916             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1917                emit_MOV( func, 2, 1 );
1918                emit_ex2( func, 3, 2 );
1919                STORE( func, *inst, 2, 0, CHAN_X );
1920             }
1921             /* dst.y = src.x - floor(src.x) */
1922             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1923                emit_MOV( func, 2, 0 );
1924                emit_sub( func, 2, 1 );
1925                STORE( func, *inst, 2, 0, CHAN_Y );
1926             }
1927          }
1928          /* dst.z = ex2(src.x) */
1929          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1930             emit_ex2( func, 3, 0 );
1931             STORE( func, *inst, 0, 0, CHAN_Z );
1932          }
1933       }
1934       /* dst.w = 1.0 */
1935       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1936          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1937          STORE( func, *inst, 0, 0, CHAN_W );
1938       }
1939       break;
1940
1941    case TGSI_OPCODE_LOG:
1942       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1943           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
1944           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1945          FETCH( func, *inst, 0, 0, CHAN_X );
1946          emit_abs( func, 0 );
1947          emit_MOV( func, 1, 0 );
1948          emit_lg2( func, 2, 1 );
1949          /* dst.z = lg2(abs(src.x)) */
1950          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1951             STORE( func, *inst, 1, 0, CHAN_Z );
1952          }
1953          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
1954              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1955             emit_flr( func, 2, 1 );
1956             /* dst.x = floor(lg2(abs(src.x))) */
1957             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
1958                STORE( func, *inst, 1, 0, CHAN_X );
1959             }
1960             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
1961             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1962                emit_ex2( func, 2, 1 );
1963                emit_rcp( func, 1, 1 );
1964                emit_mul( func, 0, 1 );
1965                STORE( func, *inst, 0, 0, CHAN_Y );
1966             }
1967          }
1968       }
1969       /* dst.w = 1.0 */
1970       if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
1971          emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
1972          STORE( func, *inst, 0, 0, CHAN_W );
1973       }
1974       break;
1975
1976    case TGSI_OPCODE_MUL:
1977       /* do all fetches and adds, storing results in temp regs */
1978       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1979          int r = chan_index + 1;
1980          FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1981          FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
1982          emit_mul( func, r, 0 );   /* xmm[r] = xmm[r] * xmm[0] */
1983       }
1984       /* do all stores of the temp regs */
1985       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1986          int r = chan_index + 1;
1987          STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
1988       }
1989       break;
1990
1991    case TGSI_OPCODE_ADD:
1992       /* do all fetches and adds, storing results in temp regs */
1993       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
1994          int r = chan_index + 1;
1995          FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
1996          FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
1997          emit_add( func, r, 0 );   /* xmm[r] = xmm[r] + xmm[0] */
1998       }
1999       /* do all stores of the temp regs */
2000       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2001          int r = chan_index + 1;
2002          STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
2003       }
2004       break;
2005
2006    case TGSI_OPCODE_DP3:
2007       FETCH( func, *inst, 0, 0, CHAN_X );
2008       FETCH( func, *inst, 1, 1, CHAN_X );
2009       emit_mul( func, 0, 1 );
2010       FETCH( func, *inst, 1, 0, CHAN_Y );
2011       FETCH( func, *inst, 2, 1, CHAN_Y );
2012       emit_mul( func, 1, 2 );
2013       emit_add( func, 0, 1 );
2014       FETCH( func, *inst, 1, 0, CHAN_Z );
2015       FETCH( func, *inst, 2, 1, CHAN_Z );
2016       emit_mul( func, 1, 2 );
2017       emit_add( func, 0, 1 );
2018       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2019          STORE( func, *inst, 0, 0, chan_index );
2020       }
2021       break;
2022
2023    case TGSI_OPCODE_DP4:
2024       FETCH( func, *inst, 0, 0, CHAN_X );
2025       FETCH( func, *inst, 1, 1, CHAN_X );
2026       emit_mul( func, 0, 1 );
2027       FETCH( func, *inst, 1, 0, CHAN_Y );
2028       FETCH( func, *inst, 2, 1, CHAN_Y );
2029       emit_mul( func, 1, 2 );
2030       emit_add( func, 0, 1 );
2031       FETCH( func, *inst, 1, 0, CHAN_Z );
2032       FETCH( func, *inst, 2, 1, CHAN_Z );
2033       emit_mul(func, 1, 2 );
2034       emit_add(func, 0, 1 );
2035       FETCH( func, *inst, 1, 0, CHAN_W );
2036       FETCH( func, *inst, 2, 1, CHAN_W );
2037       emit_mul( func, 1, 2 );
2038       emit_add( func, 0, 1 );
2039       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2040          STORE( func, *inst, 0, 0, chan_index );
2041       }
2042       break;
2043
2044    case TGSI_OPCODE_DST:
2045       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2046          emit_tempf(
2047             func,
2048             0,
2049             TEMP_ONE_I,
2050             TEMP_ONE_C );
2051          STORE( func, *inst, 0, 0, CHAN_X );
2052       }
2053       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2054          FETCH( func, *inst, 0, 0, CHAN_Y );
2055          FETCH( func, *inst, 1, 1, CHAN_Y );
2056          emit_mul( func, 0, 1 );
2057          STORE( func, *inst, 0, 0, CHAN_Y );
2058       }
2059       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2060          FETCH( func, *inst, 0, 0, CHAN_Z );
2061          STORE( func, *inst, 0, 0, CHAN_Z );
2062       }
2063       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2064          FETCH( func, *inst, 0, 1, CHAN_W );
2065          STORE( func, *inst, 0, 0, CHAN_W );
2066       }
2067       break;
2068
2069    case TGSI_OPCODE_MIN:
2070       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2071          FETCH( func, *inst, 0, 0, chan_index );
2072          FETCH( func, *inst, 1, 1, chan_index );
2073          sse_minps(
2074             func,
2075             make_xmm( 0 ),
2076             make_xmm( 1 ) );
2077          STORE( func, *inst, 0, 0, chan_index );
2078       }
2079       break;
2080
2081    case TGSI_OPCODE_MAX:
2082       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2083          FETCH( func, *inst, 0, 0, chan_index );
2084          FETCH( func, *inst, 1, 1, chan_index );
2085          sse_maxps(
2086             func,
2087             make_xmm( 0 ),
2088             make_xmm( 1 ) );
2089          STORE( func, *inst, 0, 0, chan_index );
2090       }
2091       break;
2092
2093    case TGSI_OPCODE_SLT:
2094       emit_setcc( func, inst, cc_LessThan );
2095       break;
2096
2097    case TGSI_OPCODE_SGE:
2098       emit_setcc( func, inst, cc_NotLessThan );
2099       break;
2100
2101    case TGSI_OPCODE_MAD:
2102       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2103          FETCH( func, *inst, 0, 0, chan_index );
2104          FETCH( func, *inst, 1, 1, chan_index );
2105          FETCH( func, *inst, 2, 2, chan_index );
2106          emit_mul( func, 0, 1 );
2107          emit_add( func, 0, 2 );
2108          STORE( func, *inst, 0, 0, chan_index );
2109       }
2110       break;
2111
2112    case TGSI_OPCODE_SUB:
2113       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2114          FETCH( func, *inst, 0, 0, chan_index );
2115          FETCH( func, *inst, 1, 1, chan_index );
2116          emit_sub( func, 0, 1 );
2117          STORE( func, *inst, 0, 0, chan_index );
2118       }
2119       break;
2120
2121    case TGSI_OPCODE_LRP:
2122       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2123          FETCH( func, *inst, 0, 0, chan_index );
2124          FETCH( func, *inst, 1, 1, chan_index );
2125          FETCH( func, *inst, 2, 2, chan_index );
2126          emit_sub( func, 1, 2 );
2127          emit_mul( func, 0, 1 );
2128          emit_add( func, 0, 2 );
2129          STORE( func, *inst, 0, 0, chan_index );
2130       }
2131       break;
2132
2133    case TGSI_OPCODE_CND:
2134       return 0;
2135       break;
2136
2137    case TGSI_OPCODE_DP2A:
2138       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2139       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2140       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2141       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2142       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2143       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2144       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2145       FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
2146       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2147       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2148          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2149       }
2150       break;
2151
2152    case TGSI_OPCODE_FRC:
2153       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2154          FETCH( func, *inst, 0, 0, chan_index );
2155          emit_frc( func, 0, 0 );
2156          STORE( func, *inst, 0, 0, chan_index );
2157       }
2158       break;
2159
2160    case TGSI_OPCODE_CLAMP:
2161       return 0;
2162       break;
2163
2164    case TGSI_OPCODE_FLR:
2165       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2166          FETCH( func, *inst, 0, 0, chan_index );
2167          emit_flr( func, 0, 0 );
2168          STORE( func, *inst, 0, 0, chan_index );
2169       }
2170       break;
2171
2172    case TGSI_OPCODE_ROUND:
2173       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2174          FETCH( func, *inst, 0, 0, chan_index );
2175          emit_rnd( func, 0, 0 );
2176          STORE( func, *inst, 0, 0, chan_index );
2177       }
2178       break;
2179
2180    case TGSI_OPCODE_EX2:
2181       FETCH( func, *inst, 0, 0, CHAN_X );
2182       emit_ex2( func, 0, 0 );
2183       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2184          STORE( func, *inst, 0, 0, chan_index );
2185       }
2186       break;
2187
2188    case TGSI_OPCODE_LG2:
2189       FETCH( func, *inst, 0, 0, CHAN_X );
2190       emit_lg2( func, 0, 0 );
2191       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2192          STORE( func, *inst, 0, 0, chan_index );
2193       }
2194       break;
2195
2196    case TGSI_OPCODE_POW:
2197       FETCH( func, *inst, 0, 0, CHAN_X );
2198       FETCH( func, *inst, 1, 1, CHAN_X );
2199       emit_pow( func, 0, 0, 0, 1 );
2200       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2201          STORE( func, *inst, 0, 0, chan_index );
2202       }
2203       break;
2204
2205    case TGSI_OPCODE_XPD:
2206       /* Note: we do all stores after all operands have been fetched
2207        * to avoid src/dst register aliasing issues for an instruction
2208        * such as:  XPD TEMP[2].xyz, TEMP[0], TEMP[2];
2209        */
2210       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2211           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2212          FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
2213          FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
2214       }
2215       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
2216           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2217          FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
2218          FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
2219       }
2220       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2221          emit_MOV( func, 7, 0 );  /* xmm[7] = xmm[0] */
2222          emit_mul( func, 7, 1 );  /* xmm[7] = xmm[2] * xmm[1] */
2223          emit_MOV( func, 5, 3 );  /* xmm[5] = xmm[3] */
2224          emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
2225          emit_sub( func, 7, 5 );  /* xmm[7] = xmm[2] - xmm[5] */
2226          /* store xmm[7] in dst.x below */
2227       }
2228       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
2229           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2230          FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
2231          FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
2232       }
2233       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2234          emit_mul( func, 3, 2 );  /* xmm[3] = xmm[3] * xmm[2] */
2235          emit_mul( func, 1, 5 );  /* xmm[1] = xmm[1] * xmm[5] */
2236          emit_sub( func, 3, 1 );  /* xmm[3] = xmm[3] - xmm[1] */
2237          /* store xmm[3] in dst.y below */
2238       }
2239       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2240          emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
2241          emit_mul( func, 0, 2 );  /* xmm[0] = xmm[0] * xmm[2] */
2242          emit_sub( func, 5, 0 );  /* xmm[5] = xmm[5] - xmm[0] */
2243          STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
2244       }
2245       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2246          STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
2247       }
2248       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2249          STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
2250       }
2251       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2252          emit_tempf(
2253             func,
2254             0,
2255             TEMP_ONE_I,
2256             TEMP_ONE_C );
2257          STORE( func, *inst, 0, 0, CHAN_W );
2258       }
2259       break;
2260
2261    case TGSI_OPCODE_ABS:
2262       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2263          FETCH( func, *inst, 0, 0, chan_index );
2264          emit_abs( func, 0) ;
2265
2266          STORE( func, *inst, 0, 0, chan_index );
2267       }
2268       break;
2269
2270    case TGSI_OPCODE_RCC:
2271       return 0;
2272       break;
2273
2274    case TGSI_OPCODE_DPH:
2275       FETCH( func, *inst, 0, 0, CHAN_X );
2276       FETCH( func, *inst, 1, 1, CHAN_X );
2277       emit_mul( func, 0, 1 );
2278       FETCH( func, *inst, 1, 0, CHAN_Y );
2279       FETCH( func, *inst, 2, 1, CHAN_Y );
2280       emit_mul( func, 1, 2 );
2281       emit_add( func, 0, 1 );
2282       FETCH( func, *inst, 1, 0, CHAN_Z );
2283       FETCH( func, *inst, 2, 1, CHAN_Z );
2284       emit_mul( func, 1, 2 );
2285       emit_add( func, 0, 1 );
2286       FETCH( func, *inst, 1, 1, CHAN_W );
2287       emit_add( func, 0, 1 );
2288       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2289          STORE( func, *inst, 0, 0, chan_index );
2290       }
2291       break;
2292
2293    case TGSI_OPCODE_COS:
2294       FETCH( func, *inst, 0, 0, CHAN_X );
2295       emit_cos( func, 0, 0 );
2296       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2297          STORE( func, *inst, 0, 0, chan_index );
2298       }
2299       break;
2300
2301    case TGSI_OPCODE_DDX:
2302       return 0;
2303       break;
2304
2305    case TGSI_OPCODE_DDY:
2306       return 0;
2307       break;
2308
2309    case TGSI_OPCODE_KILP:
2310       /* predicated kill */
2311       emit_kilp( func );
2312       return 0; /* XXX fix me */
2313       break;
2314
2315    case TGSI_OPCODE_KIL:
2316       /* conditional kill */
2317       emit_kil( func, &inst->Src[0] );
2318       break;
2319
2320    case TGSI_OPCODE_PK2H:
2321       return 0;
2322       break;
2323
2324    case TGSI_OPCODE_PK2US:
2325       return 0;
2326       break;
2327
2328    case TGSI_OPCODE_PK4B:
2329       return 0;
2330       break;
2331
2332    case TGSI_OPCODE_PK4UB:
2333       return 0;
2334       break;
2335
2336    case TGSI_OPCODE_RFL:
2337       return 0;
2338       break;
2339
2340    case TGSI_OPCODE_SEQ:
2341       emit_setcc( func, inst, cc_Equal );
2342       break;
2343
2344    case TGSI_OPCODE_SFL:
2345       return 0;
2346       break;
2347
2348    case TGSI_OPCODE_SGT:
2349       emit_setcc( func, inst, cc_NotLessThanEqual );
2350       break;
2351
2352    case TGSI_OPCODE_SIN:
2353       FETCH( func, *inst, 0, 0, CHAN_X );
2354       emit_sin( func, 0, 0 );
2355       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2356          STORE( func, *inst, 0, 0, chan_index );
2357       }
2358       break;
2359
2360    case TGSI_OPCODE_SLE:
2361       emit_setcc( func, inst, cc_LessThanEqual );
2362       break;
2363
2364    case TGSI_OPCODE_SNE:
2365       emit_setcc( func, inst, cc_NotEqual );
2366       break;
2367
2368    case TGSI_OPCODE_STR:
2369       return 0;
2370       break;
2371
2372    case TGSI_OPCODE_TEX:
2373       emit_tex( func, inst, FALSE, FALSE );
2374       break;
2375
2376    case TGSI_OPCODE_TXD:
2377       return 0;
2378       break;
2379
2380    case TGSI_OPCODE_UP2H:
2381       return 0;
2382       break;
2383
2384    case TGSI_OPCODE_UP2US:
2385       return 0;
2386       break;
2387
2388    case TGSI_OPCODE_UP4B:
2389       return 0;
2390       break;
2391
2392    case TGSI_OPCODE_UP4UB:
2393       return 0;
2394       break;
2395
2396    case TGSI_OPCODE_X2D:
2397       return 0;
2398       break;
2399
2400    case TGSI_OPCODE_ARA:
2401       return 0;
2402       break;
2403
2404    case TGSI_OPCODE_ARR:
2405       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2406          FETCH( func, *inst, 0, 0, chan_index );
2407          emit_rnd( func, 0, 0 );
2408          emit_f2it( func, 0 );
2409          STORE( func, *inst, 0, 0, chan_index );
2410       }
2411       break;
2412
2413    case TGSI_OPCODE_BRA:
2414       return 0;
2415       break;
2416
2417    case TGSI_OPCODE_CAL:
2418       return 0;
2419       break;
2420
2421    case TGSI_OPCODE_RET:
2422       emit_ret( func );
2423       break;
2424
2425    case TGSI_OPCODE_END:
2426       break;
2427
2428    case TGSI_OPCODE_SSG:
2429       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2430          FETCH( func, *inst, 0, 0, chan_index );
2431          emit_sgn( func, 0, 0 );
2432          STORE( func, *inst, 0, 0, chan_index );
2433       }
2434       break;
2435
2436    case TGSI_OPCODE_CMP:
2437       emit_cmp (func, inst);
2438       break;
2439
2440    case TGSI_OPCODE_SCS:
2441       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
2442          FETCH( func, *inst, 0, 0, CHAN_X );
2443          emit_cos( func, 0, 0 );
2444          STORE( func, *inst, 0, 0, CHAN_X );
2445       }
2446       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
2447          FETCH( func, *inst, 0, 0, CHAN_X );
2448          emit_sin( func, 0, 0 );
2449          STORE( func, *inst, 0, 0, CHAN_Y );
2450       }
2451       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
2452          emit_tempf(
2453             func,
2454             0,
2455             TGSI_EXEC_TEMP_00000000_I,
2456             TGSI_EXEC_TEMP_00000000_C );
2457          STORE( func, *inst, 0, 0, CHAN_Z );
2458       }
2459       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
2460          emit_tempf(
2461             func,
2462             0,
2463             TEMP_ONE_I,
2464             TEMP_ONE_C );
2465          STORE( func, *inst, 0, 0, CHAN_W );
2466       }
2467       break;
2468
2469    case TGSI_OPCODE_TXB:
2470       emit_tex( func, inst, TRUE, FALSE );
2471       break;
2472
2473    case TGSI_OPCODE_NRM:
2474       /* fall-through */
2475    case TGSI_OPCODE_NRM4:
2476       /* 3 or 4-component normalization */
2477       {
2478          uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
2479
2480          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
2481              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2482              IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
2483              (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
2484
2485             /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
2486
2487             /* xmm4 = src.x */
2488             /* xmm0 = src.x * src.x */
2489             FETCH(func, *inst, 0, 0, CHAN_X);
2490             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2491                emit_MOV(func, 4, 0);
2492             }
2493             emit_mul(func, 0, 0);
2494
2495             /* xmm5 = src.y */
2496             /* xmm0 = xmm0 + src.y * src.y */
2497             FETCH(func, *inst, 1, 0, CHAN_Y);
2498             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2499                emit_MOV(func, 5, 1);
2500             }
2501             emit_mul(func, 1, 1);
2502             emit_add(func, 0, 1);
2503
2504             /* xmm6 = src.z */
2505             /* xmm0 = xmm0 + src.z * src.z */
2506             FETCH(func, *inst, 1, 0, CHAN_Z);
2507             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2508                emit_MOV(func, 6, 1);
2509             }
2510             emit_mul(func, 1, 1);
2511             emit_add(func, 0, 1);
2512
2513             if (dims == 4) {
2514                /* xmm7 = src.w */
2515                /* xmm0 = xmm0 + src.w * src.w */
2516                FETCH(func, *inst, 1, 0, CHAN_W);
2517                if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
2518                   emit_MOV(func, 7, 1);
2519                }
2520                emit_mul(func, 1, 1);
2521                emit_add(func, 0, 1);
2522             }
2523
2524             /* xmm1 = 1 / sqrt(xmm0) */
2525             emit_rsqrt(func, 1, 0);
2526
2527             /* dst.x = xmm1 * src.x */
2528             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
2529                emit_mul(func, 4, 1);
2530                STORE(func, *inst, 4, 0, CHAN_X);
2531             }
2532
2533             /* dst.y = xmm1 * src.y */
2534             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2535                emit_mul(func, 5, 1);
2536                STORE(func, *inst, 5, 0, CHAN_Y);
2537             }
2538
2539             /* dst.z = xmm1 * src.z */
2540             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2541                emit_mul(func, 6, 1);
2542                STORE(func, *inst, 6, 0, CHAN_Z);
2543             }
2544
2545             /* dst.w = xmm1 * src.w */
2546             if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
2547                emit_mul(func, 7, 1);
2548                STORE(func, *inst, 7, 0, CHAN_W);
2549             }
2550          }
2551
2552          /* dst0.w = 1.0 */
2553          if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
2554             emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
2555             STORE(func, *inst, 0, 0, CHAN_W);
2556          }
2557       }
2558       break;
2559
2560    case TGSI_OPCODE_DIV:
2561       return 0;
2562       break;
2563
2564    case TGSI_OPCODE_DP2:
2565       FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
2566       FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
2567       emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
2568       FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
2569       FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
2570       emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
2571       emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
2572       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2573          STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
2574       }
2575       break;
2576
2577    case TGSI_OPCODE_TXL:
2578       return 0;
2579       break;
2580
2581    case TGSI_OPCODE_TXP:
2582       emit_tex( func, inst, FALSE, TRUE );
2583       break;
2584
2585    case TGSI_OPCODE_BRK:
2586       return 0;
2587       break;
2588
2589    case TGSI_OPCODE_IF:
2590       return 0;
2591       break;
2592
2593    case TGSI_OPCODE_ELSE:
2594       return 0;
2595       break;
2596
2597    case TGSI_OPCODE_ENDIF:
2598       return 0;
2599       break;
2600
2601    case TGSI_OPCODE_PUSHA:
2602       return 0;
2603       break;
2604
2605    case TGSI_OPCODE_POPA:
2606       return 0;
2607       break;
2608
2609    case TGSI_OPCODE_CEIL:
2610       return 0;
2611       break;
2612
2613    case TGSI_OPCODE_I2F:
2614       return 0;
2615       break;
2616
2617    case TGSI_OPCODE_NOT:
2618       return 0;
2619       break;
2620
2621    case TGSI_OPCODE_TRUNC:
2622       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
2623          FETCH( func, *inst, 0, 0, chan_index );
2624          emit_f2it( func, 0 );
2625          emit_i2f( func, 0 );
2626          STORE( func, *inst, 0, 0, chan_index );
2627       }
2628       break;
2629
2630    case TGSI_OPCODE_SHL:
2631       return 0;
2632       break;
2633
2634    case TGSI_OPCODE_ISHR:
2635       return 0;
2636       break;
2637
2638    case TGSI_OPCODE_AND:
2639       return 0;
2640       break;
2641
2642    case TGSI_OPCODE_OR:
2643       return 0;
2644       break;
2645
2646    case TGSI_OPCODE_MOD:
2647       return 0;
2648       break;
2649
2650    case TGSI_OPCODE_XOR:
2651       return 0;
2652       break;
2653
2654    case TGSI_OPCODE_SAD:
2655       return 0;
2656       break;
2657
2658    case TGSI_OPCODE_TXF:
2659       return 0;
2660       break;
2661
2662    case TGSI_OPCODE_TXQ:
2663       return 0;
2664       break;
2665
2666    case TGSI_OPCODE_CONT:
2667       return 0;
2668       break;
2669
2670    case TGSI_OPCODE_EMIT:
2671       return 0;
2672       break;
2673
2674    case TGSI_OPCODE_ENDPRIM:
2675       return 0;
2676       break;
2677
2678    default:
2679       return 0;
2680    }
2681
2682    return 1;
2683 }
2684
2685 static void
2686 emit_declaration(
2687    struct x86_function *func,
2688    struct tgsi_full_declaration *decl )
2689 {
2690    if( decl->Declaration.File == TGSI_FILE_INPUT ) {
2691       unsigned first, last, mask;
2692       unsigned i, j;
2693
2694       first = decl->Range.First;
2695       last = decl->Range.Last;
2696       mask = decl->Declaration.UsageMask;
2697
2698       for( i = first; i <= last; i++ ) {
2699          for( j = 0; j < NUM_CHANNELS; j++ ) {
2700             if( mask & (1 << j) ) {
2701                switch( decl->Declaration.Interpolate ) {
2702                case TGSI_INTERPOLATE_CONSTANT:
2703                   emit_coef_a0( func, 0, i, j );
2704                   emit_inputs( func, 0, i, j );
2705                   break;
2706
2707                case TGSI_INTERPOLATE_LINEAR:
2708                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2709                   emit_coef_dadx( func, 1, i, j );
2710                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2711                   emit_coef_dady( func, 3, i, j );
2712                   emit_mul( func, 0, 1 );    /* x * dadx */
2713                   emit_coef_a0( func, 4, i, j );
2714                   emit_mul( func, 2, 3 );    /* y * dady */
2715                   emit_add( func, 0, 4 );    /* x * dadx + a0 */
2716                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2717                   emit_inputs( func, 0, i, j );
2718                   break;
2719
2720                case TGSI_INTERPOLATE_PERSPECTIVE:
2721                   emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
2722                   emit_coef_dadx( func, 1, i, j );
2723                   emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
2724                   emit_coef_dady( func, 3, i, j );
2725                   emit_mul( func, 0, 1 );    /* x * dadx */
2726                   emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
2727                   emit_coef_a0( func, 5, i, j );
2728                   emit_rcp( func, 4, 4 );    /* 1.0 / w */
2729                   emit_mul( func, 2, 3 );    /* y * dady */
2730                   emit_add( func, 0, 5 );    /* x * dadx + a0 */
2731                   emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
2732                   emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
2733                   emit_inputs( func, 0, i, j );
2734                   break;
2735
2736                default:
2737                   assert( 0 );
2738                   break;
2739                }
2740             }
2741          }
2742       }
2743    }
2744 }
2745
2746 static void aos_to_soa( struct x86_function *func,
2747                         uint arg_aos,
2748                         uint arg_machine,
2749                         uint arg_num,
2750                         uint arg_stride )
2751 {
2752    struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
2753    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
2754    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
2755    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
2756    int loop_top, loop_exit_fixup;
2757
2758    /* Save EBX */
2759    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2760
2761    x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
2762    x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
2763    x86_lea( func, soa_input,
2764             x86_make_disp( soa_input,
2765                            Offset(struct tgsi_exec_machine, Inputs) ) );
2766    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
2767    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
2768
2769    /* while (num_inputs != 0) */
2770    loop_top = x86_get_label( func );
2771    x86_cmp_imm( func, num_inputs, 0 );
2772    loop_exit_fixup = x86_jcc_forward( func, cc_E );
2773
2774    {
2775       x86_push( func, aos_input );
2776       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2777       sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2778       x86_add( func, aos_input, stride );
2779       sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
2780       sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
2781       x86_add( func, aos_input, stride );
2782       sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2783       sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2784       x86_add( func, aos_input, stride );
2785       sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
2786       sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
2787       x86_pop( func, aos_input );
2788
2789       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2790       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2791       sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
2792       sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
2793       sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
2794       sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
2795
2796       sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
2797       sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
2798       sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
2799       sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
2800
2801       /* Advance to next input */
2802       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
2803       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
2804    }
2805    /* --num_inputs */
2806    x86_dec( func, num_inputs );
2807    x86_jmp( func, loop_top );
2808    x86_fixup_fwd_jump( func, loop_exit_fixup );
2809
2810    /* Restore EBX */
2811    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2812 }
2813
2814 static void soa_to_aos( struct x86_function *func,
2815                         uint arg_aos,
2816                         uint arg_machine,
2817                         uint arg_num,
2818                         uint arg_stride )
2819 {
2820    struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
2821    struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
2822    struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
2823    struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
2824    int inner_loop;
2825
2826    /* Save EBX */
2827    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2828
2829    x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
2830    x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
2831    x86_lea( func, soa_output,
2832             x86_make_disp( soa_output,
2833                            Offset(struct tgsi_exec_machine, Outputs) ) );
2834    x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
2835
2836    /* do */
2837    inner_loop = x86_get_label( func );
2838    {
2839       sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
2840       sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
2841       sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
2842       sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
2843
2844       sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
2845       sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
2846       sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
2847       sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
2848       sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
2849       sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
2850
2851       x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
2852       x86_push( func, aos_output );
2853       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2854       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2855       x86_add( func, aos_output, temp );
2856       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
2857       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
2858       x86_add( func, aos_output, temp );
2859       sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2860       sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2861       x86_add( func, aos_output, temp );
2862       sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
2863       sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
2864       x86_pop( func, aos_output );
2865
2866       /* Advance to next output */
2867       x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
2868       x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
2869    }
2870    /* while --num_outputs */
2871    x86_dec( func, num_outputs );
2872    x86_jcc( func, cc_NE, inner_loop );
2873
2874    /* Restore EBX */
2875    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
2876 }
2877
2878
2879 /**
2880  * Check if the instructions dst register is the same as any src
2881  * register and warn if there's a posible SOA dependency.
2882  */
2883 static boolean
2884 check_soa_dependencies(const struct tgsi_full_instruction *inst)
2885 {
2886    uint opcode = inst->Instruction.Opcode;
2887
2888    /* XXX: we only handle src/dst aliasing in a few opcodes currently.
2889     * Need to use an additional temporay to hold the result in the
2890     * cases where the code is too opaque to fix.
2891     */
2892
2893    switch (opcode) {
2894    case TGSI_OPCODE_ADD:
2895    case TGSI_OPCODE_MOV:
2896    case TGSI_OPCODE_MUL:
2897    case TGSI_OPCODE_RCP:
2898    case TGSI_OPCODE_RSQ:
2899    case TGSI_OPCODE_EXP:
2900    case TGSI_OPCODE_LOG:
2901    case TGSI_OPCODE_DP3:
2902    case TGSI_OPCODE_DP4:
2903    case TGSI_OPCODE_DP2A:
2904    case TGSI_OPCODE_EX2:
2905    case TGSI_OPCODE_LG2:
2906    case TGSI_OPCODE_POW:
2907    case TGSI_OPCODE_XPD:
2908    case TGSI_OPCODE_DPH:
2909    case TGSI_OPCODE_COS:
2910    case TGSI_OPCODE_SIN:
2911    case TGSI_OPCODE_TEX:
2912    case TGSI_OPCODE_TXB:
2913    case TGSI_OPCODE_TXP:
2914    case TGSI_OPCODE_NRM:
2915    case TGSI_OPCODE_NRM4:
2916    case TGSI_OPCODE_DP2:
2917       /* OK - these opcodes correctly handle SOA dependencies */
2918       return TRUE;
2919    default:
2920       if (!tgsi_check_soa_dependencies(inst))
2921          return TRUE;
2922
2923       debug_printf("Warning: src/dst aliasing in instruction"
2924                    " is not handled:\n");
2925       debug_printf("Warning: ");
2926       tgsi_dump_instruction(inst, 1);
2927
2928       return FALSE;
2929    }
2930 }
2931
2932
2933 /**
2934  * Translate a TGSI vertex/fragment shader to SSE2 code.
2935  * Slightly different things are done for vertex vs. fragment shaders.
2936  *
2937  * \param tokens  the TGSI input shader
2938  * \param func  the output SSE code/function
2939  * \param immediates  buffer to place immediates, later passed to SSE func
2940  * \param return  1 for success, 0 if translation failed
2941  */
2942 unsigned
2943 tgsi_emit_sse2(
2944    const struct tgsi_token *tokens,
2945    struct x86_function *func,
2946    float (*immediates)[4],
2947    boolean do_swizzles )
2948 {
2949    struct tgsi_parse_context parse;
2950    unsigned ok = 1;
2951    uint num_immediates = 0;
2952
2953    util_init_math();
2954
2955    func->csr = func->store;
2956
2957    tgsi_parse_init( &parse, tokens );
2958
2959    /* Can't just use EDI, EBX without save/restoring them:
2960     */
2961    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
2962    x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
2963
2964    /*
2965     * Different function args for vertex/fragment shaders:
2966     */
2967    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
2968       if (do_swizzles)
2969          aos_to_soa( func,
2970                      4,         /* aos_input */
2971                      1,         /* machine */
2972                      5,         /* num_inputs */
2973                      6 );       /* input_stride */
2974    }
2975
2976    x86_mov(
2977       func,
2978       get_machine_base(),
2979       x86_fn_arg( func, 1 ) );
2980    x86_mov(
2981       func,
2982       get_const_base(),
2983       x86_fn_arg( func, 2 ) );
2984    x86_mov(
2985       func,
2986       get_immediate_base(),
2987       x86_fn_arg( func, 3 ) );
2988
2989    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
2990       x86_mov(
2991          func,
2992          get_coef_base(),
2993          x86_fn_arg( func, 4 ) );
2994    }
2995
2996    x86_mov(
2997       func,
2998       get_sampler_base(),
2999       x86_make_disp( get_machine_base(),
3000                      Offset( struct tgsi_exec_machine, Samplers ) ) );
3001
3002    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
3003       tgsi_parse_token( &parse );
3004
3005       switch( parse.FullToken.Token.Type ) {
3006       case TGSI_TOKEN_TYPE_DECLARATION:
3007          if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
3008             emit_declaration(
3009                func,
3010                &parse.FullToken.FullDeclaration );
3011          }
3012          break;
3013
3014       case TGSI_TOKEN_TYPE_INSTRUCTION:
3015          ok = emit_instruction(
3016             func,
3017             &parse.FullToken.FullInstruction );
3018
3019          if (!ok) {
3020             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
3021             uint proc = parse.FullHeader.Processor.Processor;
3022             debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n",
3023                          opcode,
3024                          tgsi_get_opcode_name(opcode),
3025                          tgsi_get_processor_name(proc));
3026          }
3027
3028          if (ok)
3029             ok = check_soa_dependencies(&parse.FullToken.FullInstruction);
3030          break;
3031
3032       case TGSI_TOKEN_TYPE_IMMEDIATE:
3033          /* simply copy the immediate values into the next immediates[] slot */
3034          {
3035             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
3036             uint i;
3037             assert(size <= 4);
3038             assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
3039             for( i = 0; i < size; i++ ) {
3040                immediates[num_immediates][i] =
3041                   parse.FullToken.FullImmediate.u[i].Float;
3042             }
3043 #if 0
3044             debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
3045                    num_immediates,
3046                    immediates[num_immediates][0],
3047                    immediates[num_immediates][1],
3048                    immediates[num_immediates][2],
3049                    immediates[num_immediates][3]);
3050 #endif
3051             num_immediates++;
3052          }
3053          break;
3054       case TGSI_TOKEN_TYPE_PROPERTY:
3055          /* we just ignore them for now */
3056          break;
3057
3058       default:
3059          ok = 0;
3060          assert( 0 );
3061       }
3062    }
3063
3064    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
3065       if (do_swizzles)
3066          soa_to_aos( func,
3067                      7,         /* aos_output */
3068                      1,         /* machine */
3069                      8,         /* num_outputs */
3070                      9 );       /* output_stride */
3071    }
3072
3073    /* Can't just use EBX, EDI without save/restoring them:
3074     */
3075    x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
3076    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
3077
3078    emit_ret( func );
3079
3080    tgsi_parse_free( &parse );
3081
3082    return ok;
3083 }
3084
3085 #endif /* PIPE_ARCH_X86 */