src/mesa/tnl/t_vb_arbprogram_sse.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.3
   4  *
   5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file t_vb_arb_program_sse.c
  27  *
  28  * Translate simplified vertex_program representation to
  29  * x86/x87/SSE/SSE2 machine code using mesa's rtasm runtime assembler.
  30  *
  31  * This is very much a first attempt - build something that works.
  32  * There are probably better approaches for applying SSE to vertex
  33  * programs, and the whole thing is crying out for static analysis of
  34  * the programs to avoid redundant operations.
  35  *
  36  * \author Keith Whitwell
  37  */
  38
  39 #include "glheader.h"
  40 #include "context.h"
  41 #include "imports.h"
  42 #include "macros.h"
  43 #include "mtypes.h"
  44 #include "arbprogparse.h"
  45 #include "program.h"
  46 #include "program_instruction.h"
  47 #include "math/m_matrix.h"
  48 #include "math/m_translate.h"
  49 #include "t_context.h"
  50 #include "t_vb_arbprogram.h"
  51
  52 #if defined(USE_SSE_ASM)
  53
  54 #include "x86/rtasm/x86sse.h"
  55 #include "x86/common_x86_asm.h"
  56
  57 #define X    0
  58 #define Y    1
  59 #define Z    2
  60 #define W    3
  61
  62 /* Reg usage:
  63  *
  64  * EAX - temp
  65  * EBX - point to 'm->File[0]'
  66  * ECX - point to 'm->File[3]'
  67  * EDX - holds 'm'
  68  * EBP,
  69  * ESI,
  70  * EDI
  71  */
  72
  73 #define DISASSEM 0
  74
  75 #define FAIL                                                            \
  76 do {                                                                    \
  77    _mesa_printf("x86 translation failed in %s\n", __FUNCTION__);        \
  78    return GL_FALSE;                                                     \
  79 } while (0)
  80
  81 struct compilation {
  82    struct x86_function func;
  83    struct tnl_compiled_program *p;
  84    GLuint insn_counter;
  85
  86    struct {
  87       GLuint file:2;
  88       GLuint idx:7;
  89       GLuint dirty:1;
  90       GLuint last_used:10;
  91    } xmm[8];
  92
  93    struct {
  94       struct x86_reg base;
  95    } file[4];
  96
  97    GLboolean have_sse2;
  98    GLshort fpucntl;
  99 };
 100
 101 static INLINE GLboolean eq( struct x86_reg a,
 102                             struct x86_reg b )
 103 {
 104    return (a.file == b.file &&
 105            a.idx == b.idx &&
 106            a.mod == b.mod &&
 107            a.disp == b.disp);
 108 }
 109
 110 static GLint get_offset( const void *a, const void *b )
 111 {
 112    return (const char *)b - (const char *)a;
 113 }
 114
 115
 116 static struct x86_reg get_reg_ptr(GLuint file,
 117                                   GLuint idx )
 118 {
 119    struct x86_reg reg;
 120
 121    switch (file) {
 122    case FILE_REG:
 123       reg = x86_make_reg(file_REG32, reg_BX);
 124       assert(idx != REG_UNDEF);
 125       break;
 126    case FILE_STATE_PARAM:
 127       reg = x86_make_reg(file_REG32, reg_CX);
 128       break;
 129    default:
 130       assert(0);
 131    }
 132
 133    return x86_make_disp(reg, 16 * idx);
 134 }
 135
 136
 137 static void spill( struct compilation *cp, GLuint idx )
 138 {
 139    struct x86_reg oldval = get_reg_ptr(cp->xmm[idx].file,
 140                                        cp->xmm[idx].idx);
 141
 142    assert(cp->xmm[idx].dirty);
 143    sse_movups(&cp->func, oldval, x86_make_reg(file_XMM, idx));
 144    cp->xmm[idx].dirty = 0;
 145 }
 146
 147 static struct x86_reg get_xmm_reg( struct compilation *cp )
 148 {
 149    GLuint i;
 150    GLuint oldest = 0;
 151
 152    for (i = 0; i < 8; i++)
 153       if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
 154          oldest = i;
 155
 156    /* Need to write out the old value?
 157     */
 158    if (cp->xmm[oldest].dirty)
 159       spill(cp, oldest);
 160
 161    assert(cp->xmm[oldest].last_used != cp->insn_counter);
 162
 163    cp->xmm[oldest].file = FILE_REG;
 164    cp->xmm[oldest].idx = REG_UNDEF;
 165    cp->xmm[oldest].last_used = cp->insn_counter;
 166    return x86_make_reg(file_XMM, oldest);
 167 }
 168
 169 static void invalidate_xmm( struct compilation *cp,
 170                             GLuint file, GLuint idx )
 171 {
 172    GLuint i;
 173
 174    /* Invalidate any old copy of this register in XMM0-7.
 175     */
 176    for (i = 0; i < 8; i++) {
 177       if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) {
 178          cp->xmm[i].file = FILE_REG;
 179          cp->xmm[i].idx = REG_UNDEF;
 180          cp->xmm[i].dirty = 0;
 181          break;
 182       }
 183    }
 184 }
 185
 186
 187 /* Return an XMM reg to receive the results of an operation.
 188  */
 189 static struct x86_reg get_dst_xmm_reg( struct compilation *cp,
 190                                        GLuint file, GLuint idx )
 191 {
 192    struct x86_reg reg;
 193
 194    /* Invalidate any old copy of this register in XMM0-7.  Don't reuse
 195     * as this may be one of the arguments.
 196     */
 197    invalidate_xmm( cp, file, idx );
 198
 199    reg = get_xmm_reg( cp );
 200    cp->xmm[reg.idx].file = file;
 201    cp->xmm[reg.idx].idx = idx;
 202    cp->xmm[reg.idx].dirty = 1;
 203    return reg;
 204 }
 205
 206 /* As above, but return a pointer.  Note - this pointer may alias
 207  * those returned by get_arg_ptr().
 208  */
 209 static struct x86_reg get_dst_ptr( struct compilation *cp,
 210                                    GLuint file, GLuint idx )
 211 {
 212    /* Invalidate any old copy of this register in XMM0-7.  Don't reuse
 213     * as this may be one of the arguments.
 214     */
 215    invalidate_xmm( cp, file, idx );
 216
 217    return get_reg_ptr(file, idx);
 218 }
 219
 220
 221
 222 /* Return an XMM reg if the argument is resident, otherwise return a
 223  * base+offset pointer to the saved value.
 224  */
 225 static struct x86_reg get_arg( struct compilation *cp, GLuint file, GLuint idx )
 226 {
 227    GLuint i;
 228
 229    for (i = 0; i < 8; i++) {
 230       if (cp->xmm[i].file == file &&
 231           cp->xmm[i].idx == idx) {
 232          cp->xmm[i].last_used = cp->insn_counter;
 233          return x86_make_reg(file_XMM, i);
 234       }
 235    }
 236
 237    return get_reg_ptr(file, idx);
 238 }
 239
 240 /* As above, but always return a pointer:
 241  */
 242 static struct x86_reg get_arg_ptr( struct compilation *cp, GLuint file, GLuint idx )
 243 {
 244    GLuint i;
 245
 246    /* If there is a modified version of this register in one of the
 247     * XMM regs, write it out to memory.
 248     */
 249    for (i = 0; i < 8; i++) {
 250       if (cp->xmm[i].file == file &&
 251           cp->xmm[i].idx == idx &&
 252           cp->xmm[i].dirty)
 253          spill(cp, i);
 254    }
 255
 256    return get_reg_ptr(file, idx);
 257 }
 258
 259 /* Emulate pshufd insn in regular SSE, if necessary:
 260  */
 261 static void emit_pshufd( struct compilation *cp,
 262                          struct x86_reg dst,
 263                          struct x86_reg arg0,
 264                          GLubyte shuf )
 265 {
 266    if (cp->have_sse2) {
 267       sse2_pshufd(&cp->func, dst, arg0, shuf);
 268       cp->func.fn = 0;
 269    }
 270    else {
 271       if (!eq(dst, arg0))
 272          sse_movups(&cp->func, dst, arg0);
 273
 274       sse_shufps(&cp->func, dst, dst, shuf);
 275    }
 276 }
 277
 278 static void set_fpu_round_neg_inf( struct compilation *cp )
 279 {
 280    if (cp->fpucntl != RND_NEG_FPU) {
 281       struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX);
 282       struct arb_vp_machine *m = NULL;
 283
 284       cp->fpucntl = RND_NEG_FPU;
 285       x87_fnclex(&cp->func);
 286       x87_fldcw(&cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_rnd_neg)));
 287    }
 288 }
 289
 290
 291 /* Perform a reduced swizzle.
 292  */
 293 static GLboolean emit_RSW( struct compilation *cp, union instruction op )
 294 {
 295    struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
 296    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
 297    GLuint swz = GET_SWZ(op.rsw.swz, 0) | (GET_SWZ(op.rsw.swz, 1) << 2) |
 298                 (GET_SWZ(op.rsw.swz, 2) << 4| (GET_SWZ(op.rsw.swz, 3) << 6));
 299    GLuint neg = op.rsw.neg;
 300
 301    emit_pshufd(cp, dst, arg0, swz);
 302
 303    if (neg) {
 304       struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
 305       struct x86_reg tmp = get_xmm_reg(cp);
 306       /* Load 1,-1,0,0
 307        * Use neg as arg to pshufd
 308        * Multiply
 309        */
 310       /* is the emit_pshufd necessary? only SWZ can negate individual components */
 311       emit_pshufd(cp, tmp, negs,
 312                   SHUF((neg & 1) ? 1 : 0,
 313                        (neg & 2) ? 1 : 0,
 314                        (neg & 4) ? 1 : 0,
 315                        (neg & 8) ? 1 : 0));
 316       sse_mulps(&cp->func, dst, tmp);
 317    }
 318
 319    return GL_TRUE;
 320 }
 321
 322 /* Perform a full swizzle
 323  */
 324 static GLboolean emit_SWZ( struct compilation *cp, union instruction op )
 325 {
 326    struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
 327    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
 328    struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
 329    struct x86_reg tmp = get_xmm_reg(cp);
 330    GLubyte neg = op.rsw.neg;
 331    GLubyte shuf2, swz, savepos, savemask, swizzle[4];
 332
 333    swizzle[0] = GET_SWZ(op.rsw.swz, 0);
 334    swizzle[1] = GET_SWZ(op.rsw.swz, 1);
 335    swizzle[2] = GET_SWZ(op.rsw.swz, 2);
 336    swizzle[3] = GET_SWZ(op.rsw.swz, 3);
 337
 338    swz = SHUF((swizzle[0] & 3), (swizzle[1] & 3),
 339               (swizzle[2] & 3), (swizzle[3] & 3));
 340
 341    emit_pshufd(cp, dst, arg0, swz);
 342
 343    /* can handle negation and replace with zero with the same shuffle/mul */
 344    shuf2 = SHUF(swizzle[0] == 4 ? 2 : (neg & 1),
 345                 swizzle[1] == 4 ? 2 : ((neg & 2) >> 1),
 346                 swizzle[2] == 4 ? 2 : ((neg & 4) >> 2),
 347                 swizzle[3] == 4 ? 2 : ((neg & 8) >> 3));
 348
 349    /* now the hard part is getting those 1's in there... */
 350    savepos = 0;
 351    savemask = 0;
 352    if (swizzle[0] == 5) savepos = 1;
 353    if (swizzle[1] == 5) savepos = 2;
 354    else savemask |= 1 << 2;
 355    if (swizzle[2] == 5) savepos = 3;
 356    else savemask |= 2 << 4;
 357    if (swizzle[3] == 5) savepos = 4;
 358    else savemask |= 3 << 6;
 359    if (savepos) {
 360       /* need a mov first as movss from memory will overwrite high bits of xmm reg */
 361       sse_movups(&cp->func, tmp, negs);
 362       /* can only replace lowest 32bits, thus move away that part first */
 363       emit_pshufd(cp, dst, dst, savemask);
 364       sse_movss(&cp->func, dst, tmp);
 365       emit_pshufd(cp, dst, dst, (savepos - 1) | (savemask & 0xfc));
 366    }
 367
 368    if (shuf2) {
 369       /* Load 1,-1,0,0
 370        * Use neg as arg to pshufd
 371        * Multiply
 372        */
 373       emit_pshufd(cp, tmp, negs, shuf2);
 374       sse_mulps(&cp->func, dst, tmp);
 375    }
 376
 377    return GL_TRUE;
 378 }
 379
 380 /* Helper for writemask:
 381  */
 382 static GLboolean emit_shuf_copy1( struct compilation *cp,
 383                                   struct x86_reg dst,
 384                                   struct x86_reg arg0,
 385                                   struct x86_reg arg1,
 386                                   GLubyte shuf )
 387 {
 388    struct x86_reg tmp = get_xmm_reg(cp);
 389    sse_movups(&cp->func, dst, arg1);
 390    emit_pshufd(cp, dst, dst, shuf);
 391    emit_pshufd(cp, tmp, arg0, shuf);
 392
 393    sse_movss(&cp->func, dst, tmp);
 394
 395    emit_pshufd(cp, dst, dst, shuf);
 396    return GL_TRUE;
 397 }
 398
 399
 400 /* Helper for writemask:
 401  */
 402 static GLboolean emit_shuf_copy2( struct compilation *cp,
 403                                   struct x86_reg dst,
 404                                   struct x86_reg arg0,
 405                                   struct x86_reg arg1,
 406                                   GLubyte shuf )
 407 {
 408    struct x86_reg tmp = get_xmm_reg(cp);
 409    emit_pshufd(cp, dst, arg1, shuf);
 410    emit_pshufd(cp, tmp, arg0, shuf);
 411
 412    sse_shufps(&cp->func, dst, tmp, SHUF(X, Y, Z, W));
 413
 414    emit_pshufd(cp, dst, dst, shuf);
 415    return GL_TRUE;
 416 }
 417
 418
 419 static void emit_x87_ex2( struct compilation *cp )
 420 {
 421    struct x86_reg st0 = x86_make_reg(file_x87, 0);
 422    struct x86_reg st1 = x86_make_reg(file_x87, 1);
 423    struct x86_reg st3 = x86_make_reg(file_x87, 3);
 424
 425    set_fpu_round_neg_inf( cp );
 426
 427    x87_fld(&cp->func, st0); /* a a */
 428    x87_fprndint( &cp->func );   /* int(a) a */
 429    x87_fld(&cp->func, st0); /* int(a) int(a) a */
 430    x87_fstp(&cp->func, st3); /* int(a) a int(a)*/
 431    x87_fsubp(&cp->func, st1); /* frac(a) int(a) */
 432    x87_f2xm1(&cp->func);    /* (2^frac(a))-1 int(a)*/
 433    x87_fld1(&cp->func);    /* 1 (2^frac(a))-1 int(a)*/
 434    x87_faddp(&cp->func, st1);   /* 2^frac(a) int(a) */
 435    x87_fscale(&cp->func);       /* 2^a */
 436 }
 437
 438 #if 0
 439 static GLboolean emit_MSK2( struct compilation *cp, union instruction op )
 440 {
 441    struct x86_reg arg0 = get_arg(cp, op.msk.file, op.msk.arg);
 442    struct x86_reg arg1 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */
 443    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst);
 444
 445    /* make full width bitmask in tmp
 446     * dst = ~tmp
 447     * tmp &= arg0
 448     * dst &= arg1
 449     * dst |= tmp
 450     */
 451    emit_pshufd(cp, tmp, get_arg(cp, FILE_REG, REG_NEGS),
 452                SHUF((op.msk.mask & 1) ? 2 : 0,
 453                     (op.msk.mask & 2) ? 2 : 0,
 454                     (op.msk.mask & 4) ? 2 : 0,
 455                     (op.msk.mask & 8) ? 2 : 0));
 456    sse2_pnot(&cp->func, dst, tmp);
 457    sse2_pand(&cp->func, arg0, tmp);
 458    sse2_pand(&cp->func, arg1, dst);
 459    sse2_por(&cp->func, tmp, dst);
 460    return GL_TRUE;
 461 }
 462 #endif
 463
 464
 465 /* Used to implement write masking.  This and most of the other instructions
 466  * here would be easier to implement if there had been a translation
 467  * to a 2 argument format (dst/arg0, arg1) at the shader level before
 468  * attempting to translate to x86/sse code.
 469  */
 470 static GLboolean emit_MSK( struct compilation *cp, union instruction op )
 471 {
 472    struct x86_reg arg = get_arg(cp, op.msk.file, op.msk.idx);
 473    struct x86_reg dst0 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */
 474    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst);
 475
 476    /* Note that dst and dst0 refer to the same program variable, but
 477     * will definitely be different XMM registers.  We're effectively
 478     * treating this as a 2 argument SEL now, just one of which happens
 479     * always to be the same register as the destination.
 480     */
 481
 482    switch (op.msk.mask) {
 483    case 0:
 484       sse_movups(&cp->func, dst, dst0);
 485       return GL_TRUE;
 486
 487    case WRITEMASK_X:
 488       if (arg.file == file_XMM) {
 489          sse_movups(&cp->func, dst, dst0);
 490          sse_movss(&cp->func, dst, arg);
 491       }
 492       else {
 493          struct x86_reg tmp = get_xmm_reg(cp);
 494          sse_movups(&cp->func, dst, dst0);
 495          sse_movss(&cp->func, tmp, arg);
 496          sse_movss(&cp->func, dst, tmp);
 497       }
 498       return GL_TRUE;
 499
 500    case WRITEMASK_XY:
 501       sse_movups(&cp->func, dst, dst0);
 502       sse_shufps(&cp->func, dst, arg, SHUF(X, Y, Z, W));
 503       return GL_TRUE;
 504
 505    case WRITEMASK_ZW:
 506       sse_movups(&cp->func, dst, arg);
 507       sse_shufps(&cp->func, dst, dst0, SHUF(X, Y, Z, W));
 508       return GL_TRUE;
 509
 510    case WRITEMASK_YZW:
 511       if (dst0.file == file_XMM) {
 512          sse_movups(&cp->func, dst, arg);
 513          sse_movss(&cp->func, dst, dst0);
 514       }
 515       else {
 516          struct x86_reg tmp = get_xmm_reg(cp);
 517          sse_movups(&cp->func, dst, arg);
 518          sse_movss(&cp->func, tmp, dst0);
 519          sse_movss(&cp->func, dst, tmp);
 520       }
 521       return GL_TRUE;
 522
 523    case WRITEMASK_Y:
 524       emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Y,X,Z,W));
 525       return GL_TRUE;
 526
 527    case WRITEMASK_Z:
 528       emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Z,Y,X,W));
 529       return GL_TRUE;
 530
 531    case WRITEMASK_W:
 532       emit_shuf_copy1(cp, dst, arg, dst0, SHUF(W,Y,Z,X));
 533       return GL_TRUE;
 534
 535    case WRITEMASK_XZ:
 536       emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,Z,Y,W));
 537       return GL_TRUE;
 538
 539    case WRITEMASK_XW:
 540       emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,W,Z,Y));
 541
 542    case WRITEMASK_YZ:
 543       emit_shuf_copy2(cp, dst, arg, dst0, SHUF(Z,Y,X,W));
 544       return GL_TRUE;
 545
 546    case WRITEMASK_YW:
 547       emit_shuf_copy2(cp, dst, arg, dst0, SHUF(W,Y,Z,X));
 548       return GL_TRUE;
 549
 550    case WRITEMASK_XZW:
 551       emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Y,X,Z,W));
 552       return GL_TRUE;
 553
 554    case WRITEMASK_XYW:
 555       emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Z,Y,X,W));
 556       return GL_TRUE;
 557
 558    case WRITEMASK_XYZ:
 559       emit_shuf_copy1(cp, dst, dst0, arg, SHUF(W,Y,Z,X));
 560       return GL_TRUE;
 561
 562    case WRITEMASK_XYZW:
 563       sse_movups(&cp->func, dst, arg);
 564       return GL_TRUE;
 565
 566    default:
 567       assert(0);
 568       break;
 569    }
 570 }
 571
 572
 573
 574 static GLboolean emit_PRT( struct compilation *cp, union instruction op )
 575 {
 576    FAIL;
 577 }
 578
 579
 580 /**
 581  * The traditional instructions.  All operate on internal registers
 582  * and ignore write masks and swizzling issues.
 583  */
 584
 585 static GLboolean emit_ABS( struct compilation *cp, union instruction op )
 586 {
 587    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 588    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 589    struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
 590
 591    sse_movups(&cp->func, dst, arg0);
 592    sse_mulps(&cp->func, dst, neg);
 593    sse_maxps(&cp->func, dst, arg0);
 594    return GL_TRUE;
 595 }
 596
 597 static GLboolean emit_ADD( struct compilation *cp, union instruction op )
 598 {
 599    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 600    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 601    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 602
 603    sse_movups(&cp->func, dst, arg0);
 604    sse_addps(&cp->func, dst, arg1);
 605    return GL_TRUE;
 606 }
 607
 608
 609 /* The dotproduct instructions don't really do that well in sse:
 610  */
 611 static GLboolean emit_DP3( struct compilation *cp, union instruction op )
 612 {
 613    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 614    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 615    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 616    struct x86_reg tmp = get_xmm_reg(cp);
 617
 618    sse_movups(&cp->func, dst, arg0);
 619    sse_mulps(&cp->func, dst, arg1);
 620
 621    /* Now the hard bit: sum the first 3 values:
 622     */
 623    sse_movhlps(&cp->func, tmp, dst);
 624    sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
 625    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
 626    sse_addss(&cp->func, dst, tmp);
 627    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 628    return GL_TRUE;
 629 }
 630
 631
 632
 633 static GLboolean emit_DP4( struct compilation *cp, union instruction op )
 634 {
 635    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 636    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 637    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 638    struct x86_reg tmp = get_xmm_reg(cp);
 639
 640    sse_movups(&cp->func, dst, arg0);
 641    sse_mulps(&cp->func, dst, arg1);
 642
 643    /* Now the hard bit: sum the values:
 644     */
 645    sse_movhlps(&cp->func, tmp, dst);
 646    sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
 647    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
 648    sse_addss(&cp->func, dst, tmp);
 649    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 650    return GL_TRUE;
 651 }
 652
 653 static GLboolean emit_DPH( struct compilation *cp, union instruction op )
 654 {
 655    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 656    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 657    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 658    struct x86_reg tmp = get_xmm_reg(cp);
 659
 660    sse_movups(&cp->func, dst, arg0);
 661    sse_mulps(&cp->func, dst, arg1);
 662
 663    /* Now the hard bit: sum the values (from DP3):
 664     */
 665    sse_movhlps(&cp->func, tmp, dst);
 666    sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
 667    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
 668    sse_addss(&cp->func, dst, tmp);
 669    emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
 670    sse_addss(&cp->func, dst, tmp);
 671    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 672    return GL_TRUE;
 673 }
 674
 675 #if 0
 676 static GLboolean emit_DST( struct compilation *cp, union instruction op )
 677 {
 678     struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 679     struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1);
 680     struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 681
 682 /*    dst[0] = 1.0     * 1.0F; */
 683 /*    dst[1] = arg0[1] * arg1[1]; */
 684 /*    dst[2] = arg0[2] * 1.0; */
 685 /*    dst[3] = 1.0     * arg1[3]; */
 686
 687     /* Would rather do some of this with integer regs, but:
 688      *  1) No proper support for immediate values yet
 689      *  2) I'd need to push/pop somewhere to get a free reg.
 690      */
 691     x87_fld1(&cp->func);
 692     x87_fstp(&cp->func, dst); /* would rather do an immediate store... */
 693     x87_fld(&cp->func, x86_make_disp(arg0, 4));
 694     x87_fmul(&cp->func, x86_make_disp(arg1, 4));
 695     x87_fstp(&cp->func, x86_make_disp(dst, 4));
 696
 697     if (!eq(arg0, dst)) {
 698        x86_fld(&cp->func, x86_make_disp(arg0, 8));
 699        x86_stp(&cp->func, x86_make_disp(dst, 8));
 700     }
 701
 702     if (!eq(arg1, dst)) {
 703        x86_fld(&cp->func, x86_make_disp(arg0, 12));
 704        x86_stp(&cp->func, x86_make_disp(dst, 12));
 705     }
 706
 707     return GL_TRUE;
 708 }
 709 #else
 710 static GLboolean emit_DST( struct compilation *cp, union instruction op )
 711 {
 712     struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 713     struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 714     struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 715     struct x86_reg tmp = get_xmm_reg(cp);
 716     struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
 717
 718     emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
 719     emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
 720     sse_mulps(&cp->func, dst, tmp);
 721
 722 /*    dst[0] = 1.0     * 1.0F; */
 723 /*    dst[1] = arg0[1] * arg1[1]; */
 724 /*    dst[2] = arg0[2] * 1.0; */
 725 /*    dst[3] = 1.0     * arg1[3]; */
 726
 727     return GL_TRUE;
 728 }
 729 #endif
 730
 731 static GLboolean emit_LG2( struct compilation *cp, union instruction op )
 732 {
 733    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 734    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 735
 736    x87_fld1(&cp->func);         /* 1 */
 737    x87_fld(&cp->func, arg0);    /* a0 1 */
 738    x87_fyl2x(&cp->func);        /* log2(a0) */
 739    x87_fst(&cp->func, x86_make_disp(dst, 0));
 740    x87_fst(&cp->func, x86_make_disp(dst, 4));
 741    x87_fst(&cp->func, x86_make_disp(dst, 8));
 742    x87_fstp(&cp->func, x86_make_disp(dst, 12));
 743
 744    return GL_TRUE;
 745 }
 746
 747
 748 static GLboolean emit_EX2( struct compilation *cp, union instruction op )
 749 {
 750    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 751    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 752
 753    /* CAUTION: dst may alias arg0!
 754     */
 755    x87_fld(&cp->func, arg0);
 756
 757    emit_x87_ex2(cp);
 758
 759    x87_fst(&cp->func, x86_make_disp(dst, 0));
 760    x87_fst(&cp->func, x86_make_disp(dst, 4));
 761    x87_fst(&cp->func, x86_make_disp(dst, 8));
 762    x87_fst(&cp->func, x86_make_disp(dst, 12));
 763    return GL_TRUE;
 764 }
 765
 766 static GLboolean emit_EXP( struct compilation *cp, union instruction op )
 767 {
 768     struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 769     struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 770     struct x86_reg st0 = x86_make_reg(file_x87, 0);
 771     struct x86_reg st1 = x86_make_reg(file_x87, 1);
 772     struct x86_reg st3 = x86_make_reg(file_x87, 3);
 773
 774     /* CAUTION: dst may alias arg0!
 775      */
 776     x87_fld(&cp->func, arg0);   /* arg0.x */
 777     x87_fld(&cp->func, st0); /* arg arg */
 778
 779     /* by default, fpu is setup to round-to-nearest.  We want to
 780      * change this now, and track the state through to the end of the
 781      * generated function so that it isn't repeated unnecessarily.
 782      * Alternately, could subtract .5 to get round to -inf behaviour.
 783      */
 784     set_fpu_round_neg_inf( cp );
 785     x87_fprndint( &cp->func );  /* flr(a) a */
 786     x87_fld(&cp->func, st0); /* flr(a) flr(a) a */
 787     x87_fld1(&cp->func);    /* 1 floor(a) floor(a) a */
 788     x87_fst(&cp->func, x86_make_disp(dst, 12));  /* stack unchanged */
 789     x87_fscale(&cp->func);  /* 2^floor(a) floor(a) a */
 790     x87_fst(&cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/
 791     x87_fstp(&cp->func, x86_make_disp(dst, 0)); /* flr(a) a 2^flr(a) */
 792     x87_fsubrp(&cp->func, st1); /* frac(a) 2^flr(a) */
 793     x87_fst(&cp->func, x86_make_disp(dst, 4));    /* frac(a) 2^flr(a) */
 794     x87_f2xm1(&cp->func);    /* (2^frac(a))-1 2^flr(a)*/
 795     x87_fld1(&cp->func);    /* 1 (2^frac(a))-1 2^flr(a)*/
 796     x87_faddp(&cp->func, st1);  /* 2^frac(a) 2^flr(a) */
 797     x87_fmulp(&cp->func, st1);  /* 2^a */
 798     x87_fst(&cp->func, x86_make_disp(dst, 8));
 799
 800
 801
 802 /*    dst[0] = 2^floor(tmp); */
 803 /*    dst[1] = frac(tmp); */
 804 /*    dst[2] = 2^floor(tmp) * 2^frac(tmp); */
 805 /*    dst[3] = 1.0F; */
 806     return GL_TRUE;
 807 }
 808
 809 static GLboolean emit_LOG( struct compilation *cp, union instruction op )
 810 {
 811     struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 812     struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 813     struct x86_reg st0 = x86_make_reg(file_x87, 0);
 814     struct x86_reg st1 = x86_make_reg(file_x87, 1);
 815     struct x86_reg st2 = x86_make_reg(file_x87, 2);
 816
 817     /* CAUTION: dst may alias arg0!
 818      */
 819     x87_fld(&cp->func, arg0);   /* arg0.x */
 820     x87_fabs(&cp->func);        /* |arg0.x| */
 821     x87_fxtract(&cp->func);     /* mantissa(arg0.x), exponent(arg0.x) */
 822     x87_fst(&cp->func, st2);    /* mantissa, exponent, mantissa */
 823     x87_fld1(&cp->func);        /* 1, mantissa, exponent, mantissa */
 824     x87_fyl2x(&cp->func);       /* log2(mantissa), exponent, mantissa */
 825     x87_fadd(&cp->func, st0, st1);      /* e+l2(m), e, m  */
 826     x87_fstp(&cp->func, x86_make_disp(dst, 8)); /* e, m */
 827
 828     x87_fld1(&cp->func);        /* 1, e, m */
 829     x87_fsub(&cp->func, st1, st0);      /* 1, e-1, m */
 830     x87_fstp(&cp->func, x86_make_disp(dst, 12)); /* e-1,m */
 831     x87_fstp(&cp->func, dst);   /* m */
 832
 833     x87_fadd(&cp->func, st0, st0);      /* 2m */
 834     x87_fstp(&cp->func, x86_make_disp(dst, 4));
 835
 836     return GL_TRUE;
 837 }
 838
 839 static GLboolean emit_FLR( struct compilation *cp, union instruction op )
 840 {
 841    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 842    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 843    int i;
 844
 845    set_fpu_round_neg_inf( cp );
 846
 847    for (i = 0; i < 4; i++) {
 848       x87_fld(&cp->func, x86_make_disp(arg0, i*4));
 849       x87_fprndint( &cp->func );
 850       x87_fstp(&cp->func, x86_make_disp(dst, i*4));
 851    }
 852
 853
 854    return GL_TRUE;
 855 }
 856
 857 static GLboolean emit_FRC( struct compilation *cp, union instruction op )
 858 {
 859    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 860    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 861    struct x86_reg st0 = x86_make_reg(file_x87, 0);
 862    struct x86_reg st1 = x86_make_reg(file_x87, 1);
 863    int i;
 864
 865    set_fpu_round_neg_inf( cp );
 866
 867    /* Knowing liveness info or even just writemask would be useful
 868     * here:
 869     */
 870    for (i = 0; i < 4; i++) {
 871       x87_fld(&cp->func, x86_make_disp(arg0, i*4));
 872       x87_fld(&cp->func, st0);  /* a a */
 873       x87_fprndint( &cp->func );   /* flr(a) a */
 874       x87_fsubrp(&cp->func, st1); /* frc(a) */
 875       x87_fstp(&cp->func, x86_make_disp(dst, i*4));
 876    }
 877
 878    return GL_TRUE;
 879 }
 880
 881
 882
 883 static GLboolean emit_LIT( struct compilation *cp, union instruction op )
 884 {
 885 #if 1
 886    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 887    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 888    struct x86_reg lit = get_arg(cp, FILE_REG, REG_LIT);
 889    struct x86_reg tmp = get_xmm_reg(cp);
 890    struct x86_reg st1 = x86_make_reg(file_x87, 1);
 891    struct x86_reg regEAX = x86_make_reg(file_REG32, reg_AX);
 892    GLubyte *fixup1, *fixup2;
 893
 894
 895    /* Load the interesting parts of arg0:
 896     */
 897    x87_fld(&cp->func, x86_make_disp(arg0, 12)); /* a3 */
 898    x87_fld(&cp->func, x86_make_disp(arg0, 4)); /* a1 a3 */
 899    x87_fld(&cp->func, x86_make_disp(arg0, 0)); /* a0 a1 a3 */
 900
 901    /* Intialize dst:
 902     */
 903    sse_movaps(&cp->func, tmp, lit);
 904    sse_movaps(&cp->func, dst, tmp);
 905
 906    /* Check arg0[0]:
 907     */
 908    x87_fldz(&cp->func);         /* 0 a0 a1 a3 */
 909    x87_fucomp(&cp->func, st1);  /* a0 a1 a3 */
 910    x87_fnstsw(&cp->func, regEAX);
 911    x86_sahf(&cp->func);
 912    fixup1 = x86_jcc_forward(&cp->func, cc_AE);
 913
 914    x87_fstp(&cp->func, x86_make_disp(dst, 4));  /* a1 a3 */
 915
 916    /* Check arg0[1]:
 917     */
 918    x87_fldz(&cp->func);         /* 0 a1 a3 */
 919    x87_fucomp(&cp->func, st1);  /* a1 a3 */
 920    x87_fnstsw(&cp->func, regEAX);
 921    x86_sahf(&cp->func);
 922    fixup2 = x86_jcc_forward(&cp->func, cc_AE);
 923
 924    /* Compute pow(a1, a3)
 925     */
 926    x87_fyl2x(&cp->func);        /* a3*log2(a1) */
 927
 928    emit_x87_ex2( cp );          /* 2^(a3*log2(a1)) */
 929
 930    x87_fstp(&cp->func, x86_make_disp(dst, 8));
 931
 932    /* Land jumps:
 933     */
 934    x86_fixup_fwd_jump(&cp->func, fixup1);
 935    x86_fixup_fwd_jump(&cp->func, fixup2);
 936 #else
 937    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 938    struct x86_reg ones = get_reg_ptr(FILE_REG, REG_LIT);
 939    sse_movups(&cp->func, dst, ones);
 940 #endif
 941    return GL_TRUE;
 942 }
 943
 944
 945
 946 static GLboolean emit_MAX( struct compilation *cp, union instruction op )
 947 {
 948    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 949    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 950    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 951
 952    sse_movups(&cp->func, dst, arg0);
 953    sse_maxps(&cp->func, dst, arg1);
 954    return GL_TRUE;
 955 }
 956
 957
 958 static GLboolean emit_MIN( struct compilation *cp, union instruction op )
 959 {
 960    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 961    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 962    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 963
 964    sse_movups(&cp->func, dst, arg0);
 965    sse_minps(&cp->func, dst, arg1);
 966    return GL_TRUE;
 967 }
 968
 969 static GLboolean emit_MOV( struct compilation *cp, union instruction op )
 970 {
 971    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 972    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 973
 974    sse_movups(&cp->func, dst, arg0);
 975    return GL_TRUE;
 976 }
 977
 978 static GLboolean emit_MUL( struct compilation *cp, union instruction op )
 979 {
 980    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 981    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 982    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 983
 984    sse_movups(&cp->func, dst, arg0);
 985    sse_mulps(&cp->func, dst, arg1);
 986    return GL_TRUE;
 987 }
 988
 989
 990 static GLboolean emit_POW( struct compilation *cp, union instruction op )
 991 {
 992    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 993    struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1);
 994    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 995
 996    x87_fld(&cp->func, arg1);    /* a1 */
 997    x87_fld(&cp->func, arg0);    /* a0 a1 */
 998    x87_fyl2x(&cp->func);        /* a1*log2(a0) */
 999
1000    emit_x87_ex2( cp );          /* 2^(a1*log2(a0)) */
1001
1002    x87_fst(&cp->func, x86_make_disp(dst, 0));
1003    x87_fst(&cp->func, x86_make_disp(dst, 4));
1004    x87_fst(&cp->func, x86_make_disp(dst, 8));
1005    x87_fstp(&cp->func, x86_make_disp(dst, 12));
1006
1007    return GL_TRUE;
1008 }
1009
1010 static GLboolean emit_REL( struct compilation *cp, union instruction op )
1011 {
1012 /*    GLuint idx = (op.alu.idx0 + (GLint)cp->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1); */
1013 /*    GLuint idx = 0; */
1014 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, idx); */
1015 /*    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); */
1016
1017 /*    dst[0] = arg0[0]; */
1018 /*    dst[1] = arg0[1]; */
1019 /*    dst[2] = arg0[2]; */
1020 /*    dst[3] = arg0[3]; */
1021
1022    FAIL;
1023 }
1024
1025 static GLboolean emit_RCP( struct compilation *cp, union instruction op )
1026 {
1027    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1028    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1029
1030    if (cp->have_sse2) {
1031       sse2_rcpss(&cp->func, dst, arg0);
1032    }
1033    else {
1034       struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
1035       sse_movss(&cp->func, dst, ones);
1036       sse_divss(&cp->func, dst, arg0);
1037    }
1038
1039    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
1040    return GL_TRUE;
1041 }
1042
1043 static GLboolean emit_RSQ( struct compilation *cp, union instruction op )
1044 {
1045    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1046    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1047 #if 0
1048    struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
1049
1050 /* get abs value first. This STILL doesn't work.
1051    Looks like we get bogus neg values ?
1052 */
1053    sse_movss(&cp->func, dst, arg0);
1054    sse_mulss(&cp->func, dst, neg);
1055    sse_maxss(&cp->func, dst, arg0);
1056
1057    sse_rsqrtss(&cp->func, dst, dst);
1058 #endif
1059    sse_rsqrtss(&cp->func, dst, arg0);
1060    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
1061    return GL_TRUE;
1062 }
1063
1064
1065 static GLboolean emit_SGE( struct compilation *cp, union instruction op )
1066 {
1067    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1068    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1069    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1070    struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
1071
1072    sse_movups(&cp->func, dst, arg0);
1073    sse_cmpps(&cp->func, dst, arg1, cc_NotLessThan);
1074    sse_andps(&cp->func, dst, ones);
1075    return GL_TRUE;
1076 }
1077
1078
1079 static GLboolean emit_SLT( struct compilation *cp, union instruction op )
1080 {
1081    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1082    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1083    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1084    struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
1085
1086    sse_movups(&cp->func, dst, arg0);
1087    sse_cmpps(&cp->func, dst, arg1, cc_LessThan);
1088    sse_andps(&cp->func, dst, ones);
1089    return GL_TRUE;
1090 }
1091
1092 static GLboolean emit_SUB( struct compilation *cp, union instruction op )
1093 {
1094    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1095    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1096    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1097
1098    sse_movups(&cp->func, dst, arg0);
1099    sse_subps(&cp->func, dst, arg1);
1100    return GL_TRUE;
1101 }
1102
1103
1104 static GLboolean emit_XPD( struct compilation *cp, union instruction op )
1105 {
1106    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1107    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1108    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1109    struct x86_reg tmp0 = get_xmm_reg(cp);
1110    struct x86_reg tmp1 = get_xmm_reg(cp);
1111
1112    /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1.  Need a way
1113     * to invalidate registers.  This will come with better analysis
1114     * (liveness analysis) of the incoming program.
1115     */
1116    emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W));
1117    emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W));
1118    sse_mulps(&cp->func, dst, tmp1);
1119    emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W));
1120    emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1121    sse_mulps(&cp->func, tmp0, tmp1);
1122    sse_subps(&cp->func, dst, tmp0);
1123
1124 /*    dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1125 /*    dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1126 /*    dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1127 /*    dst[3] is undef */
1128
1129    return GL_TRUE;
1130 }
1131
1132 static GLboolean emit_NOP( struct compilation *cp, union instruction op )
1133 {
1134    return GL_TRUE;
1135 }
1136
1137
1138 static GLboolean (* const emit_func[])(struct compilation *, union instruction) =
1139 {
1140    emit_ABS,
1141    emit_ADD,
1142    emit_NOP, /* ARA */
1143    emit_NOP, /* ARL */
1144    emit_NOP, /* ARL_NV */
1145    emit_NOP, /* ARR */
1146    emit_NOP, /* BRA */
1147    emit_NOP, /* CAL */
1148    emit_NOP, /* CMP */
1149    emit_NOP, /* COS */
1150    emit_NOP, /* DDX */
1151    emit_NOP, /* DDY */
1152    emit_DP3,
1153    emit_DP4,
1154    emit_DPH,
1155    emit_DST,
1156    emit_NOP, /* END */
1157    emit_EX2,
1158    emit_EXP,
1159    emit_FLR,
1160    emit_FRC,
1161    emit_NOP, /* KIL */
1162    emit_NOP, /* KIL_NV */
1163    emit_LG2,
1164    emit_LIT,
1165    emit_LOG,
1166    emit_NOP, /* LRP */
1167    emit_NOP, /* MAD */
1168    emit_MAX,
1169    emit_MIN,
1170    emit_MOV,
1171    emit_MUL,
1172    emit_NOP, /* PK2H */
1173    emit_NOP, /* PK2US */
1174    emit_NOP, /* PK4B */
1175    emit_NOP, /* PK4UB */
1176    emit_POW,
1177    emit_NOP, /* POPA */
1178    emit_PRT,
1179    emit_NOP, /* PUSHA */
1180    emit_NOP, /* RCC */
1181    emit_RCP,
1182    emit_NOP, /* RET */
1183    emit_NOP, /* RFL */
1184    emit_RSQ,
1185    emit_NOP, /* SCS */
1186    emit_NOP, /* SEQ */
1187    emit_NOP, /* SFL */
1188    emit_SGE,
1189    emit_NOP, /* SGT */
1190    emit_NOP, /* SIN */
1191    emit_NOP, /* SLE */
1192    emit_SLT,
1193    emit_NOP, /* SNE */
1194    emit_NOP, /* SSG */
1195    emit_NOP, /* STR */
1196    emit_SUB,
1197    emit_SWZ, /* SWZ */
1198    emit_NOP, /* TEX */
1199    emit_NOP, /* TXB */
1200    emit_NOP, /* TXD */
1201    emit_NOP, /* TXL */
1202    emit_NOP, /* TXP */
1203    emit_NOP, /* TXP_NV */
1204    emit_NOP, /* UP2H */
1205    emit_NOP, /* UP2US */
1206    emit_NOP, /* UP4B */
1207    emit_NOP, /* UP4UB */
1208    emit_NOP, /* X2D */
1209    emit_XPD,
1210    emit_RSW,
1211    emit_MSK,
1212    emit_REL,
1213 };
1214
1215
1216
1217 static GLboolean build_vertex_program( struct compilation *cp )
1218 {
1219    struct arb_vp_machine *m = NULL;
1220    GLuint j;
1221
1222    struct x86_reg regEBX = x86_make_reg(file_REG32, reg_BX);
1223    struct x86_reg regECX = x86_make_reg(file_REG32, reg_CX);
1224    struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX);
1225
1226    x86_push(&cp->func, regEBX);
1227
1228    x86_mov(&cp->func, regEDX, x86_fn_arg(&cp->func, 1));
1229    x86_mov(&cp->func, regEBX, x86_make_disp(regEDX, get_offset(m, m->File + FILE_REG)));
1230    x86_mov(&cp->func, regECX, x86_make_disp(regEDX, get_offset(m, m->File + FILE_STATE_PARAM)));
1231
1232    for (j = 0; j < cp->p->nr_instructions; j++) {
1233       union instruction inst = cp->p->instructions[j];
1234       cp->insn_counter = j+1;   /* avoid zero */
1235
1236       if (DISASSEM) {
1237          _mesa_printf("%p: ", cp->func.csr);
1238          _tnl_disassem_vba_insn( inst );
1239       }
1240       cp->func.fn = NULL;
1241
1242       if (!emit_func[inst.alu.opcode]( cp, inst )) {
1243          return GL_FALSE;
1244       }
1245    }
1246
1247    /* TODO: only for outputs:
1248     */
1249    for (j = 0; j < 8; j++) {
1250       if (cp->xmm[j].dirty)
1251          spill(cp, j);
1252    }
1253
1254
1255    /* Exit mmx state?
1256     */
1257    if (cp->func.need_emms)
1258       mmx_emms(&cp->func);
1259
1260    /* Restore FPU control word?
1261     */
1262    if (cp->fpucntl != RESTORE_FPU) {
1263       x87_fnclex(&cp->func);
1264       x87_fldcw(&cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore)));
1265    }
1266
1267    x86_pop(&cp->func, regEBX);
1268    x86_ret(&cp->func);
1269
1270    return GL_TRUE;
1271 }
1272
1273 /**
1274  * Execute the given vertex program.
1275  *
1276  * TODO: Integrate the t_vertex.c code here, to build machine vertices
1277  * directly at this point.
1278  *
1279  * TODO: Eliminate the VB struct entirely and just use
1280  * struct arb_vertex_machine.
1281  */
1282 GLboolean
1283 _tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p)
1284 {
1285    struct compilation cp;
1286
1287    /* sanity checks */
1288    assert(emit_func[OPCODE_ABS] == emit_ABS);
1289    assert(emit_func[OPCODE_MUL] == emit_MUL);
1290    assert(emit_func[OPCODE_XPD] == emit_XPD);
1291
1292    _mesa_memset(&cp, 0, sizeof(cp));
1293    cp.p = p;
1294    cp.have_sse2 = 1;
1295
1296    if (p->compiled_func) {
1297       _mesa_free((void *)p->compiled_func);
1298       p->compiled_func = NULL;
1299    }
1300
1301    x86_init_func(&cp.func);
1302
1303    cp.fpucntl = RESTORE_FPU;
1304
1305
1306    /* Note ctx state is not referenced in building the function, so it
1307     * depends only on the list of instructions:
1308     */
1309    if (!build_vertex_program(&cp)) {
1310       x86_release_func( &cp.func );
1311       return GL_FALSE;
1312    }
1313
1314
1315    p->compiled_func = (void (*)(struct arb_vp_machine *))x86_get_func( &cp.func );
1316    return GL_TRUE;
1317 }
1318
1319
1320
1321 #else
1322
1323 GLboolean
1324 _tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p)
1325 {
1326    /* Dummy version for when USE_SSE_ASM not defined */
1327    return GL_FALSE;
1328 }
1329
1330 #endif