src/mesa/tnl/t_vb_arbprogram_sse.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.3
   4  *
   5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file t_vb_arb_program_sse.c
  27  *
  28  * Translate simplified vertex_program representation to
  29  * x86/x87/SSE/SSE2 machine code using mesa's rtasm runtime assembler.
  30  *
  31  * This is very much a first attempt - build something that works.
  32  * There are probably better approaches for applying SSE to vertex
  33  * programs, and the whole thing is crying out for static analysis of
  34  * the programs to avoid redundant operations.
  35  *
  36  * \author Keith Whitwell
  37  */
  38
  39 #include "glheader.h"
  40 #include "context.h"
  41 #include "imports.h"
  42 #include "macros.h"
  43 #include "mtypes.h"
  44 #include "arbprogparse.h"
  45 #include "program.h"
  46 #include "program_instruction.h"
  47 #include "math/m_matrix.h"
  48 #include "math/m_translate.h"
  49 #include "t_context.h"
  50 #include "t_vb_arbprogram.h"
  51
  52 #if defined(USE_SSE_ASM)
  53
  54 #include "x86/rtasm/x86sse.h"
  55 #include "x86/common_x86_asm.h"
  56
  57 #define X    0
  58 #define Y    1
  59 #define Z    2
  60 #define W    3
  61
  62 /* Reg usage:
  63  *
  64  * EAX - temp
  65  * EBX - point to 'm->File[0]'
  66  * ECX - point to 'm->File[3]'
  67  * EDX - holds 'm'
  68  * EBP,
  69  * ESI,
  70  * EDI
  71  */
  72
  73 #define DISASSEM 0
  74
  75 #define FAIL                                                            \
  76 do {                                                                    \
  77    _mesa_printf("x86 translation failed in %s\n", __FUNCTION__);        \
  78    return GL_FALSE;                                                     \
  79 } while (0)
  80
  81 struct compilation {
  82    struct x86_function func;
  83    struct tnl_compiled_program *p;
  84    GLuint insn_counter;
  85
  86    struct {
  87       GLuint file:2;
  88       GLuint idx:7;
  89       GLuint dirty:1;
  90       GLuint last_used:10;
  91    } xmm[8];
  92
  93    struct {
  94       struct x86_reg base;
  95    } file[4];
  96
  97    GLboolean have_sse2;
  98    GLshort fpucntl;
  99 };
 100
 101 static INLINE GLboolean eq( struct x86_reg a,
 102                             struct x86_reg b )
 103 {
 104    return (a.file == b.file &&
 105            a.idx == b.idx &&
 106            a.mod == b.mod &&
 107            a.disp == b.disp);
 108 }
 109
 110 static GLint get_offset( const void *a, const void *b )
 111 {
 112    return (const char *)b - (const char *)a;
 113 }
 114
 115
 116 static struct x86_reg get_reg_ptr(GLuint file,
 117                                   GLuint idx )
 118 {
 119    struct x86_reg reg;
 120
 121    switch (file) {
 122    case FILE_REG:
 123       reg = x86_make_reg(file_REG32, reg_BX);
 124       assert(idx != REG_UNDEF);
 125       break;
 126    case FILE_STATE_PARAM:
 127       reg = x86_make_reg(file_REG32, reg_CX);
 128       break;
 129    default:
 130       assert(0);
 131    }
 132
 133    return x86_make_disp(reg, 16 * idx);
 134 }
 135
 136
 137 static void spill( struct compilation *cp, GLuint idx )
 138 {
 139    struct x86_reg oldval = get_reg_ptr(cp->xmm[idx].file,
 140                                        cp->xmm[idx].idx);
 141
 142    assert(cp->xmm[idx].dirty);
 143    sse_movups(&cp->func, oldval, x86_make_reg(file_XMM, idx));
 144    cp->xmm[idx].dirty = 0;
 145 }
 146
 147 static struct x86_reg get_xmm_reg( struct compilation *cp )
 148 {
 149    GLuint i;
 150    GLuint oldest = 0;
 151
 152    for (i = 0; i < 8; i++)
 153       if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
 154          oldest = i;
 155
 156    /* Need to write out the old value?
 157     */
 158    if (cp->xmm[oldest].dirty)
 159       spill(cp, oldest);
 160
 161    assert(cp->xmm[oldest].last_used != cp->insn_counter);
 162
 163    cp->xmm[oldest].file = FILE_REG;
 164    cp->xmm[oldest].idx = REG_UNDEF;
 165    cp->xmm[oldest].last_used = cp->insn_counter;
 166    return x86_make_reg(file_XMM, oldest);
 167 }
 168
 169 static void invalidate_xmm( struct compilation *cp,
 170                             GLuint file, GLuint idx )
 171 {
 172    GLuint i;
 173
 174    /* Invalidate any old copy of this register in XMM0-7.
 175     */
 176    for (i = 0; i < 8; i++) {
 177       if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) {
 178          cp->xmm[i].file = FILE_REG;
 179          cp->xmm[i].idx = REG_UNDEF;
 180          cp->xmm[i].dirty = 0;
 181          break;
 182       }
 183    }
 184 }
 185
 186
 187 /* Return an XMM reg to receive the results of an operation.
 188  */
 189 static struct x86_reg get_dst_xmm_reg( struct compilation *cp,
 190                                        GLuint file, GLuint idx )
 191 {
 192    struct x86_reg reg;
 193
 194    /* Invalidate any old copy of this register in XMM0-7.  Don't reuse
 195     * as this may be one of the arguments.
 196     */
 197    invalidate_xmm( cp, file, idx );
 198
 199    reg = get_xmm_reg( cp );
 200    cp->xmm[reg.idx].file = file;
 201    cp->xmm[reg.idx].idx = idx;
 202    cp->xmm[reg.idx].dirty = 1;
 203    return reg;
 204 }
 205
 206 /* As above, but return a pointer.  Note - this pointer may alias
 207  * those returned by get_arg_ptr().
 208  */
 209 static struct x86_reg get_dst_ptr( struct compilation *cp,
 210                                    GLuint file, GLuint idx )
 211 {
 212    /* Invalidate any old copy of this register in XMM0-7.  Don't reuse
 213     * as this may be one of the arguments.
 214     */
 215    invalidate_xmm( cp, file, idx );
 216
 217    return get_reg_ptr(file, idx);
 218 }
 219
 220
 221
 222 /* Return an XMM reg if the argument is resident, otherwise return a
 223  * base+offset pointer to the saved value.
 224  */
 225 static struct x86_reg get_arg( struct compilation *cp, GLuint file, GLuint idx )
 226 {
 227    GLuint i;
 228
 229    for (i = 0; i < 8; i++) {
 230       if (cp->xmm[i].file == file &&
 231           cp->xmm[i].idx == idx) {
 232          cp->xmm[i].last_used = cp->insn_counter;
 233          return x86_make_reg(file_XMM, i);
 234       }
 235    }
 236
 237    return get_reg_ptr(file, idx);
 238 }
 239
 240 /* As above, but always return a pointer:
 241  */
 242 static struct x86_reg get_arg_ptr( struct compilation *cp, GLuint file, GLuint idx )
 243 {
 244    GLuint i;
 245
 246    /* If there is a modified version of this register in one of the
 247     * XMM regs, write it out to memory.
 248     */
 249    for (i = 0; i < 8; i++) {
 250       if (cp->xmm[i].file == file &&
 251           cp->xmm[i].idx == idx &&
 252           cp->xmm[i].dirty)
 253          spill(cp, i);
 254    }
 255
 256    return get_reg_ptr(file, idx);
 257 }
 258
 259 /* Emulate pshufd insn in regular SSE, if necessary:
 260  */
 261 static void emit_pshufd( struct compilation *cp,
 262                          struct x86_reg dst,
 263                          struct x86_reg arg0,
 264                          GLubyte shuf )
 265 {
 266    if (cp->have_sse2) {
 267       sse2_pshufd(&cp->func, dst, arg0, shuf);
 268       cp->func.fn = 0;
 269    }
 270    else {
 271       if (!eq(dst, arg0))
 272          sse_movups(&cp->func, dst, arg0);
 273
 274       sse_shufps(&cp->func, dst, dst, shuf);
 275    }
 276 }
 277
 278 static void set_fpu_round_neg_inf( struct compilation *cp )
 279 {
 280    if (cp->fpucntl != RND_NEG_FPU) {
 281       struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX);
 282       struct arb_vp_machine *m = NULL;
 283
 284       cp->fpucntl = RND_NEG_FPU;
 285       x87_fnclex(&cp->func);
 286       x87_fldcw(&cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_rnd_neg)));
 287    }
 288 }
 289
 290
 291 /* Perform a reduced swizzle.
 292  */
 293 static GLboolean emit_RSW( struct compilation *cp, union instruction op )
 294 {
 295    struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
 296    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.rsw.dst);
 297    GLuint swz = op.rsw.swz;
 298    GLuint neg = op.rsw.neg;
 299
 300    emit_pshufd(cp, dst, arg0, swz);
 301
 302    if (neg) {
 303       struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
 304       struct x86_reg tmp = get_xmm_reg(cp);
 305       /* Load 1,-1,0,0
 306        * Use neg as arg to pshufd
 307        * Multiply
 308        */
 309       emit_pshufd(cp, tmp, negs,
 310                   SHUF((neg & 1) ? 1 : 0,
 311                        (neg & 2) ? 1 : 0,
 312                        (neg & 4) ? 1 : 0,
 313                        (neg & 8) ? 1 : 0));
 314       sse_mulps(&cp->func, dst, tmp);
 315    }
 316
 317    return GL_TRUE;
 318 }
 319
 320 /* Helper for writemask:
 321  */
 322 static GLboolean emit_shuf_copy1( struct compilation *cp,
 323                                   struct x86_reg dst,
 324                                   struct x86_reg arg0,
 325                                   struct x86_reg arg1,
 326                                   GLubyte shuf )
 327 {
 328    struct x86_reg tmp = get_xmm_reg(cp);
 329    sse_movups(&cp->func, dst, arg1);
 330    emit_pshufd(cp, dst, dst, shuf);
 331    emit_pshufd(cp, tmp, arg0, shuf);
 332
 333    sse_movss(&cp->func, dst, tmp);
 334
 335    emit_pshufd(cp, dst, dst, shuf);
 336    return GL_TRUE;
 337 }
 338
 339
 340 /* Helper for writemask:
 341  */
 342 static GLboolean emit_shuf_copy2( struct compilation *cp,
 343                                   struct x86_reg dst,
 344                                   struct x86_reg arg0,
 345                                   struct x86_reg arg1,
 346                                   GLubyte shuf )
 347 {
 348    struct x86_reg tmp = get_xmm_reg(cp);
 349    emit_pshufd(cp, dst, arg1, shuf);
 350    emit_pshufd(cp, tmp, arg0, shuf);
 351
 352    sse_shufps(&cp->func, dst, tmp, SHUF(X, Y, Z, W));
 353
 354    emit_pshufd(cp, dst, dst, shuf);
 355    return GL_TRUE;
 356 }
 357
 358
 359 static void emit_x87_ex2( struct compilation *cp )
 360 {
 361    struct x86_reg st0 = x86_make_reg(file_x87, 0);
 362    struct x86_reg st1 = x86_make_reg(file_x87, 1);
 363    struct x86_reg st3 = x86_make_reg(file_x87, 3);
 364
 365    set_fpu_round_neg_inf( cp );
 366
 367    x87_fld(&cp->func, st0); /* a a */
 368    x87_fprndint( &cp->func );   /* int(a) a */
 369    x87_fld(&cp->func, st0); /* int(a) int(a) a */
 370    x87_fstp(&cp->func, st3); /* int(a) a int(a)*/
 371    x87_fsubp(&cp->func, st1); /* frac(a) int(a) */
 372    x87_f2xm1(&cp->func);    /* (2^frac(a))-1 int(a)*/
 373    x87_fld1(&cp->func);    /* 1 (2^frac(a))-1 int(a)*/
 374    x87_faddp(&cp->func, st1);   /* 2^frac(a) int(a) */
 375    x87_fscale(&cp->func);       /* 2^a */
 376 }
 377
 378 #if 0
 379 static GLboolean emit_MSK2( struct compilation *cp, union instruction op )
 380 {
 381    struct x86_reg arg0 = get_arg(cp, op.msk.file, op.msk.arg);
 382    struct x86_reg arg1 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */
 383    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst);
 384
 385    /* make full width bitmask in tmp
 386     * dst = ~tmp
 387     * tmp &= arg0
 388     * dst &= arg1
 389     * dst |= tmp
 390     */
 391    emit_pshufd(cp, tmp, get_arg(cp, FILE_REG, REG_NEGS),
 392                SHUF((op.msk.mask & 1) ? 2 : 0,
 393                     (op.msk.mask & 2) ? 2 : 0,
 394                     (op.msk.mask & 4) ? 2 : 0,
 395                     (op.msk.mask & 8) ? 2 : 0));
 396    sse2_pnot(&cp->func, dst, tmp);
 397    sse2_pand(&cp->func, arg0, tmp);
 398    sse2_pand(&cp->func, arg1, dst);
 399    sse2_por(&cp->func, tmp, dst);
 400    return GL_TRUE;
 401 }
 402 #endif
 403
 404
 405 /* Used to implement write masking.  This and most of the other instructions
 406  * here would be easier to implement if there had been a translation
 407  * to a 2 argument format (dst/arg0, arg1) at the shader level before
 408  * attempting to translate to x86/sse code.
 409  */
 410 static GLboolean emit_MSK( struct compilation *cp, union instruction op )
 411 {
 412    struct x86_reg arg = get_arg(cp, op.msk.file, op.msk.idx);
 413    struct x86_reg dst0 = get_arg(cp, FILE_REG, op.msk.dst); /* NOTE! */
 414    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.msk.dst);
 415
 416    /* Note that dst and dst0 refer to the same program variable, but
 417     * will definitely be different XMM registers.  We're effectively
 418     * treating this as a 2 argument SEL now, just one of which happens
 419     * always to be the same register as the destination.
 420     */
 421
 422    switch (op.msk.mask) {
 423    case 0:
 424       sse_movups(&cp->func, dst, dst0);
 425       return GL_TRUE;
 426
 427    case WRITEMASK_X:
 428       if (arg.file == file_XMM) {
 429          sse_movups(&cp->func, dst, dst0);
 430          sse_movss(&cp->func, dst, arg);
 431       }
 432       else {
 433          struct x86_reg tmp = get_xmm_reg(cp);
 434          sse_movups(&cp->func, dst, dst0);
 435          sse_movss(&cp->func, tmp, arg);
 436          sse_movss(&cp->func, dst, tmp);
 437       }
 438       return GL_TRUE;
 439
 440    case WRITEMASK_XY:
 441       sse_movups(&cp->func, dst, dst0);
 442       sse_shufps(&cp->func, dst, arg, SHUF(X, Y, Z, W));
 443       return GL_TRUE;
 444
 445    case WRITEMASK_ZW:
 446       sse_movups(&cp->func, dst, arg);
 447       sse_shufps(&cp->func, dst, dst0, SHUF(X, Y, Z, W));
 448       return GL_TRUE;
 449
 450    case WRITEMASK_YZW:
 451       if (dst0.file == file_XMM) {
 452          sse_movups(&cp->func, dst, arg);
 453          sse_movss(&cp->func, dst, dst0);
 454       }
 455       else {
 456          struct x86_reg tmp = get_xmm_reg(cp);
 457          sse_movups(&cp->func, dst, arg);
 458          sse_movss(&cp->func, tmp, dst0);
 459          sse_movss(&cp->func, dst, tmp);
 460       }
 461       return GL_TRUE;
 462
 463    case WRITEMASK_Y:
 464       emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Y,X,Z,W));
 465       return GL_TRUE;
 466
 467    case WRITEMASK_Z:
 468       emit_shuf_copy1(cp, dst, arg, dst0, SHUF(Z,Y,X,W));
 469       return GL_TRUE;
 470
 471    case WRITEMASK_W:
 472       emit_shuf_copy1(cp, dst, arg, dst0, SHUF(W,Y,Z,X));
 473       return GL_TRUE;
 474
 475    case WRITEMASK_XZ:
 476       emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,Z,Y,W));
 477       return GL_TRUE;
 478
 479    case WRITEMASK_XW:
 480       emit_shuf_copy2(cp, dst, arg, dst0, SHUF(X,W,Z,Y));
 481
 482    case WRITEMASK_YZ:
 483       emit_shuf_copy2(cp, dst, arg, dst0, SHUF(Z,Y,X,W));
 484       return GL_TRUE;
 485
 486    case WRITEMASK_YW:
 487       emit_shuf_copy2(cp, dst, arg, dst0, SHUF(W,Y,Z,X));
 488       return GL_TRUE;
 489
 490    case WRITEMASK_XZW:
 491       emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Y,X,Z,W));
 492       return GL_TRUE;
 493
 494    case WRITEMASK_XYW:
 495       emit_shuf_copy1(cp, dst, dst0, arg, SHUF(Z,Y,X,W));
 496       return GL_TRUE;
 497
 498    case WRITEMASK_XYZ:
 499       emit_shuf_copy1(cp, dst, dst0, arg, SHUF(W,Y,Z,X));
 500       return GL_TRUE;
 501
 502    case WRITEMASK_XYZW:
 503       sse_movups(&cp->func, dst, arg);
 504       return GL_TRUE;
 505
 506    default:
 507       assert(0);
 508       break;
 509    }
 510 }
 511
 512
 513
 514 static GLboolean emit_PRT( struct compilation *cp, union instruction op )
 515 {
 516    FAIL;
 517 }
 518
 519
 520 /**
 521  * The traditional instructions.  All operate on internal registers
 522  * and ignore write masks and swizzling issues.
 523  */
 524
 525 static GLboolean emit_ABS( struct compilation *cp, union instruction op )
 526 {
 527    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 528    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 529    struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
 530
 531    sse_movups(&cp->func, dst, arg0);
 532    sse_mulps(&cp->func, dst, neg);
 533    sse_maxps(&cp->func, dst, arg0);
 534    return GL_TRUE;
 535 }
 536
 537 static GLboolean emit_ADD( struct compilation *cp, union instruction op )
 538 {
 539    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 540    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 541    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 542
 543    sse_movups(&cp->func, dst, arg0);
 544    sse_addps(&cp->func, dst, arg1);
 545    return GL_TRUE;
 546 }
 547
 548
 549 /* The dotproduct instructions don't really do that well in sse:
 550  */
 551 static GLboolean emit_DP3( struct compilation *cp, union instruction op )
 552 {
 553    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 554    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 555    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 556    struct x86_reg tmp = get_xmm_reg(cp);
 557
 558    sse_movups(&cp->func, dst, arg0);
 559    sse_mulps(&cp->func, dst, arg1);
 560
 561    /* Now the hard bit: sum the first 3 values:
 562     */
 563    sse_movhlps(&cp->func, tmp, dst);
 564    sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
 565    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
 566    sse_addss(&cp->func, dst, tmp);
 567    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 568    return GL_TRUE;
 569 }
 570
 571
 572
 573 static GLboolean emit_DP4( struct compilation *cp, union instruction op )
 574 {
 575    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 576    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 577    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 578    struct x86_reg tmp = get_xmm_reg(cp);
 579
 580    sse_movups(&cp->func, dst, arg0);
 581    sse_mulps(&cp->func, dst, arg1);
 582
 583    /* Now the hard bit: sum the values:
 584     */
 585    sse_movhlps(&cp->func, tmp, dst);
 586    sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
 587    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
 588    sse_addss(&cp->func, dst, tmp);
 589    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 590    return GL_TRUE;
 591 }
 592
 593 static GLboolean emit_DPH( struct compilation *cp, union instruction op )
 594 {
 595    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 596    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 597    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 598    struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
 599    struct x86_reg tmp = get_xmm_reg(cp);
 600
 601    emit_pshufd(cp, dst, arg0, SHUF(W,X,Y,Z));
 602    sse_movss(&cp->func, dst, ones);
 603    emit_pshufd(cp, dst, dst, SHUF(W,X,Y,Z));
 604    sse_mulps(&cp->func, dst, arg1);
 605
 606    /* Now the hard bit: sum the values (from DP4):
 607     */
 608    sse_movhlps(&cp->func, tmp, dst);
 609    sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
 610    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
 611    sse_addss(&cp->func, dst, tmp);
 612    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 613    return GL_TRUE;
 614 }
 615
 616 #if 0
 617 static GLboolean emit_DST( struct compilation *cp, union instruction op )
 618 {
 619     struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 620     struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1);
 621     struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 622
 623 /*    dst[0] = 1.0     * 1.0F; */
 624 /*    dst[1] = arg0[1] * arg1[1]; */
 625 /*    dst[2] = arg0[2] * 1.0; */
 626 /*    dst[3] = 1.0     * arg1[3]; */
 627
 628     /* Would rather do some of this with integer regs, but:
 629      *  1) No proper support for immediate values yet
 630      *  2) I'd need to push/pop somewhere to get a free reg.
 631      */
 632     x87_fld1(&cp->func);
 633     x87_fstp(&cp->func, dst); /* would rather do an immediate store... */
 634     x87_fld(&cp->func, x86_make_disp(arg0, 4));
 635     x87_fmul(&cp->func, x86_make_disp(arg1, 4));
 636     x87_fstp(&cp->func, x86_make_disp(dst, 4));
 637
 638     if (!eq(arg0, dst)) {
 639        x86_fld(&cp->func, x86_make_disp(arg0, 8));
 640        x86_stp(&cp->func, x86_make_disp(dst, 8));
 641     }
 642
 643     if (!eq(arg1, dst)) {
 644        x86_fld(&cp->func, x86_make_disp(arg0, 12));
 645        x86_stp(&cp->func, x86_make_disp(dst, 12));
 646     }
 647
 648     return GL_TRUE;
 649 }
 650 #else
 651 static GLboolean emit_DST( struct compilation *cp, union instruction op )
 652 {
 653     struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 654     struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 655     struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 656     struct x86_reg tmp = get_xmm_reg(cp);
 657     struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
 658
 659     emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
 660     emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
 661     sse_mulps(&cp->func, dst, tmp);
 662
 663 /*    dst[0] = 1.0     * 1.0F; */
 664 /*    dst[1] = arg0[1] * arg1[1]; */
 665 /*    dst[2] = arg0[2] * 1.0; */
 666 /*    dst[3] = 1.0     * arg1[3]; */
 667
 668     return GL_TRUE;
 669 }
 670 #endif
 671
 672 static GLboolean emit_LG2( struct compilation *cp, union instruction op )
 673 {
 674    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 675    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 676
 677    x87_fld1(&cp->func);         /* 1 */
 678    x87_fld(&cp->func, arg0);    /* a0 1 */
 679    x87_fyl2x(&cp->func);        /* log2(a0) */
 680    x87_fst(&cp->func, x86_make_disp(dst, 0));
 681    x87_fst(&cp->func, x86_make_disp(dst, 4));
 682    x87_fst(&cp->func, x86_make_disp(dst, 8));
 683    x87_fstp(&cp->func, x86_make_disp(dst, 12));
 684
 685    return GL_TRUE;
 686 }
 687
 688
 689 static GLboolean emit_EX2( struct compilation *cp, union instruction op )
 690 {
 691    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 692    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 693
 694    /* CAUTION: dst may alias arg0!
 695     */
 696    x87_fld(&cp->func, arg0);
 697
 698    emit_x87_ex2(cp);
 699
 700    x87_fst(&cp->func, x86_make_disp(dst, 0));
 701    x87_fst(&cp->func, x86_make_disp(dst, 4));
 702    x87_fst(&cp->func, x86_make_disp(dst, 8));
 703    x87_fst(&cp->func, x86_make_disp(dst, 12));
 704    return GL_TRUE;
 705 }
 706
 707 static GLboolean emit_EXP( struct compilation *cp, union instruction op )
 708 {
 709     struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 710     struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 711     struct x86_reg st0 = x86_make_reg(file_x87, 0);
 712     struct x86_reg st1 = x86_make_reg(file_x87, 1);
 713     struct x86_reg st3 = x86_make_reg(file_x87, 3);
 714
 715     /* CAUTION: dst may alias arg0!
 716      */
 717     x87_fld(&cp->func, arg0);   /* arg0.x */
 718     x87_fld(&cp->func, st0); /* arg arg */
 719
 720     /* by default, fpu is setup to round-to-nearest.  We want to
 721      * change this now, and track the state through to the end of the
 722      * generated function so that it isn't repeated unnecessarily.
 723      * Alternately, could subtract .5 to get round to -inf behaviour.
 724      */
 725     set_fpu_round_neg_inf( cp );
 726     x87_fprndint( &cp->func );  /* flr(a) a */
 727     x87_fld(&cp->func, st0); /* flr(a) flr(a) a */
 728     x87_fld1(&cp->func);    /* 1 floor(a) floor(a) a */
 729     x87_fst(&cp->func, x86_make_disp(dst, 12));  /* stack unchanged */
 730     x87_fscale(&cp->func);  /* 2^floor(a) floor(a) a */
 731     x87_fst(&cp->func, st3); /* 2^floor(a) floor(a) a 2^floor(a)*/
 732     x87_fstp(&cp->func, x86_make_disp(dst, 0)); /* flr(a) a 2^flr(a) */
 733     x87_fsubrp(&cp->func, st1); /* frac(a) 2^flr(a) */
 734     x87_fst(&cp->func, x86_make_disp(dst, 4));    /* frac(a) 2^flr(a) */
 735     x87_f2xm1(&cp->func);    /* (2^frac(a))-1 2^flr(a)*/
 736     x87_fld1(&cp->func);    /* 1 (2^frac(a))-1 2^flr(a)*/
 737     x87_faddp(&cp->func, st1);  /* 2^frac(a) 2^flr(a) */
 738     x87_fmulp(&cp->func, st1);  /* 2^a */
 739     x87_fst(&cp->func, x86_make_disp(dst, 8));
 740
 741
 742
 743 /*    dst[0] = 2^floor(tmp); */
 744 /*    dst[1] = frac(tmp); */
 745 /*    dst[2] = 2^floor(tmp) * 2^frac(tmp); */
 746 /*    dst[3] = 1.0F; */
 747     return GL_TRUE;
 748 }
 749
 750 static GLboolean emit_LOG( struct compilation *cp, union instruction op )
 751 {
 752     struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 753     struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 754     struct x86_reg st0 = x86_make_reg(file_x87, 0);
 755     struct x86_reg st1 = x86_make_reg(file_x87, 1);
 756     struct x86_reg st2 = x86_make_reg(file_x87, 2);
 757
 758     /* CAUTION: dst may alias arg0!
 759      */
 760     x87_fld(&cp->func, arg0);   /* arg0.x */
 761     x87_fabs(&cp->func);        /* |arg0.x| */
 762     x87_fxtract(&cp->func);     /* mantissa(arg0.x), exponent(arg0.x) */
 763     x87_fst(&cp->func, st2);    /* mantissa, exponent, mantissa */
 764     x87_fld1(&cp->func);        /* 1, mantissa, exponent, mantissa */
 765     x87_fyl2x(&cp->func);       /* log2(mantissa), exponent, mantissa */
 766     x87_fadd(&cp->func, st0, st1);      /* e+l2(m), e, m  */
 767     x87_fstp(&cp->func, x86_make_disp(dst, 8)); /* e, m */
 768
 769     x87_fld1(&cp->func);        /* 1, e, m */
 770     x87_fsub(&cp->func, st1, st0);      /* 1, e-1, m */
 771     x87_fstp(&cp->func, x86_make_disp(dst, 12)); /* e-1,m */
 772     x87_fstp(&cp->func, dst);   /* m */
 773
 774     x87_fadd(&cp->func, st0, st0);      /* 2m */
 775     x87_fstp(&cp->func, x86_make_disp(dst, 4));
 776
 777     return GL_TRUE;
 778 }
 779
 780 static GLboolean emit_FLR( struct compilation *cp, union instruction op )
 781 {
 782    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 783    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 784    int i;
 785
 786    set_fpu_round_neg_inf( cp );
 787
 788    for (i = 0; i < 4; i++) {
 789       x87_fld(&cp->func, x86_make_disp(arg0, i*4));
 790       x87_fprndint( &cp->func );
 791       x87_fstp(&cp->func, x86_make_disp(dst, i*4));
 792    }
 793
 794
 795    return GL_TRUE;
 796 }
 797
 798 static GLboolean emit_FRC( struct compilation *cp, union instruction op )
 799 {
 800    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 801    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 802    struct x86_reg st0 = x86_make_reg(file_x87, 0);
 803    struct x86_reg st1 = x86_make_reg(file_x87, 1);
 804    int i;
 805
 806    set_fpu_round_neg_inf( cp );
 807
 808    /* Knowing liveness info or even just writemask would be useful
 809     * here:
 810     */
 811    for (i = 0; i < 4; i++) {
 812       x87_fld(&cp->func, x86_make_disp(arg0, i*4));
 813       x87_fld(&cp->func, st0);  /* a a */
 814       x87_fprndint( &cp->func );   /* flr(a) a */
 815       x87_fsubrp(&cp->func, st1); /* frc(a) */
 816       x87_fstp(&cp->func, x86_make_disp(dst, i*4));
 817    }
 818
 819    return GL_TRUE;
 820 }
 821
 822
 823
 824 static GLboolean emit_LIT( struct compilation *cp, union instruction op )
 825 {
 826 #if 1
 827    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 828    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 829    struct x86_reg lit = get_arg(cp, FILE_REG, REG_LIT);
 830    struct x86_reg tmp = get_xmm_reg(cp);
 831    struct x86_reg st1 = x86_make_reg(file_x87, 1);
 832    struct x86_reg regEAX = x86_make_reg(file_REG32, reg_AX);
 833    GLubyte *fixup1, *fixup2;
 834
 835
 836    /* Load the interesting parts of arg0:
 837     */
 838    x87_fld(&cp->func, x86_make_disp(arg0, 12)); /* a3 */
 839    x87_fld(&cp->func, x86_make_disp(arg0, 4)); /* a1 a3 */
 840    x87_fld(&cp->func, x86_make_disp(arg0, 0)); /* a0 a1 a3 */
 841
 842    /* Intialize dst:
 843     */
 844    sse_movaps(&cp->func, tmp, lit);
 845    sse_movaps(&cp->func, dst, tmp);
 846
 847    /* Check arg0[0]:
 848     */
 849    x87_fldz(&cp->func);         /* 0 a0 a1 a3 */
 850    x87_fucomp(&cp->func, st1);  /* a0 a1 a3 */
 851    x87_fnstsw(&cp->func, regEAX);
 852    x86_sahf(&cp->func);
 853    fixup1 = x86_jcc_forward(&cp->func, cc_AE);
 854
 855    x87_fstp(&cp->func, x86_make_disp(dst, 4));  /* a1 a3 */
 856
 857    /* Check arg0[1]:
 858     */
 859    x87_fldz(&cp->func);         /* 0 a1 a3 */
 860    x87_fucomp(&cp->func, st1);  /* a1 a3 */
 861    x87_fnstsw(&cp->func, regEAX);
 862    x86_sahf(&cp->func);
 863    fixup2 = x86_jcc_forward(&cp->func, cc_AE);
 864
 865    /* Compute pow(a1, a3)
 866     */
 867    x87_fyl2x(&cp->func);        /* a3*log2(a1) */
 868
 869    emit_x87_ex2( cp );          /* 2^(a3*log2(a1)) */
 870
 871    x87_fstp(&cp->func, x86_make_disp(dst, 8));
 872
 873    /* Land jumps:
 874     */
 875    x86_fixup_fwd_jump(&cp->func, fixup1);
 876    x86_fixup_fwd_jump(&cp->func, fixup2);
 877 #else
 878    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 879    struct x86_reg ones = get_reg_ptr(FILE_REG, REG_LIT);
 880    sse_movups(&cp->func, dst, ones);
 881 #endif
 882    return GL_TRUE;
 883 }
 884
 885
 886
 887 static GLboolean emit_MAX( struct compilation *cp, union instruction op )
 888 {
 889    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 890    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 891    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 892
 893    sse_movups(&cp->func, dst, arg0);
 894    sse_maxps(&cp->func, dst, arg1);
 895    return GL_TRUE;
 896 }
 897
 898
 899 static GLboolean emit_MIN( struct compilation *cp, union instruction op )
 900 {
 901    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 902    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 903    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 904
 905    sse_movups(&cp->func, dst, arg0);
 906    sse_minps(&cp->func, dst, arg1);
 907    return GL_TRUE;
 908 }
 909
 910 static GLboolean emit_MOV( struct compilation *cp, union instruction op )
 911 {
 912    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 913    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 914
 915    sse_movups(&cp->func, dst, arg0);
 916    return GL_TRUE;
 917 }
 918
 919 static GLboolean emit_MUL( struct compilation *cp, union instruction op )
 920 {
 921    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 922    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 923    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 924
 925    sse_movups(&cp->func, dst, arg0);
 926    sse_mulps(&cp->func, dst, arg1);
 927    return GL_TRUE;
 928 }
 929
 930
 931 static GLboolean emit_POW( struct compilation *cp, union instruction op )
 932 {
 933    struct x86_reg arg0 = get_arg_ptr(cp, op.alu.file0, op.alu.idx0);
 934    struct x86_reg arg1 = get_arg_ptr(cp, op.alu.file1, op.alu.idx1);
 935    struct x86_reg dst = get_dst_ptr(cp, FILE_REG, op.alu.dst);
 936
 937    x87_fld(&cp->func, arg1);    /* a1 */
 938    x87_fld(&cp->func, arg0);    /* a0 a1 */
 939    x87_fyl2x(&cp->func);        /* a1*log2(a0) */
 940
 941    emit_x87_ex2( cp );          /* 2^(a1*log2(a0)) */
 942
 943    x87_fst(&cp->func, x86_make_disp(dst, 0));
 944    x87_fst(&cp->func, x86_make_disp(dst, 4));
 945    x87_fst(&cp->func, x86_make_disp(dst, 8));
 946    x87_fstp(&cp->func, x86_make_disp(dst, 12));
 947
 948    return GL_TRUE;
 949 }
 950
 951 static GLboolean emit_REL( struct compilation *cp, union instruction op )
 952 {
 953 /*    GLuint idx = (op.alu.idx0 + (GLint)cp->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1); */
 954 /*    GLuint idx = 0; */
 955 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, idx); */
 956 /*    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst); */
 957
 958 /*    dst[0] = arg0[0]; */
 959 /*    dst[1] = arg0[1]; */
 960 /*    dst[2] = arg0[2]; */
 961 /*    dst[3] = arg0[3]; */
 962
 963    FAIL;
 964 }
 965
 966 static GLboolean emit_RCP( struct compilation *cp, union instruction op )
 967 {
 968    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 969    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 970
 971    if (cp->have_sse2) {
 972       sse2_rcpss(&cp->func, dst, arg0);
 973    }
 974    else {
 975       struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
 976       sse_movss(&cp->func, dst, ones);
 977       sse_divss(&cp->func, dst, arg0);
 978    }
 979
 980    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 981    return GL_TRUE;
 982 }
 983
 984 static GLboolean emit_RSQ( struct compilation *cp, union instruction op )
 985 {
 986    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 987    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
 988
 989    /* TODO: Calculate absolute value
 990     */
 991 #if 0
 992    sse_movss(&cp->func, dst, arg0);
 993    sse_mulss(&cp->func, dst, neg);
 994    sse_maxss(&cp->func, dst, arg0);
 995 #endif
 996
 997    sse_rsqrtss(&cp->func, dst, arg0);
 998    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 999    return GL_TRUE;
1000 }
1001
1002
1003 static GLboolean emit_SGE( struct compilation *cp, union instruction op )
1004 {
1005    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1006    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1007    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1008    struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
1009
1010    sse_movups(&cp->func, dst, arg0);
1011    sse_cmpps(&cp->func, dst, arg1, cc_NotLessThan);
1012    sse_andps(&cp->func, dst, ones);
1013    return GL_TRUE;
1014 }
1015
1016
1017 static GLboolean emit_SLT( struct compilation *cp, union instruction op )
1018 {
1019    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1020    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1021    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1022    struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
1023
1024    sse_movups(&cp->func, dst, arg0);
1025    sse_cmpps(&cp->func, dst, arg1, cc_LessThan);
1026    sse_andps(&cp->func, dst, ones);
1027    return GL_TRUE;
1028 }
1029
1030 static GLboolean emit_SUB( struct compilation *cp, union instruction op )
1031 {
1032    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1033    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1034    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1035
1036    sse_movups(&cp->func, dst, arg0);
1037    sse_subps(&cp->func, dst, arg1);
1038    return GL_TRUE;
1039 }
1040
1041
1042 static GLboolean emit_XPD( struct compilation *cp, union instruction op )
1043 {
1044    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
1045    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
1046    struct x86_reg dst = get_dst_xmm_reg(cp, FILE_REG, op.alu.dst);
1047    struct x86_reg tmp0 = get_xmm_reg(cp);
1048    struct x86_reg tmp1 = get_xmm_reg(cp);
1049
1050    /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1.  Need a way
1051     * to invalidate registers.  This will come with better analysis
1052     * (liveness analysis) of the incoming program.
1053     */
1054    emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W));
1055    emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W));
1056    sse_mulps(&cp->func, dst, tmp1);
1057    emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W));
1058    emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1059    sse_mulps(&cp->func, tmp0, tmp1);
1060    sse_subps(&cp->func, dst, tmp0);
1061
1062 /*    dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1063 /*    dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1064 /*    dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1065 /*    dst[3] is undef */
1066
1067    return GL_TRUE;
1068 }
1069
1070 static GLboolean emit_NOP( struct compilation *cp, union instruction op )
1071 {
1072    return GL_TRUE;
1073 }
1074
1075
1076 static GLboolean (* const emit_func[])(struct compilation *, union instruction) =
1077 {
1078    emit_ABS,
1079    emit_ADD,
1080    emit_NOP, /* ARA */
1081    emit_NOP, /* ARL */
1082    emit_NOP, /* ARL_NV */
1083    emit_NOP, /* ARR */
1084    emit_NOP, /* BRA */
1085    emit_NOP, /* CAL */
1086    emit_NOP, /* CMP */
1087    emit_NOP, /* COS */
1088    emit_NOP, /* DDX */
1089    emit_NOP, /* DDY */
1090    emit_DP3,
1091    emit_DP4,
1092    emit_DPH,
1093    emit_DST,
1094    emit_NOP, /* END */
1095    emit_EX2,
1096    emit_EXP,
1097    emit_FLR,
1098    emit_FRC,
1099    emit_NOP, /* KIL */
1100    emit_NOP, /* KIL_NV */
1101    emit_LG2,
1102    emit_LIT,
1103    emit_LOG,
1104    emit_NOP, /* LRP */
1105    emit_NOP, /* MAD */
1106    emit_MAX,
1107    emit_MIN,
1108    emit_MOV,
1109    emit_MUL,
1110    emit_NOP, /* PK2H */
1111    emit_NOP, /* PK2US */
1112    emit_NOP, /* PK4B */
1113    emit_NOP, /* PK4UB */
1114    emit_POW,
1115    emit_NOP, /* POPA */
1116    emit_PRT,
1117    emit_NOP, /* PUSHA */
1118    emit_NOP, /* RCC */
1119    emit_RCP,
1120    emit_NOP, /* RET */
1121    emit_NOP, /* RFL */
1122    emit_RSQ,
1123    emit_NOP, /* SCS */
1124    emit_NOP, /* SEQ */
1125    emit_NOP, /* SFL */
1126    emit_SGE,
1127    emit_NOP, /* SGT */
1128    emit_NOP, /* SIN */
1129    emit_NOP, /* SLE */
1130    emit_SLT,
1131    emit_NOP, /* SNE */
1132    emit_NOP, /* SSG */
1133    emit_NOP, /* STR */
1134    emit_SUB,
1135    emit_RSW, /* SWZ */
1136    emit_NOP, /* TEX */
1137    emit_NOP, /* TXB */
1138    emit_NOP, /* TXD */
1139    emit_NOP, /* TXL */
1140    emit_NOP, /* TXP */
1141    emit_NOP, /* TXP_NV */
1142    emit_NOP, /* UP2H */
1143    emit_NOP, /* UP2US */
1144    emit_NOP, /* UP4B */
1145    emit_NOP, /* UP4UB */
1146    emit_NOP, /* X2D */
1147    emit_XPD,
1148    emit_RSW,
1149    emit_MSK,
1150    emit_REL,
1151 };
1152
1153
1154
1155 static GLboolean build_vertex_program( struct compilation *cp )
1156 {
1157    struct arb_vp_machine *m = NULL;
1158    GLuint j;
1159
1160    struct x86_reg regEBX = x86_make_reg(file_REG32, reg_BX);
1161    struct x86_reg regECX = x86_make_reg(file_REG32, reg_CX);
1162    struct x86_reg regEDX = x86_make_reg(file_REG32, reg_DX);
1163
1164    x86_push(&cp->func, regEBX);
1165
1166    x86_mov(&cp->func, regEDX, x86_fn_arg(&cp->func, 1));
1167    x86_mov(&cp->func, regEBX, x86_make_disp(regEDX, get_offset(m, m->File + FILE_REG)));
1168    x86_mov(&cp->func, regECX, x86_make_disp(regEDX, get_offset(m, m->File + FILE_STATE_PARAM)));
1169
1170    for (j = 0; j < cp->p->nr_instructions; j++) {
1171       union instruction inst = cp->p->instructions[j];
1172       cp->insn_counter = j+1;   /* avoid zero */
1173
1174       if (DISASSEM) {
1175          _mesa_printf("%p: ", cp->func.csr);
1176          _tnl_disassem_vba_insn( inst );
1177       }
1178       cp->func.fn = NULL;
1179
1180       if (!emit_func[inst.alu.opcode]( cp, inst )) {
1181          return GL_FALSE;
1182       }
1183    }
1184
1185    /* TODO: only for outputs:
1186     */
1187    for (j = 0; j < 8; j++) {
1188       if (cp->xmm[j].dirty)
1189          spill(cp, j);
1190    }
1191
1192
1193    /* Exit mmx state?
1194     */
1195    if (cp->func.need_emms)
1196       mmx_emms(&cp->func);
1197
1198    /* Restore FPU control word?
1199     */
1200    if (cp->fpucntl != RESTORE_FPU) {
1201       x87_fnclex(&cp->func);
1202       x87_fldcw(&cp->func, x86_make_disp(regEDX, get_offset(m, &m->fpucntl_restore)));
1203    }
1204
1205    x86_pop(&cp->func, regEBX);
1206    x86_ret(&cp->func);
1207
1208    return GL_TRUE;
1209 }
1210
1211 /**
1212  * Execute the given vertex program.
1213  *
1214  * TODO: Integrate the t_vertex.c code here, to build machine vertices
1215  * directly at this point.
1216  *
1217  * TODO: Eliminate the VB struct entirely and just use
1218  * struct arb_vertex_machine.
1219  */
1220 GLboolean
1221 _tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p)
1222 {
1223    struct compilation cp;
1224
1225    /* sanity checks */
1226    assert(emit_func[OPCODE_ABS] == emit_ABS);
1227    assert(emit_func[OPCODE_MUL] == emit_MUL);
1228    assert(emit_func[OPCODE_XPD] == emit_XPD);
1229
1230    _mesa_memset(&cp, 0, sizeof(cp));
1231    cp.p = p;
1232    cp.have_sse2 = 1;
1233
1234    if (p->compiled_func) {
1235       _mesa_free((void *)p->compiled_func);
1236       p->compiled_func = NULL;
1237    }
1238
1239    x86_init_func(&cp.func);
1240
1241    cp.fpucntl = RESTORE_FPU;
1242
1243
1244    /* Note ctx state is not referenced in building the function, so it
1245     * depends only on the list of instructions:
1246     */
1247    if (!build_vertex_program(&cp)) {
1248       x86_release_func( &cp.func );
1249       return GL_FALSE;
1250    }
1251
1252
1253    p->compiled_func = (void (*)(struct arb_vp_machine *))x86_get_func( &cp.func );
1254    return GL_TRUE;
1255 }
1256
1257
1258
1259 #else
1260
1261 GLboolean
1262 _tnl_sse_codegen_vertex_program(struct tnl_compiled_program *p)
1263 {
1264    /* Dummy version for when USE_SSE_ASM not defined */
1265    return GL_FALSE;
1266 }
1267
1268 #endif