src/mesa/tnl/t_vb_arbprogram_sse.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.3
   4  *
   5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file t_vb_arb_program_sse.c
  27  *
  28  * Translate simplified vertex_program representation to x86/SSE/SSE2
  29  * machine code using mesa's rtasm runtime assembler.
  30  *
  31  * \author Keith Whitwell
  32  */
  33
  34 #include "glheader.h"
  35 #include "context.h"
  36 #include "imports.h"
  37 #include "macros.h"
  38 #include "mtypes.h"
  39 #include "arbprogparse.h"
  40 #include "program.h"
  41 #include "math/m_matrix.h"
  42 #include "math/m_translate.h"
  43 #include "t_context.h"
  44 #include "t_vb_arbprogram.h"
  45
  46 #if defined(USE_SSE_ASM)
  47
  48 #include "x86/rtasm/x86sse.h"
  49 #include "x86/common_x86_asm.h"
  50
  51
  52 #define X    0
  53 #define Y    1
  54 #define Z    2
  55 #define W    3
  56
  57 /* Reg usage:
  58  *
  59  * EAX - point to 'm->File[0]'
  60  * ECX - point to 'm->File[3]'
  61  * EDX,
  62  * EBX,
  63  * ESP,
  64  * EBP,
  65  * ESI,
  66  * EDI
  67  */
  68
  69 #define DISASSEM 0
  70
  71 #define FAIL                                                            \
  72 do {                                                                    \
  73    _mesa_printf("x86 translation failed in %s\n", __FUNCTION__);        \
  74    return GL_FALSE;                                                     \
  75 } while (0)
  76
  77 struct compilation {
  78    struct x86_function func;
  79    struct arb_vp_machine *m;
  80
  81    GLuint insn_counter;
  82
  83    struct {
  84       GLuint file:2;
  85       GLuint idx:7;
  86       GLuint dirty:1;
  87       GLuint last_used:10;
  88    } xmm[8];
  89
  90    struct {
  91       struct x86_reg base;
  92    } file[4];
  93
  94    GLboolean have_sse2;
  95 };
  96
  97 static INLINE GLboolean eq( struct x86_reg a,
  98                             struct x86_reg b )
  99 {
 100    return (a.file == b.file &&
 101            a.idx == b.idx &&
 102            a.mod == b.mod &&
 103            a.disp == b.disp);
 104 }
 105
 106
 107
 108 static struct x86_reg get_reg_ptr(GLuint file,
 109                                   GLuint idx )
 110 {
 111    struct x86_reg reg;
 112
 113    switch (file) {
 114    case FILE_REG:
 115       reg = x86_make_reg(file_REG32, reg_AX);
 116       assert(idx != REG_UNDEF);
 117       break;
 118    case FILE_STATE_PARAM:
 119       reg = x86_make_reg(file_REG32, reg_CX);
 120       break;
 121    default:
 122       assert(0);
 123    }
 124
 125    return x86_make_disp(reg, 16 * idx);
 126 }
 127
 128
 129 static void spill( struct compilation *cp, GLuint idx )
 130 {
 131    struct x86_reg oldval = get_reg_ptr(cp->xmm[idx].file,
 132                                        cp->xmm[idx].idx);
 133
 134    assert(cp->xmm[idx].dirty);
 135    sse_movups(&cp->func, oldval, x86_make_reg(file_XMM, idx));
 136    cp->xmm[idx].dirty = 0;
 137 }
 138
 139 static struct x86_reg get_xmm_reg( struct compilation *cp )
 140 {
 141    GLuint i;
 142    GLuint oldest = 0;
 143
 144    for (i = 0; i < 8; i++)
 145       if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
 146          oldest = i;
 147
 148    /* Need to write out the old value?
 149     */
 150    if (cp->xmm[oldest].dirty)
 151       spill(cp, oldest);
 152
 153    assert(cp->xmm[oldest].last_used != cp->insn_counter);
 154
 155    cp->xmm[oldest].file = FILE_REG;
 156    cp->xmm[oldest].idx = REG_UNDEF;
 157    cp->xmm[oldest].last_used = cp->insn_counter;
 158    return x86_make_reg(file_XMM, oldest);
 159 }
 160
 161
 162
 163
 164 static struct x86_reg get_dst_reg( struct compilation *cp,
 165                                    GLuint file, GLuint idx )
 166 {
 167    struct x86_reg reg;
 168    GLuint i;
 169
 170    /* Invalidate any old copy of this register in XMM0-7.  Don't reuse
 171     * as this may be one of the arguments.
 172     */
 173    for (i = 0; i < 8; i++) {
 174       if (cp->xmm[i].file == file && cp->xmm[i].idx == idx) {
 175          cp->xmm[i].file = FILE_REG;
 176          cp->xmm[i].idx = REG_UNDEF;
 177          cp->xmm[i].dirty = 0;
 178          break;
 179       }
 180    }
 181
 182    reg = get_xmm_reg( cp );
 183    cp->xmm[reg.idx].file = file;
 184    cp->xmm[reg.idx].idx = idx;
 185    cp->xmm[reg.idx].dirty = 1;
 186    return reg;
 187 }
 188
 189
 190 /* Return an XMM reg if the argument is resident, otherwise return a
 191  * base+offset pointer to the saved value.
 192  */
 193 static struct x86_reg get_arg( struct compilation *cp, GLuint file, GLuint idx )
 194 {
 195    GLuint i;
 196
 197    for (i = 0; i < 8; i++) {
 198       if (cp->xmm[i].file == file &&
 199           cp->xmm[i].idx == idx) {
 200          cp->xmm[i].last_used = cp->insn_counter;
 201          return x86_make_reg(file_XMM, i);
 202       }
 203    }
 204
 205    return get_reg_ptr(file, idx);
 206 }
 207
 208 static void emit_pshufd( struct compilation *cp,
 209                          struct x86_reg dst,
 210                          struct x86_reg arg0,
 211                          GLubyte shuf )
 212 {
 213    if (cp->have_sse2) {
 214       sse2_pshufd(&cp->func, dst, arg0, shuf);
 215       cp->func.fn = 0;
 216    }
 217    else {
 218       if (!eq(dst, arg0))
 219          sse_movups(&cp->func, dst, arg0);
 220
 221       sse_shufps(&cp->func, dst, dst, shuf);
 222    }
 223 }
 224
 225
 226
 227 /* Perform a reduced swizzle.
 228  */
 229 static GLboolean emit_RSW( struct compilation *cp, union instruction op )
 230 {
 231    struct x86_reg arg0 = get_arg(cp, op.rsw.file0, op.rsw.idx0);
 232    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.rsw.dst);
 233    GLuint swz = op.rsw.swz;
 234    GLuint neg = op.rsw.neg;
 235
 236    emit_pshufd(cp, dst, arg0, swz);
 237
 238    if (neg) {
 239       struct x86_reg negs = get_arg(cp, FILE_REG, REG_SWZ);
 240       struct x86_reg tmp = get_xmm_reg(cp);
 241       /* Load 1,-1,0,0
 242        * Use neg as arg to pshufd
 243        * Multiply
 244        */
 245       emit_pshufd(cp, tmp, negs,
 246                   SHUF((neg & 1) ? 1 : 0,
 247                        (neg & 2) ? 1 : 0,
 248                        (neg & 4) ? 1 : 0,
 249                        (neg & 8) ? 1 : 0));
 250       sse_mulps(&cp->func, dst, tmp);
 251    }
 252
 253    return GL_TRUE;
 254 }
 255
 256 /* Used to implement write masking.  This and most of the other instructions
 257  * here would be easier to implement if there had been a translation
 258  * to a 2 argument format (dst/arg0, arg1) at the shader level before
 259  * attempting to translate to x86/sse code.
 260  */
 261 /* Hmm.  I went back to MSK from SEL to make things easier -- was that just BS?
 262  */
 263 static GLboolean emit_MSK( struct compilation *cp, union instruction op )
 264 {
 265    struct x86_reg arg = get_arg(cp, op.msk.file, op.msk.idx);
 266    struct x86_reg dst0 = get_arg(cp, FILE_REG, op.msk.dst);
 267    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.msk.dst);
 268
 269    sse_movups(&cp->func, dst, dst0);
 270
 271    switch (op.msk.mask) {
 272    case 0:
 273       return GL_TRUE;
 274
 275    case WRITEMASK_X:
 276       if (arg.file == file_XMM) {
 277          sse_movss(&cp->func, dst, arg);
 278       }
 279       else {
 280          struct x86_reg tmp = get_xmm_reg(cp);
 281          sse_movss(&cp->func, tmp, arg);
 282          sse_movss(&cp->func, dst, tmp);
 283       }
 284       return GL_TRUE;
 285
 286    case WRITEMASK_Y: {
 287       struct x86_reg tmp = get_xmm_reg(cp);
 288       emit_pshufd(cp, dst, dst, SHUF(Y, X, Z, W));
 289       emit_pshufd(cp, tmp, arg, SHUF(Y, X, Z, W));
 290       sse_movss(&cp->func, dst, tmp);
 291       emit_pshufd(cp, dst, dst, SHUF(Y, X, Z, W));
 292       return GL_TRUE;
 293    }
 294
 295    case WRITEMASK_Z: {
 296       struct x86_reg tmp = get_xmm_reg(cp);
 297       emit_pshufd(cp, dst, dst, SHUF(Z, Y, X, W));
 298       emit_pshufd(cp, tmp, arg, SHUF(Z, Y, X, W));
 299       sse_movss(&cp->func, dst, tmp);
 300       emit_pshufd(cp, dst, dst, SHUF(Z, Y, X, W));
 301       return GL_TRUE;
 302    }
 303
 304    case WRITEMASK_W: {
 305       struct x86_reg tmp = get_xmm_reg(cp);
 306       emit_pshufd(cp, dst, dst, SHUF(W, Y, Z, X));
 307       emit_pshufd(cp, tmp, arg, SHUF(W, Y, Z, X));
 308       sse_movss(&cp->func, dst, tmp);
 309       emit_pshufd(cp, dst, dst, SHUF(W, Y, Z, X));
 310       return GL_TRUE;
 311    }
 312
 313    case WRITEMASK_XY:
 314       sse_shufps(&cp->func, dst, arg, SHUF(X, Y, Z, W));
 315       return GL_TRUE;
 316
 317    case WRITEMASK_ZW: {
 318       struct x86_reg tmp = get_xmm_reg(cp);
 319       sse_movups(&cp->func, tmp, dst);
 320       sse_movups(&cp->func, dst, arg);
 321       sse_shufps(&cp->func, dst, tmp, SHUF(X, Y, Z, W));
 322       return GL_TRUE;
 323    }
 324
 325    case WRITEMASK_YZW: {
 326       struct x86_reg tmp = get_xmm_reg(cp);
 327       sse_movss(&cp->func, tmp, dst);
 328       sse_movups(&cp->func, dst, arg);
 329       sse_movss(&cp->func, dst, tmp);
 330       return GL_TRUE;
 331    }
 332
 333    case WRITEMASK_XYZW:
 334       sse_movups(&cp->func, dst, arg);
 335       return GL_TRUE;
 336
 337    default:
 338       FAIL;
 339    }
 340
 341 #if 0
 342    /* The catchall implementation:
 343     */
 344
 345    /* make full width bitmask in tmp
 346     * dst = ~tmp
 347     * tmp &= arg0
 348     * dst &= arg1
 349     * dst |= tmp
 350     */
 351    {
 352       struct x86_reg negs = get_arg(cp, FILE_REG, REG_NEGS);
 353       emit_pshufd(cp, tmp, negs,
 354                   SHUF((op.msk.mask & 1) ? 2 : 0,
 355                        (op.msk.mask & 2) ? 2 : 0,
 356                        (op.msk.mask & 4) ? 2 : 0,
 357                        (op.msk.mask & 8) ? 2 : 0));
 358       sse_mulps(&cp->func, dst, tmp);
 359    }
 360
 361    return GL_TRUE;
 362 #endif
 363    FAIL;
 364 }
 365
 366
 367
 368 static GLboolean emit_PRT( struct compilation *cp, union instruction op )
 369 {
 370    FAIL;
 371 }
 372
 373
 374 /**
 375  * The traditional instructions.  All operate on internal registers
 376  * and ignore write masks and swizzling issues.
 377  */
 378
 379 static GLboolean emit_ABS( struct compilation *cp, union instruction op )
 380 {
 381    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 382    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 383    struct x86_reg neg = get_reg_ptr(FILE_REG, REG_NEG);
 384
 385    sse_movups(&cp->func, dst, arg0);
 386    sse_mulps(&cp->func, dst, neg);
 387    sse_maxps(&cp->func, dst, arg0);
 388    return GL_TRUE;
 389 }
 390
 391 static GLboolean emit_ADD( struct compilation *cp, union instruction op )
 392 {
 393    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 394    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 395    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 396
 397    sse_movups(&cp->func, dst, arg0);
 398    sse_addps(&cp->func, dst, arg1);
 399    return GL_TRUE;
 400 }
 401
 402
 403 /* The dotproduct instructions don't really do that well in sse:
 404  */
 405 static GLboolean emit_DP3( struct compilation *cp, union instruction op )
 406 {
 407    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 408    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 409    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 410    struct x86_reg tmp = get_xmm_reg(cp);
 411
 412    sse_movups(&cp->func, dst, arg0);
 413    sse_mulps(&cp->func, dst, arg1);
 414
 415    /* Now the hard bit: sum the first 3 values:
 416     */
 417    sse_movhlps(&cp->func, tmp, dst);
 418    sse_addss(&cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
 419    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
 420    sse_addss(&cp->func, dst, tmp);
 421    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 422    return GL_TRUE;
 423 }
 424
 425
 426
 427 static GLboolean emit_DP4( struct compilation *cp, union instruction op )
 428 {
 429    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 430    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 431    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 432    struct x86_reg tmp = get_xmm_reg(cp);
 433
 434    sse_movups(&cp->func, dst, arg0);
 435    sse_mulps(&cp->func, dst, arg1);
 436
 437    /* Now the hard bit: sum the values:
 438     */
 439    sse_movhlps(&cp->func, tmp, dst);
 440    sse_addps(&cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
 441    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
 442    sse_addss(&cp->func, dst, tmp);
 443    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 444    return GL_TRUE;
 445 }
 446
 447 static GLboolean emit_DPH( struct compilation *cp, union instruction op )
 448 {
 449 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
 450 /*    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); */
 451    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 452
 453 /*    dst[0] = (arg0[0] * arg1[0] +  */
 454 /*           arg0[1] * arg1[1] +  */
 455 /*           arg0[2] * arg1[2] +  */
 456 /*           1.0     * arg1[3]); */
 457
 458    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 459    FAIL;
 460 }
 461
 462 static GLboolean emit_DST( struct compilation *cp, union instruction op )
 463 {
 464 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
 465 /*    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); */
 466 /*    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
 467
 468 /*    dst[0] = 1.0     * 1.0F; */
 469 /*    dst[1] = arg0[1] * arg1[1]; */
 470 /*    dst[2] = arg0[2] * 1.0; */
 471 /*    dst[3] = 1.0     * arg1[3]; */
 472
 473    FAIL;
 474 }
 475
 476
 477 static GLboolean emit_EX2( struct compilation *cp, union instruction op )
 478 {
 479 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
 480    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 481
 482 /*    dst[0] = (GLfloat)RoughApproxPow2(arg0[0]); */
 483    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 484    FAIL;
 485 }
 486
 487 static GLboolean emit_EXP( struct compilation *cp, union instruction op )
 488 {
 489 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
 490 /*    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
 491
 492 /*    GLfloat tmp = arg0[0]; */
 493 /*    GLfloat flr_tmp = FLOORF(tmp); */
 494 /*    dst[0] = (GLfloat) (1 << (int)flr_tmp); */
 495 /*    dst[1] = tmp - flr_tmp; */
 496 /*    dst[2] = RoughApproxPow2(tmp); */
 497 /*    dst[3] = 1.0F; */
 498    FAIL;
 499 }
 500
 501 static GLboolean emit_FLR( struct compilation *cp, union instruction op )
 502 {
 503 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
 504 /*    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
 505
 506 /*    dst[0] = FLOORF(arg0[0]); */
 507 /*    dst[1] = FLOORF(arg0[1]); */
 508 /*    dst[2] = FLOORF(arg0[2]); */
 509 /*    dst[3] = FLOORF(arg0[3]); */
 510    FAIL;
 511 }
 512
 513 static GLboolean emit_FRC( struct compilation *cp, union instruction op )
 514 {
 515 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
 516 /*    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
 517
 518 /*    dst[0] = arg0[0] - FLOORF(arg0[0]); */
 519 /*    dst[1] = arg0[1] - FLOORF(arg0[1]); */
 520 /*    dst[2] = arg0[2] - FLOORF(arg0[2]); */
 521 /*    dst[3] = arg0[3] - FLOORF(arg0[3]); */
 522    FAIL;
 523 }
 524
 525 static GLboolean emit_LG2( struct compilation *cp, union instruction op )
 526 {
 527 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
 528 /*    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
 529
 530 /*    dst[0] = RoughApproxLog2(arg0[0]); */
 531
 532 /*    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X)); */
 533    FAIL;
 534 }
 535
 536
 537
 538 static GLboolean emit_LIT( struct compilation *cp, union instruction op )
 539 {
 540 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
 541 /*    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
 542
 543 /*    const GLfloat epsilon = 1.0F / 256.0F; */
 544 /*    GLfloat tmp[4]; */
 545
 546 /*    tmp[0] = MAX2(arg0[0], 0.0F); */
 547 /*    tmp[1] = MAX2(arg0[1], 0.0F); */
 548 /*    tmp[3] = CLAMP(arg0[3], -(128.0F - epsilon), (128.0F - epsilon)); */
 549
 550 /*    dst[0] = 1.0; */
 551 /*    dst[1] = tmp[0]; */
 552 /*    dst[2] = (tmp[0] > 0.0) ? RoughApproxPower(tmp[1], tmp[3]) : 0.0F; */
 553 /*    dst[3] = 1.0; */
 554    FAIL;
 555 }
 556
 557
 558 static GLboolean emit_LOG( struct compilation *cp, union instruction op )
 559 {
 560 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
 561 /*    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
 562
 563 /*    GLfloat tmp = FABSF(arg0[0]); */
 564 /*    int exponent; */
 565 /*    GLfloat mantissa = FREXPF(tmp, &exponent); */
 566 /*    dst[0] = (GLfloat) (exponent - 1); */
 567 /*    dst[1] = 2.0 * mantissa; // map [.5, 1) -> [1, 2)  */
 568 /*    dst[2] = dst[0] + LOG2(dst[1]); */
 569 /*    dst[3] = 1.0; */
 570    FAIL;
 571 }
 572
 573 static GLboolean emit_MAX( struct compilation *cp, union instruction op )
 574 {
 575    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 576    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 577    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 578
 579    sse_movups(&cp->func, dst, arg0);
 580    sse_maxps(&cp->func, dst, arg1);
 581    return GL_TRUE;
 582 }
 583
 584
 585 static GLboolean emit_MIN( struct compilation *cp, union instruction op )
 586 {
 587    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 588    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 589    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 590
 591    sse_movups(&cp->func, dst, arg0);
 592    sse_minps(&cp->func, dst, arg1);
 593    return GL_TRUE;
 594 }
 595
 596 static GLboolean emit_MOV( struct compilation *cp, union instruction op )
 597 {
 598    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 599    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 600
 601    sse_movups(&cp->func, dst, arg0);
 602    return GL_TRUE;
 603 }
 604
 605 static GLboolean emit_MUL( struct compilation *cp, union instruction op )
 606 {
 607    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 608    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 609    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 610
 611    sse_movups(&cp->func, dst, arg0);
 612    sse_mulps(&cp->func, dst, arg1);
 613    return GL_TRUE;
 614 }
 615
 616
 617 static GLboolean emit_POW( struct compilation *cp, union instruction op )
 618 {
 619 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0); */
 620 /*    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1); */
 621    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 622
 623 /*    dst[0] = (GLfloat)RoughApproxPower(arg0[0], arg1[0]); */
 624
 625    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 626    FAIL;
 627 }
 628
 629 static GLboolean emit_REL( struct compilation *cp, union instruction op )
 630 {
 631 /*    GLuint idx = (op.alu.idx0 + (GLint)cp->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1); */
 632 /*    GLuint idx = 0; */
 633 /*    struct x86_reg arg0 = get_arg(cp, op.alu.file0, idx); */
 634 /*    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst); */
 635
 636 /*    dst[0] = arg0[0]; */
 637 /*    dst[1] = arg0[1]; */
 638 /*    dst[2] = arg0[2]; */
 639 /*    dst[3] = arg0[3]; */
 640
 641    FAIL;
 642 }
 643
 644 static GLboolean emit_RCP( struct compilation *cp, union instruction op )
 645 {
 646    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 647    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 648
 649    if (cp->have_sse2) {
 650       sse2_rcpss(&cp->func, dst, arg0);
 651    }
 652    else {
 653       struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
 654       sse_movss(&cp->func, dst, ones);
 655       sse_divss(&cp->func, dst, arg0);
 656    }
 657
 658    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 659    return GL_TRUE;
 660 }
 661
 662 static GLboolean emit_RSQ( struct compilation *cp, union instruction op )
 663 {
 664    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 665    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 666
 667    sse_rsqrtss(&cp->func, dst, arg0);
 668    sse_shufps(&cp->func, dst, dst, SHUF(X, X, X, X));
 669    return GL_TRUE;
 670 }
 671
 672
 673 static GLboolean emit_SGE( struct compilation *cp, union instruction op )
 674 {
 675    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 676    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 677    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 678    struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
 679
 680    sse_movups(&cp->func, dst, arg0);
 681    sse_cmpps(&cp->func, dst, arg1, cc_NotLessThan);
 682    sse_andps(&cp->func, dst, ones);
 683    return GL_TRUE;
 684 }
 685
 686
 687 static GLboolean emit_SLT( struct compilation *cp, union instruction op )
 688 {
 689    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 690    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 691    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 692    struct x86_reg ones = get_reg_ptr(FILE_REG, REG_ONES);
 693
 694    sse_movups(&cp->func, dst, arg0);
 695    sse_cmpps(&cp->func, dst, arg1, cc_LessThan);
 696    sse_andps(&cp->func, dst, ones);
 697    return GL_TRUE;
 698 }
 699
 700 static GLboolean emit_SUB( struct compilation *cp, union instruction op )
 701 {
 702    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 703    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 704    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 705
 706    sse_movups(&cp->func, dst, arg0);
 707    sse_subps(&cp->func, dst, arg1);
 708    return GL_TRUE;
 709 }
 710
 711
 712 static GLboolean emit_XPD( struct compilation *cp, union instruction op )
 713 {
 714    struct x86_reg arg0 = get_arg(cp, op.alu.file0, op.alu.idx0);
 715    struct x86_reg arg1 = get_arg(cp, op.alu.file1, op.alu.idx1);
 716    struct x86_reg dst = get_dst_reg(cp, FILE_REG, op.alu.dst);
 717    struct x86_reg tmp0 = get_xmm_reg(cp);
 718    struct x86_reg tmp1 = get_xmm_reg(cp);
 719
 720    /* Could avoid tmp0, tmp1 if we overwrote arg0, arg1.  Need a way
 721     * to invalidate registers.  This will come with better analysis
 722     * (liveness analysis) of the incoming program.
 723     */
 724    emit_pshufd(cp, dst, arg0, SHUF(Y, Z, X, W));
 725    emit_pshufd(cp, tmp1, arg1, SHUF(Z, X, Y, W));
 726    sse_mulps(&cp->func, dst, tmp1);
 727    emit_pshufd(cp, tmp0, arg0, SHUF(Z, X, Y, W));
 728    emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
 729    sse_mulps(&cp->func, tmp0, tmp1);
 730    sse_subps(&cp->func, dst, tmp0);
 731
 732 /*    dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
 733 /*    dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
 734 /*    dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
 735 /*    dst[3] is undef */
 736
 737    return GL_TRUE;
 738 }
 739
 740 static GLboolean emit_NOP( struct compilation *cp, union instruction op )
 741 {
 742    return GL_TRUE;
 743 }
 744
 745
 746 static GLboolean (* const emit_func[])(struct compilation *, union instruction) =
 747 {
 748    emit_ABS,
 749    emit_ADD,
 750    emit_NOP,
 751    emit_DP3,
 752    emit_DP4,
 753    emit_DPH,
 754    emit_DST,
 755    emit_NOP,
 756    emit_EX2,
 757    emit_EXP,
 758    emit_FLR,
 759    emit_FRC,
 760    emit_LG2,
 761    emit_LIT,
 762    emit_LOG,
 763    emit_NOP,
 764    emit_MAX,
 765    emit_MIN,
 766    emit_MOV,
 767    emit_MUL,
 768    emit_POW,
 769    emit_PRT,
 770    emit_NOP,
 771    emit_RCP,
 772    emit_RSQ,
 773    emit_SGE,
 774    emit_SLT,
 775    emit_SUB,
 776    emit_RSW,
 777    emit_XPD,
 778    emit_RSW,
 779    emit_MSK,
 780    emit_REL,
 781 };
 782
 783 static GLint get_offset( const void *a, const void *b )
 784 {
 785    return (const char *)b - (const char *)a;
 786 }
 787
 788
 789 static GLboolean build_vertex_program( struct compilation *cp )
 790 {
 791    GLuint j;
 792
 793    struct x86_reg regEAX = x86_make_reg(file_REG32, reg_AX);
 794    struct x86_reg parmECX = x86_make_reg(file_REG32, reg_CX);
 795
 796    x86_mov(&cp->func, regEAX, x86_fn_arg(&cp->func, 1));
 797    x86_mov(&cp->func, parmECX, regEAX);
 798
 799    x86_mov(&cp->func, regEAX, x86_make_disp(regEAX, get_offset(cp->m, cp->m->File + FILE_REG)));
 800    x86_mov(&cp->func, parmECX, x86_make_disp(parmECX, get_offset(cp->m, cp->m->File + FILE_STATE_PARAM)));
 801
 802    for (j = 0; j < cp->m->nr_instructions; j++) {
 803       union instruction inst = cp->m->instructions[j];
 804       cp->insn_counter = j+1;   /* avoid zero */
 805
 806       if (DISASSEM) {
 807          _mesa_printf("%p: ", cp->func.csr);
 808          _tnl_disassem_vba_insn( inst );
 809       }
 810       cp->func.fn = NULL;
 811
 812       if (!emit_func[inst.alu.opcode]( cp, inst )) {
 813          return GL_FALSE;
 814       }
 815    }
 816
 817    /* TODO: only for outputs:
 818     */
 819    for (j = 0; j < 8; j++) {
 820       if (cp->xmm[j].dirty)
 821          spill(cp, j);
 822    }
 823
 824
 825    /* Exit mmx state?
 826     */
 827    if (cp->func.need_emms)
 828       mmx_emms(&cp->func);
 829
 830    x86_ret(&cp->func);
 831
 832    return GL_TRUE;
 833 }
 834
 835 /**
 836  * Execute the given vertex program.
 837  *
 838  * TODO: Integrate the t_vertex.c code here, to build machine vertices
 839  * directly at this point.
 840  *
 841  * TODO: Eliminate the VB struct entirely and just use
 842  * struct arb_vertex_machine.
 843  */
 844 GLboolean
 845 _tnl_sse_codegen_vertex_program(struct arb_vp_machine *m)
 846 {
 847    struct compilation cp;
 848
 849    memset(&cp, 0, sizeof(cp));
 850    cp.m = m;
 851    cp.have_sse2 = 1;
 852
 853    if (m->func) {
 854       free((void *)m->func);
 855       m->func = NULL;
 856    }
 857
 858    x86_init_func(&cp.func);
 859
 860    if (!build_vertex_program(&cp)) {
 861       x86_release_func( &cp.func );
 862       return GL_FALSE;
 863    }
 864
 865    m->func = (void (*)(struct arb_vp_machine *))x86_get_func( &cp.func );
 866    return GL_TRUE;
 867 }
 868
 869
 870
 871 #else
 872
 873 GLboolean
 874 _tnl_sse_codegen_vertex_program( GLcontext *ctx )
 875 {
 876    /* Dummy version for when USE_SSE_ASM not defined */
 877    return GL_FALSE;
 878 }
 879
 880 #endif