src/mesa/tnl/t_vb_arbprogram.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.3
   4  *
   5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file t_arb_program.c
  27  * Compile vertex programs to an intermediate representation.
  28  * Execute vertex programs over a buffer of vertices.
  29  * \author Keith Whitwell, Brian Paul
  30  */
  31
  32 #include "glheader.h"
  33 #include "context.h"
  34 #include "imports.h"
  35 #include "macros.h"
  36 #include "mtypes.h"
  37 #include "arbprogparse.h"
  38 #include "program.h"
  39 #include "math/m_matrix.h"
  40 #include "math/m_translate.h"
  41 #include "t_context.h"
  42 #include "t_pipeline.h"
  43 #include "t_vp_build.h"
  44
  45 /* Define to see the compiled program on stderr:
  46  */
  47 #define DISASSEM 0
  48
  49
  50 /* New, internal instructions:
  51  */
  52 #define IN1        (VP_OPCODE_XPD+1)
  53 #define IN2        (IN1+1)      /* intput-to-reg MOV */
  54 #define IN3        (IN1+2)
  55 #define IN4        (IN1+3)
  56 #define OUT        (IN1+4)      /* reg-to-output MOV */
  57 #define OUM        (IN1+5)      /* reg-to-output MOV with mask */
  58 #define RSW        (IN1+6)
  59 #define MSK        (IN1+7)      /* reg-to-reg MOV with mask */
  60 #define PAR        (IN1+8)      /* parameter-to-reg MOV */
  61 #define PRL        (IN1+9)      /* parameter-to-reg MOV */
  62
  63
  64 /* Layout of register file:
  65
  66   0 -- Scratch (Arg0)
  67   1 -- Scratch (Arg1)
  68   2 -- Scratch (Arg2)
  69   3 -- Scratch (Result)
  70   4 -- Program Temporary 0
  71   ..
  72   31 -- Program Temporary 27
  73   32 -- State/Input/Const shadow 0
  74   ..
  75   63 -- State/Input/Const shadow 31
  76
  77 */
  78
  79
  80
  81 #define REG_ARG0  0
  82 #define REG_ARG1  1
  83 #define REG_ARG2  2
  84 #define REG_RES   3
  85 #define REG_TMP0  4
  86 #define REG_TMP_MAX 32
  87 #define REG_TMP_NR (REG_TMP_MAX-REG_TMP0)
  88 #define REG_PAR0  32
  89 #define REG_PAR_MAX 64
  90 #define REG_PAR_NR (REG_PAR_MAX-REG_PAR0)
  91
  92 #define REG_MAX 64
  93 #define REG_SWZDST_MAX 16
  94
  95 /* ARB_vp instructions are broken down into one or more of the
  96  * following micro-instructions, each representable in a 32 bit packed
  97  * structure.
  98  */
  99
 100
 101 union instruction {
 102    struct {
 103       GLuint opcode:6;
 104       GLuint dst:5;
 105       GLuint arg0:6;
 106       GLuint arg1:6;
 107       GLuint elt:2;             /* x,y,z or w */
 108       GLuint pad:7;
 109    } scl;
 110
 111
 112    struct {
 113       GLuint opcode:6;
 114       GLuint dst:5;
 115       GLuint arg0:6;
 116       GLuint arg1:6;
 117       GLuint arg2:6;
 118       GLuint pad:3;
 119    } vec;
 120
 121    struct {
 122       GLuint opcode:6;
 123       GLuint dst:4;             /* NOTE!  REG 0..16 only! */
 124       GLuint arg0:6;
 125       GLuint neg:4;
 126       GLuint swz:12;
 127    } swz;
 128
 129    struct {
 130       GLuint opcode:6;
 131       GLuint dst:6;
 132       GLuint arg0:6;
 133       GLuint neg:1;             /* 1 bit only */
 134       GLuint swz:8;             /* xyzw only */
 135       GLuint pad:5;
 136    } rsw;
 137
 138    struct {
 139       GLuint opcode:6;
 140       GLuint reg:6;
 141       GLuint file:5;
 142       GLuint idx:8;             /* plenty? */
 143       GLuint rel:1;
 144       GLuint pad:6;
 145    } inr;
 146
 147
 148    struct {
 149       GLuint opcode:6;
 150       GLuint reg:6;
 151       GLuint file:5;
 152       GLuint idx:8;             /* plenty? */
 153       GLuint mask:4;
 154       GLuint pad:3;
 155    } out;
 156
 157    struct {
 158       GLuint opcode:6;
 159       GLuint dst:5;
 160       GLuint arg0:6;
 161       GLuint mask:4;
 162       GLuint pad:11;
 163    } msk;
 164
 165    GLuint dword;
 166 };
 167
 168
 169
 170 struct compilation {
 171    struct {
 172       GLuint file:5;
 173       GLuint idx:8;
 174    } reg[REG_PAR_NR];
 175
 176    GLuint par_active;
 177    GLuint par_protected;
 178    GLuint tmp_active;
 179
 180    union instruction *csr;
 181
 182    struct vertex_buffer *VB;    /* for input sizes! */
 183 };
 184
 185 /*--------------------------------------------------------------------------- */
 186
 187 /*!
 188  * Private storage for the vertex program pipeline stage.
 189  */
 190 struct arb_vp_machine {
 191    GLfloat reg[REG_MAX][4];     /* Program temporaries, shadowed parameters and inputs,
 192                                    plus some internal values */
 193
 194    GLfloat (*File[8])[4];       /* Src/Dest for PAR/PRL instructions. */
 195    GLint AddressReg;
 196
 197    union instruction store[1024];
 198    union instruction *instructions;
 199    GLint nr_instructions;
 200
 201    GLvector4f attribs[VERT_RESULT_MAX]; /**< result vectors. */
 202    GLvector4f ndcCoords;              /**< normalized device coords */
 203    GLubyte *clipmask;                 /**< clip flags */
 204    GLubyte ormask, andmask;           /**< for clipping */
 205
 206    GLuint vtx_nr;               /**< loop counter */
 207
 208    struct vertex_buffer *VB;
 209    GLcontext *ctx;
 210 };
 211
 212
 213 /*--------------------------------------------------------------------------- */
 214
 215 struct opcode_info {
 216    GLuint type;
 217    GLuint nr_args;
 218    const char *string;
 219    void (*func)( struct arb_vp_machine *, union instruction );
 220    void (*print)( union instruction , const struct opcode_info * );
 221 };
 222
 223
 224 #define ARB_VP_MACHINE(stage) ((struct arb_vp_machine *)(stage->privatePtr))
 225
 226
 227
 228 /**
 229  * Set x to positive or negative infinity.
 230  *
 231  * XXX: FIXME - type punning.
 232  */
 233 #if defined(USE_IEEE) || defined(_WIN32)
 234 #define SET_POS_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0x7F800000 )
 235 #define SET_NEG_INFINITY(x)  ( *((GLuint *) (void *)&x) = 0xFF800000 )
 236 #elif defined(VMS)
 237 #define SET_POS_INFINITY(x)  x = __MAXFLOAT
 238 #define SET_NEG_INFINITY(x)  x = -__MAXFLOAT
 239 #define IS_INF_OR_NAN(t)   ((t) == __MAXFLOAT)
 240 #else
 241 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
 242 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
 243 #endif
 244
 245 #define FREXPF(a,b) frexpf(a,b)
 246
 247 #define PUFF(x) ((x)[1] = (x)[2] = (x)[3] = (x)[0])
 248
 249 /* FIXME: more type punning (despite use of fi_type...)
 250  */
 251 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
 252
 253
 254 static GLfloat RoughApproxLog2(GLfloat t)
 255 {
 256    return LOG2(t);
 257 }
 258
 259 static GLfloat RoughApproxPow2(GLfloat t)
 260 {
 261    GLfloat q;
 262 #ifdef USE_IEEE
 263    GLint ii = (GLint) t;
 264    ii = (ii < 23) + 0x3f800000;
 265    SET_FLOAT_BITS(q, ii);
 266    q = *((GLfloat *) (void *)&ii);
 267 #else
 268    q = (GLfloat) pow(2.0, floor_t0);
 269 #endif
 270    return q;
 271 }
 272
 273 static GLfloat RoughApproxPower(GLfloat x, GLfloat y)
 274 {
 275 #if 0
 276    return (GLfloat) exp(y * log(x));
 277 #else
 278    return (GLfloat) _mesa_pow(x, y);
 279 #endif
 280 }
 281
 282
 283 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
 284
 285
 286
 287
 288 /**
 289  * This is probably the least-optimal part of the process, have to
 290  * multiply out the stride to access each incoming input value.
 291  */
 292 static GLfloat *get_input( struct arb_vp_machine *m, GLuint index )
 293 {
 294    return VEC_ELT(m->VB->AttribPtr[index], GLfloat, m->vtx_nr);
 295 }
 296
 297
 298 /**
 299  * Fetch a 4-element float vector from the given source register.
 300  * Deal with the possibility that not all elements are present.
 301  */
 302 static void do_IN1( struct arb_vp_machine *m, union instruction op )
 303 {
 304    GLfloat *result = m->reg[op.inr.reg];
 305    const GLfloat *src = get_input(m, op.inr.idx);
 306
 307    result[0] = src[0];
 308    result[1] = 0;
 309    result[2] = 0;
 310    result[3] = 1;
 311 }
 312
 313 static void do_IN2( struct arb_vp_machine *m, union instruction op )
 314 {
 315    GLfloat *result = m->reg[op.inr.reg];
 316    const GLfloat *src = get_input(m, op.inr.idx);
 317
 318    result[0] = src[0];
 319    result[1] = src[1];
 320    result[2] = 0;
 321    result[3] = 1;
 322 }
 323
 324 static void do_IN3( struct arb_vp_machine *m, union instruction op )
 325 {
 326    GLfloat *result = m->reg[op.inr.reg];
 327    const GLfloat *src = get_input(m, op.inr.idx);
 328
 329    result[0] = src[0];
 330    result[1] = src[1];
 331    result[2] = src[2];
 332    result[3] = 1;
 333 }
 334
 335 static void do_IN4( struct arb_vp_machine *m, union instruction op )
 336 {
 337    GLfloat *result = m->reg[op.inr.reg];
 338    const GLfloat *src = get_input(m, op.inr.idx);
 339
 340    result[0] = src[0];
 341    result[1] = src[1];
 342    result[2] = src[2];
 343    result[3] = src[3];
 344 }
 345
 346 /**
 347  * Perform a reduced swizzle:
 348  */
 349 static void do_RSW( struct arb_vp_machine *m, union instruction op )
 350 {
 351    GLfloat *result = m->reg[op.rsw.dst];
 352    const GLfloat *arg0 = m->reg[op.rsw.arg0];
 353    GLuint swz = op.rsw.swz;
 354    GLuint neg = op.rsw.neg;
 355    GLuint i;
 356
 357    if (neg)
 358       for (i = 0; i < 4; i++, swz >>= 2)
 359          result[i] = -arg0[swz & 0x3];
 360    else
 361       for (i = 0; i < 4; i++, swz >>= 2)
 362          result[i] = arg0[swz & 0x3];
 363 }
 364
 365
 366
 367 /**
 368  * Store 4 floats into an external address.
 369  */
 370 static void do_OUM( struct arb_vp_machine *m, union instruction op )
 371 {
 372    GLfloat *dst = m->attribs[op.out.idx].data[m->vtx_nr];
 373    const GLfloat *value = m->reg[op.out.reg];
 374
 375    if (op.out.mask & 0x1) dst[0] = value[0];
 376    if (op.out.mask & 0x2) dst[1] = value[1];
 377    if (op.out.mask & 0x4) dst[2] = value[2];
 378    if (op.out.mask & 0x8) dst[3] = value[3];
 379 }
 380
 381 static void do_OUT( struct arb_vp_machine *m, union instruction op )
 382 {
 383    GLfloat *dst = m->attribs[op.out.idx].data[m->vtx_nr];
 384    const GLfloat *value = m->reg[op.out.reg];
 385
 386    dst[0] = value[0];
 387    dst[1] = value[1];
 388    dst[2] = value[2];
 389    dst[3] = value[3];
 390 }
 391
 392 /* Register-to-register MOV with writemask.
 393  */
 394 static void do_MSK( struct arb_vp_machine *m, union instruction op )
 395 {
 396    GLfloat *dst = m->reg[op.msk.dst];
 397    const GLfloat *arg0 = m->reg[op.msk.arg0];
 398
 399    if (op.msk.mask & 0x1) dst[0] = arg0[0];
 400    if (op.msk.mask & 0x2) dst[1] = arg0[1];
 401    if (op.msk.mask & 0x4) dst[2] = arg0[2];
 402    if (op.msk.mask & 0x8) dst[3] = arg0[3];
 403 }
 404
 405
 406 /* Retreive parameters and other constant values:
 407  */
 408 static void do_PAR( struct arb_vp_machine *m, union instruction op )
 409 {
 410    GLfloat *result = m->reg[op.inr.reg];
 411    const GLfloat *src = m->File[op.inr.file][op.inr.idx];
 412
 413    result[0] = src[0];
 414    result[1] = src[1];
 415    result[2] = src[2];
 416    result[3] = src[3];
 417 }
 418
 419
 420 #define RELADDR_MASK (MAX_NV_VERTEX_PROGRAM_PARAMS-1)
 421
 422 static void do_PRL( struct arb_vp_machine *m, union instruction op )
 423 {
 424    GLfloat *result = m->reg[op.inr.reg];
 425    GLuint index = (op.inr.idx + m->AddressReg) & RELADDR_MASK;
 426    const GLfloat *src = m->File[op.inr.file][index];
 427
 428    result[0] = src[0];
 429    result[1] = src[1];
 430    result[2] = src[2];
 431    result[3] = src[3];
 432 }
 433
 434 static void do_PRT( struct arb_vp_machine *m, union instruction op )
 435 {
 436    const GLfloat *arg0 = m->reg[op.vec.arg0];
 437
 438    _mesa_printf("%d: %f %f %f %f\n", m->vtx_nr,
 439                 arg0[0], arg0[1], arg0[2], arg0[3]);
 440 }
 441
 442
 443 /**
 444  * The traditional ALU and texturing instructions.  All operate on
 445  * internal registers and ignore write masks and swizzling issues.
 446  */
 447
 448 static void do_ABS( struct arb_vp_machine *m, union instruction op )
 449 {
 450    GLfloat *result = m->reg[op.vec.dst];
 451    const GLfloat *arg0 = m->reg[op.vec.arg0];
 452
 453    result[0] = (arg0[0] < 0.0) ? -arg0[0] : arg0[0];
 454    result[1] = (arg0[1] < 0.0) ? -arg0[1] : arg0[1];
 455    result[2] = (arg0[2] < 0.0) ? -arg0[2] : arg0[2];
 456    result[3] = (arg0[3] < 0.0) ? -arg0[3] : arg0[3];
 457 }
 458
 459 static void do_ADD( struct arb_vp_machine *m, union instruction op )
 460 {
 461    GLfloat *result = m->reg[op.vec.dst];
 462    const GLfloat *arg0 = m->reg[op.vec.arg0];
 463    const GLfloat *arg1 = m->reg[op.vec.arg1];
 464
 465    result[0] = arg0[0] + arg1[0];
 466    result[1] = arg0[1] + arg1[1];
 467    result[2] = arg0[2] + arg1[2];
 468    result[3] = arg0[3] + arg1[3];
 469 }
 470
 471
 472 static void do_ARL( struct arb_vp_machine *m, union instruction op )
 473 {
 474    const GLfloat *arg0 = m->reg[op.out.reg];
 475    m->AddressReg = (GLint) floor(arg0[0]);
 476 }
 477
 478
 479 static void do_DP3( struct arb_vp_machine *m, union instruction op )
 480 {
 481    GLfloat *result = m->reg[op.scl.dst];
 482    const GLfloat *arg0 = m->reg[op.scl.arg0];
 483    const GLfloat *arg1 = m->reg[op.scl.arg1];
 484
 485    result[0] = (arg0[0] * arg1[0] +
 486                 arg0[1] * arg1[1] +
 487                 arg0[2] * arg1[2]);
 488
 489    PUFF(result);
 490 }
 491
 492 #if 0
 493 static void do_MAT4( struct arb_vp_machine *m, union instruction op )
 494 {
 495    GLfloat *result = m->reg[op.scl.dst];
 496    const GLfloat *arg0 = m->reg[op.scl.arg0];
 497    const GLfloat *mat[] = m->reg[op.scl.arg1];
 498
 499    result[0] = (arg0[0] * mat0[0] + arg0[1] * mat0[1] + arg0[2] * mat0[2] + arg0[3] * mat0[3]);
 500    result[1] = (arg0[0] * mat1[0] + arg0[1] * mat1[1] + arg0[2] * mat1[2] + arg0[3] * mat1[3]);
 501    result[2] = (arg0[0] * mat2[0] + arg0[1] * mat2[1] + arg0[2] * mat2[2] + arg0[3] * mat2[3]);
 502    result[3] = (arg0[0] * mat3[0] + arg0[1] * mat3[1] + arg0[2] * mat3[2] + arg0[3] * mat3[3]);
 503 }
 504 #endif
 505
 506
 507 static void do_DP4( struct arb_vp_machine *m, union instruction op )
 508 {
 509    GLfloat *result = m->reg[op.scl.dst];
 510    const GLfloat *arg0 = m->reg[op.scl.arg0];
 511    const GLfloat *arg1 = m->reg[op.scl.arg1];
 512
 513    result[0] = (arg0[0] * arg1[0] +
 514                 arg0[1] * arg1[1] +
 515                 arg0[2] * arg1[2] +
 516                 arg0[3] * arg1[3]);
 517
 518    PUFF(result);
 519 }
 520
 521 static void do_DPH( struct arb_vp_machine *m, union instruction op )
 522 {
 523    GLfloat *result = m->reg[op.scl.dst];
 524    const GLfloat *arg0 = m->reg[op.scl.arg0];
 525    const GLfloat *arg1 = m->reg[op.scl.arg1];
 526
 527    result[0] = (arg0[0] * arg1[0] +
 528                 arg0[1] * arg1[1] +
 529                 arg0[2] * arg1[2] +
 530                 1.0     * arg1[3]);
 531
 532    PUFF(result);
 533 }
 534
 535 static void do_DST( struct arb_vp_machine *m, union instruction op )
 536 {
 537    GLfloat *result = m->reg[op.vec.dst];
 538    const GLfloat *arg0 = m->reg[op.vec.arg0];
 539    const GLfloat *arg1 = m->reg[op.vec.arg1];
 540
 541    result[0] = 1.0F;
 542    result[1] = arg0[1] * arg1[1];
 543    result[2] = arg0[2];
 544    result[3] = arg1[3];
 545 }
 546
 547
 548 static void do_EX2( struct arb_vp_machine *m, union instruction op )
 549 {
 550    GLfloat *result = m->reg[op.scl.dst];
 551    const GLfloat *arg0 = m->reg[op.scl.arg0];
 552
 553    result[0] = (GLfloat)RoughApproxPow2(arg0[0]);
 554    PUFF(result);
 555 }
 556
 557 static void do_EXP( struct arb_vp_machine *m, union instruction op )
 558 {
 559    GLfloat *result = m->reg[op.vec.dst];
 560    const GLfloat *arg0 = m->reg[op.vec.arg0];
 561    GLfloat tmp = arg0[0];
 562    GLfloat flr_tmp = FLOORF(tmp);
 563
 564    /* KW: nvvertexec has an optimized version of this which is pretty
 565     * hard to understand/validate, but avoids the RoughApproxPow2.
 566     */
 567    result[0] = (GLfloat) (1 << (int)flr_tmp);
 568    result[1] = tmp - flr_tmp;
 569    result[2] = RoughApproxPow2(tmp);
 570    result[3] = 1.0F;
 571 }
 572
 573 static void do_FLR( struct arb_vp_machine *m, union instruction op )
 574 {
 575    GLfloat *result = m->reg[op.vec.dst];
 576    const GLfloat *arg0 = m->reg[op.vec.arg0];
 577
 578    result[0] = FLOORF(arg0[0]);
 579    result[1] = FLOORF(arg0[1]);
 580    result[2] = FLOORF(arg0[2]);
 581    result[3] = FLOORF(arg0[3]);
 582 }
 583
 584 static void do_FRC( struct arb_vp_machine *m, union instruction op )
 585 {
 586    GLfloat *result = m->reg[op.vec.dst];
 587    const GLfloat *arg0 = m->reg[op.vec.arg0];
 588
 589    result[0] = arg0[0] - FLOORF(arg0[0]);
 590    result[1] = arg0[1] - FLOORF(arg0[1]);
 591    result[2] = arg0[2] - FLOORF(arg0[2]);
 592    result[3] = arg0[3] - FLOORF(arg0[3]);
 593 }
 594
 595 static void do_LG2( struct arb_vp_machine *m, union instruction op )
 596 {
 597    GLfloat *result = m->reg[op.scl.dst];
 598    const GLfloat *arg0 = m->reg[op.scl.arg0];
 599
 600    result[0] = RoughApproxLog2(arg0[0]);
 601    PUFF(result);
 602 }
 603
 604
 605
 606 static void do_LIT( struct arb_vp_machine *m, union instruction op )
 607 {
 608    GLfloat *result = m->reg[op.vec.dst];
 609    const GLfloat *arg0 = m->reg[op.vec.arg0];
 610
 611    const GLfloat epsilon = 1.0F / 256.0F; /* per NV spec */
 612    GLfloat tmp[4];
 613
 614    tmp[0] = MAX2(arg0[0], 0.0F);
 615    tmp[1] = MAX2(arg0[1], 0.0F);
 616    tmp[3] = CLAMP(arg0[3], -(128.0F - epsilon), (128.0F - epsilon));
 617
 618    result[0] = 1.0;
 619    result[1] = tmp[0];
 620    result[2] = (tmp[0] > 0.0) ? RoughApproxPower(tmp[1], tmp[3]) : 0.0F;
 621    result[3] = 1.0;
 622 }
 623
 624
 625 static void do_LOG( struct arb_vp_machine *m, union instruction op )
 626 {
 627    GLfloat *result = m->reg[op.vec.dst];
 628    const GLfloat *arg0 = m->reg[op.vec.arg0];
 629    GLfloat tmp = FABSF(arg0[0]);
 630    int exponent;
 631    GLfloat mantissa = FREXPF(tmp, &exponent);
 632
 633    result[0] = (GLfloat) (exponent - 1);
 634    result[1] = 2.0 * mantissa; /* map [.5, 1) -> [1, 2) */
 635    result[2] = result[0] + LOG2(result[1]);
 636    result[3] = 1.0;
 637 }
 638
 639
 640 static void do_MAD( struct arb_vp_machine *m, union instruction op )
 641 {
 642    GLfloat *result = m->reg[op.vec.dst];
 643    const GLfloat *arg0 = m->reg[op.vec.arg0];
 644    const GLfloat *arg1 = m->reg[op.vec.arg1];
 645    const GLfloat *arg2 = m->reg[op.vec.arg2];
 646
 647    result[0] = arg0[0] * arg1[0] + arg2[0];
 648    result[1] = arg0[1] * arg1[1] + arg2[1];
 649    result[2] = arg0[2] * arg1[2] + arg2[2];
 650    result[3] = arg0[3] * arg1[3] + arg2[3];
 651 }
 652
 653 static void do_MAX( struct arb_vp_machine *m, union instruction op )
 654 {
 655    GLfloat *result = m->reg[op.vec.dst];
 656    const GLfloat *arg0 = m->reg[op.vec.arg0];
 657    const GLfloat *arg1 = m->reg[op.vec.arg1];
 658
 659    result[0] = (arg0[0] > arg1[0]) ? arg0[0] : arg1[0];
 660    result[1] = (arg0[1] > arg1[1]) ? arg0[1] : arg1[1];
 661    result[2] = (arg0[2] > arg1[2]) ? arg0[2] : arg1[2];
 662    result[3] = (arg0[3] > arg1[3]) ? arg0[3] : arg1[3];
 663 }
 664
 665
 666 static void do_MIN( struct arb_vp_machine *m, union instruction op )
 667 {
 668    GLfloat *result = m->reg[op.vec.dst];
 669    const GLfloat *arg0 = m->reg[op.vec.arg0];
 670    const GLfloat *arg1 = m->reg[op.vec.arg1];
 671
 672    result[0] = (arg0[0] < arg1[0]) ? arg0[0] : arg1[0];
 673    result[1] = (arg0[1] < arg1[1]) ? arg0[1] : arg1[1];
 674    result[2] = (arg0[2] < arg1[2]) ? arg0[2] : arg1[2];
 675    result[3] = (arg0[3] < arg1[3]) ? arg0[3] : arg1[3];
 676 }
 677
 678 static void do_MOV( struct arb_vp_machine *m, union instruction op )
 679 {
 680    GLfloat *result = m->reg[op.vec.dst];
 681    const GLfloat *arg0 = m->reg[op.vec.arg0];
 682
 683    result[0] = arg0[0];
 684    result[1] = arg0[1];
 685    result[2] = arg0[2];
 686    result[3] = arg0[3];
 687 }
 688
 689 static void do_MUL( struct arb_vp_machine *m, union instruction op )
 690 {
 691    GLfloat *result = m->reg[op.vec.dst];
 692    const GLfloat *arg0 = m->reg[op.vec.arg0];
 693    const GLfloat *arg1 = m->reg[op.vec.arg1];
 694
 695    result[0] = arg0[0] * arg1[0];
 696    result[1] = arg0[1] * arg1[1];
 697    result[2] = arg0[2] * arg1[2];
 698    result[3] = arg0[3] * arg1[3];
 699 }
 700
 701
 702 static void do_POW( struct arb_vp_machine *m, union instruction op )
 703 {
 704    GLfloat *result = m->reg[op.scl.dst];
 705    const GLfloat *arg0 = m->reg[op.scl.arg0];
 706    const GLfloat *arg1 = m->reg[op.scl.arg1];
 707
 708    result[0] = (GLfloat)RoughApproxPower(arg0[0], arg1[0]);
 709    PUFF(result);
 710 }
 711
 712 static void do_RCP( struct arb_vp_machine *m, union instruction op )
 713 {
 714    GLfloat *result = m->reg[op.scl.dst];
 715    const GLfloat *arg0 = m->reg[op.scl.arg0];
 716
 717    result[0] = 1.0F / arg0[0];
 718    PUFF(result);
 719 }
 720
 721 static void do_RSQ( struct arb_vp_machine *m, union instruction op )
 722 {
 723    GLfloat *result = m->reg[op.scl.dst];
 724    const GLfloat *arg0 = m->reg[op.scl.arg0];
 725
 726    result[0] = INV_SQRTF(FABSF(arg0[0]));
 727    PUFF(result);
 728 }
 729
 730
 731 static void do_SGE( struct arb_vp_machine *m, union instruction op )
 732 {
 733    GLfloat *result = m->reg[op.vec.dst];
 734    const GLfloat *arg0 = m->reg[op.vec.arg0];
 735    const GLfloat *arg1 = m->reg[op.vec.arg1];
 736
 737    result[0] = (arg0[0] >= arg1[0]) ? 1.0F : 0.0F;
 738    result[1] = (arg0[1] >= arg1[1]) ? 1.0F : 0.0F;
 739    result[2] = (arg0[2] >= arg1[2]) ? 1.0F : 0.0F;
 740    result[3] = (arg0[3] >= arg1[3]) ? 1.0F : 0.0F;
 741 }
 742
 743
 744 static void do_SLT( struct arb_vp_machine *m, union instruction op )
 745 {
 746    GLfloat *result = m->reg[op.vec.dst];
 747    const GLfloat *arg0 = m->reg[op.vec.arg0];
 748    const GLfloat *arg1 = m->reg[op.vec.arg1];
 749
 750    result[0] = (arg0[0] < arg1[0]) ? 1.0F : 0.0F;
 751    result[1] = (arg0[1] < arg1[1]) ? 1.0F : 0.0F;
 752    result[2] = (arg0[2] < arg1[2]) ? 1.0F : 0.0F;
 753    result[3] = (arg0[3] < arg1[3]) ? 1.0F : 0.0F;
 754 }
 755
 756 static void do_SWZ( struct arb_vp_machine *m, union instruction op )
 757 {
 758    GLfloat *result = m->reg[op.swz.dst];
 759    const GLfloat *arg0 = m->reg[op.swz.arg0];
 760    GLuint swz = op.swz.swz;
 761    GLuint neg = op.swz.neg;
 762    GLuint i;
 763
 764    for (i = 0; i < 4; i++, swz >>= 3, neg >>= 1) {
 765       switch (swz & 0x7) {
 766       case SWIZZLE_ZERO: result[i] = 0.0; break;
 767       case SWIZZLE_ONE:  result[i] = 1.0; break;
 768       default:           result[i] = arg0[swz & 0x7]; break;
 769       }
 770       if (neg & 0x1)     result[i] = -result[i];
 771    }
 772 }
 773
 774 static void do_SUB( struct arb_vp_machine *m, union instruction op )
 775 {
 776    GLfloat *result = m->reg[op.vec.dst];
 777    const GLfloat *arg0 = m->reg[op.vec.arg0];
 778    const GLfloat *arg1 = m->reg[op.vec.arg1];
 779
 780    result[0] = arg0[0] - arg1[0];
 781    result[1] = arg0[1] - arg1[1];
 782    result[2] = arg0[2] - arg1[2];
 783    result[3] = arg0[3] - arg1[3];
 784 }
 785
 786
 787 static void do_XPD( struct arb_vp_machine *m, union instruction op )
 788 {
 789    GLfloat *result = m->reg[op.vec.dst];
 790    const GLfloat *arg0 = m->reg[op.vec.arg0];
 791    const GLfloat *arg1 = m->reg[op.vec.arg1];
 792
 793    result[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1];
 794    result[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2];
 795    result[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0];
 796 }
 797
 798 static void do_NOP( struct arb_vp_machine *m, union instruction op )
 799 {
 800 }
 801
 802 /* Some useful debugging functions:
 803  */
 804 static void print_reg( GLuint reg )
 805 {
 806    if (reg == REG_RES)
 807       _mesa_printf("RES");
 808    else if (reg >= REG_ARG0 && reg <= REG_ARG2)
 809       _mesa_printf("ARG%d", reg - REG_ARG0);
 810    else if (reg >= REG_TMP0 && reg < REG_TMP_MAX)
 811       _mesa_printf("TMP%d", reg - REG_TMP0);
 812    else if (reg >= REG_PAR0 && reg < REG_PAR_MAX)
 813       _mesa_printf("PAR%d", reg - REG_PAR0);
 814    else
 815       _mesa_printf("???");
 816 }
 817
 818 static void print_mask( GLuint mask )
 819 {
 820    _mesa_printf(".");
 821    if (mask&0x1) _mesa_printf("x");
 822    if (mask&0x2) _mesa_printf("y");
 823    if (mask&0x4) _mesa_printf("z");
 824    if (mask&0x8) _mesa_printf("w");
 825 }
 826
 827 static void print_extern( GLuint file, GLuint idx )
 828 {
 829    static const char *reg_file[] = {
 830       "TEMPORARY",
 831       "INPUT",
 832       "OUTPUT",
 833       "LOCAL_PARAM",
 834       "ENV_PARAM",
 835       "NAMED_PARAM",
 836       "STATE_VAR",
 837       "WRITE_ONLY",
 838       "ADDRESS"
 839    };
 840
 841    _mesa_printf("%s:%d", reg_file[file], idx);
 842 }
 843
 844
 845
 846 static void print_SWZ( union instruction op, const struct opcode_info *info )
 847 {
 848    GLuint swz = op.swz.swz;
 849    GLuint neg = op.swz.neg;
 850    GLuint i;
 851
 852    _mesa_printf("%s ", info->string);
 853    print_reg(op.swz.dst);
 854    _mesa_printf(", ");
 855    print_reg(op.swz.arg0);
 856    _mesa_printf(".");
 857    for (i = 0; i < 4; i++, swz >>= 3, neg >>= 1) {
 858       const char *cswz = "xyzw01??";
 859       if (neg & 0x1)
 860          _mesa_printf("-");
 861       _mesa_printf("%c", cswz[swz&0x7]);
 862    }
 863    _mesa_printf("\n");
 864 }
 865
 866 static void print_RSW( union instruction op, const struct opcode_info *info )
 867 {
 868    GLuint swz = op.rsw.swz;
 869    GLuint neg = op.rsw.neg;
 870    GLuint i;
 871
 872    _mesa_printf("%s ", info->string);
 873    print_reg(op.rsw.dst);
 874    _mesa_printf(", ");
 875    print_reg(op.rsw.arg0);
 876    _mesa_printf(".");
 877    for (i = 0; i < 4; i++, swz >>= 2) {
 878       const char *cswz = "xyzw";
 879       if (neg)
 880          _mesa_printf("-");
 881       _mesa_printf("%c", cswz[swz&0x3]);
 882    }
 883    _mesa_printf("\n");
 884 }
 885
 886
 887 static void print_SCL( union instruction op, const struct opcode_info *info )
 888 {
 889    _mesa_printf("%s ", info->string);
 890    print_reg(op.scl.dst);
 891    _mesa_printf(", ");
 892    print_reg(op.scl.arg0);
 893    if (info->nr_args > 1) {
 894       _mesa_printf(", ");
 895       print_reg(op.scl.arg1);
 896    }
 897    _mesa_printf("\n");
 898 }
 899
 900
 901 static void print_VEC( union instruction op, const struct opcode_info *info )
 902 {
 903    _mesa_printf("%s ", info->string);
 904    print_reg(op.vec.dst);
 905    _mesa_printf(", ");
 906    print_reg(op.vec.arg0);
 907    if (info->nr_args > 1) {
 908       _mesa_printf(", ");
 909       print_reg(op.vec.arg1);
 910    }
 911    if (info->nr_args > 2) {
 912       _mesa_printf(", ");
 913       print_reg(op.vec.arg2);
 914    }
 915    _mesa_printf("\n");
 916 }
 917
 918 static void print_MSK( union instruction op, const struct opcode_info *info )
 919 {
 920    _mesa_printf("%s ", info->string);
 921    print_reg(op.msk.dst);
 922    print_mask(op.msk.mask);
 923    _mesa_printf(", ");
 924    print_reg(op.msk.arg0);
 925    _mesa_printf("\n");
 926 }
 927
 928 static void print_IN( union instruction op, const struct opcode_info *info )
 929 {
 930    _mesa_printf("%s ", info->string);
 931    print_reg(op.inr.reg);
 932    _mesa_printf(", ");
 933    print_extern(op.inr.file, op.inr.idx);
 934    _mesa_printf("\n");
 935 }
 936
 937 static void print_OUT( union instruction op, const struct opcode_info *info )
 938 {
 939    _mesa_printf("%s ", info->string);
 940    print_extern(op.out.file, op.out.idx);
 941    if (op.out.opcode == OUM)
 942       print_mask(op.out.mask);
 943    _mesa_printf(", ");
 944    print_reg(op.out.reg);
 945    _mesa_printf("\n");
 946 }
 947
 948 static void print_NOP( union instruction op, const struct opcode_info *info )
 949 {
 950 }
 951
 952 #define NOP 0
 953 #define VEC 1
 954 #define SCL 2
 955 #define SWZ 3
 956
 957 static const struct opcode_info opcode_info[] =
 958 {
 959    { VEC, 1, "ABS", do_ABS, print_VEC },
 960    { VEC, 2, "ADD", do_ADD, print_VEC },
 961    { OUT, 1, "ARL", do_ARL, print_OUT },
 962    { SCL, 2, "DP3", do_DP3, print_SCL },
 963    { SCL, 2, "DP4", do_DP4, print_SCL },
 964    { SCL, 2, "DPH", do_DPH, print_SCL },
 965    { VEC, 2, "DST", do_DST, print_VEC },
 966    { NOP, 0, "END", do_NOP, print_NOP },
 967    { SCL, 1, "EX2", do_EX2, print_VEC },
 968    { VEC, 1, "EXP", do_EXP, print_VEC },
 969    { VEC, 1, "FLR", do_FLR, print_VEC },
 970    { VEC, 1, "FRC", do_FRC, print_VEC },
 971    { SCL, 1, "LG2", do_LG2, print_VEC },
 972    { VEC, 1, "LIT", do_LIT, print_VEC },
 973    { VEC, 1, "LOG", do_LOG, print_VEC },
 974    { VEC, 3, "MAD", do_MAD, print_VEC },
 975    { VEC, 2, "MAX", do_MAX, print_VEC },
 976    { VEC, 2, "MIN", do_MIN, print_VEC },
 977    { VEC, 1, "MOV", do_MOV, print_VEC },
 978    { VEC, 2, "MUL", do_MUL, print_VEC },
 979    { SCL, 2, "POW", do_POW, print_VEC },
 980    { VEC, 1, "PRT", do_PRT, print_VEC }, /* PRINT */
 981    { NOP, 1, "RCC", do_NOP, print_NOP },
 982    { SCL, 1, "RCP", do_RCP, print_VEC },
 983    { SCL, 1, "RSQ", do_RSQ, print_VEC },
 984    { VEC, 2, "SGE", do_SGE, print_VEC },
 985    { VEC, 2, "SLT", do_SLT, print_VEC },
 986    { VEC, 2, "SUB", do_SUB, print_VEC },
 987    { SWZ, 1, "SWZ", do_SWZ, print_SWZ },
 988    { VEC, 2, "XPD", do_XPD, print_VEC },
 989    { IN4, 1, "IN1", do_IN1, print_IN }, /* Internals */
 990    { IN4, 1, "IN2", do_IN2, print_IN },
 991    { IN4, 1, "IN3", do_IN3, print_IN },
 992    { IN4, 1, "IN4", do_IN4, print_IN },
 993    { OUT, 1, "OUT", do_OUT, print_OUT },
 994    { OUT, 1, "OUM", do_OUM, print_OUT },
 995    { SWZ, 1, "RSW", do_RSW, print_RSW },
 996    { MSK, 1, "MSK", do_MSK, print_MSK },
 997    { IN4, 1, "PAR", do_PAR, print_IN },
 998    { IN4, 1, "PRL", do_PRL, print_IN },
 999 };
1000
1001
1002 static GLuint cvp_load_reg( struct compilation *cp,
1003                             GLuint file,
1004                             GLuint index,
1005                             GLuint rel )
1006 {
1007    GLuint i, op;
1008
1009    if (file == PROGRAM_TEMPORARY)
1010       return index + REG_TMP0;
1011
1012    /* Don't try to cache relatively addressed values yet:
1013     */
1014    if (!rel) {
1015       for (i = 0; i < REG_PAR_NR; i++) {
1016          if ((cp->par_active & (1<<i)) &&
1017              cp->reg[i].file == file &&
1018              cp->reg[i].idx == index) {
1019             cp->par_protected |= (1<<i);
1020             return i + REG_PAR0;
1021          }
1022       }
1023    }
1024
1025    /* Not already loaded, so identify a slot and load it.
1026     * TODO: preload these values once only!
1027     * TODO: better eviction strategy!
1028     */
1029    if (cp->par_active == ~0) {
1030       assert(cp->par_protected != ~0);
1031       cp->par_active = cp->par_protected;
1032    }
1033
1034    i = ffs(~cp->par_active);
1035    assert(i);
1036    i--;
1037
1038
1039    if (file == PROGRAM_INPUT)
1040       op = IN1 + cp->VB->AttribPtr[index]->size - 1;
1041    else if (rel)
1042       op = PRL;
1043    else
1044       op = PAR;
1045
1046    cp->csr->dword = 0;
1047    cp->csr->inr.opcode = op;
1048    cp->csr->inr.reg = i + REG_PAR0;
1049    cp->csr->inr.file = file;
1050    cp->csr->inr.idx = index;
1051    cp->csr++;
1052
1053    cp->reg[i].file = file;
1054    cp->reg[i].idx = index;
1055    cp->par_protected |= (1<<i);
1056    cp->par_active |= (1<<i);
1057    return i + REG_PAR0;
1058 }
1059
1060 static void cvp_release_regs( struct compilation *cp )
1061 {
1062    cp->par_protected = 0;
1063 }
1064
1065
1066
1067 static GLuint cvp_emit_arg( struct compilation *cp,
1068                             const struct vp_src_register *src,
1069                             GLuint arg )
1070 {
1071    GLuint reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr );
1072    union instruction rsw, noop;
1073
1074    /* Emit any necessary swizzling.
1075     */
1076    rsw.dword = 0;
1077    rsw.rsw.neg = src->Negate ? 1 : 0;
1078    rsw.rsw.swz = ((GET_SWZ(src->Swizzle, 0) << 0) |
1079                   (GET_SWZ(src->Swizzle, 1) << 2) |
1080                   (GET_SWZ(src->Swizzle, 2) << 4) |
1081                   (GET_SWZ(src->Swizzle, 3) << 6));
1082
1083    noop.dword = 0;
1084    noop.rsw.neg = 0;
1085    noop.rsw.swz = ((0<<0) |
1086                    (1<<2) |
1087                    (2<<4) |
1088                    (3<<6));
1089
1090    if (rsw.dword != noop.dword) {
1091       GLuint rsw_reg = arg;
1092       cp->csr->dword = rsw.dword;
1093       cp->csr->rsw.opcode = RSW;
1094       cp->csr->rsw.arg0 = reg;
1095       cp->csr->rsw.dst = rsw_reg;
1096       cp->csr++;
1097       return rsw_reg;
1098    }
1099    else
1100       return reg;
1101 }
1102
1103 static GLuint cvp_choose_result( struct compilation *cp,
1104                                  const struct vp_dst_register *dst,
1105                                  union instruction *fixup,
1106                                  GLuint maxreg)
1107 {
1108    GLuint mask = dst->WriteMask;
1109
1110    if (dst->File == PROGRAM_TEMPORARY) {
1111
1112       /* Optimization: When writing (with a writemask) to an undefined
1113        * value for the first time, the writemask may be ignored.  In
1114        * practise this means that the MSK instruction to implement the
1115        * writemask can be dropped.
1116        */
1117       if (dst->Index < maxreg &&
1118           (mask == 0xf || !(cp->tmp_active & (1<<dst->Index)))) {
1119          fixup->dword = 0;
1120          cp->tmp_active |= (1<<dst->Index);
1121          return REG_TMP0 + dst->Index;
1122       }
1123       else if (mask != 0xf) {
1124          fixup->msk.opcode = MSK;
1125          fixup->msk.arg0 = REG_RES;
1126          fixup->msk.dst = REG_TMP0 + dst->Index;
1127          fixup->msk.mask = mask;
1128          cp->tmp_active |= (1<<dst->Index);
1129          return REG_RES;
1130       }
1131       else {
1132          fixup->vec.opcode = VP_OPCODE_MOV;
1133          fixup->vec.arg0 = REG_RES;
1134          fixup->vec.dst = REG_TMP0 + dst->Index;
1135          cp->tmp_active |= (1<<dst->Index);
1136          return REG_RES;
1137       }
1138    }
1139    else {
1140       assert(dst->File == PROGRAM_OUTPUT);
1141       fixup->out.opcode = (mask == 0xf) ? OUT : OUM;
1142       fixup->out.reg = REG_RES;
1143       fixup->out.file = dst->File;
1144       fixup->out.idx = dst->Index;
1145       fixup->out.mask = mask;
1146       return REG_RES;
1147    }
1148 }
1149
1150
1151 static void cvp_emit_inst( struct compilation *cp,
1152                            const struct vp_instruction *inst )
1153 {
1154    const struct opcode_info *info = &opcode_info[inst->Opcode];
1155    union instruction fixup;
1156    GLuint reg[3];
1157    GLuint result, i;
1158
1159    /* Need to handle SWZ, ARL specially.
1160     */
1161    switch (info->type) {
1162    case OUT:
1163       assert(inst->Opcode == VP_OPCODE_ARL);
1164       reg[0] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
1165
1166       cp->csr->dword = 0;
1167       cp->csr->out.opcode = inst->Opcode;
1168       cp->csr->out.reg = reg[0];
1169       cp->csr->out.file = PROGRAM_ADDRESS;
1170       cp->csr->out.idx = 0;
1171       break;
1172    case SWZ:
1173       assert(inst->Opcode == VP_OPCODE_SWZ);
1174       result = cvp_choose_result( cp, &inst->DstReg, &fixup, REG_SWZDST_MAX );
1175
1176       reg[0] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
1177
1178       cp->csr->dword = 0;
1179       cp->csr->swz.opcode = VP_OPCODE_SWZ;
1180       cp->csr->swz.arg0 = reg[0];
1181       cp->csr->swz.dst = result;
1182       cp->csr->swz.neg = inst->SrcReg[0].Negate;
1183       cp->csr->swz.swz = inst->SrcReg[0].Swizzle;
1184       cp->csr++;
1185
1186       if (result == REG_RES) {
1187          cp->csr->dword = fixup.dword;
1188          cp->csr++;
1189       }
1190       break;
1191
1192    case VEC:
1193    case SCL:                    /* for now */
1194       result = cvp_choose_result( cp, &inst->DstReg, &fixup, REG_MAX );
1195
1196       reg[0] = reg[1] = reg[2] = 0;
1197
1198       for (i = 0; i < info->nr_args; i++)
1199          reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0 + i );
1200
1201       cp->csr->dword = 0;
1202       cp->csr->vec.opcode = inst->Opcode;
1203       cp->csr->vec.arg0 = reg[0];
1204       cp->csr->vec.arg1 = reg[1];
1205       cp->csr->vec.arg2 = reg[2];
1206       cp->csr->vec.dst = result;
1207       cp->csr++;
1208
1209       if (result == REG_RES) {
1210          cp->csr->dword = fixup.dword;
1211          cp->csr++;
1212       }
1213       break;
1214
1215
1216    case NOP:
1217       break;
1218
1219    default:
1220       assert(0);
1221       break;
1222    }
1223
1224    cvp_release_regs( cp );
1225 }
1226
1227
1228 static void compile_vertex_program( struct arb_vp_machine *m,
1229                                     const struct vertex_program *program )
1230 {
1231    struct compilation cp;
1232    GLuint i;
1233
1234    /* Initialize cp:
1235     */
1236    memset(&cp, 0, sizeof(cp));
1237    cp.VB = m->VB;
1238    cp.csr = m->store;
1239
1240    /* Compile instructions:
1241     */
1242    for (i = 0; i < program->Base.NumInstructions; i++) {
1243       cvp_emit_inst(&cp, &program->Instructions[i]);
1244    }
1245
1246    /* Finish up:
1247     */
1248    m->instructions = m->store;
1249    m->nr_instructions = cp.csr - m->store;
1250
1251
1252    /* Print/disassemble:
1253     */
1254    if (DISASSEM) {
1255       for (i = 0; i < m->nr_instructions; i++) {
1256          union instruction insn = m->instructions[i];
1257          const struct opcode_info *info = &opcode_info[insn.vec.opcode];
1258          info->print( insn, info );
1259       }
1260       _mesa_printf("\n\n");
1261    }
1262 }
1263
1264
1265
1266
1267 /* ----------------------------------------------------------------------
1268  * Execution
1269  */
1270 static void userclip( GLcontext *ctx,
1271                       GLvector4f *clip,
1272                       GLubyte *clipmask,
1273                       GLubyte *clipormask,
1274                       GLubyte *clipandmask )
1275 {
1276    GLuint p;
1277
1278    for (p = 0; p < ctx->Const.MaxClipPlanes; p++)
1279       if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
1280          GLuint nr, i;
1281          const GLfloat a = ctx->Transform._ClipUserPlane[p][0];
1282          const GLfloat b = ctx->Transform._ClipUserPlane[p][1];
1283          const GLfloat c = ctx->Transform._ClipUserPlane[p][2];
1284          const GLfloat d = ctx->Transform._ClipUserPlane[p][3];
1285          GLfloat *coord = (GLfloat *)clip->data;
1286          GLuint stride = clip->stride;
1287          GLuint count = clip->count;
1288
1289          for (nr = 0, i = 0 ; i < count ; i++) {
1290             GLfloat dp = (coord[0] * a +
1291                           coord[1] * b +
1292                           coord[2] * c +
1293                           coord[3] * d);
1294
1295             if (dp < 0) {
1296                nr++;
1297                clipmask[i] |= CLIP_USER_BIT;
1298             }
1299
1300             STRIDE_F(coord, stride);
1301          }
1302
1303          if (nr > 0) {
1304             *clipormask |= CLIP_USER_BIT;
1305             if (nr == count) {
1306                *clipandmask |= CLIP_USER_BIT;
1307                return;
1308             }
1309          }
1310       }
1311 }
1312
1313
1314 static GLboolean do_ndc_cliptest( struct arb_vp_machine *m )
1315 {
1316    GLcontext *ctx = m->ctx;
1317    TNLcontext *tnl = TNL_CONTEXT(ctx);
1318    struct vertex_buffer *VB = m->VB;
1319
1320    /* Cliptest and perspective divide.  Clip functions must clear
1321     * the clipmask.
1322     */
1323    m->ormask = 0;
1324    m->andmask = CLIP_ALL_BITS;
1325
1326    if (tnl->NeedNdcCoords) {
1327       VB->NdcPtr =
1328          _mesa_clip_tab[VB->ClipPtr->size]( VB->ClipPtr,
1329                                             &m->ndcCoords,
1330                                             m->clipmask,
1331                                             &m->ormask,
1332                                             &m->andmask );
1333    }
1334    else {
1335       VB->NdcPtr = NULL;
1336       _mesa_clip_np_tab[VB->ClipPtr->size]( VB->ClipPtr,
1337                                             NULL,
1338                                             m->clipmask,
1339                                             &m->ormask,
1340                                             &m->andmask );
1341    }
1342
1343    if (m->andmask) {
1344       /* All vertices are outside the frustum */
1345       return GL_FALSE;
1346    }
1347
1348    /* Test userclip planes.  This contributes to VB->ClipMask.
1349     */
1350    if (ctx->Transform.ClipPlanesEnabled && !ctx->VertexProgram._Enabled) {
1351       userclip( ctx,
1352                 VB->ClipPtr,
1353                 m->clipmask,
1354                 &m->ormask,
1355                 &m->andmask );
1356
1357       if (m->andmask) {
1358          return GL_FALSE;
1359       }
1360    }
1361
1362    VB->ClipAndMask = m->andmask;
1363    VB->ClipOrMask = m->ormask;
1364    VB->ClipMask = m->clipmask;
1365
1366    return GL_TRUE;
1367 }
1368
1369
1370
1371
1372 /**
1373  * Execute the given vertex program.
1374  *
1375  * TODO: Integrate the t_vertex.c code here, to build machine vertices
1376  * directly at this point.
1377  *
1378  * TODO: Eliminate the VB struct entirely and just use
1379  * struct arb_vertex_machine.
1380  */
1381 static GLboolean
1382 run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
1383 {
1384    struct vertex_program *program = (ctx->VertexProgram._Enabled ?
1385                                      ctx->VertexProgram.Current :
1386                                      ctx->_TnlProgram);
1387    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
1388    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1389    GLuint i, j, outputs = program->OutputsWritten;
1390
1391    if (program->Parameters) {
1392       _mesa_load_state_parameters(ctx, program->Parameters);
1393       m->File[PROGRAM_STATE_VAR] = program->Parameters->ParameterValues;
1394    }
1395
1396    /* Run the actual program:
1397     */
1398    for (m->vtx_nr = 0; m->vtx_nr < VB->Count; m->vtx_nr++) {
1399       for (j = 0; j < m->nr_instructions; j++) {
1400          union instruction inst = m->instructions[j];
1401          opcode_info[inst.vec.opcode].func( m, inst );
1402       }
1403    }
1404
1405    /* Setup the VB pointers so that the next pipeline stages get
1406     * their data from the right place (the program output arrays).
1407     *
1408     * TODO: 1) Have tnl use these RESULT values for outputs rather
1409     * than trying to shoe-horn inputs and outputs into one set of
1410     * values.
1411     *
1412     * TODO: 2) Integrate t_vertex.c so that we just go straight ahead
1413     * and build machine vertices here.
1414     */
1415    VB->ClipPtr = &m->attribs[VERT_RESULT_HPOS];
1416    VB->ClipPtr->count = VB->Count;
1417
1418    if (outputs & (1<<VERT_RESULT_COL0)) {
1419       VB->ColorPtr[0] = &m->attribs[VERT_RESULT_COL0];
1420       VB->AttribPtr[VERT_ATTRIB_COLOR0] = VB->ColorPtr[0];
1421    }
1422
1423    if (outputs & (1<<VERT_RESULT_BFC0)) {
1424       VB->ColorPtr[1] = &m->attribs[VERT_RESULT_BFC0];
1425    }
1426
1427    if (outputs & (1<<VERT_RESULT_COL1)) {
1428       VB->SecondaryColorPtr[0] = &m->attribs[VERT_RESULT_COL1];
1429       VB->AttribPtr[VERT_ATTRIB_COLOR1] = VB->SecondaryColorPtr[0];
1430    }
1431
1432    if (outputs & (1<<VERT_RESULT_BFC1)) {
1433       VB->SecondaryColorPtr[1] = &m->attribs[VERT_RESULT_BFC1];
1434    }
1435
1436    if (outputs & (1<<VERT_RESULT_FOGC)) {
1437       VB->FogCoordPtr = &m->attribs[VERT_RESULT_FOGC];
1438       VB->AttribPtr[VERT_ATTRIB_FOG] = VB->FogCoordPtr;
1439    }
1440
1441    if (outputs & (1<<VERT_RESULT_PSIZ)) {
1442       VB->PointSizePtr = &m->attribs[VERT_RESULT_PSIZ];
1443       VB->AttribPtr[_TNL_ATTRIB_POINTSIZE] = &m->attribs[VERT_RESULT_PSIZ];
1444    }
1445
1446    for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
1447       if (outputs & (1<<(VERT_RESULT_TEX0+i))) {
1448          VB->TexCoordPtr[i] = &m->attribs[VERT_RESULT_TEX0 + i];
1449          VB->AttribPtr[VERT_ATTRIB_TEX0+i] = VB->TexCoordPtr[i];
1450       }
1451    }
1452
1453 #if 0
1454    for (i = 0; i < VB->Count; i++) {
1455       printf("Out %d: %f %f %f %f %f %f %f %f\n", i,
1456              VEC_ELT(VB->ClipPtr, GLfloat, i)[0],
1457              VEC_ELT(VB->ClipPtr, GLfloat, i)[1],
1458              VEC_ELT(VB->ClipPtr, GLfloat, i)[2],
1459              VEC_ELT(VB->ClipPtr, GLfloat, i)[3],
1460              VEC_ELT(VB->ColorPtr[0], GLfloat, i)[0],
1461              VEC_ELT(VB->ColorPtr[0], GLfloat, i)[1],
1462              VEC_ELT(VB->ColorPtr[0], GLfloat, i)[2],
1463              VEC_ELT(VB->ColorPtr[0], GLfloat, i)[3]);
1464    }
1465 #endif
1466
1467    /* Perform NDC and cliptest operations:
1468     */
1469    return do_ndc_cliptest(m);
1470 }
1471
1472
1473 static void
1474 validate_vertex_program( GLcontext *ctx, struct tnl_pipeline_stage *stage )
1475 {
1476    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1477    struct vertex_program *program =
1478       (ctx->VertexProgram._Enabled ? ctx->VertexProgram.Current : 0);
1479
1480 #if TNL_FIXED_FUNCTION_PROGRAM
1481    if (!program) {
1482       program = ctx->_TnlProgram;
1483    }
1484 #endif
1485
1486    if (program) {
1487       compile_vertex_program( m, program );
1488
1489       /* Grab the state GL state and put into registers:
1490        */
1491       m->File[PROGRAM_LOCAL_PARAM] = program->Base.LocalParams;
1492       m->File[PROGRAM_ENV_PARAM] = ctx->VertexProgram.Parameters;
1493       m->File[PROGRAM_STATE_VAR] = 0;
1494    }
1495 }
1496
1497
1498
1499
1500
1501
1502
1503 /**
1504  * Called the first time stage->run is called.  In effect, don't
1505  * allocate data until the first time the stage is run.
1506  */
1507 static GLboolean init_vertex_program( GLcontext *ctx,
1508                                       struct tnl_pipeline_stage *stage )
1509 {
1510    TNLcontext *tnl = TNL_CONTEXT(ctx);
1511    struct vertex_buffer *VB = &(tnl->vb);
1512    struct arb_vp_machine *m;
1513    const GLuint size = VB->Size;
1514    GLuint i;
1515
1516    stage->privatePtr = MALLOC(sizeof(*m));
1517    m = ARB_VP_MACHINE(stage);
1518    if (!m)
1519       return GL_FALSE;
1520
1521    /* arb_vertex_machine struct should subsume the VB:
1522     */
1523    m->VB = VB;
1524    m->ctx = ctx;
1525
1526    /* Allocate arrays of vertex output values */
1527    for (i = 0; i < VERT_RESULT_MAX; i++) {
1528       _mesa_vector4f_alloc( &m->attribs[i], 0, size, 32 );
1529       m->attribs[i].size = 4;
1530    }
1531
1532    /* a few other misc allocations */
1533    _mesa_vector4f_alloc( &m->ndcCoords, 0, size, 32 );
1534    m->clipmask = (GLubyte *) ALIGN_MALLOC(sizeof(GLubyte)*size, 32 );
1535
1536
1537 #if TNL_FIXED_FUNCTION_PROGRAM
1538    _mesa_allow_light_in_model( ctx, GL_FALSE );
1539 #endif
1540
1541
1542    return GL_TRUE;
1543 }
1544
1545
1546
1547
1548 /**
1549  * Destructor for this pipeline stage.
1550  */
1551 static void dtr( struct tnl_pipeline_stage *stage )
1552 {
1553    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1554
1555    if (m) {
1556       GLuint i;
1557
1558       /* free the vertex program result arrays */
1559       for (i = 0; i < VERT_RESULT_MAX; i++)
1560          _mesa_vector4f_free( &m->attribs[i] );
1561
1562       /* free misc arrays */
1563       _mesa_vector4f_free( &m->ndcCoords );
1564       ALIGN_FREE( m->clipmask );
1565
1566       FREE( m );
1567       stage->privatePtr = NULL;
1568    }
1569 }
1570
1571 /**
1572  * Public description of this pipeline stage.
1573  */
1574 const struct tnl_pipeline_stage _tnl_arb_vertex_program_stage =
1575 {
1576    "vertex-program",
1577    NULL,                        /* private_data */
1578    init_vertex_program,         /* create */
1579    dtr,                         /* destroy */
1580    validate_vertex_program,     /* validate */
1581    run_arb_vertex_program       /* run */
1582 };