src/mesa/tnl/t_vb_arbprogram.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.5.1
   4  *
   5  * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file t_arb_program.c
  27  * Compile vertex programs to an intermediate representation.
  28  * Execute vertex programs over a buffer of vertices.
  29  * \author Keith Whitwell, Brian Paul
  30  */
  31
  32 #include "glheader.h"
  33 #include "context.h"
  34 #include "imports.h"
  35 #include "macros.h"
  36 #include "mtypes.h"
  37 #include "arbprogparse.h"
  38 #include "light.h"
  39 #include "program.h"
  40 #include "math/m_matrix.h"
  41 #include "t_context.h"
  42 #include "t_pipeline.h"
  43 #include "t_vb_arbprogram.h"
  44 #include "tnl.h"
  45 #include "program_instruction.h"
  46
  47
  48 #define DISASSEM 0
  49
  50
  51 struct compilation {
  52    GLuint reg_active;
  53    union instruction *csr;
  54 };
  55
  56
  57 #define ARB_VP_MACHINE(stage) ((struct arb_vp_machine *)(stage->privatePtr))
  58
  59 #define PUFF(x) ((x)[1] = (x)[2] = (x)[3] = (x)[0])
  60
  61
  62
  63 /* Lower precision functions for the EXP, LOG and LIT opcodes.  The
  64  * LOG2() implementation is probably not accurate enough, and the
  65  * attempted optimization for Exp2 is definitely not accurate
  66  * enough - it discards all of t's fractional bits!
  67  */
  68 static GLfloat RoughApproxLog2(GLfloat t)
  69 {
  70    return LOG2(t);
  71 }
  72
  73 static GLfloat RoughApproxExp2(GLfloat t)
  74 {
  75 #if 0
  76    fi_type fi;
  77    fi.i = (GLint) t;
  78    fi.i = (fi.i << 23) + 0x3f800000;
  79    return fi.f;
  80 #else
  81    return (GLfloat) _mesa_pow(2.0, t);
  82 #endif
  83 }
  84
  85 static GLfloat RoughApproxPower(GLfloat x, GLfloat y)
  86 {
  87    if (x == 0.0 && y == 0.0)
  88       return 1.0;  /* spec requires this */
  89    else
  90       return RoughApproxExp2(y * RoughApproxLog2(x));
  91 }
  92
  93
  94 /* Higher precision functions for the EX2, LG2 and POW opcodes:
  95  */
  96 static GLfloat ApproxLog2(GLfloat t)
  97 {
  98    return (GLfloat) (LOGF(t) * 1.442695F);
  99 }
 100
 101 static GLfloat ApproxExp2(GLfloat t)
 102 {
 103    return (GLfloat) _mesa_pow(2.0, t);
 104 }
 105
 106 static GLfloat ApproxPower(GLfloat x, GLfloat y)
 107 {
 108    return (GLfloat) _mesa_pow(x, y);
 109 }
 110
 111
 112 /**
 113  * Perform a reduced swizzle:
 114  */
 115 static void do_RSW( struct arb_vp_machine *m, union instruction op )
 116 {
 117    GLfloat *result = m->File[0][op.rsw.dst];
 118    const GLfloat *arg0 = m->File[op.rsw.file0][op.rsw.idx0];
 119    const GLuint swz = op.rsw.swz;
 120    const GLuint neg = op.rsw.neg;
 121    GLfloat tmp[4];
 122
 123    /* Need a temporary to be correct in the case where result == arg0.
 124     */
 125    COPY_4V(tmp, arg0);
 126
 127    result[0] = tmp[GET_SWZ(swz, 0)];
 128    result[1] = tmp[GET_SWZ(swz, 1)];
 129    result[2] = tmp[GET_SWZ(swz, 2)];
 130    result[3] = tmp[GET_SWZ(swz, 3)];
 131
 132    if (neg) {
 133       if (neg & 0x1) result[0] = -result[0];
 134       if (neg & 0x2) result[1] = -result[1];
 135       if (neg & 0x4) result[2] = -result[2];
 136       if (neg & 0x8) result[3] = -result[3];
 137    }
 138 }
 139
 140 /**
 141  * Perform a full swizzle
 142  */
 143 static void do_SWZ( struct arb_vp_machine *m, union instruction op )
 144 {
 145    GLfloat *result = m->File[0][op.rsw.dst];
 146    const GLfloat *arg0 = m->File[op.rsw.file0][op.rsw.idx0];
 147    const GLuint swz = op.rsw.swz;
 148    const GLuint neg = op.rsw.neg;
 149    GLfloat tmp[6];
 150    tmp[4] = 0.0;
 151    tmp[5] = 1.0;
 152
 153    /* Need a temporary to be correct in the case where result == arg0.
 154     */
 155    COPY_4V(tmp, arg0);
 156
 157    result[0] = tmp[GET_SWZ(swz, 0)];
 158    result[1] = tmp[GET_SWZ(swz, 1)];
 159    result[2] = tmp[GET_SWZ(swz, 2)];
 160    result[3] = tmp[GET_SWZ(swz, 3)];
 161
 162    if (neg) {
 163       if (neg & 0x1) result[0] = -result[0];
 164       if (neg & 0x2) result[1] = -result[1];
 165       if (neg & 0x4) result[2] = -result[2];
 166       if (neg & 0x8) result[3] = -result[3];
 167    }
 168 }
 169
 170 /* Used to implement write masking.  To make things easier for the sse
 171  * generator I've gone back to a 1 argument version of this function
 172  * (dst.msk = arg), rather than the semantically cleaner (dst = SEL
 173  * arg0, arg1, msk)
 174  *
 175  * That means this is the only instruction which doesn't write a full
 176  * 4 dwords out.  This would make such a program harder to analyse,
 177  * but it looks like analysis is going to take place on a higher level
 178  * anyway.
 179  */
 180 static void do_MSK( struct arb_vp_machine *m, union instruction op )
 181 {
 182    GLfloat *dst = m->File[0][op.msk.dst];
 183    const GLfloat *arg = m->File[op.msk.file][op.msk.idx];
 184
 185    if (op.msk.mask & WRITEMASK_X) dst[0] = arg[0];
 186    if (op.msk.mask & WRITEMASK_Y) dst[1] = arg[1];
 187    if (op.msk.mask & WRITEMASK_Z) dst[2] = arg[2];
 188    if (op.msk.mask & WRITEMASK_W) dst[3] = arg[3];
 189 }
 190
 191
 192 static void do_PRT( struct arb_vp_machine *m, union instruction op )
 193 {
 194    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 195
 196    _mesa_printf("%d: %f %f %f %f\n", m->vtx_nr,
 197                 arg0[0], arg0[1], arg0[2], arg0[3]);
 198 }
 199
 200
 201 /**
 202  * The traditional ALU and texturing instructions.  All operate on
 203  * internal registers and ignore write masks and swizzling issues.
 204  */
 205
 206 static void do_ABS( struct arb_vp_machine *m, union instruction op )
 207 {
 208    GLfloat *result = m->File[0][op.alu.dst];
 209    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 210
 211    result[0] = (arg0[0] < 0.0) ? -arg0[0] : arg0[0];
 212    result[1] = (arg0[1] < 0.0) ? -arg0[1] : arg0[1];
 213    result[2] = (arg0[2] < 0.0) ? -arg0[2] : arg0[2];
 214    result[3] = (arg0[3] < 0.0) ? -arg0[3] : arg0[3];
 215 }
 216
 217 static void do_ADD( struct arb_vp_machine *m, union instruction op )
 218 {
 219    GLfloat *result = m->File[0][op.alu.dst];
 220    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 221    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 222
 223    result[0] = arg0[0] + arg1[0];
 224    result[1] = arg0[1] + arg1[1];
 225    result[2] = arg0[2] + arg1[2];
 226    result[3] = arg0[3] + arg1[3];
 227 }
 228
 229
 230 static void do_DP3( struct arb_vp_machine *m, union instruction op )
 231 {
 232    GLfloat *result = m->File[0][op.alu.dst];
 233    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 234    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 235
 236    result[0] = (arg0[0] * arg1[0] +
 237                 arg0[1] * arg1[1] +
 238                 arg0[2] * arg1[2]);
 239
 240    PUFF(result);
 241 }
 242
 243
 244
 245 static void do_DP4( struct arb_vp_machine *m, union instruction op )
 246 {
 247    GLfloat *result = m->File[0][op.alu.dst];
 248    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 249    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 250
 251    result[0] = (arg0[0] * arg1[0] +
 252                 arg0[1] * arg1[1] +
 253                 arg0[2] * arg1[2] +
 254                 arg0[3] * arg1[3]);
 255
 256    PUFF(result);
 257 }
 258
 259 static void do_DPH( struct arb_vp_machine *m, union instruction op )
 260 {
 261    GLfloat *result = m->File[0][op.alu.dst];
 262    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 263    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 264
 265    result[0] = (arg0[0] * arg1[0] +
 266                 arg0[1] * arg1[1] +
 267                 arg0[2] * arg1[2] +
 268                 1.0     * arg1[3]);
 269
 270    PUFF(result);
 271 }
 272
 273 static void do_DST( struct arb_vp_machine *m, union instruction op )
 274 {
 275    GLfloat *result = m->File[0][op.alu.dst];
 276    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 277    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 278
 279    /* This should be ok even if result == arg0 or result == arg1.
 280     */
 281    result[0] = 1.0F;
 282    result[1] = arg0[1] * arg1[1];
 283    result[2] = arg0[2];
 284    result[3] = arg1[3];
 285 }
 286
 287
 288 /* Intended to be high precision:
 289  */
 290 static void do_EX2( struct arb_vp_machine *m, union instruction op )
 291 {
 292    GLfloat *result = m->File[0][op.alu.dst];
 293    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 294
 295    result[0] = (GLfloat)ApproxExp2(arg0[0]);
 296    PUFF(result);
 297 }
 298
 299
 300 /* Allowed to be lower precision:
 301  */
 302 static void do_EXP( struct arb_vp_machine *m, union instruction op )
 303 {
 304    GLfloat *result = m->File[0][op.alu.dst];
 305    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 306    const GLfloat tmp = arg0[0];
 307    const GLfloat flr_tmp = FLOORF(tmp);
 308    const GLfloat frac_tmp = tmp - flr_tmp;
 309
 310    result[0] = LDEXPF(1.0, (int)flr_tmp);
 311    result[1] = frac_tmp;
 312    result[2] = RoughApproxExp2(tmp);
 313    result[3] = 1.0F;
 314 }
 315
 316 static void do_FLR( struct arb_vp_machine *m, union instruction op )
 317 {
 318    GLfloat *result = m->File[0][op.alu.dst];
 319    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 320
 321    result[0] = FLOORF(arg0[0]);
 322    result[1] = FLOORF(arg0[1]);
 323    result[2] = FLOORF(arg0[2]);
 324    result[3] = FLOORF(arg0[3]);
 325 }
 326
 327 static void do_FRC( struct arb_vp_machine *m, union instruction op )
 328 {
 329    GLfloat *result = m->File[0][op.alu.dst];
 330    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 331
 332    result[0] = arg0[0] - FLOORF(arg0[0]);
 333    result[1] = arg0[1] - FLOORF(arg0[1]);
 334    result[2] = arg0[2] - FLOORF(arg0[2]);
 335    result[3] = arg0[3] - FLOORF(arg0[3]);
 336 }
 337
 338 /* High precision log base 2:
 339  */
 340 static void do_LG2( struct arb_vp_machine *m, union instruction op )
 341 {
 342    GLfloat *result = m->File[0][op.alu.dst];
 343    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 344
 345    result[0] = ApproxLog2(arg0[0]);
 346    PUFF(result);
 347 }
 348
 349
 350
 351 static void do_LIT( struct arb_vp_machine *m, union instruction op )
 352 {
 353    GLfloat *result = m->File[0][op.alu.dst];
 354    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 355    GLfloat tmp[4]; /* use temp in case arg0 == result register */
 356
 357    tmp[0] = 1.0;
 358    tmp[1] = arg0[0];
 359    if (arg0[0] > 0.0) {
 360       tmp[2] = RoughApproxPower(arg0[1], arg0[3]);
 361    }
 362    else {
 363       tmp[2] = 0.0;
 364    }
 365    tmp[3] = 1.0;
 366
 367    COPY_4V(result, tmp);
 368 }
 369
 370
 371 /* Intended to allow a lower precision than required for LG2 above.
 372  */
 373 static void do_LOG( struct arb_vp_machine *m, union instruction op )
 374 {
 375    GLfloat *result = m->File[0][op.alu.dst];
 376    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 377    const GLfloat tmp = FABSF(arg0[0]);
 378    int exponent;
 379    const GLfloat mantissa = FREXPF(tmp, &exponent);
 380
 381    result[0] = (GLfloat) (exponent - 1);
 382    result[1] = 2.0 * mantissa; /* map [.5, 1) -> [1, 2) */
 383    result[2] = exponent + LOG2(mantissa);
 384    result[3] = 1.0;
 385 }
 386
 387 static void do_MAX( struct arb_vp_machine *m, union instruction op )
 388 {
 389    GLfloat *result = m->File[0][op.alu.dst];
 390    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 391    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 392
 393    result[0] = (arg0[0] > arg1[0]) ? arg0[0] : arg1[0];
 394    result[1] = (arg0[1] > arg1[1]) ? arg0[1] : arg1[1];
 395    result[2] = (arg0[2] > arg1[2]) ? arg0[2] : arg1[2];
 396    result[3] = (arg0[3] > arg1[3]) ? arg0[3] : arg1[3];
 397 }
 398
 399
 400 static void do_MIN( struct arb_vp_machine *m, union instruction op )
 401 {
 402    GLfloat *result = m->File[0][op.alu.dst];
 403    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 404    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 405
 406    result[0] = (arg0[0] < arg1[0]) ? arg0[0] : arg1[0];
 407    result[1] = (arg0[1] < arg1[1]) ? arg0[1] : arg1[1];
 408    result[2] = (arg0[2] < arg1[2]) ? arg0[2] : arg1[2];
 409    result[3] = (arg0[3] < arg1[3]) ? arg0[3] : arg1[3];
 410 }
 411
 412 static void do_MOV( struct arb_vp_machine *m, union instruction op )
 413 {
 414    GLfloat *result = m->File[0][op.alu.dst];
 415    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 416
 417    result[0] = arg0[0];
 418    result[1] = arg0[1];
 419    result[2] = arg0[2];
 420    result[3] = arg0[3];
 421 }
 422
 423 static void do_MUL( struct arb_vp_machine *m, union instruction op )
 424 {
 425    GLfloat *result = m->File[0][op.alu.dst];
 426    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 427    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 428
 429    result[0] = arg0[0] * arg1[0];
 430    result[1] = arg0[1] * arg1[1];
 431    result[2] = arg0[2] * arg1[2];
 432    result[3] = arg0[3] * arg1[3];
 433 }
 434
 435
 436 /* Intended to be "high" precision
 437  */
 438 static void do_POW( struct arb_vp_machine *m, union instruction op )
 439 {
 440    GLfloat *result = m->File[0][op.alu.dst];
 441    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 442    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 443
 444    result[0] = (GLfloat)ApproxPower(arg0[0], arg1[0]);
 445    PUFF(result);
 446 }
 447
 448 static void do_REL( struct arb_vp_machine *m, union instruction op )
 449 {
 450    GLfloat *result = m->File[0][op.alu.dst];
 451    const GLuint idx = (op.alu.idx0 + (GLint)m->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1);
 452    const GLfloat *arg0 = m->File[op.alu.file0][idx];
 453
 454    result[0] = arg0[0];
 455    result[1] = arg0[1];
 456    result[2] = arg0[2];
 457    result[3] = arg0[3];
 458 }
 459
 460 static void do_RCP( struct arb_vp_machine *m, union instruction op )
 461 {
 462    GLfloat *result = m->File[0][op.alu.dst];
 463    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 464
 465    result[0] = 1.0F / arg0[0];
 466    PUFF(result);
 467 }
 468
 469 static void do_RSQ( struct arb_vp_machine *m, union instruction op )
 470 {
 471    GLfloat *result = m->File[0][op.alu.dst];
 472    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 473
 474    result[0] = INV_SQRTF(FABSF(arg0[0]));
 475    PUFF(result);
 476 }
 477
 478
 479 static void do_SGE( struct arb_vp_machine *m, union instruction op )
 480 {
 481    GLfloat *result = m->File[0][op.alu.dst];
 482    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 483    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 484
 485    result[0] = (arg0[0] >= arg1[0]) ? 1.0F : 0.0F;
 486    result[1] = (arg0[1] >= arg1[1]) ? 1.0F : 0.0F;
 487    result[2] = (arg0[2] >= arg1[2]) ? 1.0F : 0.0F;
 488    result[3] = (arg0[3] >= arg1[3]) ? 1.0F : 0.0F;
 489 }
 490
 491
 492 static void do_SLT( struct arb_vp_machine *m, union instruction op )
 493 {
 494    GLfloat *result = m->File[0][op.alu.dst];
 495    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 496    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 497
 498    result[0] = (arg0[0] < arg1[0]) ? 1.0F : 0.0F;
 499    result[1] = (arg0[1] < arg1[1]) ? 1.0F : 0.0F;
 500    result[2] = (arg0[2] < arg1[2]) ? 1.0F : 0.0F;
 501    result[3] = (arg0[3] < arg1[3]) ? 1.0F : 0.0F;
 502 }
 503
 504 static void do_SUB( struct arb_vp_machine *m, union instruction op )
 505 {
 506    GLfloat *result = m->File[0][op.alu.dst];
 507    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 508    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 509
 510    result[0] = arg0[0] - arg1[0];
 511    result[1] = arg0[1] - arg1[1];
 512    result[2] = arg0[2] - arg1[2];
 513    result[3] = arg0[3] - arg1[3];
 514 }
 515
 516
 517 static void do_XPD( struct arb_vp_machine *m, union instruction op )
 518 {
 519    GLfloat *result = m->File[0][op.alu.dst];
 520    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 521    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 522    GLfloat tmp[3];
 523
 524    tmp[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1];
 525    tmp[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2];
 526    tmp[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0];
 527
 528    /* Need a temporary to be correct in the case where result == arg0
 529     * or result == arg1.
 530     */
 531    result[0] = tmp[0];
 532    result[1] = tmp[1];
 533    result[2] = tmp[2];
 534 }
 535
 536 static void do_NOP( struct arb_vp_machine *m, union instruction op )
 537 {
 538 }
 539
 540 /* Some useful debugging functions:
 541  */
 542 static void print_mask( GLuint mask )
 543 {
 544    _mesa_printf(".");
 545    if (mask & WRITEMASK_X) _mesa_printf("x");
 546    if (mask & WRITEMASK_Y) _mesa_printf("y");
 547    if (mask & WRITEMASK_Z) _mesa_printf("z");
 548    if (mask & WRITEMASK_W) _mesa_printf("w");
 549 }
 550
 551 static void print_reg( GLuint file, GLuint reg )
 552 {
 553    static const char *reg_file[] = {
 554       "REG",
 555       "LOCAL_PARAM",
 556       "ENV_PARAM",
 557       "STATE_VAR",
 558    };
 559
 560    if (file == 0) {
 561       if (reg == REG_RES)
 562          _mesa_printf("RES");
 563       else if (reg >= REG_ARG0 && reg <= REG_ARG1)
 564          _mesa_printf("ARG%d", reg - REG_ARG0);
 565       else if (reg >= REG_TMP0 && reg <= REG_TMP11)
 566          _mesa_printf("TMP%d", reg - REG_TMP0);
 567       else if (reg >= REG_IN0 && reg <= REG_IN31)
 568          _mesa_printf("IN%d", reg - REG_IN0);
 569       else if (reg >= REG_OUT0 && reg <= REG_OUT14)
 570          _mesa_printf("OUT%d", reg - REG_OUT0);
 571       else if (reg == REG_ADDR)
 572          _mesa_printf("ADDR");
 573       else if (reg == REG_ID)
 574          _mesa_printf("ID");
 575       else
 576          _mesa_printf("REG%d", reg);
 577    }
 578    else
 579       _mesa_printf("%s:%d", reg_file[file], reg);
 580 }
 581
 582
 583 static void print_RSW( union instruction op )
 584 {
 585    GLuint swz = op.rsw.swz;
 586    GLuint neg = op.rsw.neg;
 587    GLuint i;
 588
 589    _mesa_printf("RSW ");
 590    print_reg(0, op.rsw.dst);
 591    _mesa_printf(", ");
 592    print_reg(op.rsw.file0, op.rsw.idx0);
 593    _mesa_printf(".");
 594    for (i = 0; i < 4; i++, swz >>= 3) {
 595       const char *cswz = "xyzw01";
 596       if (neg & (1<<i))
 597          _mesa_printf("-");
 598       _mesa_printf("%c", cswz[swz&0x7]);
 599    }
 600    _mesa_printf("\n");
 601 }
 602
 603 static void print_SWZ( union instruction op )
 604 {
 605    GLuint swz = op.rsw.swz;
 606    GLuint neg = op.rsw.neg;
 607    GLuint i;
 608
 609    _mesa_printf("SWZ ");
 610    print_reg(0, op.rsw.dst);
 611    _mesa_printf(", ");
 612    print_reg(op.rsw.file0, op.rsw.idx0);
 613    _mesa_printf(".");
 614    for (i = 0; i < 4; i++, swz >>= 3) {
 615       const char *cswz = "xyzw01";
 616       if (neg & (1<<i))
 617          _mesa_printf("-");
 618       _mesa_printf("%c", cswz[swz&0x7]);
 619    }
 620    _mesa_printf("\n");
 621 }
 622
 623
 624 static void print_ALU( union instruction op )
 625 {
 626    _mesa_printf("%s ", _mesa_opcode_string((enum prog_opcode) op.alu.opcode));
 627    print_reg(0, op.alu.dst);
 628    _mesa_printf(", ");
 629    print_reg(op.alu.file0, op.alu.idx0);
 630    if (_mesa_num_inst_src_regs((enum prog_opcode) op.alu.opcode) > 1) {
 631       _mesa_printf(", ");
 632       print_reg(op.alu.file1, op.alu.idx1);
 633    }
 634    _mesa_printf("\n");
 635 }
 636
 637 static void print_MSK( union instruction op )
 638 {
 639    _mesa_printf("MSK ");
 640    print_reg(0, op.msk.dst);
 641    print_mask(op.msk.mask);
 642    _mesa_printf(", ");
 643    print_reg(op.msk.file, op.msk.idx);
 644    _mesa_printf("\n");
 645 }
 646
 647 static void print_NOP( union instruction op )
 648 {
 649 }
 650
 651 void
 652 _tnl_disassem_vba_insn( union instruction op )
 653 {
 654    switch (op.alu.opcode) {
 655    case OPCODE_ABS:
 656    case OPCODE_ADD:
 657    case OPCODE_DP3:
 658    case OPCODE_DP4:
 659    case OPCODE_DPH:
 660    case OPCODE_DST:
 661    case OPCODE_EX2:
 662    case OPCODE_EXP:
 663    case OPCODE_FLR:
 664    case OPCODE_FRC:
 665    case OPCODE_LG2:
 666    case OPCODE_LIT:
 667    case OPCODE_LOG:
 668    case OPCODE_MAX:
 669    case OPCODE_MIN:
 670    case OPCODE_MOV:
 671    case OPCODE_MUL:
 672    case OPCODE_POW:
 673    case OPCODE_PRINT:
 674    case OPCODE_RCP:
 675    case OPCODE_RSQ:
 676    case OPCODE_SGE:
 677    case OPCODE_SLT:
 678    case OPCODE_SUB:
 679    case OPCODE_XPD:
 680       print_ALU(op);
 681       break;
 682    case OPCODE_ARA:
 683    case OPCODE_ARL:
 684    case OPCODE_ARL_NV:
 685    case OPCODE_ARR:
 686    case OPCODE_BRA:
 687    case OPCODE_CAL:
 688    case OPCODE_END:
 689    case OPCODE_MAD:
 690    case OPCODE_POPA:
 691    case OPCODE_PUSHA:
 692    case OPCODE_RCC:
 693    case OPCODE_RET:
 694    case OPCODE_SSG:
 695       print_NOP(op);
 696       break;
 697    case OPCODE_SWZ:
 698       print_SWZ(op);
 699       break;
 700    case RSW:
 701       print_RSW(op);
 702       break;
 703    case MSK:
 704       print_MSK(op);
 705       break;
 706    case REL:
 707       print_ALU(op);
 708       break;
 709    default:
 710       _mesa_problem(NULL, "Bad opcode in _tnl_disassem_vba_insn()");
 711    }
 712 }
 713
 714
 715 static void (* const opcode_func[MAX_OPCODE+3])(struct arb_vp_machine *, union instruction) =
 716 {
 717    do_ABS,
 718    do_ADD,
 719    do_NOP,/*ARA*/
 720    do_NOP,/*ARL*/
 721    do_NOP,/*ARL_NV*/
 722    do_NOP,/*ARR*/
 723    do_NOP,/*BRA*/
 724    do_NOP,/*CAL*/
 725    do_NOP,/*CMP*/
 726    do_NOP,/*COS*/
 727    do_NOP,/*DDX*/
 728    do_NOP,/*DDY*/
 729    do_DP3,
 730    do_DP4,
 731    do_DPH,
 732    do_DST,
 733    do_NOP,
 734    do_EX2,
 735    do_EXP,
 736    do_FLR,
 737    do_FRC,
 738    do_NOP,/*KIL*/
 739    do_NOP,/*KIL_NV*/
 740    do_LG2,
 741    do_LIT,
 742    do_LOG,
 743    do_NOP,/*LRP*/
 744    do_NOP,/*MAD*/
 745    do_MAX,
 746    do_MIN,
 747    do_MOV,
 748    do_MUL,
 749    do_NOP,/*PK2H*/
 750    do_NOP,/*PK2US*/
 751    do_NOP,/*PK4B*/
 752    do_NOP,/*PK4UB*/
 753    do_POW,
 754    do_NOP,/*POPA*/
 755    do_PRT,
 756    do_NOP,/*PUSHA*/
 757    do_NOP,/*RCC*/
 758    do_RCP,/*RCP*/
 759    do_NOP,/*RET*/
 760    do_NOP,/*RFL*/
 761    do_RSQ,
 762    do_NOP,/*SCS*/
 763    do_NOP,/*SEQ*/
 764    do_NOP,/*SFL*/
 765    do_SGE,
 766    do_NOP,/*SGT*/
 767    do_NOP,/*SIN*/
 768    do_NOP,/*SLE*/
 769    do_SLT,
 770    do_NOP,/*SNE*/
 771    do_NOP,/*SSG*/
 772    do_NOP,/*STR*/
 773    do_SUB,
 774    do_SWZ,/*SWZ*/
 775    do_NOP,/*TEX*/
 776    do_NOP,/*TXB*/
 777    do_NOP,/*TXD*/
 778    do_NOP,/*TXL*/
 779    do_NOP,/*TXP*/
 780    do_NOP,/*TXP_NV*/
 781    do_NOP,/*UP2H*/
 782    do_NOP,/*UP2US*/
 783    do_NOP,/*UP4B*/
 784    do_NOP,/*UP4UB*/
 785    do_NOP,/*X2D*/
 786    do_XPD,
 787    do_RSW,
 788    do_MSK,
 789    do_REL,
 790 };
 791
 792 static union instruction *cvp_next_instruction( struct compilation *cp )
 793 {
 794    union instruction *op = cp->csr++;
 795    _mesa_bzero(op, sizeof(*op));
 796    return op;
 797 }
 798
 799 static struct reg cvp_make_reg( GLuint file, GLuint idx )
 800 {
 801    struct reg reg;
 802    reg.file = file;
 803    reg.idx = idx;
 804    return reg;
 805 }
 806
 807 static struct reg cvp_emit_rel( struct compilation *cp,
 808                                 struct reg reg,
 809                                 struct reg tmpreg )
 810 {
 811    union instruction *op = cvp_next_instruction(cp);
 812    op->alu.opcode = REL;
 813    op->alu.file0 = reg.file;
 814    op->alu.idx0 = reg.idx;
 815    op->alu.dst = tmpreg.idx;
 816    return tmpreg;
 817 }
 818
 819
 820 static struct reg cvp_load_reg( struct compilation *cp,
 821                                 GLuint file,
 822                                 GLuint index,
 823                                 GLuint rel,
 824                                 GLuint tmpidx )
 825 {
 826    struct reg tmpreg = cvp_make_reg(FILE_REG, tmpidx);
 827    struct reg reg;
 828
 829    switch (file) {
 830    case PROGRAM_TEMPORARY:
 831       return cvp_make_reg(FILE_REG, REG_TMP0 + index);
 832
 833    case PROGRAM_INPUT:
 834       return cvp_make_reg(FILE_REG, REG_IN0 + index);
 835
 836    case PROGRAM_OUTPUT:
 837       return cvp_make_reg(FILE_REG, REG_OUT0 + index);
 838
 839       /* These two aren't populated by the parser?
 840        */
 841    case PROGRAM_LOCAL_PARAM:
 842       reg = cvp_make_reg(FILE_LOCAL_PARAM, index);
 843       if (rel)
 844          return cvp_emit_rel(cp, reg, tmpreg);
 845       else
 846          return reg;
 847
 848    case PROGRAM_ENV_PARAM:
 849       reg = cvp_make_reg(FILE_ENV_PARAM, index);
 850       if (rel)
 851          return cvp_emit_rel(cp, reg, tmpreg);
 852       else
 853          return reg;
 854
 855    case PROGRAM_STATE_VAR:
 856       reg = cvp_make_reg(FILE_STATE_PARAM, index);
 857       if (rel)
 858          return cvp_emit_rel(cp, reg, tmpreg);
 859       else
 860          return reg;
 861
 862       /* Invalid values:
 863        */
 864    case PROGRAM_WRITE_ONLY:
 865    case PROGRAM_ADDRESS:
 866    default:
 867       _mesa_problem(NULL, "Invalid register file %d in cvp_load_reg()");
 868       assert(0);
 869       return tmpreg;            /* can't happen */
 870    }
 871 }
 872
 873 static struct reg cvp_emit_arg( struct compilation *cp,
 874                                 const struct prog_src_register *src,
 875                                 GLuint arg )
 876 {
 877    struct reg reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr, arg );
 878    union instruction rsw, noop;
 879
 880    /* Emit any necessary swizzling.
 881     */
 882    _mesa_bzero(&rsw, sizeof(rsw));
 883    rsw.rsw.neg = src->NegateBase ? WRITEMASK_XYZW : 0;
 884
 885    /* we're expecting 2-bit swizzles below... */
 886 #if 1 /* XXX THESE ASSERTIONS CURRENTLY FAIL DURING GLEAN TESTS! */
 887 /* hopefully no longer happens? */
 888    ASSERT(GET_SWZ(src->Swizzle, 0) < 4);
 889    ASSERT(GET_SWZ(src->Swizzle, 1) < 4);
 890    ASSERT(GET_SWZ(src->Swizzle, 2) < 4);
 891    ASSERT(GET_SWZ(src->Swizzle, 3) < 4);
 892 #endif
 893    rsw.rsw.swz = src->Swizzle;
 894
 895    _mesa_bzero(&noop, sizeof(noop));
 896    noop.rsw.neg = 0;
 897    noop.rsw.swz = SWIZZLE_NOOP;
 898
 899    if (_mesa_memcmp(&rsw, &noop, sizeof(rsw)) !=0) {
 900       union instruction *op = cvp_next_instruction(cp);
 901       struct reg rsw_reg = cvp_make_reg(FILE_REG, REG_ARG0 + arg);
 902       *op = rsw;
 903       op->rsw.opcode = RSW;
 904       op->rsw.file0 = reg.file;
 905       op->rsw.idx0 = reg.idx;
 906       op->rsw.dst = rsw_reg.idx;
 907       return rsw_reg;
 908    }
 909    else
 910       return reg;
 911 }
 912
 913 static GLuint cvp_choose_result( struct compilation *cp,
 914                                  const struct prog_dst_register *dst,
 915                                  union instruction *fixup )
 916 {
 917    GLuint mask = dst->WriteMask;
 918    GLuint idx;
 919
 920    switch (dst->File) {
 921    case PROGRAM_TEMPORARY:
 922       idx = REG_TMP0 + dst->Index;
 923       break;
 924    case PROGRAM_OUTPUT:
 925       idx = REG_OUT0 + dst->Index;
 926       break;
 927    default:
 928       assert(0);
 929       return REG_RES;           /* can't happen */
 930    }
 931
 932    /* Optimization: When writing (with a writemask) to an undefined
 933     * value for the first time, the writemask may be ignored.
 934     */
 935    if (mask != WRITEMASK_XYZW && (cp->reg_active & (1 << idx))) {
 936       fixup->msk.opcode = MSK;
 937       fixup->msk.dst = idx;
 938       fixup->msk.file = FILE_REG;
 939       fixup->msk.idx = REG_RES;
 940       fixup->msk.mask = mask;
 941       cp->reg_active |= 1 << idx;
 942       return REG_RES;
 943    }
 944    else {
 945       _mesa_bzero(fixup, sizeof(*fixup));
 946       cp->reg_active |= 1 << idx;
 947       return idx;
 948    }
 949 }
 950
 951
 952 static void cvp_emit_inst( struct compilation *cp,
 953                            const struct prog_instruction *inst )
 954 {
 955    union instruction *op;
 956    union instruction fixup;
 957    struct reg reg[3];
 958    GLuint result, nr_args, i;
 959
 960    /* Need to handle SWZ, ARL specially.
 961     */
 962    switch (inst->Opcode) {
 963       /* Split into mul and add:
 964        */
 965    case OPCODE_MAD:
 966       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
 967       for (i = 0; i < 3; i++)
 968          reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0+i );
 969
 970       op = cvp_next_instruction(cp);
 971       op->alu.opcode = OPCODE_MUL;
 972       op->alu.file0 = reg[0].file;
 973       op->alu.idx0 = reg[0].idx;
 974       op->alu.file1 = reg[1].file;
 975       op->alu.idx1 = reg[1].idx;
 976       op->alu.dst = REG_ARG0;
 977
 978       op = cvp_next_instruction(cp);
 979       op->alu.opcode = OPCODE_ADD;
 980       op->alu.file0 = FILE_REG;
 981       op->alu.idx0 = REG_ARG0;
 982       op->alu.file1 = reg[2].file;
 983       op->alu.idx1 = reg[2].idx;
 984       op->alu.dst = result;
 985
 986       if (result == REG_RES) {
 987          op = cvp_next_instruction(cp);
 988          *op = fixup;
 989       }
 990       break;
 991
 992    case OPCODE_ARL:
 993       reg[0] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
 994
 995       op = cvp_next_instruction(cp);
 996       op->alu.opcode = OPCODE_FLR;
 997       op->alu.dst = REG_ADDR;
 998       op->alu.file0 = reg[0].file;
 999       op->alu.idx0 = reg[0].idx;
1000       break;
1001
1002    case OPCODE_END:
1003       break;
1004
1005    case OPCODE_SWZ:
1006       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
1007       reg[0] = cvp_load_reg( cp, inst->SrcReg[0].File,
1008                         inst->SrcReg[0].Index, inst->SrcReg[0].RelAddr, REG_ARG0 );
1009       op = cvp_next_instruction(cp);
1010       op->rsw.opcode = inst->Opcode;
1011       op->rsw.file0 = reg[0].file;
1012       op->rsw.idx0 = reg[0].idx;
1013       op->rsw.dst = result;
1014       op->rsw.swz = inst->SrcReg[0].Swizzle;
1015       op->rsw.neg = inst->SrcReg[0].NegateBase;
1016
1017       if (result == REG_RES) {
1018          op = cvp_next_instruction(cp);
1019          *op = fixup;
1020       }
1021       break;
1022
1023    default:
1024       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
1025       nr_args = _mesa_num_inst_src_regs(inst->Opcode);
1026       for (i = 0; i < nr_args; i++)
1027          reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0 + i );
1028
1029       op = cvp_next_instruction(cp);
1030       op->alu.opcode = inst->Opcode;
1031       op->alu.file0 = reg[0].file;
1032       op->alu.idx0 = reg[0].idx;
1033       op->alu.file1 = reg[1].file;
1034       op->alu.idx1 = reg[1].idx;
1035       op->alu.dst = result;
1036
1037       if (result == REG_RES) {
1038          op = cvp_next_instruction(cp);
1039          *op = fixup;
1040       }
1041       break;
1042    }
1043 }
1044
1045 static void free_tnl_data( struct gl_vertex_program *program  )
1046 {
1047    struct tnl_compiled_program *p = (struct tnl_compiled_program *) program->TnlData;
1048    if (p->compiled_func)
1049       _mesa_free((void *)p->compiled_func);
1050    _mesa_free(p);
1051    program->TnlData = NULL;
1052 }
1053
1054 static void compile_vertex_program( struct gl_vertex_program *program,
1055                                     GLboolean try_codegen )
1056 {
1057    struct compilation cp;
1058    struct tnl_compiled_program *p = CALLOC_STRUCT(tnl_compiled_program);
1059    GLint i;
1060
1061    if (program->TnlData)
1062       free_tnl_data( program );
1063
1064    program->TnlData = p;
1065
1066    /* Initialize cp.  Note that ctx and VB aren't used in compilation
1067     * so we don't have to worry about statechanges:
1068     */
1069    _mesa_memset(&cp, 0, sizeof(cp));
1070    cp.csr = p->instructions;
1071
1072    /* Compile instructions:
1073     */
1074    for (i = 0; i < program->Base.NumInstructions; i++) {
1075       cvp_emit_inst(&cp, &program->Base.Instructions[i]);
1076    }
1077
1078    /* Finish up:
1079     */
1080    p->nr_instructions = cp.csr - p->instructions;
1081
1082    /* Print/disassemble:
1083     */
1084    if (DISASSEM) {
1085       for (i = 0; i < p->nr_instructions; i++) {
1086          _tnl_disassem_vba_insn(p->instructions[i]);
1087       }
1088       _mesa_printf("\n\n");
1089    }
1090
1091 #ifdef USE_SSE_ASM
1092    if (try_codegen)
1093       _tnl_sse_codegen_vertex_program(p);
1094 #endif
1095
1096 }
1097
1098
1099
1100
1101 /* ----------------------------------------------------------------------
1102  * Execution
1103  */
1104 static void userclip( GLcontext *ctx,
1105                       GLvector4f *clip,
1106                       GLubyte *clipmask,
1107                       GLubyte *clipormask,
1108                       GLubyte *clipandmask )
1109 {
1110    GLuint p;
1111
1112    for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
1113       if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
1114          GLuint nr, i;
1115          const GLfloat a = ctx->Transform._ClipUserPlane[p][0];
1116          const GLfloat b = ctx->Transform._ClipUserPlane[p][1];
1117          const GLfloat c = ctx->Transform._ClipUserPlane[p][2];
1118          const GLfloat d = ctx->Transform._ClipUserPlane[p][3];
1119          GLfloat *coord = (GLfloat *)clip->data;
1120          GLuint stride = clip->stride;
1121          GLuint count = clip->count;
1122
1123          for (nr = 0, i = 0 ; i < count ; i++) {
1124             GLfloat dp = (coord[0] * a +
1125                           coord[1] * b +
1126                           coord[2] * c +
1127                           coord[3] * d);
1128
1129             if (dp < 0) {
1130                nr++;
1131                clipmask[i] |= CLIP_USER_BIT;
1132             }
1133
1134             STRIDE_F(coord, stride);
1135          }
1136
1137          if (nr > 0) {
1138             *clipormask |= CLIP_USER_BIT;
1139             if (nr == count) {
1140                *clipandmask |= CLIP_USER_BIT;
1141                return;
1142             }
1143          }
1144       }
1145    }
1146 }
1147
1148
1149 static GLboolean
1150 do_ndc_cliptest(GLcontext *ctx, struct arb_vp_machine *m)
1151 {
1152    TNLcontext *tnl = TNL_CONTEXT(ctx);
1153    struct vertex_buffer *VB = m->VB;
1154
1155    /* Cliptest and perspective divide.  Clip functions must clear
1156     * the clipmask.
1157     */
1158    m->ormask = 0;
1159    m->andmask = CLIP_FRUSTUM_BITS;
1160
1161    if (tnl->NeedNdcCoords) {
1162       VB->NdcPtr =
1163          _mesa_clip_tab[VB->ClipPtr->size]( VB->ClipPtr,
1164                                             &m->ndcCoords,
1165                                             m->clipmask,
1166                                             &m->ormask,
1167                                             &m->andmask );
1168    }
1169    else {
1170       VB->NdcPtr = NULL;
1171       _mesa_clip_np_tab[VB->ClipPtr->size]( VB->ClipPtr,
1172                                             NULL,
1173                                             m->clipmask,
1174                                             &m->ormask,
1175                                             &m->andmask );
1176    }
1177
1178    if (m->andmask) {
1179       /* All vertices are outside the frustum */
1180       return GL_FALSE;
1181    }
1182
1183    /* Test userclip planes.  This contributes to VB->ClipMask.
1184     */
1185    if (ctx->Transform.ClipPlanesEnabled && (!ctx->VertexProgram._Enabled ||
1186       ctx->VertexProgram.Current->IsPositionInvariant)) {
1187       userclip( ctx,
1188                 VB->ClipPtr,
1189                 m->clipmask,
1190                 &m->ormask,
1191                 &m->andmask );
1192
1193       if (m->andmask) {
1194          return GL_FALSE;
1195       }
1196    }
1197
1198    VB->ClipAndMask = m->andmask;
1199    VB->ClipOrMask = m->ormask;
1200    VB->ClipMask = m->clipmask;
1201
1202    return GL_TRUE;
1203 }
1204
1205
1206 static INLINE void call_func( struct tnl_compiled_program *p,
1207                               struct arb_vp_machine *m )
1208 {
1209    p->compiled_func(m);
1210 }
1211
1212 /**
1213  * Execute the given vertex program.
1214  *
1215  * TODO: Integrate the t_vertex.c code here, to build machine vertices
1216  * directly at this point.
1217  *
1218  * TODO: Eliminate the VB struct entirely and just use
1219  * struct arb_vertex_machine.
1220  */
1221 static GLboolean
1222 run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
1223 {
1224    const struct gl_vertex_program *program;
1225    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
1226    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1227    struct tnl_compiled_program *p;
1228    GLuint i, j;
1229    GLbitfield outputs;
1230
1231    if (ctx->ShaderObjects._VertexShaderPresent)
1232       return GL_TRUE;
1233
1234    program = ctx->VertexProgram._Enabled ? ctx->VertexProgram.Current : NULL;
1235    if (!program && ctx->_MaintainTnlProgram) {
1236       program = ctx->_TnlProgram;
1237    }
1238    if (!program || program->IsNVProgram)
1239       return GL_TRUE;
1240
1241    if (program->Base.Parameters) {
1242       _mesa_load_state_parameters(ctx, program->Base.Parameters);
1243    }
1244
1245    p = (struct tnl_compiled_program *)program->TnlData;
1246    assert(p);
1247
1248
1249    m->nr_inputs = m->nr_outputs = 0;
1250
1251    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
1252       if (program->Base.InputsRead & (1<<i) ||
1253           (i == VERT_ATTRIB_POS && program->IsPositionInvariant)) {
1254          GLuint j = m->nr_inputs++;
1255          m->input[j].idx = i;
1256          m->input[j].data = (GLfloat *)m->VB->AttribPtr[i]->data;
1257          m->input[j].stride = m->VB->AttribPtr[i]->stride;
1258          m->input[j].size = m->VB->AttribPtr[i]->size;
1259          ASSIGN_4V(m->File[0][REG_IN0 + i], 0, 0, 0, 1);
1260       }
1261    }
1262
1263    for (i = 0; i < VERT_RESULT_MAX; i++) {
1264       if (program->Base.OutputsWritten & (1 << i) ||
1265           (i == VERT_RESULT_HPOS && program->IsPositionInvariant)) {
1266          GLuint j = m->nr_outputs++;
1267          m->output[j].idx = i;
1268          m->output[j].data = (GLfloat *)m->attribs[i].data;
1269       }
1270    }
1271
1272
1273    /* Run the actual program:
1274     */
1275    for (m->vtx_nr = 0; m->vtx_nr < VB->Count; m->vtx_nr++) {
1276       for (j = 0; j < m->nr_inputs; j++) {
1277          GLuint idx = REG_IN0 + m->input[j].idx;
1278          switch (m->input[j].size) {
1279          case 4: m->File[0][idx][3] = m->input[j].data[3];
1280          case 3: m->File[0][idx][2] = m->input[j].data[2];
1281          case 2: m->File[0][idx][1] = m->input[j].data[1];
1282          case 1: m->File[0][idx][0] = m->input[j].data[0];
1283          }
1284
1285          STRIDE_F(m->input[j].data, m->input[j].stride);
1286       }
1287
1288
1289       if (p->compiled_func) {
1290          call_func( p, m );
1291       }
1292       else {
1293          GLint j;
1294          for (j = 0; j < p->nr_instructions; j++) {
1295             union instruction inst = p->instructions[j];
1296             opcode_func[inst.alu.opcode]( m, inst );
1297          }
1298       }
1299
1300       /* If the program is position invariant, multiply the input position
1301        * by the MVP matrix and store in the vertex position result register.
1302        */
1303       if (program->IsPositionInvariant) {
1304          TRANSFORM_POINT( m->File[0][REG_OUT0+0],
1305                           ctx->_ModelProjectMatrix.m,
1306                           m->File[0][REG_IN0+0]);
1307       }
1308
1309       for (j = 0; j < m->nr_outputs; j++) {
1310          GLuint idx = REG_OUT0 + m->output[j].idx;
1311          m->output[j].data[0] = m->File[0][idx][0];
1312          m->output[j].data[1] = m->File[0][idx][1];
1313          m->output[j].data[2] = m->File[0][idx][2];
1314          m->output[j].data[3] = m->File[0][idx][3];
1315          m->output[j].data += 4;
1316       }
1317
1318    }
1319
1320    /* Setup the VB pointers so that the next pipeline stages get
1321     * their data from the right place (the program output arrays).
1322     *
1323     * TODO: 1) Have tnl use these RESULT values for outputs rather
1324     * than trying to shoe-horn inputs and outputs into one set of
1325     * values.
1326     *
1327     * TODO: 2) Integrate t_vertex.c so that we just go straight ahead
1328     * and build machine vertices here.
1329     */
1330    VB->ClipPtr = &m->attribs[VERT_RESULT_HPOS];
1331    VB->ClipPtr->count = VB->Count;
1332
1333    /* XXX There seems to be confusion between using the VERT_ATTRIB_*
1334     * values vs _TNL_ATTRIB_* tokens here:
1335     */
1336    outputs = program->Base.OutputsWritten;
1337    if (program->IsPositionInvariant)
1338       outputs |= (1<<VERT_RESULT_HPOS);
1339
1340    if (outputs & (1<<VERT_RESULT_COL0)) {
1341       VB->ColorPtr[0] =
1342       VB->AttribPtr[VERT_ATTRIB_COLOR0] = &m->attribs[VERT_RESULT_COL0];
1343    }
1344
1345    if (outputs & (1<<VERT_RESULT_BFC0)) {
1346       VB->ColorPtr[1] = &m->attribs[VERT_RESULT_BFC0];
1347    }
1348
1349    if (outputs & (1<<VERT_RESULT_COL1)) {
1350       VB->SecondaryColorPtr[0] =
1351       VB->AttribPtr[VERT_ATTRIB_COLOR1] = &m->attribs[VERT_RESULT_COL1];
1352    }
1353
1354    if (outputs & (1<<VERT_RESULT_BFC1)) {
1355       VB->SecondaryColorPtr[1] = &m->attribs[VERT_RESULT_BFC1];
1356    }
1357
1358    if (outputs & (1<<VERT_RESULT_FOGC)) {
1359       VB->FogCoordPtr =
1360       VB->AttribPtr[VERT_ATTRIB_FOG] = &m->attribs[VERT_RESULT_FOGC];
1361    }
1362
1363    if (outputs & (1<<VERT_RESULT_PSIZ)) {
1364       VB->AttribPtr[_TNL_ATTRIB_POINTSIZE] = &m->attribs[VERT_RESULT_PSIZ];
1365    }
1366
1367    for (i = 0; i < ctx->Const.MaxTextureCoordUnits; i++) {
1368       if (outputs & (1<<(VERT_RESULT_TEX0+i))) {
1369          VB->TexCoordPtr[i] =
1370          VB->AttribPtr[VERT_ATTRIB_TEX0+i] = &m->attribs[VERT_RESULT_TEX0 + i];
1371       }
1372    }
1373
1374 #if 0
1375    for (i = 0; i < VB->Count; i++) {
1376       printf("Out %d: %f %f %f %f %f %f %f %f\n", i,
1377              VEC_ELT(VB->ClipPtr, GLfloat, i)[0],
1378              VEC_ELT(VB->ClipPtr, GLfloat, i)[1],
1379              VEC_ELT(VB->ClipPtr, GLfloat, i)[2],
1380              VEC_ELT(VB->ClipPtr, GLfloat, i)[3],
1381              VEC_ELT(VB->AttribPtr[VERT_ATTRIB_TEX0], GLfloat, i)[0],
1382              VEC_ELT(VB->AttribPtr[VERT_ATTRIB_TEX0], GLfloat, i)[1],
1383              VEC_ELT(VB->AttribPtr[VERT_ATTRIB_TEX0], GLfloat, i)[2],
1384              VEC_ELT(VB->AttribPtr[VERT_ATTRIB_TEX0], GLfloat, i)[3]);
1385    }
1386 #endif
1387
1388    /* Perform NDC and cliptest operations:
1389     */
1390    return do_ndc_cliptest(ctx, m);
1391 }
1392
1393
1394 static void
1395 validate_vertex_program( GLcontext *ctx, struct tnl_pipeline_stage *stage )
1396 {
1397    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1398    struct gl_vertex_program *program;
1399
1400    if (ctx->ShaderObjects._VertexShaderPresent)
1401       return;
1402
1403    program = (ctx->VertexProgram._Enabled ? ctx->VertexProgram.Current : 0);
1404    if (!program && ctx->_MaintainTnlProgram) {
1405       program = ctx->_TnlProgram;
1406    }
1407
1408    if (program) {
1409       if (!program->TnlData)
1410          compile_vertex_program( program, m->try_codegen );
1411
1412       /* Grab the state GL state and put into registers:
1413        */
1414       m->File[FILE_LOCAL_PARAM] = program->Base.LocalParams;
1415       m->File[FILE_ENV_PARAM] = ctx->VertexProgram.Parameters;
1416       /* GL_NV_vertex_programs can't reference GL state */
1417       if (program->Base.Parameters)
1418          m->File[FILE_STATE_PARAM] = program->Base.Parameters->ParameterValues;
1419       else
1420          m->File[FILE_STATE_PARAM] = NULL;
1421    }
1422 }
1423
1424
1425
1426
1427
1428
1429
1430 /**
1431  * Called the first time stage->run is called.  In effect, don't
1432  * allocate data until the first time the stage is run.
1433  */
1434 static GLboolean init_vertex_program( GLcontext *ctx,
1435                                       struct tnl_pipeline_stage *stage )
1436 {
1437    TNLcontext *tnl = TNL_CONTEXT(ctx);
1438    struct vertex_buffer *VB = &(tnl->vb);
1439    struct arb_vp_machine *m;
1440    const GLuint size = VB->Size;
1441    GLuint i;
1442
1443    stage->privatePtr = _mesa_calloc(sizeof(*m));
1444    m = ARB_VP_MACHINE(stage);
1445    if (!m)
1446       return GL_FALSE;
1447
1448    /* arb_vertex_machine struct should subsume the VB:
1449     */
1450    m->VB = VB;
1451
1452    m->File[0] = (GLfloat(*)[4])ALIGN_MALLOC(REG_MAX * sizeof(GLfloat) * 4, 16);
1453
1454    /* Initialize regs where necessary:
1455     */
1456    ASSIGN_4V(m->File[0][REG_ID], 0, 0, 0, 1);
1457    ASSIGN_4V(m->File[0][REG_ONES], 1, 1, 1, 1);
1458    ASSIGN_4V(m->File[0][REG_SWZ], 1, -1, 0, 0);
1459    ASSIGN_4V(m->File[0][REG_NEG], -1, -1, -1, -1);
1460    ASSIGN_4V(m->File[0][REG_LIT], 1, 0, 0, 1);
1461    ASSIGN_4V(m->File[0][REG_LIT2], 1, .5, .2, 1); /* debug value */
1462
1463    if (_mesa_getenv("MESA_EXPERIMENTAL"))
1464       m->try_codegen = GL_TRUE;
1465
1466    /* Allocate arrays of vertex output values */
1467    for (i = 0; i < VERT_RESULT_MAX; i++) {
1468       _mesa_vector4f_alloc( &m->attribs[i], 0, size, 32 );
1469       m->attribs[i].size = 4;
1470    }
1471
1472    /* a few other misc allocations */
1473    _mesa_vector4f_alloc( &m->ndcCoords, 0, size, 32 );
1474    m->clipmask = (GLubyte *) ALIGN_MALLOC(sizeof(GLubyte)*size, 32 );
1475
1476    if (ctx->_MaintainTnlProgram)
1477       _mesa_allow_light_in_model( ctx, GL_FALSE );
1478
1479    m->fpucntl_rnd_neg = RND_NEG_FPU; /* const value */
1480    m->fpucntl_restore = RESTORE_FPU; /* const value */
1481
1482    return GL_TRUE;
1483 }
1484
1485
1486
1487
1488 /**
1489  * Destructor for this pipeline stage.
1490  */
1491 static void dtr( struct tnl_pipeline_stage *stage )
1492 {
1493    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1494
1495    if (m) {
1496       GLuint i;
1497
1498       /* free the vertex program result arrays */
1499       for (i = 0; i < VERT_RESULT_MAX; i++)
1500          _mesa_vector4f_free( &m->attribs[i] );
1501
1502       /* free misc arrays */
1503       _mesa_vector4f_free( &m->ndcCoords );
1504       ALIGN_FREE( m->clipmask );
1505       ALIGN_FREE( m->File[0] );
1506
1507       _mesa_free( m );
1508       stage->privatePtr = NULL;
1509    }
1510 }
1511
1512 /**
1513  * Public description of this pipeline stage.
1514  */
1515 const struct tnl_pipeline_stage _tnl_arb_vertex_program_stage =
1516 {
1517    "arb-vertex-program",
1518    NULL,                        /* private_data */
1519    init_vertex_program,         /* create */
1520    dtr,                         /* destroy */
1521    validate_vertex_program,     /* validate */
1522    run_arb_vertex_program       /* run */
1523 };
1524
1525
1526 /**
1527  * Called via ctx->Driver.ProgramStringNotify() after a new vertex program
1528  * string has been parsed.
1529  */
1530 void
1531 _tnl_program_string(GLcontext *ctx, GLenum target, struct gl_program *program)
1532 {
1533    if (target == GL_VERTEX_PROGRAM_ARB) {
1534       /* free any existing tnl data hanging off the program */
1535       struct gl_vertex_program *vprog = (struct gl_vertex_program *) program;
1536       if (vprog->TnlData) {
1537          free_tnl_data(vprog);
1538       }
1539    }
1540 }