src/mesa/tnl/t_vb_arbprogram.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.3
   4  *
   5  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file t_arb_program.c
  27  * Compile vertex programs to an intermediate representation.
  28  * Execute vertex programs over a buffer of vertices.
  29  * \author Keith Whitwell, Brian Paul
  30  */
  31
  32 #include "glheader.h"
  33 #include "context.h"
  34 #include "imports.h"
  35 #include "macros.h"
  36 #include "mtypes.h"
  37 #include "arbprogparse.h"
  38 #include "program.h"
  39 #include "math/m_matrix.h"
  40 #include "math/m_translate.h"
  41 #include "t_context.h"
  42 #include "t_pipeline.h"
  43 #include "t_vp_build.h"
  44 #include "t_vb_arbprogram.h"
  45
  46 #define DISASSEM 0
  47
  48 /*--------------------------------------------------------------------------- */
  49
  50 struct opcode_info {
  51    GLuint nr_args;
  52    const char *string;
  53    void (*print)( union instruction , const struct opcode_info * );
  54 };
  55
  56 struct compilation {
  57    GLuint reg_active;
  58    union instruction *csr;
  59 };
  60
  61
  62 #define ARB_VP_MACHINE(stage) ((struct arb_vp_machine *)(stage->privatePtr))
  63
  64 #define PUFF(x) ((x)[1] = (x)[2] = (x)[3] = (x)[0])
  65
  66
  67
  68 /* Lower precision functions for the EXP, LOG and LIT opcodes.  The
  69  * LOG2() implementation is probably not accurate enough, and the
  70  * attempted optimization for Exp2 is definitely not accurate
  71  * enough - it discards all of t's fractional bits!
  72  */
  73 static GLfloat RoughApproxLog2(GLfloat t)
  74 {
  75    return LOG2(t);
  76 }
  77
  78 static GLfloat RoughApproxExp2(GLfloat t)
  79 {
  80 #if 0
  81    fi_type fi;
  82    fi.i = (GLint) t;
  83    fi.i = (fi.i << 23) + 0x3f800000;
  84    return fi.f;
  85 #else
  86    return (GLfloat) _mesa_pow(2.0, t);
  87 #endif
  88 }
  89
  90 static GLfloat RoughApproxPower(GLfloat x, GLfloat y)
  91 {
  92    return RoughApproxExp2(y * RoughApproxLog2(x));
  93 }
  94
  95
  96 /* Higher precision functions for the EX2, LG2 and POW opcodes:
  97  */
  98 static GLfloat ApproxLog2(GLfloat t)
  99 {
 100    return (GLfloat) (log(t) * 1.442695F);
 101 }
 102
 103 static GLfloat ApproxExp2(GLfloat t)
 104 {
 105    return (GLfloat) _mesa_pow(2.0, t);
 106 }
 107
 108 static GLfloat ApproxPower(GLfloat x, GLfloat y)
 109 {
 110    return (GLfloat) _mesa_pow(x, y);
 111 }
 112
 113
 114
 115
 116
 117 /**
 118  * Perform a reduced swizzle:
 119  */
 120 static void do_RSW( struct arb_vp_machine *m, union instruction op )
 121 {
 122    GLfloat *result = m->File[0][op.rsw.dst];
 123    const GLfloat *arg0 = m->File[op.rsw.file0][op.rsw.idx0];
 124    GLuint swz = op.rsw.swz;
 125    GLuint neg = op.rsw.neg;
 126
 127    result[0] = arg0[GET_RSW(swz, 0)];
 128    result[1] = arg0[GET_RSW(swz, 1)];
 129    result[2] = arg0[GET_RSW(swz, 2)];
 130    result[3] = arg0[GET_RSW(swz, 3)];
 131
 132    if (neg) {
 133       if (neg & 0x1) result[0] = -result[0];
 134       if (neg & 0x2) result[1] = -result[1];
 135       if (neg & 0x4) result[2] = -result[2];
 136       if (neg & 0x8) result[3] = -result[3];
 137    }
 138 }
 139
 140 /* Used to implement write masking.  To make things easier for the sse
 141  * generator I've gone back to a 1 argument version of this function
 142  * (dst.msk = arg), rather than the semantically cleaner (dst = SEL
 143  * arg0, arg1, msk)
 144  *
 145  * That means this is the only instruction which doesn't write a full
 146  * 4 dwords out.  This would make such a program harder to analyse,
 147  * but it looks like analysis is going to take place on a higher level
 148  * anyway.
 149  */
 150 static void do_MSK( struct arb_vp_machine *m, union instruction op )
 151 {
 152    GLfloat *dst = m->File[0][op.msk.dst];
 153    const GLfloat *arg = m->File[op.msk.file][op.msk.idx];
 154
 155    if (op.msk.mask & 0x1) dst[0] = arg[0];
 156    if (op.msk.mask & 0x2) dst[1] = arg[1];
 157    if (op.msk.mask & 0x4) dst[2] = arg[2];
 158    if (op.msk.mask & 0x8) dst[3] = arg[3];
 159 }
 160
 161
 162 static void do_PRT( struct arb_vp_machine *m, union instruction op )
 163 {
 164    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 165
 166    _mesa_printf("%d: %f %f %f %f\n", m->vtx_nr,
 167                 arg0[0], arg0[1], arg0[2], arg0[3]);
 168 }
 169
 170
 171 /**
 172  * The traditional ALU and texturing instructions.  All operate on
 173  * internal registers and ignore write masks and swizzling issues.
 174  */
 175
 176 static void do_ABS( struct arb_vp_machine *m, union instruction op )
 177 {
 178    GLfloat *result = m->File[0][op.alu.dst];
 179    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 180
 181    result[0] = (arg0[0] < 0.0) ? -arg0[0] : arg0[0];
 182    result[1] = (arg0[1] < 0.0) ? -arg0[1] : arg0[1];
 183    result[2] = (arg0[2] < 0.0) ? -arg0[2] : arg0[2];
 184    result[3] = (arg0[3] < 0.0) ? -arg0[3] : arg0[3];
 185 }
 186
 187 static void do_ADD( struct arb_vp_machine *m, union instruction op )
 188 {
 189    GLfloat *result = m->File[0][op.alu.dst];
 190    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 191    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 192
 193    result[0] = arg0[0] + arg1[0];
 194    result[1] = arg0[1] + arg1[1];
 195    result[2] = arg0[2] + arg1[2];
 196    result[3] = arg0[3] + arg1[3];
 197 }
 198
 199
 200 static void do_DP3( struct arb_vp_machine *m, union instruction op )
 201 {
 202    GLfloat *result = m->File[0][op.alu.dst];
 203    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 204    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 205
 206    result[0] = (arg0[0] * arg1[0] +
 207                 arg0[1] * arg1[1] +
 208                 arg0[2] * arg1[2]);
 209
 210    PUFF(result);
 211 }
 212
 213
 214
 215 static void do_DP4( struct arb_vp_machine *m, union instruction op )
 216 {
 217    GLfloat *result = m->File[0][op.alu.dst];
 218    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 219    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 220
 221    result[0] = (arg0[0] * arg1[0] +
 222                 arg0[1] * arg1[1] +
 223                 arg0[2] * arg1[2] +
 224                 arg0[3] * arg1[3]);
 225
 226    PUFF(result);
 227 }
 228
 229 static void do_DPH( struct arb_vp_machine *m, union instruction op )
 230 {
 231    GLfloat *result = m->File[0][op.alu.dst];
 232    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 233    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 234
 235    result[0] = (arg0[0] * arg1[0] +
 236                 arg0[1] * arg1[1] +
 237                 arg0[2] * arg1[2] +
 238                 1.0     * arg1[3]);
 239
 240    PUFF(result);
 241 }
 242
 243 static void do_DST( struct arb_vp_machine *m, union instruction op )
 244 {
 245    GLfloat *result = m->File[0][op.alu.dst];
 246    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 247    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 248
 249    result[0] = 1.0F;
 250    result[1] = arg0[1] * arg1[1];
 251    result[2] = arg0[2];
 252    result[3] = arg1[3];
 253 }
 254
 255
 256 /* Intended to be high precision:
 257  */
 258 static void do_EX2( struct arb_vp_machine *m, union instruction op )
 259 {
 260    GLfloat *result = m->File[0][op.alu.dst];
 261    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 262
 263    result[0] = (GLfloat)ApproxExp2(arg0[0]);
 264    PUFF(result);
 265 }
 266
 267
 268 /* Allowed to be lower precision:
 269  */
 270 static void do_EXP( struct arb_vp_machine *m, union instruction op )
 271 {
 272    GLfloat *result = m->File[0][op.alu.dst];
 273    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 274    GLfloat tmp = arg0[0];
 275    GLfloat flr_tmp = FLOORF(tmp);
 276
 277    /* KW: nvvertexec has an optimized version of this which is pretty
 278     * hard to understand/validate, but avoids the RoughApproxExp2.
 279     */
 280    result[0] = (GLfloat) (1 << (int)flr_tmp);
 281    result[1] = tmp - flr_tmp;
 282    result[2] = RoughApproxExp2(tmp);
 283    result[3] = 1.0F;
 284 }
 285
 286 static void do_FLR( struct arb_vp_machine *m, union instruction op )
 287 {
 288    GLfloat *result = m->File[0][op.alu.dst];
 289    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 290
 291    result[0] = FLOORF(arg0[0]);
 292    result[1] = FLOORF(arg0[1]);
 293    result[2] = FLOORF(arg0[2]);
 294    result[3] = FLOORF(arg0[3]);
 295 }
 296
 297 static void do_FRC( struct arb_vp_machine *m, union instruction op )
 298 {
 299    GLfloat *result = m->File[0][op.alu.dst];
 300    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 301
 302    result[0] = arg0[0] - FLOORF(arg0[0]);
 303    result[1] = arg0[1] - FLOORF(arg0[1]);
 304    result[2] = arg0[2] - FLOORF(arg0[2]);
 305    result[3] = arg0[3] - FLOORF(arg0[3]);
 306 }
 307
 308 /* High precision log base 2:
 309  */
 310 static void do_LG2( struct arb_vp_machine *m, union instruction op )
 311 {
 312    GLfloat *result = m->File[0][op.alu.dst];
 313    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 314
 315    result[0] = ApproxLog2(arg0[0]);
 316    PUFF(result);
 317 }
 318
 319
 320
 321 static void do_LIT( struct arb_vp_machine *m, union instruction op )
 322 {
 323    GLfloat *result = m->File[0][op.alu.dst];
 324    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 325
 326    const GLfloat epsilon = 1.0F / 256.0F; /* per NV spec */
 327    GLfloat tmp[4];
 328
 329    tmp[0] = MAX2(arg0[0], 0.0F);
 330    tmp[1] = MAX2(arg0[1], 0.0F);
 331    tmp[3] = CLAMP(arg0[3], -(128.0F - epsilon), (128.0F - epsilon));
 332
 333    result[0] = 1.0;
 334    result[1] = tmp[0];
 335    result[2] = (tmp[0] > 0.0) ? RoughApproxPower(tmp[1], tmp[3]) : 0.0F;
 336    result[3] = 1.0;
 337 }
 338
 339
 340 /* Intended to allow a lower precision than required for LG2 above.
 341  */
 342 static void do_LOG( struct arb_vp_machine *m, union instruction op )
 343 {
 344    GLfloat *result = m->File[0][op.alu.dst];
 345    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 346    GLfloat tmp = FABSF(arg0[0]);
 347    int exponent;
 348    GLfloat mantissa = frexpf(tmp, &exponent);
 349
 350    result[0] = (GLfloat) (exponent - 1);
 351    result[1] = 2.0 * mantissa; /* map [.5, 1) -> [1, 2) */
 352    result[2] = result[0] + LOG2(result[1]);
 353    result[3] = 1.0;
 354 }
 355
 356 static void do_MAX( struct arb_vp_machine *m, union instruction op )
 357 {
 358    GLfloat *result = m->File[0][op.alu.dst];
 359    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 360    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 361
 362    result[0] = (arg0[0] > arg1[0]) ? arg0[0] : arg1[0];
 363    result[1] = (arg0[1] > arg1[1]) ? arg0[1] : arg1[1];
 364    result[2] = (arg0[2] > arg1[2]) ? arg0[2] : arg1[2];
 365    result[3] = (arg0[3] > arg1[3]) ? arg0[3] : arg1[3];
 366 }
 367
 368
 369 static void do_MIN( struct arb_vp_machine *m, union instruction op )
 370 {
 371    GLfloat *result = m->File[0][op.alu.dst];
 372    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 373    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 374
 375    result[0] = (arg0[0] < arg1[0]) ? arg0[0] : arg1[0];
 376    result[1] = (arg0[1] < arg1[1]) ? arg0[1] : arg1[1];
 377    result[2] = (arg0[2] < arg1[2]) ? arg0[2] : arg1[2];
 378    result[3] = (arg0[3] < arg1[3]) ? arg0[3] : arg1[3];
 379 }
 380
 381 static void do_MOV( struct arb_vp_machine *m, union instruction op )
 382 {
 383    GLfloat *result = m->File[0][op.alu.dst];
 384    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 385
 386    result[0] = arg0[0];
 387    result[1] = arg0[1];
 388    result[2] = arg0[2];
 389    result[3] = arg0[3];
 390 }
 391
 392 static void do_MUL( struct arb_vp_machine *m, union instruction op )
 393 {
 394    GLfloat *result = m->File[0][op.alu.dst];
 395    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 396    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 397
 398    result[0] = arg0[0] * arg1[0];
 399    result[1] = arg0[1] * arg1[1];
 400    result[2] = arg0[2] * arg1[2];
 401    result[3] = arg0[3] * arg1[3];
 402 }
 403
 404
 405 /* Intended to be "high" precision
 406  */
 407 static void do_POW( struct arb_vp_machine *m, union instruction op )
 408 {
 409    GLfloat *result = m->File[0][op.alu.dst];
 410    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 411    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 412
 413    result[0] = (GLfloat)ApproxPower(arg0[0], arg1[0]);
 414    PUFF(result);
 415 }
 416
 417 static void do_REL( struct arb_vp_machine *m, union instruction op )
 418 {
 419    GLfloat *result = m->File[0][op.alu.dst];
 420    GLuint idx = (op.alu.idx0 + (GLint)m->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1);
 421    const GLfloat *arg0 = m->File[op.alu.file0][idx];
 422
 423    result[0] = arg0[0];
 424    result[1] = arg0[1];
 425    result[2] = arg0[2];
 426    result[3] = arg0[3];
 427 }
 428
 429 static void do_RCP( struct arb_vp_machine *m, union instruction op )
 430 {
 431    GLfloat *result = m->File[0][op.alu.dst];
 432    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 433
 434    result[0] = 1.0F / arg0[0];
 435    PUFF(result);
 436 }
 437
 438 static void do_RSQ( struct arb_vp_machine *m, union instruction op )
 439 {
 440    GLfloat *result = m->File[0][op.alu.dst];
 441    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 442
 443    result[0] = INV_SQRTF(FABSF(arg0[0]));
 444    PUFF(result);
 445 }
 446
 447
 448 static void do_SGE( struct arb_vp_machine *m, union instruction op )
 449 {
 450    GLfloat *result = m->File[0][op.alu.dst];
 451    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 452    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 453
 454    result[0] = (arg0[0] >= arg1[0]) ? 1.0F : 0.0F;
 455    result[1] = (arg0[1] >= arg1[1]) ? 1.0F : 0.0F;
 456    result[2] = (arg0[2] >= arg1[2]) ? 1.0F : 0.0F;
 457    result[3] = (arg0[3] >= arg1[3]) ? 1.0F : 0.0F;
 458 }
 459
 460
 461 static void do_SLT( struct arb_vp_machine *m, union instruction op )
 462 {
 463    GLfloat *result = m->File[0][op.alu.dst];
 464    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 465    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 466
 467    result[0] = (arg0[0] < arg1[0]) ? 1.0F : 0.0F;
 468    result[1] = (arg0[1] < arg1[1]) ? 1.0F : 0.0F;
 469    result[2] = (arg0[2] < arg1[2]) ? 1.0F : 0.0F;
 470    result[3] = (arg0[3] < arg1[3]) ? 1.0F : 0.0F;
 471 }
 472
 473 static void do_SUB( struct arb_vp_machine *m, union instruction op )
 474 {
 475    GLfloat *result = m->File[0][op.alu.dst];
 476    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 477    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 478
 479    result[0] = arg0[0] - arg1[0];
 480    result[1] = arg0[1] - arg1[1];
 481    result[2] = arg0[2] - arg1[2];
 482    result[3] = arg0[3] - arg1[3];
 483 }
 484
 485
 486 static void do_XPD( struct arb_vp_machine *m, union instruction op )
 487 {
 488    GLfloat *result = m->File[0][op.alu.dst];
 489    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 490    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 491
 492    result[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1];
 493    result[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2];
 494    result[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0];
 495 }
 496
 497 static void do_NOP( struct arb_vp_machine *m, union instruction op )
 498 {
 499 }
 500
 501 /* Some useful debugging functions:
 502  */
 503 static void print_mask( GLuint mask )
 504 {
 505    _mesa_printf(".");
 506    if (mask&0x1) _mesa_printf("x");
 507    if (mask&0x2) _mesa_printf("y");
 508    if (mask&0x4) _mesa_printf("z");
 509    if (mask&0x8) _mesa_printf("w");
 510 }
 511
 512 static void print_reg( GLuint file, GLuint reg )
 513 {
 514    static const char *reg_file[] = {
 515       "REG",
 516       "LOCAL_PARAM",
 517       "ENV_PARAM",
 518       "STATE_VAR",
 519    };
 520
 521    if (file == 0) {
 522       if (reg == REG_RES)
 523          _mesa_printf("RES");
 524       else if (reg >= REG_ARG0 && reg <= REG_ARG1)
 525          _mesa_printf("ARG%d", reg - REG_ARG0);
 526       else if (reg >= REG_TMP0 && reg <= REG_TMP11)
 527          _mesa_printf("TMP%d", reg - REG_TMP0);
 528       else if (reg >= REG_IN0 && reg <= REG_IN31)
 529          _mesa_printf("IN%d", reg - REG_IN0);
 530       else if (reg >= REG_OUT0 && reg <= REG_OUT14)
 531          _mesa_printf("OUT%d", reg - REG_OUT0);
 532       else if (reg == REG_ADDR)
 533          _mesa_printf("ADDR");
 534       else if (reg == REG_ID)
 535          _mesa_printf("ID");
 536       else
 537          _mesa_printf("REG%d", reg);
 538    }
 539    else
 540       _mesa_printf("%s:%d", reg_file[file], reg);
 541 }
 542
 543
 544 static void print_RSW( union instruction op, const struct opcode_info *info )
 545 {
 546    GLuint swz = op.rsw.swz;
 547    GLuint neg = op.rsw.neg;
 548    GLuint i;
 549
 550    _mesa_printf("%s ", info->string);
 551    print_reg(0, op.rsw.dst);
 552    _mesa_printf(", ");
 553    print_reg(op.rsw.file0, op.rsw.idx0);
 554    _mesa_printf(".");
 555    for (i = 0; i < 4; i++, swz >>= 2) {
 556       const char *cswz = "xyzw";
 557       if (neg & (1<<i))
 558          _mesa_printf("-");
 559       _mesa_printf("%c", cswz[swz&0x3]);
 560    }
 561    _mesa_printf("\n");
 562 }
 563
 564
 565 static void print_ALU( union instruction op, const struct opcode_info *info )
 566 {
 567    _mesa_printf("%s ", info->string);
 568    print_reg(0, op.alu.dst);
 569    _mesa_printf(", ");
 570    print_reg(op.alu.file0, op.alu.idx0);
 571    if (info->nr_args > 1) {
 572       _mesa_printf(", ");
 573       print_reg(op.alu.file1, op.alu.idx1);
 574    }
 575    _mesa_printf("\n");
 576 }
 577
 578 static void print_MSK( union instruction op, const struct opcode_info *info )
 579 {
 580    _mesa_printf("%s ", info->string);
 581    print_reg(0, op.msk.dst);
 582    print_mask(op.msk.mask);
 583    _mesa_printf(", ");
 584    print_reg(op.msk.file, op.msk.idx);
 585    _mesa_printf("\n");
 586 }
 587
 588
 589 static void print_NOP( union instruction op, const struct opcode_info *info )
 590 {
 591 }
 592
 593 #define NOP 0
 594 #define ALU 1
 595 #define SWZ 2
 596
 597 static const struct opcode_info opcode_info[] =
 598 {
 599    { 1, "ABS", print_ALU },
 600    { 2, "ADD", print_ALU },
 601    { 1, "ARL", print_NOP },
 602    { 2, "DP3", print_ALU },
 603    { 2, "DP4", print_ALU },
 604    { 2, "DPH", print_ALU },
 605    { 2, "DST", print_ALU },
 606    { 0, "END", print_NOP },
 607    { 1, "EX2", print_ALU },
 608    { 1, "EXP", print_ALU },
 609    { 1, "FLR", print_ALU },
 610    { 1, "FRC", print_ALU },
 611    { 1, "LG2", print_ALU },
 612    { 1, "LIT", print_ALU },
 613    { 1, "LOG", print_ALU },
 614    { 3, "MAD", print_NOP },
 615    { 2, "MAX", print_ALU },
 616    { 2, "MIN", print_ALU },
 617    { 1, "MOV", print_ALU },
 618    { 2, "MUL", print_ALU },
 619    { 2, "POW", print_ALU },
 620    { 1, "PRT", print_ALU }, /* PRINT */
 621    { 1, "RCC", print_NOP },
 622    { 1, "RCP", print_ALU },
 623    { 1, "RSQ", print_ALU },
 624    { 2, "SGE", print_ALU },
 625    { 2, "SLT", print_ALU },
 626    { 2, "SUB", print_ALU },
 627    { 1, "SWZ", print_NOP },
 628    { 2, "XPD", print_ALU },
 629    { 1, "RSW", print_RSW },
 630    { 2, "MSK", print_MSK },
 631    { 1, "REL", print_ALU },
 632 };
 633
 634 void _tnl_disassem_vba_insn( union instruction op )
 635 {
 636    const struct opcode_info *info = &opcode_info[op.alu.opcode];
 637    info->print( op, info );
 638 }
 639
 640
 641 static void (* const opcode_func[])(struct arb_vp_machine *, union instruction) =
 642 {
 643    do_ABS,
 644    do_ADD,
 645    do_NOP,
 646    do_DP3,
 647    do_DP4,
 648    do_DPH,
 649    do_DST,
 650    do_NOP,
 651    do_EX2,
 652    do_EXP,
 653    do_FLR,
 654    do_FRC,
 655    do_LG2,
 656    do_LIT,
 657    do_LOG,
 658    do_NOP,
 659    do_MAX,
 660    do_MIN,
 661    do_MOV,
 662    do_MUL,
 663    do_POW,
 664    do_PRT,
 665    do_NOP,
 666    do_RCP,
 667    do_RSQ,
 668    do_SGE,
 669    do_SLT,
 670    do_SUB,
 671    do_RSW,
 672    do_XPD,
 673    do_RSW,
 674    do_MSK,
 675    do_REL,
 676 };
 677
 678 static union instruction *cvp_next_instruction( struct compilation *cp )
 679 {
 680    union instruction *op = cp->csr++;
 681    op->dword = 0;
 682    return op;
 683 }
 684
 685 static struct reg cvp_make_reg( GLuint file, GLuint idx )
 686 {
 687    struct reg reg;
 688    reg.file = file;
 689    reg.idx = idx;
 690    return reg;
 691 }
 692
 693 static struct reg cvp_emit_rel( struct compilation *cp,
 694                                 struct reg reg,
 695                                 struct reg tmpreg )
 696 {
 697    union instruction *op = cvp_next_instruction(cp);
 698    op->alu.opcode = REL;
 699    op->alu.file0 = reg.file;
 700    op->alu.idx0 = reg.idx;
 701    op->alu.dst = tmpreg.idx;
 702    return tmpreg;
 703 }
 704
 705
 706 static struct reg cvp_load_reg( struct compilation *cp,
 707                                 GLuint file,
 708                                 GLuint index,
 709                                 GLuint rel,
 710                                 GLuint tmpidx )
 711 {
 712    struct reg tmpreg = cvp_make_reg(FILE_REG, tmpidx);
 713    struct reg reg;
 714
 715    switch (file) {
 716    case PROGRAM_TEMPORARY:
 717       return cvp_make_reg(FILE_REG, REG_TMP0 + index);
 718
 719    case PROGRAM_INPUT:
 720       return cvp_make_reg(FILE_REG, REG_IN0 + index);
 721
 722    case PROGRAM_OUTPUT:
 723       return cvp_make_reg(FILE_REG, REG_OUT0 + index);
 724
 725       /* These two aren't populated by the parser?
 726        */
 727    case PROGRAM_LOCAL_PARAM:
 728       reg = cvp_make_reg(FILE_LOCAL_PARAM, index);
 729       if (rel)
 730          return cvp_emit_rel(cp, reg, tmpreg);
 731       else
 732          return reg;
 733
 734    case PROGRAM_ENV_PARAM:
 735       reg = cvp_make_reg(FILE_ENV_PARAM, index);
 736       if (rel)
 737          return cvp_emit_rel(cp, reg, tmpreg);
 738       else
 739          return reg;
 740
 741    case PROGRAM_STATE_VAR:
 742       reg = cvp_make_reg(FILE_STATE_PARAM, index);
 743       if (rel)
 744          return cvp_emit_rel(cp, reg, tmpreg);
 745       else
 746          return reg;
 747
 748       /* Invalid values:
 749        */
 750    case PROGRAM_WRITE_ONLY:
 751    case PROGRAM_ADDRESS:
 752    default:
 753       assert(0);
 754       return tmpreg;            /* can't happen */
 755    }
 756 }
 757
 758 static struct reg cvp_emit_arg( struct compilation *cp,
 759                                 const struct vp_src_register *src,
 760                                 GLuint arg )
 761 {
 762    struct reg reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr, arg );
 763    union instruction rsw, noop;
 764
 765    /* Emit any necessary swizzling.
 766     */
 767    rsw.dword = 0;
 768    rsw.rsw.neg = src->Negate ? WRITEMASK_XYZW : 0;
 769    rsw.rsw.swz = ((GET_SWZ(src->Swizzle, 0) << 0) |
 770                   (GET_SWZ(src->Swizzle, 1) << 2) |
 771                   (GET_SWZ(src->Swizzle, 2) << 4) |
 772                   (GET_SWZ(src->Swizzle, 3) << 6));
 773
 774    noop.dword = 0;
 775    noop.rsw.neg = 0;
 776    noop.rsw.swz = RSW_NOOP;
 777
 778    if (rsw.dword != noop.dword) {
 779       union instruction *op = cvp_next_instruction(cp);
 780       struct reg rsw_reg = cvp_make_reg(FILE_REG, REG_ARG0 + arg);
 781       op->dword = rsw.dword;
 782       op->rsw.opcode = RSW;
 783       op->rsw.file0 = reg.file;
 784       op->rsw.idx0 = reg.idx;
 785       op->rsw.dst = rsw_reg.idx;
 786       return rsw_reg;
 787    }
 788    else
 789       return reg;
 790 }
 791
 792 static GLuint cvp_choose_result( struct compilation *cp,
 793                                  const struct vp_dst_register *dst,
 794                                  union instruction *fixup )
 795 {
 796    GLuint mask = dst->WriteMask;
 797    GLuint idx;
 798
 799    switch (dst->File) {
 800    case PROGRAM_TEMPORARY:
 801       idx = REG_TMP0 + dst->Index;
 802       break;
 803    case PROGRAM_OUTPUT:
 804       idx = REG_OUT0 + dst->Index;
 805       break;
 806    default:
 807       assert(0);
 808       return REG_RES;           /* can't happen */
 809    }
 810
 811    /* Optimization: When writing (with a writemask) to an undefined
 812     * value for the first time, the writemask may be ignored.
 813     */
 814    if (mask != WRITEMASK_XYZW && (cp->reg_active & (1 << idx))) {
 815       fixup->msk.opcode = MSK;
 816       fixup->msk.dst = idx;
 817       fixup->msk.file = FILE_REG;
 818       fixup->msk.idx = REG_RES;
 819       fixup->msk.mask = mask;
 820       cp->reg_active |= 1 << idx;
 821       return REG_RES;
 822    }
 823    else {
 824       fixup->dword = 0;
 825       cp->reg_active |= 1 << idx;
 826       return idx;
 827    }
 828 }
 829
 830 static struct reg cvp_emit_rsw( struct compilation *cp,
 831                                 GLuint dst,
 832                                 struct reg src,
 833                                 GLuint neg,
 834                                 GLuint swz,
 835                                 GLboolean force)
 836 {
 837    struct reg retval;
 838
 839    if (swz != RSW_NOOP || neg != 0) {
 840       union instruction *op = cvp_next_instruction(cp);
 841       op->rsw.opcode = RSW;
 842       op->rsw.dst = dst;
 843       op->rsw.file0 = src.file;
 844       op->rsw.idx0 = src.idx;
 845       op->rsw.neg = neg;
 846       op->rsw.swz = swz;
 847
 848       retval.file = FILE_REG;
 849       retval.idx = dst;
 850       return retval;
 851    }
 852    else if (force) {
 853       /* Oops.  Degenerate case:
 854        */
 855       union instruction *op = cvp_next_instruction(cp);
 856       op->alu.opcode = VP_OPCODE_MOV;
 857       op->alu.dst = dst;
 858       op->alu.file0 = src.file;
 859       op->alu.idx0 = src.idx;
 860
 861       retval.file = FILE_REG;
 862       retval.idx = dst;
 863       return retval;
 864    }
 865    else {
 866       return src;
 867    }
 868 }
 869
 870
 871 static void cvp_emit_inst( struct compilation *cp,
 872                            const struct vp_instruction *inst )
 873 {
 874    const struct opcode_info *info = &opcode_info[inst->Opcode];
 875    union instruction *op;
 876    union instruction fixup;
 877    struct reg reg[3];
 878    GLuint result, i;
 879
 880    assert(sizeof(*op) == sizeof(GLuint));
 881
 882    /* Need to handle SWZ, ARL specially.
 883     */
 884    switch (inst->Opcode) {
 885       /* Split into mul and add:
 886        */
 887    case VP_OPCODE_MAD:
 888       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
 889       for (i = 0; i < 3; i++)
 890          reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0+i );
 891
 892       op = cvp_next_instruction(cp);
 893       op->alu.opcode = VP_OPCODE_MUL;
 894       op->alu.file0 = reg[0].file;
 895       op->alu.idx0 = reg[0].idx;
 896       op->alu.file1 = reg[1].file;
 897       op->alu.idx1 = reg[1].idx;
 898       op->alu.dst = REG_ARG0;
 899
 900       op = cvp_next_instruction(cp);
 901       op->alu.opcode = VP_OPCODE_ADD;
 902       op->alu.file0 = FILE_REG;
 903       op->alu.idx0 = REG_ARG0;
 904       op->alu.file1 = reg[2].file;
 905       op->alu.idx1 = reg[2].idx;
 906       op->alu.dst = result;
 907
 908       if (result == REG_RES) {
 909          op = cvp_next_instruction(cp);
 910          op->dword = fixup.dword;
 911       }
 912       break;
 913
 914    case VP_OPCODE_ARL:
 915       reg[0] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
 916
 917       op = cvp_next_instruction(cp);
 918       op->alu.opcode = VP_OPCODE_FLR;
 919       op->alu.dst = REG_ADDR;
 920       op->alu.file0 = reg[0].file;
 921       op->alu.idx0 = reg[0].idx;
 922       break;
 923
 924    case VP_OPCODE_SWZ: {
 925       GLuint swz0 = 0, swz1 = 0;
 926       GLuint neg0 = 0, neg1 = 0;
 927       GLuint mask = 0;
 928
 929       /* Translate 3-bit-per-element swizzle into two 2-bit swizzles,
 930        * one from the source register the other from a constant
 931        * {0,0,0,1}.
 932        */
 933       for (i = 0; i < 4; i++) {
 934          GLuint swzelt = GET_SWZ(inst->SrcReg[0].Swizzle, i);
 935          if (swzelt >= SWIZZLE_ZERO) {
 936             neg0 |= inst->SrcReg[0].Negate & (1<<i);
 937             if (swzelt == SWIZZLE_ONE)
 938                swz0 |= SWIZZLE_W << (i*2);
 939             else if (i < SWIZZLE_W)
 940                swz0 |= i << (i*2);
 941          }
 942          else {
 943             mask |= 1<<i;
 944             neg1 |= inst->SrcReg[0].Negate & (1<<i);
 945             swz1 |= swzelt << (i*2);
 946          }
 947       }
 948
 949       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
 950       reg[0].file = FILE_REG;
 951       reg[0].idx = REG_ID;
 952       reg[1] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
 953
 954       if (mask == WRITEMASK_XYZW) {
 955          cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
 956
 957       }
 958       else if (mask == 0) {
 959          cvp_emit_rsw(cp, result, reg[1], neg1, swz1, GL_TRUE);
 960       }
 961       else {
 962          cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
 963          reg[1] = cvp_emit_rsw(cp, REG_ARG0, reg[1], neg1, swz1, GL_FALSE);
 964
 965          op = cvp_next_instruction(cp);
 966          op->msk.opcode = MSK;
 967          op->msk.dst = result;
 968          op->msk.file = reg[1].file;
 969          op->msk.idx = reg[1].idx;
 970          op->msk.mask = mask;
 971       }
 972
 973       if (result == REG_RES) {
 974          op = cvp_next_instruction(cp);
 975          op->dword = fixup.dword;
 976       }
 977       break;
 978    }
 979    case VP_OPCODE_PRINT:
 980    case VP_OPCODE_END:
 981       break;
 982
 983    default:
 984       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
 985       for (i = 0; i < info->nr_args; i++)
 986          reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0 + i );
 987
 988       op = cvp_next_instruction(cp);
 989       op->alu.opcode = inst->Opcode;
 990       op->alu.file0 = reg[0].file;
 991       op->alu.idx0 = reg[0].idx;
 992       op->alu.file1 = reg[1].file;
 993       op->alu.idx1 = reg[1].idx;
 994       op->alu.dst = result;
 995
 996       if (result == REG_RES) {
 997          op = cvp_next_instruction(cp);
 998          op->dword = fixup.dword;
 999       }
1000       break;
1001    }
1002 }
1003
1004 static void free_tnl_data( struct vertex_program *program  )
1005 {
1006    struct tnl_compiled_program *p = program->TnlData;
1007    if (p->compiled_func) free((void *)p->compiled_func);
1008    free(p);
1009    program->TnlData = NULL;
1010 }
1011
1012 static void compile_vertex_program( struct vertex_program *program,
1013                                     GLboolean try_codegen )
1014 {
1015    struct compilation cp;
1016    struct tnl_compiled_program *p = CALLOC_STRUCT(tnl_compiled_program);
1017    GLuint i;
1018
1019    _mesa_printf("%s\n", __FUNCTION__);
1020
1021    if (program->TnlData)
1022       free_tnl_data( program );
1023
1024    program->TnlData = p;
1025
1026    /* Initialize cp.  Note that ctx and VB aren't used in compilation
1027     * so we don't have to worry about statechanges:
1028     */
1029    memset(&cp, 0, sizeof(cp));
1030    cp.csr = p->instructions;
1031
1032    /* Compile instructions:
1033     */
1034    for (i = 0; i < program->Base.NumInstructions; i++) {
1035       cvp_emit_inst(&cp, &program->Instructions[i]);
1036    }
1037
1038    /* Finish up:
1039     */
1040    p->nr_instructions = cp.csr - p->instructions;
1041
1042    /* Print/disassemble:
1043     */
1044    if (DISASSEM) {
1045       for (i = 0; i < p->nr_instructions; i++) {
1046          _tnl_disassem_vba_insn(p->instructions[i]);
1047       }
1048       _mesa_printf("\n\n");
1049    }
1050
1051 #ifdef USE_SSE_ASM
1052    if (try_codegen)
1053       _tnl_sse_codegen_vertex_program(p);
1054 #endif
1055
1056 }
1057
1058
1059
1060
1061 /* ----------------------------------------------------------------------
1062  * Execution
1063  */
1064 static void userclip( GLcontext *ctx,
1065                       GLvector4f *clip,
1066                       GLubyte *clipmask,
1067                       GLubyte *clipormask,
1068                       GLubyte *clipandmask )
1069 {
1070    GLuint p;
1071
1072    for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
1073       if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
1074          GLuint nr, i;
1075          const GLfloat a = ctx->Transform._ClipUserPlane[p][0];
1076          const GLfloat b = ctx->Transform._ClipUserPlane[p][1];
1077          const GLfloat c = ctx->Transform._ClipUserPlane[p][2];
1078          const GLfloat d = ctx->Transform._ClipUserPlane[p][3];
1079          GLfloat *coord = (GLfloat *)clip->data;
1080          GLuint stride = clip->stride;
1081          GLuint count = clip->count;
1082
1083          for (nr = 0, i = 0 ; i < count ; i++) {
1084             GLfloat dp = (coord[0] * a +
1085                           coord[1] * b +
1086                           coord[2] * c +
1087                           coord[3] * d);
1088
1089             if (dp < 0) {
1090                nr++;
1091                clipmask[i] |= CLIP_USER_BIT;
1092             }
1093
1094             STRIDE_F(coord, stride);
1095          }
1096
1097          if (nr > 0) {
1098             *clipormask |= CLIP_USER_BIT;
1099             if (nr == count) {
1100                *clipandmask |= CLIP_USER_BIT;
1101                return;
1102             }
1103          }
1104       }
1105    }
1106 }
1107
1108
1109 static GLboolean do_ndc_cliptest( struct arb_vp_machine *m )
1110 {
1111    GLcontext *ctx = m->ctx;
1112    TNLcontext *tnl = TNL_CONTEXT(ctx);
1113    struct vertex_buffer *VB = m->VB;
1114
1115    /* Cliptest and perspective divide.  Clip functions must clear
1116     * the clipmask.
1117     */
1118    m->ormask = 0;
1119    m->andmask = CLIP_ALL_BITS;
1120
1121    if (tnl->NeedNdcCoords) {
1122       VB->NdcPtr =
1123          _mesa_clip_tab[VB->ClipPtr->size]( VB->ClipPtr,
1124                                             &m->ndcCoords,
1125                                             m->clipmask,
1126                                             &m->ormask,
1127                                             &m->andmask );
1128    }
1129    else {
1130       VB->NdcPtr = NULL;
1131       _mesa_clip_np_tab[VB->ClipPtr->size]( VB->ClipPtr,
1132                                             NULL,
1133                                             m->clipmask,
1134                                             &m->ormask,
1135                                             &m->andmask );
1136    }
1137
1138    if (m->andmask) {
1139       /* All vertices are outside the frustum */
1140       return GL_FALSE;
1141    }
1142
1143    /* Test userclip planes.  This contributes to VB->ClipMask.
1144     */
1145    if (ctx->Transform.ClipPlanesEnabled && !ctx->VertexProgram._Enabled) {
1146       userclip( ctx,
1147                 VB->ClipPtr,
1148                 m->clipmask,
1149                 &m->ormask,
1150                 &m->andmask );
1151
1152       if (m->andmask) {
1153          return GL_FALSE;
1154       }
1155    }
1156
1157    VB->ClipAndMask = m->andmask;
1158    VB->ClipOrMask = m->ormask;
1159    VB->ClipMask = m->clipmask;
1160
1161    return GL_TRUE;
1162 }
1163
1164
1165 static INLINE void call_func( struct tnl_compiled_program *p,
1166                               struct arb_vp_machine *m )
1167 {
1168    p->compiled_func(m);
1169 }
1170
1171 /**
1172  * Execute the given vertex program.
1173  *
1174  * TODO: Integrate the t_vertex.c code here, to build machine vertices
1175  * directly at this point.
1176  *
1177  * TODO: Eliminate the VB struct entirely and just use
1178  * struct arb_vertex_machine.
1179  */
1180 static GLboolean
1181 run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
1182 {
1183    struct vertex_program *program = (ctx->VertexProgram._Enabled ?
1184                                      ctx->VertexProgram.Current :
1185                                      ctx->_TnlProgram);
1186    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
1187    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1188    struct tnl_compiled_program *p;
1189    GLuint i, j, outputs;
1190
1191    if (!program || program->IsNVProgram)
1192       return GL_TRUE;
1193
1194    if (program->Parameters) {
1195       _mesa_load_state_parameters(ctx, program->Parameters);
1196    }
1197
1198    p = (struct tnl_compiled_program *)program->TnlData;
1199    assert(p);
1200
1201    /* Initialize regs where necessary:
1202     */
1203    ASSIGN_4V(m->File[0][REG_ID], 0, 0, 0, 1);
1204
1205    m->nr_inputs = m->nr_outputs = 0;
1206
1207    for (i = 0; i < _TNL_ATTRIB_MAX; i++) {
1208       if (program->InputsRead & (1<<i)) {
1209          GLuint j = m->nr_inputs++;
1210          m->input[j].idx = i;
1211          m->input[j].data = (GLfloat *)m->VB->AttribPtr[i]->data;
1212          m->input[j].stride = m->VB->AttribPtr[i]->stride;
1213          m->input[j].size = m->VB->AttribPtr[i]->size;
1214          ASSIGN_4V(m->File[0][REG_IN0 + i], 0, 0, 0, 1);
1215       }
1216    }
1217
1218    for (i = 0; i < 15; i++) {
1219       if (program->OutputsWritten & (1<<i)) {
1220          GLuint j = m->nr_outputs++;
1221          m->output[j].idx = i;
1222          m->output[j].data = (GLfloat *)m->attribs[i].data;
1223       }
1224    }
1225
1226
1227    /* Run the actual program:
1228     */
1229    for (m->vtx_nr = 0; m->vtx_nr < VB->Count; m->vtx_nr++) {
1230       for (j = 0; j < m->nr_inputs; j++) {
1231          GLuint idx = REG_IN0 + m->input[j].idx;
1232          switch (m->input[j].size) {
1233          case 4: m->File[0][idx][3] = m->input[j].data[3];
1234          case 3: m->File[0][idx][2] = m->input[j].data[2];
1235          case 2: m->File[0][idx][1] = m->input[j].data[1];
1236          case 1: m->File[0][idx][0] = m->input[j].data[0];
1237          }
1238
1239          STRIDE_F(m->input[j].data, m->input[j].stride);
1240       }
1241
1242       if (p->compiled_func) {
1243          call_func( p, m );
1244       }
1245       else {
1246          for (j = 0; j < p->nr_instructions; j++) {
1247             union instruction inst = p->instructions[j];
1248             opcode_func[inst.alu.opcode]( m, inst );
1249          }
1250       }
1251
1252       for (j = 0; j < m->nr_outputs; j++) {
1253          GLuint idx = REG_OUT0 + m->output[j].idx;
1254          m->output[j].data[0] = m->File[0][idx][0];
1255          m->output[j].data[1] = m->File[0][idx][1];
1256          m->output[j].data[2] = m->File[0][idx][2];
1257          m->output[j].data[3] = m->File[0][idx][3];
1258          m->output[j].data += 4;
1259       }
1260    }
1261
1262    /* Setup the VB pointers so that the next pipeline stages get
1263     * their data from the right place (the program output arrays).
1264     *
1265     * TODO: 1) Have tnl use these RESULT values for outputs rather
1266     * than trying to shoe-horn inputs and outputs into one set of
1267     * values.
1268     *
1269     * TODO: 2) Integrate t_vertex.c so that we just go straight ahead
1270     * and build machine vertices here.
1271     */
1272    VB->ClipPtr = &m->attribs[VERT_RESULT_HPOS];
1273    VB->ClipPtr->count = VB->Count;
1274
1275    outputs = program->OutputsWritten;
1276
1277    if (outputs & (1<<VERT_RESULT_COL0)) {
1278       VB->ColorPtr[0] = &m->attribs[VERT_RESULT_COL0];
1279       VB->AttribPtr[VERT_ATTRIB_COLOR0] = VB->ColorPtr[0];
1280    }
1281
1282    if (outputs & (1<<VERT_RESULT_BFC0)) {
1283       VB->ColorPtr[1] = &m->attribs[VERT_RESULT_BFC0];
1284    }
1285
1286    if (outputs & (1<<VERT_RESULT_COL1)) {
1287       VB->SecondaryColorPtr[0] = &m->attribs[VERT_RESULT_COL1];
1288       VB->AttribPtr[VERT_ATTRIB_COLOR1] = VB->SecondaryColorPtr[0];
1289    }
1290
1291    if (outputs & (1<<VERT_RESULT_BFC1)) {
1292       VB->SecondaryColorPtr[1] = &m->attribs[VERT_RESULT_BFC1];
1293    }
1294
1295    if (outputs & (1<<VERT_RESULT_FOGC)) {
1296       VB->FogCoordPtr = &m->attribs[VERT_RESULT_FOGC];
1297       VB->AttribPtr[VERT_ATTRIB_FOG] = VB->FogCoordPtr;
1298    }
1299
1300    if (outputs & (1<<VERT_RESULT_PSIZ)) {
1301       VB->PointSizePtr = &m->attribs[VERT_RESULT_PSIZ];
1302       VB->AttribPtr[_TNL_ATTRIB_POINTSIZE] = &m->attribs[VERT_RESULT_PSIZ];
1303    }
1304
1305    for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
1306       if (outputs & (1<<(VERT_RESULT_TEX0+i))) {
1307          VB->TexCoordPtr[i] = &m->attribs[VERT_RESULT_TEX0 + i];
1308          VB->AttribPtr[VERT_ATTRIB_TEX0+i] = VB->TexCoordPtr[i];
1309       }
1310    }
1311
1312 #if 0
1313    for (i = 0; i < VB->Count; i++) {
1314       printf("Out %d: %f %f %f %f %f %f %f %f\n", i,
1315              VEC_ELT(VB->ClipPtr, GLfloat, i)[0],
1316              VEC_ELT(VB->ClipPtr, GLfloat, i)[1],
1317              VEC_ELT(VB->ClipPtr, GLfloat, i)[2],
1318              VEC_ELT(VB->ClipPtr, GLfloat, i)[3],
1319              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[0],
1320              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[1],
1321              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[2],
1322              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[3]);
1323    }
1324 #endif
1325
1326    /* Perform NDC and cliptest operations:
1327     */
1328    return do_ndc_cliptest(m);
1329 }
1330
1331
1332 static void
1333 validate_vertex_program( GLcontext *ctx, struct tnl_pipeline_stage *stage )
1334 {
1335    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1336    struct vertex_program *program =
1337       (ctx->VertexProgram._Enabled ? ctx->VertexProgram.Current : 0);
1338
1339    if (!program && ctx->_MaintainTnlProgram) {
1340       program = ctx->_TnlProgram;
1341    }
1342
1343    if (program) {
1344       if (!program->TnlData)
1345          compile_vertex_program( program, m->try_codegen );
1346
1347       /* Grab the state GL state and put into registers:
1348        */
1349       m->File[FILE_LOCAL_PARAM] = program->Base.LocalParams;
1350       m->File[FILE_ENV_PARAM] = ctx->VertexProgram.Parameters;
1351       m->File[FILE_STATE_PARAM] = program->Parameters->ParameterValues;
1352    }
1353 }
1354
1355
1356
1357
1358
1359
1360
1361 /**
1362  * Called the first time stage->run is called.  In effect, don't
1363  * allocate data until the first time the stage is run.
1364  */
1365 static GLboolean init_vertex_program( GLcontext *ctx,
1366                                       struct tnl_pipeline_stage *stage )
1367 {
1368    TNLcontext *tnl = TNL_CONTEXT(ctx);
1369    struct vertex_buffer *VB = &(tnl->vb);
1370    struct arb_vp_machine *m;
1371    const GLuint size = VB->Size;
1372    GLuint i;
1373
1374    stage->privatePtr = MALLOC(sizeof(*m));
1375    m = ARB_VP_MACHINE(stage);
1376    if (!m)
1377       return GL_FALSE;
1378
1379    /* arb_vertex_machine struct should subsume the VB:
1380     */
1381    m->VB = VB;
1382    m->ctx = ctx;
1383
1384    m->File[0] = ALIGN_MALLOC(REG_MAX * sizeof(GLfloat) * 4, 16);
1385
1386    if (_mesa_getenv("MESA_EXPERIMENTAL"))
1387       m->try_codegen = 1;
1388
1389    /* Allocate arrays of vertex output values */
1390    for (i = 0; i < VERT_RESULT_MAX; i++) {
1391       _mesa_vector4f_alloc( &m->attribs[i], 0, size, 32 );
1392       m->attribs[i].size = 4;
1393    }
1394
1395    /* a few other misc allocations */
1396    _mesa_vector4f_alloc( &m->ndcCoords, 0, size, 32 );
1397    m->clipmask = (GLubyte *) ALIGN_MALLOC(sizeof(GLubyte)*size, 32 );
1398
1399    if (ctx->_MaintainTnlProgram)
1400       _mesa_allow_light_in_model( ctx, GL_FALSE );
1401
1402    return GL_TRUE;
1403 }
1404
1405
1406
1407
1408 /**
1409  * Destructor for this pipeline stage.
1410  */
1411 static void dtr( struct tnl_pipeline_stage *stage )
1412 {
1413    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1414
1415    if (m) {
1416       GLuint i;
1417
1418       /* free the vertex program result arrays */
1419       for (i = 0; i < VERT_RESULT_MAX; i++)
1420          _mesa_vector4f_free( &m->attribs[i] );
1421
1422       /* free misc arrays */
1423       _mesa_vector4f_free( &m->ndcCoords );
1424       ALIGN_FREE( m->clipmask );
1425       ALIGN_FREE( m->File[0] );
1426
1427       FREE( m );
1428       stage->privatePtr = NULL;
1429    }
1430 }
1431
1432 /**
1433  * Public description of this pipeline stage.
1434  */
1435 const struct tnl_pipeline_stage _tnl_arb_vertex_program_stage =
1436 {
1437    "vertex-program",
1438    NULL,                        /* private_data */
1439    init_vertex_program,         /* create */
1440    dtr,                         /* destroy */
1441    validate_vertex_program,     /* validate */
1442    run_arb_vertex_program       /* run */
1443 };