src/mesa/tnl/t_vb_arbprogram.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.5
   4  *
   5  * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file t_arb_program.c
  27  * Compile vertex programs to an intermediate representation.
  28  * Execute vertex programs over a buffer of vertices.
  29  * \author Keith Whitwell, Brian Paul
  30  */
  31
  32 #include "glheader.h"
  33 #include "context.h"
  34 #include "imports.h"
  35 #include "macros.h"
  36 #include "mtypes.h"
  37 #include "arbprogparse.h"
  38 #include "light.h"
  39 #include "program.h"
  40 #include "math/m_matrix.h"
  41 #include "math/m_translate.h"
  42 #include "t_context.h"
  43 #include "t_pipeline.h"
  44 #include "t_vb_arbprogram.h"
  45 #include "tnl.h"
  46 #include "program_instruction.h"
  47
  48
  49 #define DISASSEM 0
  50
  51
  52 struct compilation {
  53    GLuint reg_active;
  54    union instruction *csr;
  55 };
  56
  57
  58 #define ARB_VP_MACHINE(stage) ((struct arb_vp_machine *)(stage->privatePtr))
  59
  60 #define PUFF(x) ((x)[1] = (x)[2] = (x)[3] = (x)[0])
  61
  62
  63
  64 /* Lower precision functions for the EXP, LOG and LIT opcodes.  The
  65  * LOG2() implementation is probably not accurate enough, and the
  66  * attempted optimization for Exp2 is definitely not accurate
  67  * enough - it discards all of t's fractional bits!
  68  */
  69 static GLfloat RoughApproxLog2(GLfloat t)
  70 {
  71    return LOG2(t);
  72 }
  73
  74 static GLfloat RoughApproxExp2(GLfloat t)
  75 {
  76 #if 0
  77    fi_type fi;
  78    fi.i = (GLint) t;
  79    fi.i = (fi.i << 23) + 0x3f800000;
  80    return fi.f;
  81 #else
  82    return (GLfloat) _mesa_pow(2.0, t);
  83 #endif
  84 }
  85
  86 static GLfloat RoughApproxPower(GLfloat x, GLfloat y)
  87 {
  88    if (x == 0.0 && y == 0.0)
  89       return 1.0;  /* spec requires this */
  90    else
  91       return RoughApproxExp2(y * RoughApproxLog2(x));
  92 }
  93
  94
  95 /* Higher precision functions for the EX2, LG2 and POW opcodes:
  96  */
  97 static GLfloat ApproxLog2(GLfloat t)
  98 {
  99    return (GLfloat) (LOGF(t) * 1.442695F);
 100 }
 101
 102 static GLfloat ApproxExp2(GLfloat t)
 103 {
 104    return (GLfloat) _mesa_pow(2.0, t);
 105 }
 106
 107 static GLfloat ApproxPower(GLfloat x, GLfloat y)
 108 {
 109    return (GLfloat) _mesa_pow(x, y);
 110 }
 111
 112 static GLfloat rough_approx_log2_0_1(GLfloat x)
 113 {
 114    return LOG2(x);
 115 }
 116
 117
 118
 119
 120 /**
 121  * Perform a reduced swizzle:
 122  */
 123 static void do_RSW( struct arb_vp_machine *m, union instruction op )
 124 {
 125    GLfloat *result = m->File[0][op.rsw.dst];
 126    const GLfloat *arg0 = m->File[op.rsw.file0][op.rsw.idx0];
 127    GLuint swz = op.rsw.swz;
 128    GLuint neg = op.rsw.neg;
 129    GLfloat tmp[4];
 130
 131    /* Need a temporary to be correct in the case where result == arg0.
 132     */
 133    COPY_4V(tmp, arg0);
 134
 135    result[0] = tmp[GET_RSW(swz, 0)];
 136    result[1] = tmp[GET_RSW(swz, 1)];
 137    result[2] = tmp[GET_RSW(swz, 2)];
 138    result[3] = tmp[GET_RSW(swz, 3)];
 139
 140    if (neg) {
 141       if (neg & 0x1) result[0] = -result[0];
 142       if (neg & 0x2) result[1] = -result[1];
 143       if (neg & 0x4) result[2] = -result[2];
 144       if (neg & 0x8) result[3] = -result[3];
 145    }
 146 }
 147
 148 /* Used to implement write masking.  To make things easier for the sse
 149  * generator I've gone back to a 1 argument version of this function
 150  * (dst.msk = arg), rather than the semantically cleaner (dst = SEL
 151  * arg0, arg1, msk)
 152  *
 153  * That means this is the only instruction which doesn't write a full
 154  * 4 dwords out.  This would make such a program harder to analyse,
 155  * but it looks like analysis is going to take place on a higher level
 156  * anyway.
 157  */
 158 static void do_MSK( struct arb_vp_machine *m, union instruction op )
 159 {
 160    GLfloat *dst = m->File[0][op.msk.dst];
 161    const GLfloat *arg = m->File[op.msk.file][op.msk.idx];
 162
 163    if (op.msk.mask & 0x1) dst[0] = arg[0];
 164    if (op.msk.mask & 0x2) dst[1] = arg[1];
 165    if (op.msk.mask & 0x4) dst[2] = arg[2];
 166    if (op.msk.mask & 0x8) dst[3] = arg[3];
 167 }
 168
 169
 170 static void do_PRT( struct arb_vp_machine *m, union instruction op )
 171 {
 172    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 173
 174    _mesa_printf("%d: %f %f %f %f\n", m->vtx_nr,
 175                 arg0[0], arg0[1], arg0[2], arg0[3]);
 176 }
 177
 178
 179 /**
 180  * The traditional ALU and texturing instructions.  All operate on
 181  * internal registers and ignore write masks and swizzling issues.
 182  */
 183
 184 static void do_ABS( struct arb_vp_machine *m, union instruction op )
 185 {
 186    GLfloat *result = m->File[0][op.alu.dst];
 187    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 188
 189    result[0] = (arg0[0] < 0.0) ? -arg0[0] : arg0[0];
 190    result[1] = (arg0[1] < 0.0) ? -arg0[1] : arg0[1];
 191    result[2] = (arg0[2] < 0.0) ? -arg0[2] : arg0[2];
 192    result[3] = (arg0[3] < 0.0) ? -arg0[3] : arg0[3];
 193 }
 194
 195 static void do_ADD( struct arb_vp_machine *m, union instruction op )
 196 {
 197    GLfloat *result = m->File[0][op.alu.dst];
 198    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 199    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 200
 201    result[0] = arg0[0] + arg1[0];
 202    result[1] = arg0[1] + arg1[1];
 203    result[2] = arg0[2] + arg1[2];
 204    result[3] = arg0[3] + arg1[3];
 205 }
 206
 207
 208 static void do_DP3( struct arb_vp_machine *m, union instruction op )
 209 {
 210    GLfloat *result = m->File[0][op.alu.dst];
 211    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 212    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 213
 214    result[0] = (arg0[0] * arg1[0] +
 215                 arg0[1] * arg1[1] +
 216                 arg0[2] * arg1[2]);
 217
 218    PUFF(result);
 219 }
 220
 221
 222
 223 static void do_DP4( struct arb_vp_machine *m, union instruction op )
 224 {
 225    GLfloat *result = m->File[0][op.alu.dst];
 226    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 227    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 228
 229    result[0] = (arg0[0] * arg1[0] +
 230                 arg0[1] * arg1[1] +
 231                 arg0[2] * arg1[2] +
 232                 arg0[3] * arg1[3]);
 233
 234    PUFF(result);
 235 }
 236
 237 static void do_DPH( struct arb_vp_machine *m, union instruction op )
 238 {
 239    GLfloat *result = m->File[0][op.alu.dst];
 240    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 241    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 242
 243    result[0] = (arg0[0] * arg1[0] +
 244                 arg0[1] * arg1[1] +
 245                 arg0[2] * arg1[2] +
 246                 1.0     * arg1[3]);
 247
 248    PUFF(result);
 249 }
 250
 251 static void do_DST( struct arb_vp_machine *m, union instruction op )
 252 {
 253    GLfloat *result = m->File[0][op.alu.dst];
 254    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 255    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 256
 257    /* This should be ok even if result == arg0 or result == arg1.
 258     */
 259    result[0] = 1.0F;
 260    result[1] = arg0[1] * arg1[1];
 261    result[2] = arg0[2];
 262    result[3] = arg1[3];
 263 }
 264
 265
 266 /* Intended to be high precision:
 267  */
 268 static void do_EX2( struct arb_vp_machine *m, union instruction op )
 269 {
 270    GLfloat *result = m->File[0][op.alu.dst];
 271    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 272
 273    result[0] = (GLfloat)ApproxExp2(arg0[0]);
 274    PUFF(result);
 275 }
 276
 277
 278 /* Allowed to be lower precision:
 279  */
 280 static void do_EXP( struct arb_vp_machine *m, union instruction op )
 281 {
 282    GLfloat *result = m->File[0][op.alu.dst];
 283    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 284    GLfloat tmp = arg0[0];
 285    GLfloat flr_tmp = FLOORF(tmp);
 286    GLfloat frac_tmp = tmp - flr_tmp;
 287
 288    result[0] = LDEXPF(1.0, (int)flr_tmp);
 289    result[1] = frac_tmp;
 290    result[2] = LDEXPF(rough_approx_log2_0_1(frac_tmp), (int)flr_tmp);
 291    result[3] = 1.0F;
 292 }
 293
 294 static void do_FLR( struct arb_vp_machine *m, union instruction op )
 295 {
 296    GLfloat *result = m->File[0][op.alu.dst];
 297    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 298
 299    result[0] = FLOORF(arg0[0]);
 300    result[1] = FLOORF(arg0[1]);
 301    result[2] = FLOORF(arg0[2]);
 302    result[3] = FLOORF(arg0[3]);
 303 }
 304
 305 static void do_FRC( struct arb_vp_machine *m, union instruction op )
 306 {
 307    GLfloat *result = m->File[0][op.alu.dst];
 308    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 309
 310    result[0] = arg0[0] - FLOORF(arg0[0]);
 311    result[1] = arg0[1] - FLOORF(arg0[1]);
 312    result[2] = arg0[2] - FLOORF(arg0[2]);
 313    result[3] = arg0[3] - FLOORF(arg0[3]);
 314 }
 315
 316 /* High precision log base 2:
 317  */
 318 static void do_LG2( struct arb_vp_machine *m, union instruction op )
 319 {
 320    GLfloat *result = m->File[0][op.alu.dst];
 321    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 322
 323    result[0] = ApproxLog2(arg0[0]);
 324    PUFF(result);
 325 }
 326
 327
 328
 329 static void do_LIT( struct arb_vp_machine *m, union instruction op )
 330 {
 331    GLfloat *result = m->File[0][op.alu.dst];
 332    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 333    GLfloat tmp[4];
 334
 335    tmp[0] = 1.0;
 336    tmp[1] = arg0[0];
 337    if (arg0[0] > 0.0) {
 338       tmp[2] = RoughApproxPower(arg0[1], arg0[3]);
 339    }
 340    else {
 341       tmp[2] = 0.0;
 342    }
 343    tmp[3] = 1.0;
 344
 345
 346    COPY_4V(result, tmp);
 347 }
 348
 349
 350 /* Intended to allow a lower precision than required for LG2 above.
 351  */
 352 static void do_LOG( struct arb_vp_machine *m, union instruction op )
 353 {
 354    GLfloat *result = m->File[0][op.alu.dst];
 355    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 356    GLfloat tmp = FABSF(arg0[0]);
 357    int exponent;
 358    GLfloat mantissa = FREXPF(tmp, &exponent);
 359
 360    result[0] = (GLfloat) (exponent - 1);
 361    result[1] = 2.0 * mantissa; /* map [.5, 1) -> [1, 2) */
 362    result[2] = exponent + LOG2(mantissa);
 363    result[3] = 1.0;
 364 }
 365
 366 static void do_MAX( struct arb_vp_machine *m, union instruction op )
 367 {
 368    GLfloat *result = m->File[0][op.alu.dst];
 369    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 370    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 371
 372    result[0] = (arg0[0] > arg1[0]) ? arg0[0] : arg1[0];
 373    result[1] = (arg0[1] > arg1[1]) ? arg0[1] : arg1[1];
 374    result[2] = (arg0[2] > arg1[2]) ? arg0[2] : arg1[2];
 375    result[3] = (arg0[3] > arg1[3]) ? arg0[3] : arg1[3];
 376 }
 377
 378
 379 static void do_MIN( struct arb_vp_machine *m, union instruction op )
 380 {
 381    GLfloat *result = m->File[0][op.alu.dst];
 382    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 383    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 384
 385    result[0] = (arg0[0] < arg1[0]) ? arg0[0] : arg1[0];
 386    result[1] = (arg0[1] < arg1[1]) ? arg0[1] : arg1[1];
 387    result[2] = (arg0[2] < arg1[2]) ? arg0[2] : arg1[2];
 388    result[3] = (arg0[3] < arg1[3]) ? arg0[3] : arg1[3];
 389 }
 390
 391 static void do_MOV( struct arb_vp_machine *m, union instruction op )
 392 {
 393    GLfloat *result = m->File[0][op.alu.dst];
 394    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 395
 396    result[0] = arg0[0];
 397    result[1] = arg0[1];
 398    result[2] = arg0[2];
 399    result[3] = arg0[3];
 400 }
 401
 402 static void do_MUL( struct arb_vp_machine *m, union instruction op )
 403 {
 404    GLfloat *result = m->File[0][op.alu.dst];
 405    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 406    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 407
 408    result[0] = arg0[0] * arg1[0];
 409    result[1] = arg0[1] * arg1[1];
 410    result[2] = arg0[2] * arg1[2];
 411    result[3] = arg0[3] * arg1[3];
 412 }
 413
 414
 415 /* Intended to be "high" precision
 416  */
 417 static void do_POW( struct arb_vp_machine *m, union instruction op )
 418 {
 419    GLfloat *result = m->File[0][op.alu.dst];
 420    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 421    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 422
 423    result[0] = (GLfloat)ApproxPower(arg0[0], arg1[0]);
 424    PUFF(result);
 425 }
 426
 427 static void do_REL( struct arb_vp_machine *m, union instruction op )
 428 {
 429    GLfloat *result = m->File[0][op.alu.dst];
 430    GLuint idx = (op.alu.idx0 + (GLint)m->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1);
 431    const GLfloat *arg0 = m->File[op.alu.file0][idx];
 432
 433    result[0] = arg0[0];
 434    result[1] = arg0[1];
 435    result[2] = arg0[2];
 436    result[3] = arg0[3];
 437 }
 438
 439 static void do_RCP( struct arb_vp_machine *m, union instruction op )
 440 {
 441    GLfloat *result = m->File[0][op.alu.dst];
 442    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 443
 444    result[0] = 1.0F / arg0[0];
 445    PUFF(result);
 446 }
 447
 448 static void do_RSQ( struct arb_vp_machine *m, union instruction op )
 449 {
 450    GLfloat *result = m->File[0][op.alu.dst];
 451    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 452
 453    result[0] = INV_SQRTF(FABSF(arg0[0]));
 454    PUFF(result);
 455 }
 456
 457
 458 static void do_SGE( struct arb_vp_machine *m, union instruction op )
 459 {
 460    GLfloat *result = m->File[0][op.alu.dst];
 461    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 462    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 463
 464    result[0] = (arg0[0] >= arg1[0]) ? 1.0F : 0.0F;
 465    result[1] = (arg0[1] >= arg1[1]) ? 1.0F : 0.0F;
 466    result[2] = (arg0[2] >= arg1[2]) ? 1.0F : 0.0F;
 467    result[3] = (arg0[3] >= arg1[3]) ? 1.0F : 0.0F;
 468 }
 469
 470
 471 static void do_SLT( struct arb_vp_machine *m, union instruction op )
 472 {
 473    GLfloat *result = m->File[0][op.alu.dst];
 474    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 475    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 476
 477    result[0] = (arg0[0] < arg1[0]) ? 1.0F : 0.0F;
 478    result[1] = (arg0[1] < arg1[1]) ? 1.0F : 0.0F;
 479    result[2] = (arg0[2] < arg1[2]) ? 1.0F : 0.0F;
 480    result[3] = (arg0[3] < arg1[3]) ? 1.0F : 0.0F;
 481 }
 482
 483 static void do_SUB( struct arb_vp_machine *m, union instruction op )
 484 {
 485    GLfloat *result = m->File[0][op.alu.dst];
 486    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 487    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 488
 489    result[0] = arg0[0] - arg1[0];
 490    result[1] = arg0[1] - arg1[1];
 491    result[2] = arg0[2] - arg1[2];
 492    result[3] = arg0[3] - arg1[3];
 493 }
 494
 495
 496 static void do_XPD( struct arb_vp_machine *m, union instruction op )
 497 {
 498    GLfloat *result = m->File[0][op.alu.dst];
 499    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 500    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 501    GLfloat tmp[3];
 502
 503    tmp[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1];
 504    tmp[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2];
 505    tmp[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0];
 506
 507    /* Need a temporary to be correct in the case where result == arg0
 508     * or result == arg1.
 509     */
 510    result[0] = tmp[0];
 511    result[1] = tmp[1];
 512    result[2] = tmp[2];
 513 }
 514
 515 static void do_NOP( struct arb_vp_machine *m, union instruction op )
 516 {
 517 }
 518
 519 /* Some useful debugging functions:
 520  */
 521 static void print_mask( GLuint mask )
 522 {
 523    _mesa_printf(".");
 524    if (mask&0x1) _mesa_printf("x");
 525    if (mask&0x2) _mesa_printf("y");
 526    if (mask&0x4) _mesa_printf("z");
 527    if (mask&0x8) _mesa_printf("w");
 528 }
 529
 530 static void print_reg( GLuint file, GLuint reg )
 531 {
 532    static const char *reg_file[] = {
 533       "REG",
 534       "LOCAL_PARAM",
 535       "ENV_PARAM",
 536       "STATE_VAR",
 537    };
 538
 539    if (file == 0) {
 540       if (reg == REG_RES)
 541          _mesa_printf("RES");
 542       else if (reg >= REG_ARG0 && reg <= REG_ARG1)
 543          _mesa_printf("ARG%d", reg - REG_ARG0);
 544       else if (reg >= REG_TMP0 && reg <= REG_TMP11)
 545          _mesa_printf("TMP%d", reg - REG_TMP0);
 546       else if (reg >= REG_IN0 && reg <= REG_IN31)
 547          _mesa_printf("IN%d", reg - REG_IN0);
 548       else if (reg >= REG_OUT0 && reg <= REG_OUT14)
 549          _mesa_printf("OUT%d", reg - REG_OUT0);
 550       else if (reg == REG_ADDR)
 551          _mesa_printf("ADDR");
 552       else if (reg == REG_ID)
 553          _mesa_printf("ID");
 554       else
 555          _mesa_printf("REG%d", reg);
 556    }
 557    else
 558       _mesa_printf("%s:%d", reg_file[file], reg);
 559 }
 560
 561
 562 static void print_RSW( union instruction op )
 563 {
 564    GLuint swz = op.rsw.swz;
 565    GLuint neg = op.rsw.neg;
 566    GLuint i;
 567
 568    _mesa_printf("RSW ");
 569    print_reg(0, op.rsw.dst);
 570    _mesa_printf(", ");
 571    print_reg(op.rsw.file0, op.rsw.idx0);
 572    _mesa_printf(".");
 573    for (i = 0; i < 4; i++, swz >>= 2) {
 574       const char *cswz = "xyzw";
 575       if (neg & (1<<i))
 576          _mesa_printf("-");
 577       _mesa_printf("%c", cswz[swz&0x3]);
 578    }
 579    _mesa_printf("\n");
 580 }
 581
 582
 583 static void print_ALU( union instruction op )
 584 {
 585    _mesa_printf("%s ", _mesa_opcode_string((enum prog_opcode) op.alu.opcode));
 586    print_reg(0, op.alu.dst);
 587    _mesa_printf(", ");
 588    print_reg(op.alu.file0, op.alu.idx0);
 589    if (_mesa_num_inst_src_regs((enum prog_opcode) op.alu.opcode) > 1) {
 590       _mesa_printf(", ");
 591       print_reg(op.alu.file1, op.alu.idx1);
 592    }
 593    _mesa_printf("\n");
 594 }
 595
 596 static void print_MSK( union instruction op )
 597 {
 598    _mesa_printf("MSK ");
 599    print_reg(0, op.msk.dst);
 600    print_mask(op.msk.mask);
 601    _mesa_printf(", ");
 602    print_reg(op.msk.file, op.msk.idx);
 603    _mesa_printf("\n");
 604 }
 605
 606 static void print_NOP( union instruction op )
 607 {
 608 }
 609
 610 void
 611 _tnl_disassem_vba_insn( union instruction op )
 612 {
 613    switch (op.alu.opcode) {
 614    case OPCODE_ABS:
 615    case OPCODE_ADD:
 616    case OPCODE_DP3:
 617    case OPCODE_DP4:
 618    case OPCODE_DPH:
 619    case OPCODE_DST:
 620    case OPCODE_EX2:
 621    case OPCODE_EXP:
 622    case OPCODE_FLR:
 623    case OPCODE_FRC:
 624    case OPCODE_LG2:
 625    case OPCODE_LIT:
 626    case OPCODE_LOG:
 627    case OPCODE_MAX:
 628    case OPCODE_MIN:
 629    case OPCODE_MOV:
 630    case OPCODE_MUL:
 631    case OPCODE_POW:
 632    case OPCODE_PRINT:
 633    case OPCODE_RCP:
 634    case OPCODE_RSQ:
 635    case OPCODE_SGE:
 636    case OPCODE_SLT:
 637    case OPCODE_SUB:
 638    case OPCODE_XPD:
 639       print_ALU(op);
 640       break;
 641    case OPCODE_ARA:
 642    case OPCODE_ARL:
 643    case OPCODE_ARL_NV:
 644    case OPCODE_ARR:
 645    case OPCODE_BRA:
 646    case OPCODE_CAL:
 647    case OPCODE_END:
 648    case OPCODE_MAD:
 649    case OPCODE_POPA:
 650    case OPCODE_PUSHA:
 651    case OPCODE_RCC:
 652    case OPCODE_RET:
 653    case OPCODE_SSG:
 654    case OPCODE_SWZ:
 655       print_NOP(op);
 656       break;
 657    case RSW:
 658       print_RSW(op);
 659       break;
 660    case MSK:
 661       print_MSK(op);
 662       break;
 663    case REL:
 664       print_ALU(op);
 665       break;
 666    default:
 667       _mesa_problem(NULL, "Bad opcode in _tnl_disassem_vba_insn()");
 668    }
 669 }
 670
 671
 672 static void (* const opcode_func[MAX_OPCODE+3])(struct arb_vp_machine *, union instruction) =
 673 {
 674    do_ABS,
 675    do_ADD,
 676    do_NOP,/*ARA*/
 677    do_NOP,/*ARL*/
 678    do_NOP,/*ARL_NV*/
 679    do_NOP,/*ARR*/
 680    do_NOP,/*BRA*/
 681    do_NOP,/*CAL*/
 682    do_NOP,/*CMP*/
 683    do_NOP,/*COS*/
 684    do_NOP,/*DDX*/
 685    do_NOP,/*DDY*/
 686    do_DP3,
 687    do_DP4,
 688    do_DPH,
 689    do_DST,
 690    do_NOP,
 691    do_EX2,
 692    do_EXP,
 693    do_FLR,
 694    do_FRC,
 695    do_NOP,/*KIL*/
 696    do_NOP,/*KIL_NV*/
 697    do_LG2,
 698    do_LIT,
 699    do_LOG,
 700    do_NOP,/*LRP*/
 701    do_NOP,/*MAD*/
 702    do_MAX,
 703    do_MIN,
 704    do_MOV,
 705    do_MUL,
 706    do_NOP,/*PK2H*/
 707    do_NOP,/*PK2US*/
 708    do_NOP,/*PK4B*/
 709    do_NOP,/*PK4UB*/
 710    do_POW,
 711    do_NOP,/*POPA*/
 712    do_PRT,
 713    do_NOP,/*PUSHA*/
 714    do_NOP,/*RCC*/
 715    do_RCP,/*RCP*/
 716    do_NOP,/*RET*/
 717    do_NOP,/*RFL*/
 718    do_RSQ,
 719    do_NOP,/*SCS*/
 720    do_NOP,/*SEQ*/
 721    do_NOP,/*SFL*/
 722    do_SGE,
 723    do_NOP,/*SGT*/
 724    do_NOP,/*SIN*/
 725    do_NOP,/*SLE*/
 726    do_SLT,
 727    do_NOP,/*SNE*/
 728    do_NOP,/*SSG*/
 729    do_NOP,/*STR*/
 730    do_SUB,
 731    do_RSW,/*SWZ*/
 732    do_NOP,/*TEX*/
 733    do_NOP,/*TXB*/
 734    do_NOP,/*TXD*/
 735    do_NOP,/*TXL*/
 736    do_NOP,/*TXP*/
 737    do_NOP,/*TXP_NV*/
 738    do_NOP,/*UP2H*/
 739    do_NOP,/*UP2US*/
 740    do_NOP,/*UP4B*/
 741    do_NOP,/*UP4UB*/
 742    do_NOP,/*X2D*/
 743    do_XPD,
 744    do_RSW,
 745    do_MSK,
 746    do_REL,
 747 };
 748
 749 static union instruction *cvp_next_instruction( struct compilation *cp )
 750 {
 751    union instruction *op = cp->csr++;
 752    op->dword = 0;
 753    return op;
 754 }
 755
 756 static struct reg cvp_make_reg( GLuint file, GLuint idx )
 757 {
 758    struct reg reg;
 759    reg.file = file;
 760    reg.idx = idx;
 761    return reg;
 762 }
 763
 764 static struct reg cvp_emit_rel( struct compilation *cp,
 765                                 struct reg reg,
 766                                 struct reg tmpreg )
 767 {
 768    union instruction *op = cvp_next_instruction(cp);
 769    op->alu.opcode = REL;
 770    op->alu.file0 = reg.file;
 771    op->alu.idx0 = reg.idx;
 772    op->alu.dst = tmpreg.idx;
 773    return tmpreg;
 774 }
 775
 776
 777 static struct reg cvp_load_reg( struct compilation *cp,
 778                                 GLuint file,
 779                                 GLuint index,
 780                                 GLuint rel,
 781                                 GLuint tmpidx )
 782 {
 783    struct reg tmpreg = cvp_make_reg(FILE_REG, tmpidx);
 784    struct reg reg;
 785
 786    switch (file) {
 787    case PROGRAM_TEMPORARY:
 788       return cvp_make_reg(FILE_REG, REG_TMP0 + index);
 789
 790    case PROGRAM_INPUT:
 791       return cvp_make_reg(FILE_REG, REG_IN0 + index);
 792
 793    case PROGRAM_OUTPUT:
 794       return cvp_make_reg(FILE_REG, REG_OUT0 + index);
 795
 796       /* These two aren't populated by the parser?
 797        */
 798    case PROGRAM_LOCAL_PARAM:
 799       reg = cvp_make_reg(FILE_LOCAL_PARAM, index);
 800       if (rel)
 801          return cvp_emit_rel(cp, reg, tmpreg);
 802       else
 803          return reg;
 804
 805    case PROGRAM_ENV_PARAM:
 806       reg = cvp_make_reg(FILE_ENV_PARAM, index);
 807       if (rel)
 808          return cvp_emit_rel(cp, reg, tmpreg);
 809       else
 810          return reg;
 811
 812    case PROGRAM_STATE_VAR:
 813       reg = cvp_make_reg(FILE_STATE_PARAM, index);
 814       if (rel)
 815          return cvp_emit_rel(cp, reg, tmpreg);
 816       else
 817          return reg;
 818
 819       /* Invalid values:
 820        */
 821    case PROGRAM_WRITE_ONLY:
 822    case PROGRAM_ADDRESS:
 823    default:
 824       _mesa_problem(NULL, "Invalid register file %d in cvp_load_reg()");
 825       assert(0);
 826       return tmpreg;            /* can't happen */
 827    }
 828 }
 829
 830 static struct reg cvp_emit_arg( struct compilation *cp,
 831                                 const struct prog_src_register *src,
 832                                 GLuint arg )
 833 {
 834    struct reg reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr, arg );
 835    union instruction rsw, noop;
 836
 837    /* Emit any necessary swizzling.
 838     */
 839    rsw.dword = 0;
 840    rsw.rsw.neg = src->NegateBase ? WRITEMASK_XYZW : 0;
 841
 842    /* we're expecting 2-bit swizzles below... */
 843    ASSERT(GET_SWZ(src->Swizzle, 0) < 4);
 844    ASSERT(GET_SWZ(src->Swizzle, 1) < 4);
 845    ASSERT(GET_SWZ(src->Swizzle, 2) < 4);
 846    ASSERT(GET_SWZ(src->Swizzle, 3) < 4);
 847
 848    rsw.rsw.swz = ((GET_SWZ(src->Swizzle, 0) << 0) |
 849                   (GET_SWZ(src->Swizzle, 1) << 2) |
 850                   (GET_SWZ(src->Swizzle, 2) << 4) |
 851                   (GET_SWZ(src->Swizzle, 3) << 6));
 852
 853    noop.dword = 0;
 854    noop.rsw.neg = 0;
 855    noop.rsw.swz = RSW_NOOP;
 856
 857    if (rsw.dword != noop.dword) {
 858       union instruction *op = cvp_next_instruction(cp);
 859       struct reg rsw_reg = cvp_make_reg(FILE_REG, REG_ARG0 + arg);
 860       op->dword = rsw.dword;
 861       op->rsw.opcode = RSW;
 862       op->rsw.file0 = reg.file;
 863       op->rsw.idx0 = reg.idx;
 864       op->rsw.dst = rsw_reg.idx;
 865       return rsw_reg;
 866    }
 867    else
 868       return reg;
 869 }
 870
 871 static GLuint cvp_choose_result( struct compilation *cp,
 872                                  const struct prog_dst_register *dst,
 873                                  union instruction *fixup )
 874 {
 875    GLuint mask = dst->WriteMask;
 876    GLuint idx;
 877
 878    switch (dst->File) {
 879    case PROGRAM_TEMPORARY:
 880       idx = REG_TMP0 + dst->Index;
 881       break;
 882    case PROGRAM_OUTPUT:
 883       idx = REG_OUT0 + dst->Index;
 884       break;
 885    default:
 886       assert(0);
 887       return REG_RES;           /* can't happen */
 888    }
 889
 890    /* Optimization: When writing (with a writemask) to an undefined
 891     * value for the first time, the writemask may be ignored.
 892     */
 893    if (mask != WRITEMASK_XYZW && (cp->reg_active & (1 << idx))) {
 894       fixup->msk.opcode = MSK;
 895       fixup->msk.dst = idx;
 896       fixup->msk.file = FILE_REG;
 897       fixup->msk.idx = REG_RES;
 898       fixup->msk.mask = mask;
 899       cp->reg_active |= 1 << idx;
 900       return REG_RES;
 901    }
 902    else {
 903       fixup->dword = 0;
 904       cp->reg_active |= 1 << idx;
 905       return idx;
 906    }
 907 }
 908
 909 static struct reg cvp_emit_rsw( struct compilation *cp,
 910                                 GLuint dst,
 911                                 struct reg src,
 912                                 GLuint neg,
 913                                 GLuint swz,
 914                                 GLboolean force)
 915 {
 916    struct reg retval;
 917
 918    if (swz != RSW_NOOP || neg != 0) {
 919       union instruction *op = cvp_next_instruction(cp);
 920       op->rsw.opcode = RSW;
 921       op->rsw.dst = dst;
 922       op->rsw.file0 = src.file;
 923       op->rsw.idx0 = src.idx;
 924       op->rsw.neg = neg;
 925       op->rsw.swz = swz;
 926
 927       retval.file = FILE_REG;
 928       retval.idx = dst;
 929       return retval;
 930    }
 931    else if (force) {
 932       /* Oops.  Degenerate case:
 933        */
 934       union instruction *op = cvp_next_instruction(cp);
 935       op->alu.opcode = OPCODE_MOV;
 936       op->alu.dst = dst;
 937       op->alu.file0 = src.file;
 938       op->alu.idx0 = src.idx;
 939
 940       retval.file = FILE_REG;
 941       retval.idx = dst;
 942       return retval;
 943    }
 944    else {
 945       return src;
 946    }
 947 }
 948
 949
 950 static void cvp_emit_inst( struct compilation *cp,
 951                            const struct prog_instruction *inst )
 952 {
 953    union instruction *op;
 954    union instruction fixup;
 955    struct reg reg[3];
 956    GLuint result, nr_args, i;
 957
 958    assert(sizeof(*op) == sizeof(long long));
 959
 960    /* Need to handle SWZ, ARL specially.
 961     */
 962    switch (inst->Opcode) {
 963       /* Split into mul and add:
 964        */
 965    case OPCODE_MAD:
 966       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
 967       for (i = 0; i < 3; i++)
 968          reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0+i );
 969
 970       op = cvp_next_instruction(cp);
 971       op->alu.opcode = OPCODE_MUL;
 972       op->alu.file0 = reg[0].file;
 973       op->alu.idx0 = reg[0].idx;
 974       op->alu.file1 = reg[1].file;
 975       op->alu.idx1 = reg[1].idx;
 976       op->alu.dst = REG_ARG0;
 977
 978       op = cvp_next_instruction(cp);
 979       op->alu.opcode = OPCODE_ADD;
 980       op->alu.file0 = FILE_REG;
 981       op->alu.idx0 = REG_ARG0;
 982       op->alu.file1 = reg[2].file;
 983       op->alu.idx1 = reg[2].idx;
 984       op->alu.dst = result;
 985
 986       if (result == REG_RES) {
 987          op = cvp_next_instruction(cp);
 988          op->dword = fixup.dword;
 989       }
 990       break;
 991
 992    case OPCODE_ARL:
 993       reg[0] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
 994
 995       op = cvp_next_instruction(cp);
 996       op->alu.opcode = OPCODE_FLR;
 997       op->alu.dst = REG_ADDR;
 998       op->alu.file0 = reg[0].file;
 999       op->alu.idx0 = reg[0].idx;
1000       break;
1001
1002    case OPCODE_SWZ: {
1003       GLuint swz0 = 0, swz1 = 0;
1004       GLuint neg0 = 0, neg1 = 0;
1005       GLuint mask = 0;
1006
1007       /* Translate 3-bit-per-element swizzle into two 2-bit swizzles,
1008        * one from the source register the other from a constant
1009        * {0,0,0,1}.
1010        */
1011       for (i = 0; i < 4; i++) {
1012          GLuint swzelt = GET_SWZ(inst->SrcReg[0].Swizzle, i);
1013          if (swzelt >= SWIZZLE_ZERO) {
1014             neg0 |= inst->SrcReg[0].NegateBase & (1<<i);
1015             if (swzelt == SWIZZLE_ONE)
1016                swz0 |= SWIZZLE_W << (i*2);
1017             else if (i < SWIZZLE_W)
1018                swz0 |= i << (i*2);
1019          }
1020          else {
1021             mask |= 1<<i;
1022             neg1 |= inst->SrcReg[0].NegateBase & (1<<i);
1023             swz1 |= swzelt << (i*2);
1024          }
1025       }
1026
1027       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
1028       reg[0].file = FILE_REG;
1029       reg[0].idx = REG_ID;
1030       reg[1] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
1031
1032       if (mask == WRITEMASK_XYZW) {
1033          cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
1034
1035       }
1036       else if (mask == 0) {
1037          cvp_emit_rsw(cp, result, reg[1], neg1, swz1, GL_TRUE);
1038       }
1039       else {
1040          cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
1041          reg[1] = cvp_emit_rsw(cp, REG_ARG0, reg[1], neg1, swz1, GL_FALSE);
1042
1043          op = cvp_next_instruction(cp);
1044          op->msk.opcode = MSK;
1045          op->msk.dst = result;
1046          op->msk.file = reg[1].file;
1047          op->msk.idx = reg[1].idx;
1048          op->msk.mask = mask;
1049       }
1050
1051       if (result == REG_RES) {
1052          op = cvp_next_instruction(cp);
1053          op->dword = fixup.dword;
1054       }
1055       break;
1056    }
1057
1058    case OPCODE_END:
1059       break;
1060
1061    default:
1062       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
1063       nr_args = _mesa_num_inst_src_regs(inst->Opcode);
1064       for (i = 0; i < nr_args; i++)
1065          reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0 + i );
1066
1067       op = cvp_next_instruction(cp);
1068       op->alu.opcode = inst->Opcode;
1069       op->alu.file0 = reg[0].file;
1070       op->alu.idx0 = reg[0].idx;
1071       op->alu.file1 = reg[1].file;
1072       op->alu.idx1 = reg[1].idx;
1073       op->alu.dst = result;
1074
1075       if (result == REG_RES) {
1076          op = cvp_next_instruction(cp);
1077          op->dword = fixup.dword;
1078       }
1079       break;
1080    }
1081 }
1082
1083 static void free_tnl_data( struct vertex_program *program  )
1084 {
1085    struct tnl_compiled_program *p = (struct tnl_compiled_program *) program->TnlData;
1086    if (p->compiled_func)
1087       _mesa_free((void *)p->compiled_func);
1088    _mesa_free(p);
1089    program->TnlData = NULL;
1090 }
1091
1092 static void compile_vertex_program( struct vertex_program *program,
1093                                     GLboolean try_codegen )
1094 {
1095    struct compilation cp;
1096    struct tnl_compiled_program *p = CALLOC_STRUCT(tnl_compiled_program);
1097    GLuint i;
1098
1099    if (program->TnlData)
1100       free_tnl_data( program );
1101
1102    program->TnlData = p;
1103
1104    /* Initialize cp.  Note that ctx and VB aren't used in compilation
1105     * so we don't have to worry about statechanges:
1106     */
1107    _mesa_memset(&cp, 0, sizeof(cp));
1108    cp.csr = p->instructions;
1109
1110    /* Compile instructions:
1111     */
1112    for (i = 0; i < program->Base.NumInstructions; i++) {
1113       cvp_emit_inst(&cp, &program->Base.Instructions[i]);
1114    }
1115
1116    /* Finish up:
1117     */
1118    p->nr_instructions = cp.csr - p->instructions;
1119
1120    /* Print/disassemble:
1121     */
1122    if (DISASSEM) {
1123       for (i = 0; i < p->nr_instructions; i++) {
1124          _tnl_disassem_vba_insn(p->instructions[i]);
1125       }
1126       _mesa_printf("\n\n");
1127    }
1128
1129 #ifdef USE_SSE_ASM
1130    if (try_codegen)
1131       _tnl_sse_codegen_vertex_program(p);
1132 #endif
1133
1134 }
1135
1136
1137
1138
1139 /* ----------------------------------------------------------------------
1140  * Execution
1141  */
1142 static void userclip( GLcontext *ctx,
1143                       GLvector4f *clip,
1144                       GLubyte *clipmask,
1145                       GLubyte *clipormask,
1146                       GLubyte *clipandmask )
1147 {
1148    GLuint p;
1149
1150    for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
1151       if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
1152          GLuint nr, i;
1153          const GLfloat a = ctx->Transform._ClipUserPlane[p][0];
1154          const GLfloat b = ctx->Transform._ClipUserPlane[p][1];
1155          const GLfloat c = ctx->Transform._ClipUserPlane[p][2];
1156          const GLfloat d = ctx->Transform._ClipUserPlane[p][3];
1157          GLfloat *coord = (GLfloat *)clip->data;
1158          GLuint stride = clip->stride;
1159          GLuint count = clip->count;
1160
1161          for (nr = 0, i = 0 ; i < count ; i++) {
1162             GLfloat dp = (coord[0] * a +
1163                           coord[1] * b +
1164                           coord[2] * c +
1165                           coord[3] * d);
1166
1167             if (dp < 0) {
1168                nr++;
1169                clipmask[i] |= CLIP_USER_BIT;
1170             }
1171
1172             STRIDE_F(coord, stride);
1173          }
1174
1175          if (nr > 0) {
1176             *clipormask |= CLIP_USER_BIT;
1177             if (nr == count) {
1178                *clipandmask |= CLIP_USER_BIT;
1179                return;
1180             }
1181          }
1182       }
1183    }
1184 }
1185
1186
1187 static GLboolean
1188 do_ndc_cliptest(GLcontext *ctx, struct arb_vp_machine *m)
1189 {
1190    TNLcontext *tnl = TNL_CONTEXT(ctx);
1191    struct vertex_buffer *VB = m->VB;
1192
1193    /* Cliptest and perspective divide.  Clip functions must clear
1194     * the clipmask.
1195     */
1196    m->ormask = 0;
1197    m->andmask = CLIP_ALL_BITS;
1198
1199    if (tnl->NeedNdcCoords) {
1200       VB->NdcPtr =
1201          _mesa_clip_tab[VB->ClipPtr->size]( VB->ClipPtr,
1202                                             &m->ndcCoords,
1203                                             m->clipmask,
1204                                             &m->ormask,
1205                                             &m->andmask );
1206    }
1207    else {
1208       VB->NdcPtr = NULL;
1209       _mesa_clip_np_tab[VB->ClipPtr->size]( VB->ClipPtr,
1210                                             NULL,
1211                                             m->clipmask,
1212                                             &m->ormask,
1213                                             &m->andmask );
1214    }
1215
1216    if (m->andmask) {
1217       /* All vertices are outside the frustum */
1218       return GL_FALSE;
1219    }
1220
1221    /* Test userclip planes.  This contributes to VB->ClipMask.
1222     */
1223    if (ctx->Transform.ClipPlanesEnabled && !ctx->VertexProgram._Enabled) {
1224       userclip( ctx,
1225                 VB->ClipPtr,
1226                 m->clipmask,
1227                 &m->ormask,
1228                 &m->andmask );
1229
1230       if (m->andmask) {
1231          return GL_FALSE;
1232       }
1233    }
1234
1235    VB->ClipAndMask = m->andmask;
1236    VB->ClipOrMask = m->ormask;
1237    VB->ClipMask = m->clipmask;
1238
1239    return GL_TRUE;
1240 }
1241
1242
1243 static INLINE void call_func( struct tnl_compiled_program *p,
1244                               struct arb_vp_machine *m )
1245 {
1246    p->compiled_func(m);
1247 }
1248
1249 /**
1250  * Execute the given vertex program.
1251  *
1252  * TODO: Integrate the t_vertex.c code here, to build machine vertices
1253  * directly at this point.
1254  *
1255  * TODO: Eliminate the VB struct entirely and just use
1256  * struct arb_vertex_machine.
1257  */
1258 static GLboolean
1259 run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
1260 {
1261    struct vertex_program *program = (ctx->VertexProgram._Enabled ?
1262                                      ctx->VertexProgram.Current :
1263                                      ctx->_TnlProgram);
1264    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
1265    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1266    struct tnl_compiled_program *p;
1267    GLuint i, j;
1268    GLbitfield outputs;
1269
1270    if (!program || program->IsNVProgram)
1271       return GL_TRUE;
1272
1273    if (program->Base.Parameters) {
1274       _mesa_load_state_parameters(ctx, program->Base.Parameters);
1275    }
1276
1277    p = (struct tnl_compiled_program *)program->TnlData;
1278    assert(p);
1279
1280
1281    m->nr_inputs = m->nr_outputs = 0;
1282
1283    for (i = 0; i < _TNL_ATTRIB_MAX; i++) {
1284       if (program->Base.InputsRead & (1<<i)) {
1285          GLuint j = m->nr_inputs++;
1286          m->input[j].idx = i;
1287          m->input[j].data = (GLfloat *)m->VB->AttribPtr[i]->data;
1288          m->input[j].stride = m->VB->AttribPtr[i]->stride;
1289          m->input[j].size = m->VB->AttribPtr[i]->size;
1290          ASSIGN_4V(m->File[0][REG_IN0 + i], 0, 0, 0, 1);
1291       }
1292    }
1293
1294    for (i = 0; i < VERT_RESULT_MAX; i++) {
1295       if (program->Base.OutputsWritten & (1 << i)) {
1296          GLuint j = m->nr_outputs++;
1297          m->output[j].idx = i;
1298          m->output[j].data = (GLfloat *)m->attribs[i].data;
1299       }
1300    }
1301
1302
1303    /* Run the actual program:
1304     */
1305    for (m->vtx_nr = 0; m->vtx_nr < VB->Count; m->vtx_nr++) {
1306       for (j = 0; j < m->nr_inputs; j++) {
1307          GLuint idx = REG_IN0 + m->input[j].idx;
1308          switch (m->input[j].size) {
1309          case 4: m->File[0][idx][3] = m->input[j].data[3];
1310          case 3: m->File[0][idx][2] = m->input[j].data[2];
1311          case 2: m->File[0][idx][1] = m->input[j].data[1];
1312          case 1: m->File[0][idx][0] = m->input[j].data[0];
1313          }
1314
1315          STRIDE_F(m->input[j].data, m->input[j].stride);
1316       }
1317
1318       if (p->compiled_func) {
1319          call_func( p, m );
1320       }
1321       else {
1322          for (j = 0; j < p->nr_instructions; j++) {
1323             union instruction inst = p->instructions[j];
1324             opcode_func[inst.alu.opcode]( m, inst );
1325          }
1326       }
1327
1328       for (j = 0; j < m->nr_outputs; j++) {
1329          GLuint idx = REG_OUT0 + m->output[j].idx;
1330          m->output[j].data[0] = m->File[0][idx][0];
1331          m->output[j].data[1] = m->File[0][idx][1];
1332          m->output[j].data[2] = m->File[0][idx][2];
1333          m->output[j].data[3] = m->File[0][idx][3];
1334          m->output[j].data += 4;
1335       }
1336    }
1337
1338    /* Setup the VB pointers so that the next pipeline stages get
1339     * their data from the right place (the program output arrays).
1340     *
1341     * TODO: 1) Have tnl use these RESULT values for outputs rather
1342     * than trying to shoe-horn inputs and outputs into one set of
1343     * values.
1344     *
1345     * TODO: 2) Integrate t_vertex.c so that we just go straight ahead
1346     * and build machine vertices here.
1347     */
1348    VB->ClipPtr = &m->attribs[VERT_RESULT_HPOS];
1349    VB->ClipPtr->count = VB->Count;
1350
1351    outputs = program->Base.OutputsWritten;
1352
1353    if (outputs & (1<<VERT_RESULT_COL0)) {
1354       VB->ColorPtr[0] = &m->attribs[VERT_RESULT_COL0];
1355       VB->AttribPtr[VERT_ATTRIB_COLOR0] = VB->ColorPtr[0];
1356    }
1357
1358    if (outputs & (1<<VERT_RESULT_BFC0)) {
1359       VB->ColorPtr[1] = &m->attribs[VERT_RESULT_BFC0];
1360    }
1361
1362    if (outputs & (1<<VERT_RESULT_COL1)) {
1363       VB->SecondaryColorPtr[0] = &m->attribs[VERT_RESULT_COL1];
1364       VB->AttribPtr[VERT_ATTRIB_COLOR1] = VB->SecondaryColorPtr[0];
1365    }
1366
1367    if (outputs & (1<<VERT_RESULT_BFC1)) {
1368       VB->SecondaryColorPtr[1] = &m->attribs[VERT_RESULT_BFC1];
1369    }
1370
1371    if (outputs & (1<<VERT_RESULT_FOGC)) {
1372       VB->FogCoordPtr = &m->attribs[VERT_RESULT_FOGC];
1373       VB->AttribPtr[VERT_ATTRIB_FOG] = VB->FogCoordPtr;
1374    }
1375
1376    if (outputs & (1<<VERT_RESULT_PSIZ)) {
1377       VB->PointSizePtr = &m->attribs[VERT_RESULT_PSIZ];
1378       VB->AttribPtr[_TNL_ATTRIB_POINTSIZE] = &m->attribs[VERT_RESULT_PSIZ];
1379    }
1380
1381    for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
1382       if (outputs & (1<<(VERT_RESULT_TEX0+i))) {
1383          VB->TexCoordPtr[i] = &m->attribs[VERT_RESULT_TEX0 + i];
1384          VB->AttribPtr[VERT_ATTRIB_TEX0+i] = VB->TexCoordPtr[i];
1385       }
1386    }
1387
1388 #if 0
1389    for (i = 0; i < VB->Count; i++) {
1390       printf("Out %d: %f %f %f %f %f %f %f %f\n", i,
1391              VEC_ELT(VB->ClipPtr, GLfloat, i)[0],
1392              VEC_ELT(VB->ClipPtr, GLfloat, i)[1],
1393              VEC_ELT(VB->ClipPtr, GLfloat, i)[2],
1394              VEC_ELT(VB->ClipPtr, GLfloat, i)[3],
1395              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[0],
1396              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[1],
1397              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[2],
1398              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[3]);
1399    }
1400 #endif
1401
1402    /* Perform NDC and cliptest operations:
1403     */
1404    return do_ndc_cliptest(ctx, m);
1405 }
1406
1407
1408 static void
1409 validate_vertex_program( GLcontext *ctx, struct tnl_pipeline_stage *stage )
1410 {
1411    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1412    struct vertex_program *program =
1413       (ctx->VertexProgram._Enabled ? ctx->VertexProgram.Current : 0);
1414
1415    if (!program && ctx->_MaintainTnlProgram) {
1416       program = ctx->_TnlProgram;
1417    }
1418
1419    if (program) {
1420       if (!program->TnlData)
1421          compile_vertex_program( program, m->try_codegen );
1422
1423       /* Grab the state GL state and put into registers:
1424        */
1425       m->File[FILE_LOCAL_PARAM] = program->Base.LocalParams;
1426       m->File[FILE_ENV_PARAM] = ctx->VertexProgram.Parameters;
1427       /* GL_NV_vertex_programs can't reference GL state */
1428       if (program->Base.Parameters)
1429          m->File[FILE_STATE_PARAM] = program->Base.Parameters->ParameterValues;
1430       else
1431          m->File[FILE_STATE_PARAM] = NULL;
1432    }
1433 }
1434
1435
1436
1437
1438
1439
1440
1441 /**
1442  * Called the first time stage->run is called.  In effect, don't
1443  * allocate data until the first time the stage is run.
1444  */
1445 static GLboolean init_vertex_program( GLcontext *ctx,
1446                                       struct tnl_pipeline_stage *stage )
1447 {
1448    TNLcontext *tnl = TNL_CONTEXT(ctx);
1449    struct vertex_buffer *VB = &(tnl->vb);
1450    struct arb_vp_machine *m;
1451    const GLuint size = VB->Size;
1452    GLuint i;
1453
1454    stage->privatePtr = _mesa_malloc(sizeof(*m));
1455    m = ARB_VP_MACHINE(stage);
1456    if (!m)
1457       return GL_FALSE;
1458
1459    /* arb_vertex_machine struct should subsume the VB:
1460     */
1461    m->VB = VB;
1462
1463    m->File[0] = (GLfloat(*)[4])ALIGN_MALLOC(REG_MAX * sizeof(GLfloat) * 4, 16);
1464
1465    /* Initialize regs where necessary:
1466     */
1467    ASSIGN_4V(m->File[0][REG_ID], 0, 0, 0, 1);
1468    ASSIGN_4V(m->File[0][REG_ONES], 1, 1, 1, 1);
1469    ASSIGN_4V(m->File[0][REG_SWZ], -1, 1, 0, 0);
1470    ASSIGN_4V(m->File[0][REG_NEG], -1, -1, -1, -1);
1471    ASSIGN_4V(m->File[0][REG_LIT], 1, 0, 0, 1);
1472    ASSIGN_4V(m->File[0][REG_LIT2], 1, .5, .2, 1); /* debug value */
1473
1474    if (_mesa_getenv("MESA_EXPERIMENTAL"))
1475       m->try_codegen = 1;
1476
1477    /* Allocate arrays of vertex output values */
1478    for (i = 0; i < VERT_RESULT_MAX; i++) {
1479       _mesa_vector4f_alloc( &m->attribs[i], 0, size, 32 );
1480       m->attribs[i].size = 4;
1481    }
1482
1483    /* a few other misc allocations */
1484    _mesa_vector4f_alloc( &m->ndcCoords, 0, size, 32 );
1485    m->clipmask = (GLubyte *) ALIGN_MALLOC(sizeof(GLubyte)*size, 32 );
1486
1487    if (ctx->_MaintainTnlProgram)
1488       _mesa_allow_light_in_model( ctx, GL_FALSE );
1489
1490    m->fpucntl_rnd_neg = RND_NEG_FPU; /* const value */
1491    m->fpucntl_restore = RESTORE_FPU; /* const value */
1492
1493    return GL_TRUE;
1494 }
1495
1496
1497
1498
1499 /**
1500  * Destructor for this pipeline stage.
1501  */
1502 static void dtr( struct tnl_pipeline_stage *stage )
1503 {
1504    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1505
1506    if (m) {
1507       GLuint i;
1508
1509       /* free the vertex program result arrays */
1510       for (i = 0; i < VERT_RESULT_MAX; i++)
1511          _mesa_vector4f_free( &m->attribs[i] );
1512
1513       /* free misc arrays */
1514       _mesa_vector4f_free( &m->ndcCoords );
1515       ALIGN_FREE( m->clipmask );
1516       ALIGN_FREE( m->File[0] );
1517
1518       _mesa_free( m );
1519       stage->privatePtr = NULL;
1520    }
1521 }
1522
1523 /**
1524  * Public description of this pipeline stage.
1525  */
1526 const struct tnl_pipeline_stage _tnl_arb_vertex_program_stage =
1527 {
1528    "vertex-program",
1529    NULL,                        /* private_data */
1530    init_vertex_program,         /* create */
1531    dtr,                         /* destroy */
1532    validate_vertex_program,     /* validate */
1533    run_arb_vertex_program       /* run */
1534 };
1535
1536
1537 /**
1538  * Called via ctx->Driver.ProgramStringNotify() after a new vertex program
1539  * string has been parsed.
1540  */
1541 void
1542 _tnl_program_string(GLcontext *ctx, GLenum target, struct program *program)
1543 {
1544    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
1545       /* free any existing tnl data hanging off the program */
1546       struct vertex_program *vprog = (struct vertex_program *) program;
1547       if (vprog->TnlData) {
1548          free_tnl_data(vprog);
1549       }
1550    }
1551 }