src/mesa/tnl/t_vb_arbprogram.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  6.5
   4  *
   5  * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file t_arb_program.c
  27  * Compile vertex programs to an intermediate representation.
  28  * Execute vertex programs over a buffer of vertices.
  29  * \author Keith Whitwell, Brian Paul
  30  */
  31
  32 #include "glheader.h"
  33 #include "context.h"
  34 #include "imports.h"
  35 #include "macros.h"
  36 #include "mtypes.h"
  37 #include "arbprogparse.h"
  38 #include "light.h"
  39 #include "program.h"
  40 #include "math/m_matrix.h"
  41 #include "math/m_translate.h"
  42 #include "t_context.h"
  43 #include "t_pipeline.h"
  44 #include "t_vb_arbprogram.h"
  45 #include "tnl.h"
  46 #include "program_instruction.h"
  47
  48
  49 #define DISASSEM 0
  50
  51
  52 struct compilation {
  53    GLuint reg_active;
  54    union instruction *csr;
  55 };
  56
  57
  58 #define ARB_VP_MACHINE(stage) ((struct arb_vp_machine *)(stage->privatePtr))
  59
  60 #define PUFF(x) ((x)[1] = (x)[2] = (x)[3] = (x)[0])
  61
  62
  63
  64 /* Lower precision functions for the EXP, LOG and LIT opcodes.  The
  65  * LOG2() implementation is probably not accurate enough, and the
  66  * attempted optimization for Exp2 is definitely not accurate
  67  * enough - it discards all of t's fractional bits!
  68  */
  69 static GLfloat RoughApproxLog2(GLfloat t)
  70 {
  71    return LOG2(t);
  72 }
  73
  74 static GLfloat RoughApproxExp2(GLfloat t)
  75 {
  76 #if 0
  77    fi_type fi;
  78    fi.i = (GLint) t;
  79    fi.i = (fi.i << 23) + 0x3f800000;
  80    return fi.f;
  81 #else
  82    return (GLfloat) _mesa_pow(2.0, t);
  83 #endif
  84 }
  85
  86 static GLfloat RoughApproxPower(GLfloat x, GLfloat y)
  87 {
  88    if (x == 0.0 && y == 0.0)
  89       return 1.0;  /* spec requires this */
  90    else
  91       return RoughApproxExp2(y * RoughApproxLog2(x));
  92 }
  93
  94
  95 /* Higher precision functions for the EX2, LG2 and POW opcodes:
  96  */
  97 static GLfloat ApproxLog2(GLfloat t)
  98 {
  99    return (GLfloat) (LOGF(t) * 1.442695F);
 100 }
 101
 102 static GLfloat ApproxExp2(GLfloat t)
 103 {
 104    return (GLfloat) _mesa_pow(2.0, t);
 105 }
 106
 107 static GLfloat ApproxPower(GLfloat x, GLfloat y)
 108 {
 109    return (GLfloat) _mesa_pow(x, y);
 110 }
 111
 112 static GLfloat rough_approx_log2_0_1(GLfloat x)
 113 {
 114    return LOG2(x);
 115 }
 116
 117
 118
 119
 120 /**
 121  * Perform a reduced swizzle:
 122  */
 123 static void do_RSW( struct arb_vp_machine *m, union instruction op )
 124 {
 125    GLfloat *result = m->File[0][op.rsw.dst];
 126    const GLfloat *arg0 = m->File[op.rsw.file0][op.rsw.idx0];
 127    GLuint swz = op.rsw.swz;
 128    GLuint neg = op.rsw.neg;
 129    GLfloat tmp[4];
 130
 131    /* Need a temporary to be correct in the case where result == arg0.
 132     */
 133    COPY_4V(tmp, arg0);
 134
 135    result[0] = tmp[GET_RSW(swz, 0)];
 136    result[1] = tmp[GET_RSW(swz, 1)];
 137    result[2] = tmp[GET_RSW(swz, 2)];
 138    result[3] = tmp[GET_RSW(swz, 3)];
 139
 140    if (neg) {
 141       if (neg & 0x1) result[0] = -result[0];
 142       if (neg & 0x2) result[1] = -result[1];
 143       if (neg & 0x4) result[2] = -result[2];
 144       if (neg & 0x8) result[3] = -result[3];
 145    }
 146 }
 147
 148 /* Used to implement write masking.  To make things easier for the sse
 149  * generator I've gone back to a 1 argument version of this function
 150  * (dst.msk = arg), rather than the semantically cleaner (dst = SEL
 151  * arg0, arg1, msk)
 152  *
 153  * That means this is the only instruction which doesn't write a full
 154  * 4 dwords out.  This would make such a program harder to analyse,
 155  * but it looks like analysis is going to take place on a higher level
 156  * anyway.
 157  */
 158 static void do_MSK( struct arb_vp_machine *m, union instruction op )
 159 {
 160    GLfloat *dst = m->File[0][op.msk.dst];
 161    const GLfloat *arg = m->File[op.msk.file][op.msk.idx];
 162
 163    if (op.msk.mask & 0x1) dst[0] = arg[0];
 164    if (op.msk.mask & 0x2) dst[1] = arg[1];
 165    if (op.msk.mask & 0x4) dst[2] = arg[2];
 166    if (op.msk.mask & 0x8) dst[3] = arg[3];
 167 }
 168
 169
 170 static void do_PRT( struct arb_vp_machine *m, union instruction op )
 171 {
 172    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 173
 174    _mesa_printf("%d: %f %f %f %f\n", m->vtx_nr,
 175                 arg0[0], arg0[1], arg0[2], arg0[3]);
 176 }
 177
 178
 179 /**
 180  * The traditional ALU and texturing instructions.  All operate on
 181  * internal registers and ignore write masks and swizzling issues.
 182  */
 183
 184 static void do_ABS( struct arb_vp_machine *m, union instruction op )
 185 {
 186    GLfloat *result = m->File[0][op.alu.dst];
 187    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 188
 189    result[0] = (arg0[0] < 0.0) ? -arg0[0] : arg0[0];
 190    result[1] = (arg0[1] < 0.0) ? -arg0[1] : arg0[1];
 191    result[2] = (arg0[2] < 0.0) ? -arg0[2] : arg0[2];
 192    result[3] = (arg0[3] < 0.0) ? -arg0[3] : arg0[3];
 193 }
 194
 195 static void do_ADD( struct arb_vp_machine *m, union instruction op )
 196 {
 197    GLfloat *result = m->File[0][op.alu.dst];
 198    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 199    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 200
 201    result[0] = arg0[0] + arg1[0];
 202    result[1] = arg0[1] + arg1[1];
 203    result[2] = arg0[2] + arg1[2];
 204    result[3] = arg0[3] + arg1[3];
 205 }
 206
 207
 208 static void do_DP3( struct arb_vp_machine *m, union instruction op )
 209 {
 210    GLfloat *result = m->File[0][op.alu.dst];
 211    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 212    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 213
 214    result[0] = (arg0[0] * arg1[0] +
 215                 arg0[1] * arg1[1] +
 216                 arg0[2] * arg1[2]);
 217
 218    PUFF(result);
 219 }
 220
 221
 222
 223 static void do_DP4( struct arb_vp_machine *m, union instruction op )
 224 {
 225    GLfloat *result = m->File[0][op.alu.dst];
 226    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 227    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 228
 229    result[0] = (arg0[0] * arg1[0] +
 230                 arg0[1] * arg1[1] +
 231                 arg0[2] * arg1[2] +
 232                 arg0[3] * arg1[3]);
 233
 234    PUFF(result);
 235 }
 236
 237 static void do_DPH( struct arb_vp_machine *m, union instruction op )
 238 {
 239    GLfloat *result = m->File[0][op.alu.dst];
 240    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 241    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 242
 243    result[0] = (arg0[0] * arg1[0] +
 244                 arg0[1] * arg1[1] +
 245                 arg0[2] * arg1[2] +
 246                 1.0     * arg1[3]);
 247
 248    PUFF(result);
 249 }
 250
 251 static void do_DST( struct arb_vp_machine *m, union instruction op )
 252 {
 253    GLfloat *result = m->File[0][op.alu.dst];
 254    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 255    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 256
 257    /* This should be ok even if result == arg0 or result == arg1.
 258     */
 259    result[0] = 1.0F;
 260    result[1] = arg0[1] * arg1[1];
 261    result[2] = arg0[2];
 262    result[3] = arg1[3];
 263 }
 264
 265
 266 /* Intended to be high precision:
 267  */
 268 static void do_EX2( struct arb_vp_machine *m, union instruction op )
 269 {
 270    GLfloat *result = m->File[0][op.alu.dst];
 271    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 272
 273    result[0] = (GLfloat)ApproxExp2(arg0[0]);
 274    PUFF(result);
 275 }
 276
 277
 278 /* Allowed to be lower precision:
 279  */
 280 static void do_EXP( struct arb_vp_machine *m, union instruction op )
 281 {
 282    GLfloat *result = m->File[0][op.alu.dst];
 283    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 284    GLfloat tmp = arg0[0];
 285    GLfloat flr_tmp = FLOORF(tmp);
 286    GLfloat frac_tmp = tmp - flr_tmp;
 287
 288    result[0] = LDEXPF(1.0, (int)flr_tmp);
 289    result[1] = frac_tmp;
 290    result[2] = LDEXPF(rough_approx_log2_0_1(frac_tmp), (int)flr_tmp);
 291    result[3] = 1.0F;
 292 }
 293
 294 static void do_FLR( struct arb_vp_machine *m, union instruction op )
 295 {
 296    GLfloat *result = m->File[0][op.alu.dst];
 297    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 298
 299    result[0] = FLOORF(arg0[0]);
 300    result[1] = FLOORF(arg0[1]);
 301    result[2] = FLOORF(arg0[2]);
 302    result[3] = FLOORF(arg0[3]);
 303 }
 304
 305 static void do_FRC( struct arb_vp_machine *m, union instruction op )
 306 {
 307    GLfloat *result = m->File[0][op.alu.dst];
 308    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 309
 310    result[0] = arg0[0] - FLOORF(arg0[0]);
 311    result[1] = arg0[1] - FLOORF(arg0[1]);
 312    result[2] = arg0[2] - FLOORF(arg0[2]);
 313    result[3] = arg0[3] - FLOORF(arg0[3]);
 314 }
 315
 316 /* High precision log base 2:
 317  */
 318 static void do_LG2( struct arb_vp_machine *m, union instruction op )
 319 {
 320    GLfloat *result = m->File[0][op.alu.dst];
 321    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 322
 323    result[0] = ApproxLog2(arg0[0]);
 324    PUFF(result);
 325 }
 326
 327
 328
 329 static void do_LIT( struct arb_vp_machine *m, union instruction op )
 330 {
 331    GLfloat *result = m->File[0][op.alu.dst];
 332    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 333    GLfloat tmp[4];
 334
 335    tmp[0] = 1.0;
 336    tmp[1] = arg0[0];
 337    if (arg0[0] > 0.0) {
 338       tmp[2] = RoughApproxPower(arg0[1], arg0[3]);
 339    }
 340    else {
 341       tmp[2] = 0.0;
 342    }
 343    tmp[3] = 1.0;
 344
 345
 346    COPY_4V(result, tmp);
 347 }
 348
 349
 350 /* Intended to allow a lower precision than required for LG2 above.
 351  */
 352 static void do_LOG( struct arb_vp_machine *m, union instruction op )
 353 {
 354    GLfloat *result = m->File[0][op.alu.dst];
 355    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 356    GLfloat tmp = FABSF(arg0[0]);
 357    int exponent;
 358    GLfloat mantissa = FREXPF(tmp, &exponent);
 359
 360    result[0] = (GLfloat) (exponent - 1);
 361    result[1] = 2.0 * mantissa; /* map [.5, 1) -> [1, 2) */
 362    result[2] = exponent + LOG2(mantissa);
 363    result[3] = 1.0;
 364 }
 365
 366 static void do_MAX( struct arb_vp_machine *m, union instruction op )
 367 {
 368    GLfloat *result = m->File[0][op.alu.dst];
 369    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 370    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 371
 372    result[0] = (arg0[0] > arg1[0]) ? arg0[0] : arg1[0];
 373    result[1] = (arg0[1] > arg1[1]) ? arg0[1] : arg1[1];
 374    result[2] = (arg0[2] > arg1[2]) ? arg0[2] : arg1[2];
 375    result[3] = (arg0[3] > arg1[3]) ? arg0[3] : arg1[3];
 376 }
 377
 378
 379 static void do_MIN( struct arb_vp_machine *m, union instruction op )
 380 {
 381    GLfloat *result = m->File[0][op.alu.dst];
 382    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 383    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 384
 385    result[0] = (arg0[0] < arg1[0]) ? arg0[0] : arg1[0];
 386    result[1] = (arg0[1] < arg1[1]) ? arg0[1] : arg1[1];
 387    result[2] = (arg0[2] < arg1[2]) ? arg0[2] : arg1[2];
 388    result[3] = (arg0[3] < arg1[3]) ? arg0[3] : arg1[3];
 389 }
 390
 391 static void do_MOV( struct arb_vp_machine *m, union instruction op )
 392 {
 393    GLfloat *result = m->File[0][op.alu.dst];
 394    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 395
 396    result[0] = arg0[0];
 397    result[1] = arg0[1];
 398    result[2] = arg0[2];
 399    result[3] = arg0[3];
 400 }
 401
 402 static void do_MUL( struct arb_vp_machine *m, union instruction op )
 403 {
 404    GLfloat *result = m->File[0][op.alu.dst];
 405    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 406    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 407
 408    result[0] = arg0[0] * arg1[0];
 409    result[1] = arg0[1] * arg1[1];
 410    result[2] = arg0[2] * arg1[2];
 411    result[3] = arg0[3] * arg1[3];
 412 }
 413
 414
 415 /* Intended to be "high" precision
 416  */
 417 static void do_POW( struct arb_vp_machine *m, union instruction op )
 418 {
 419    GLfloat *result = m->File[0][op.alu.dst];
 420    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 421    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 422
 423    result[0] = (GLfloat)ApproxPower(arg0[0], arg1[0]);
 424    PUFF(result);
 425 }
 426
 427 static void do_REL( struct arb_vp_machine *m, union instruction op )
 428 {
 429    GLfloat *result = m->File[0][op.alu.dst];
 430    GLuint idx = (op.alu.idx0 + (GLint)m->File[0][REG_ADDR][0]) & (MAX_NV_VERTEX_PROGRAM_PARAMS-1);
 431    const GLfloat *arg0 = m->File[op.alu.file0][idx];
 432
 433    result[0] = arg0[0];
 434    result[1] = arg0[1];
 435    result[2] = arg0[2];
 436    result[3] = arg0[3];
 437 }
 438
 439 static void do_RCP( struct arb_vp_machine *m, union instruction op )
 440 {
 441    GLfloat *result = m->File[0][op.alu.dst];
 442    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 443
 444    result[0] = 1.0F / arg0[0];
 445    PUFF(result);
 446 }
 447
 448 static void do_RSQ( struct arb_vp_machine *m, union instruction op )
 449 {
 450    GLfloat *result = m->File[0][op.alu.dst];
 451    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 452
 453    result[0] = INV_SQRTF(FABSF(arg0[0]));
 454    PUFF(result);
 455 }
 456
 457
 458 static void do_SGE( struct arb_vp_machine *m, union instruction op )
 459 {
 460    GLfloat *result = m->File[0][op.alu.dst];
 461    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 462    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 463
 464    result[0] = (arg0[0] >= arg1[0]) ? 1.0F : 0.0F;
 465    result[1] = (arg0[1] >= arg1[1]) ? 1.0F : 0.0F;
 466    result[2] = (arg0[2] >= arg1[2]) ? 1.0F : 0.0F;
 467    result[3] = (arg0[3] >= arg1[3]) ? 1.0F : 0.0F;
 468 }
 469
 470
 471 static void do_SLT( struct arb_vp_machine *m, union instruction op )
 472 {
 473    GLfloat *result = m->File[0][op.alu.dst];
 474    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 475    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 476
 477    result[0] = (arg0[0] < arg1[0]) ? 1.0F : 0.0F;
 478    result[1] = (arg0[1] < arg1[1]) ? 1.0F : 0.0F;
 479    result[2] = (arg0[2] < arg1[2]) ? 1.0F : 0.0F;
 480    result[3] = (arg0[3] < arg1[3]) ? 1.0F : 0.0F;
 481 }
 482
 483 static void do_SUB( struct arb_vp_machine *m, union instruction op )
 484 {
 485    GLfloat *result = m->File[0][op.alu.dst];
 486    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 487    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 488
 489    result[0] = arg0[0] - arg1[0];
 490    result[1] = arg0[1] - arg1[1];
 491    result[2] = arg0[2] - arg1[2];
 492    result[3] = arg0[3] - arg1[3];
 493 }
 494
 495
 496 static void do_XPD( struct arb_vp_machine *m, union instruction op )
 497 {
 498    GLfloat *result = m->File[0][op.alu.dst];
 499    const GLfloat *arg0 = m->File[op.alu.file0][op.alu.idx0];
 500    const GLfloat *arg1 = m->File[op.alu.file1][op.alu.idx1];
 501    GLfloat tmp[3];
 502
 503    tmp[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1];
 504    tmp[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2];
 505    tmp[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0];
 506
 507    /* Need a temporary to be correct in the case where result == arg0
 508     * or result == arg1.
 509     */
 510    result[0] = tmp[0];
 511    result[1] = tmp[1];
 512    result[2] = tmp[2];
 513 }
 514
 515 static void do_NOP( struct arb_vp_machine *m, union instruction op )
 516 {
 517 }
 518
 519 /* Some useful debugging functions:
 520  */
 521 static void print_mask( GLuint mask )
 522 {
 523    _mesa_printf(".");
 524    if (mask&0x1) _mesa_printf("x");
 525    if (mask&0x2) _mesa_printf("y");
 526    if (mask&0x4) _mesa_printf("z");
 527    if (mask&0x8) _mesa_printf("w");
 528 }
 529
 530 static void print_reg( GLuint file, GLuint reg )
 531 {
 532    static const char *reg_file[] = {
 533       "REG",
 534       "LOCAL_PARAM",
 535       "ENV_PARAM",
 536       "STATE_VAR",
 537    };
 538
 539    if (file == 0) {
 540       if (reg == REG_RES)
 541          _mesa_printf("RES");
 542       else if (reg >= REG_ARG0 && reg <= REG_ARG1)
 543          _mesa_printf("ARG%d", reg - REG_ARG0);
 544       else if (reg >= REG_TMP0 && reg <= REG_TMP11)
 545          _mesa_printf("TMP%d", reg - REG_TMP0);
 546       else if (reg >= REG_IN0 && reg <= REG_IN31)
 547          _mesa_printf("IN%d", reg - REG_IN0);
 548       else if (reg >= REG_OUT0 && reg <= REG_OUT14)
 549          _mesa_printf("OUT%d", reg - REG_OUT0);
 550       else if (reg == REG_ADDR)
 551          _mesa_printf("ADDR");
 552       else if (reg == REG_ID)
 553          _mesa_printf("ID");
 554       else
 555          _mesa_printf("REG%d", reg);
 556    }
 557    else
 558       _mesa_printf("%s:%d", reg_file[file], reg);
 559 }
 560
 561
 562 static void print_RSW( union instruction op )
 563 {
 564    GLuint swz = op.rsw.swz;
 565    GLuint neg = op.rsw.neg;
 566    GLuint i;
 567
 568    _mesa_printf("RSW ");
 569    print_reg(0, op.rsw.dst);
 570    _mesa_printf(", ");
 571    print_reg(op.rsw.file0, op.rsw.idx0);
 572    _mesa_printf(".");
 573    for (i = 0; i < 4; i++, swz >>= 2) {
 574       const char *cswz = "xyzw";
 575       if (neg & (1<<i))
 576          _mesa_printf("-");
 577       _mesa_printf("%c", cswz[swz&0x3]);
 578    }
 579    _mesa_printf("\n");
 580 }
 581
 582
 583 static void print_ALU( union instruction op )
 584 {
 585    _mesa_printf("%s ", _mesa_opcode_string((enum prog_opcode) op.alu.opcode));
 586    print_reg(0, op.alu.dst);
 587    _mesa_printf(", ");
 588    print_reg(op.alu.file0, op.alu.idx0);
 589    if (_mesa_num_inst_src_regs((enum prog_opcode) op.alu.opcode) > 1) {
 590       _mesa_printf(", ");
 591       print_reg(op.alu.file1, op.alu.idx1);
 592    }
 593    _mesa_printf("\n");
 594 }
 595
 596 static void print_MSK( union instruction op )
 597 {
 598    _mesa_printf("MSK ");
 599    print_reg(0, op.msk.dst);
 600    print_mask(op.msk.mask);
 601    _mesa_printf(", ");
 602    print_reg(op.msk.file, op.msk.idx);
 603    _mesa_printf("\n");
 604 }
 605
 606 static void print_NOP( union instruction op )
 607 {
 608 }
 609
 610 void
 611 _tnl_disassem_vba_insn( union instruction op )
 612 {
 613    switch (op.alu.opcode) {
 614    case OPCODE_ABS:
 615    case OPCODE_ADD:
 616    case OPCODE_DP3:
 617    case OPCODE_DP4:
 618    case OPCODE_DPH:
 619    case OPCODE_DST:
 620    case OPCODE_EX2:
 621    case OPCODE_EXP:
 622    case OPCODE_FLR:
 623    case OPCODE_FRC:
 624    case OPCODE_LG2:
 625    case OPCODE_LIT:
 626    case OPCODE_LOG:
 627    case OPCODE_MAX:
 628    case OPCODE_MIN:
 629    case OPCODE_MOV:
 630    case OPCODE_MUL:
 631    case OPCODE_POW:
 632    case OPCODE_PRINT:
 633    case OPCODE_RCP:
 634    case OPCODE_RSQ:
 635    case OPCODE_SGE:
 636    case OPCODE_SLT:
 637    case OPCODE_SUB:
 638    case OPCODE_XPD:
 639       print_ALU(op);
 640       break;
 641    case OPCODE_ARA:
 642    case OPCODE_ARL:
 643    case OPCODE_ARL_NV:
 644    case OPCODE_ARR:
 645    case OPCODE_BRA:
 646    case OPCODE_CAL:
 647    case OPCODE_END:
 648    case OPCODE_MAD:
 649    case OPCODE_POPA:
 650    case OPCODE_PUSHA:
 651    case OPCODE_RCC:
 652    case OPCODE_RET:
 653    case OPCODE_SSG:
 654    case OPCODE_SWZ:
 655       print_NOP(op);
 656       break;
 657    case RSW:
 658       print_RSW(op);
 659       break;
 660    case MSK:
 661       print_MSK(op);
 662       break;
 663    case REL:
 664       print_ALU(op);
 665       break;
 666    default:
 667       _mesa_problem(NULL, "Bad opcode in _tnl_disassem_vba_insn()");
 668    }
 669 }
 670
 671
 672 static void (* const opcode_func[MAX_OPCODE+3])(struct arb_vp_machine *, union instruction) =
 673 {
 674    do_ABS,
 675    do_ADD,
 676    do_NOP,/*ARA*/
 677    do_NOP,/*ARL*/
 678    do_NOP,/*ARL_NV*/
 679    do_NOP,/*ARR*/
 680    do_NOP,/*BRA*/
 681    do_NOP,/*CAL*/
 682    do_NOP,/*CMP*/
 683    do_NOP,/*COS*/
 684    do_NOP,/*DDX*/
 685    do_NOP,/*DDY*/
 686    do_DP3,
 687    do_DP4,
 688    do_DPH,
 689    do_DST,
 690    do_NOP,
 691    do_EX2,
 692    do_EXP,
 693    do_FLR,
 694    do_FRC,
 695    do_NOP,/*KIL*/
 696    do_NOP,/*KIL_NV*/
 697    do_LG2,
 698    do_LIT,
 699    do_LOG,
 700    do_NOP,/*LRP*/
 701    do_NOP,/*MAD*/
 702    do_MAX,
 703    do_MIN,
 704    do_MOV,
 705    do_MUL,
 706    do_NOP,/*PK2H*/
 707    do_NOP,/*PK2US*/
 708    do_NOP,/*PK4B*/
 709    do_NOP,/*PK4UB*/
 710    do_POW,
 711    do_NOP,/*POPA*/
 712    do_PRT,
 713    do_NOP,/*PUSHA*/
 714    do_NOP,/*RCC*/
 715    do_RCP,/*RCP*/
 716    do_NOP,/*RET*/
 717    do_NOP,/*RFL*/
 718    do_RSQ,
 719    do_NOP,/*SCS*/
 720    do_NOP,/*SEQ*/
 721    do_NOP,/*SFL*/
 722    do_SGE,
 723    do_NOP,/*SGT*/
 724    do_NOP,/*SIN*/
 725    do_NOP,/*SLE*/
 726    do_SLT,
 727    do_NOP,/*SNE*/
 728    do_NOP,/*SSG*/
 729    do_NOP,/*STR*/
 730    do_SUB,
 731    do_RSW,/*SWZ*/
 732    do_NOP,/*TEX*/
 733    do_NOP,/*TXB*/
 734    do_NOP,/*TXD*/
 735    do_NOP,/*TXL*/
 736    do_NOP,/*TXP*/
 737    do_NOP,/*TXP_NV*/
 738    do_NOP,/*UP2H*/
 739    do_NOP,/*UP2US*/
 740    do_NOP,/*UP4B*/
 741    do_NOP,/*UP4UB*/
 742    do_NOP,/*X2D*/
 743    do_XPD,
 744    do_RSW,
 745    do_MSK,
 746    do_REL,
 747 };
 748
 749 static union instruction *cvp_next_instruction( struct compilation *cp )
 750 {
 751    union instruction *op = cp->csr++;
 752    _mesa_bzero(op, sizeof(*op));
 753    return op;
 754 }
 755
 756 static struct reg cvp_make_reg( GLuint file, GLuint idx )
 757 {
 758    struct reg reg;
 759    reg.file = file;
 760    reg.idx = idx;
 761    return reg;
 762 }
 763
 764 static struct reg cvp_emit_rel( struct compilation *cp,
 765                                 struct reg reg,
 766                                 struct reg tmpreg )
 767 {
 768    union instruction *op = cvp_next_instruction(cp);
 769    op->alu.opcode = REL;
 770    op->alu.file0 = reg.file;
 771    op->alu.idx0 = reg.idx;
 772    op->alu.dst = tmpreg.idx;
 773    return tmpreg;
 774 }
 775
 776
 777 static struct reg cvp_load_reg( struct compilation *cp,
 778                                 GLuint file,
 779                                 GLuint index,
 780                                 GLuint rel,
 781                                 GLuint tmpidx )
 782 {
 783    struct reg tmpreg = cvp_make_reg(FILE_REG, tmpidx);
 784    struct reg reg;
 785
 786    switch (file) {
 787    case PROGRAM_TEMPORARY:
 788       return cvp_make_reg(FILE_REG, REG_TMP0 + index);
 789
 790    case PROGRAM_INPUT:
 791       return cvp_make_reg(FILE_REG, REG_IN0 + index);
 792
 793    case PROGRAM_OUTPUT:
 794       return cvp_make_reg(FILE_REG, REG_OUT0 + index);
 795
 796       /* These two aren't populated by the parser?
 797        */
 798    case PROGRAM_LOCAL_PARAM:
 799       reg = cvp_make_reg(FILE_LOCAL_PARAM, index);
 800       if (rel)
 801          return cvp_emit_rel(cp, reg, tmpreg);
 802       else
 803          return reg;
 804
 805    case PROGRAM_ENV_PARAM:
 806       reg = cvp_make_reg(FILE_ENV_PARAM, index);
 807       if (rel)
 808          return cvp_emit_rel(cp, reg, tmpreg);
 809       else
 810          return reg;
 811
 812    case PROGRAM_STATE_VAR:
 813       reg = cvp_make_reg(FILE_STATE_PARAM, index);
 814       if (rel)
 815          return cvp_emit_rel(cp, reg, tmpreg);
 816       else
 817          return reg;
 818
 819       /* Invalid values:
 820        */
 821    case PROGRAM_WRITE_ONLY:
 822    case PROGRAM_ADDRESS:
 823    default:
 824       _mesa_problem(NULL, "Invalid register file %d in cvp_load_reg()");
 825       assert(0);
 826       return tmpreg;            /* can't happen */
 827    }
 828 }
 829
 830 static struct reg cvp_emit_arg( struct compilation *cp,
 831                                 const struct prog_src_register *src,
 832                                 GLuint arg )
 833 {
 834    struct reg reg = cvp_load_reg( cp, src->File, src->Index, src->RelAddr, arg );
 835    union instruction rsw, noop;
 836
 837    /* Emit any necessary swizzling.
 838     */
 839    _mesa_bzero(&rsw, sizeof(rsw));
 840    rsw.rsw.neg = src->NegateBase ? WRITEMASK_XYZW : 0;
 841
 842    /* we're expecting 2-bit swizzles below... */
 843 #if 1 /* XXX THESE ASSERTIONS CURRENTLY FAIL DURING GLEAN TESTS! */
 844    ASSERT(GET_SWZ(src->Swizzle, 0) < 4);
 845    ASSERT(GET_SWZ(src->Swizzle, 1) < 4);
 846    ASSERT(GET_SWZ(src->Swizzle, 2) < 4);
 847    ASSERT(GET_SWZ(src->Swizzle, 3) < 4);
 848 #endif
 849    rsw.rsw.swz = ((GET_SWZ(src->Swizzle, 0) << 0) |
 850                   (GET_SWZ(src->Swizzle, 1) << 2) |
 851                   (GET_SWZ(src->Swizzle, 2) << 4) |
 852                   (GET_SWZ(src->Swizzle, 3) << 6));
 853
 854    _mesa_bzero(&noop, sizeof(noop));
 855    noop.rsw.neg = 0;
 856    noop.rsw.swz = RSW_NOOP;
 857
 858    if (_mesa_memcmp(&rsw, &noop, sizeof(rsw)) !=0) {
 859       union instruction *op = cvp_next_instruction(cp);
 860       struct reg rsw_reg = cvp_make_reg(FILE_REG, REG_ARG0 + arg);
 861       *op = rsw;
 862       op->rsw.opcode = RSW;
 863       op->rsw.file0 = reg.file;
 864       op->rsw.idx0 = reg.idx;
 865       op->rsw.dst = rsw_reg.idx;
 866       return rsw_reg;
 867    }
 868    else
 869       return reg;
 870 }
 871
 872 static GLuint cvp_choose_result( struct compilation *cp,
 873                                  const struct prog_dst_register *dst,
 874                                  union instruction *fixup )
 875 {
 876    GLuint mask = dst->WriteMask;
 877    GLuint idx;
 878
 879    switch (dst->File) {
 880    case PROGRAM_TEMPORARY:
 881       idx = REG_TMP0 + dst->Index;
 882       break;
 883    case PROGRAM_OUTPUT:
 884       idx = REG_OUT0 + dst->Index;
 885       break;
 886    default:
 887       assert(0);
 888       return REG_RES;           /* can't happen */
 889    }
 890
 891    /* Optimization: When writing (with a writemask) to an undefined
 892     * value for the first time, the writemask may be ignored.
 893     */
 894    if (mask != WRITEMASK_XYZW && (cp->reg_active & (1 << idx))) {
 895       fixup->msk.opcode = MSK;
 896       fixup->msk.dst = idx;
 897       fixup->msk.file = FILE_REG;
 898       fixup->msk.idx = REG_RES;
 899       fixup->msk.mask = mask;
 900       cp->reg_active |= 1 << idx;
 901       return REG_RES;
 902    }
 903    else {
 904       _mesa_bzero(fixup, sizeof(*fixup));
 905       cp->reg_active |= 1 << idx;
 906       return idx;
 907    }
 908 }
 909
 910 static struct reg cvp_emit_rsw( struct compilation *cp,
 911                                 GLuint dst,
 912                                 struct reg src,
 913                                 GLuint neg,
 914                                 GLuint swz,
 915                                 GLboolean force)
 916 {
 917    struct reg retval;
 918
 919    if (swz != RSW_NOOP || neg != 0) {
 920       union instruction *op = cvp_next_instruction(cp);
 921       op->rsw.opcode = RSW;
 922       op->rsw.dst = dst;
 923       op->rsw.file0 = src.file;
 924       op->rsw.idx0 = src.idx;
 925       op->rsw.neg = neg;
 926       op->rsw.swz = swz;
 927
 928       retval.file = FILE_REG;
 929       retval.idx = dst;
 930       return retval;
 931    }
 932    else if (force) {
 933       /* Oops.  Degenerate case:
 934        */
 935       union instruction *op = cvp_next_instruction(cp);
 936       op->alu.opcode = OPCODE_MOV;
 937       op->alu.dst = dst;
 938       op->alu.file0 = src.file;
 939       op->alu.idx0 = src.idx;
 940
 941       retval.file = FILE_REG;
 942       retval.idx = dst;
 943       return retval;
 944    }
 945    else {
 946       return src;
 947    }
 948 }
 949
 950
 951 static void cvp_emit_inst( struct compilation *cp,
 952                            const struct prog_instruction *inst )
 953 {
 954    union instruction *op;
 955    union instruction fixup;
 956    struct reg reg[3];
 957    GLuint result, nr_args, i;
 958
 959    /* Need to handle SWZ, ARL specially.
 960     */
 961    switch (inst->Opcode) {
 962       /* Split into mul and add:
 963        */
 964    case OPCODE_MAD:
 965       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
 966       for (i = 0; i < 3; i++)
 967          reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0+i );
 968
 969       op = cvp_next_instruction(cp);
 970       op->alu.opcode = OPCODE_MUL;
 971       op->alu.file0 = reg[0].file;
 972       op->alu.idx0 = reg[0].idx;
 973       op->alu.file1 = reg[1].file;
 974       op->alu.idx1 = reg[1].idx;
 975       op->alu.dst = REG_ARG0;
 976
 977       op = cvp_next_instruction(cp);
 978       op->alu.opcode = OPCODE_ADD;
 979       op->alu.file0 = FILE_REG;
 980       op->alu.idx0 = REG_ARG0;
 981       op->alu.file1 = reg[2].file;
 982       op->alu.idx1 = reg[2].idx;
 983       op->alu.dst = result;
 984
 985       if (result == REG_RES) {
 986          op = cvp_next_instruction(cp);
 987          *op = fixup;
 988       }
 989       break;
 990
 991    case OPCODE_ARL:
 992       reg[0] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
 993
 994       op = cvp_next_instruction(cp);
 995       op->alu.opcode = OPCODE_FLR;
 996       op->alu.dst = REG_ADDR;
 997       op->alu.file0 = reg[0].file;
 998       op->alu.idx0 = reg[0].idx;
 999       break;
1000
1001    case OPCODE_SWZ: {
1002       GLuint swz0 = 0, swz1 = 0;
1003       GLuint neg0 = 0, neg1 = 0;
1004       GLuint mask = 0;
1005
1006       /* Translate 3-bit-per-element swizzle into two 2-bit swizzles,
1007        * one from the source register the other from a constant
1008        * {0,0,0,1}.
1009        */
1010       for (i = 0; i < 4; i++) {
1011          GLuint swzelt = GET_SWZ(inst->SrcReg[0].Swizzle, i);
1012          if (swzelt >= SWIZZLE_ZERO) {
1013             neg0 |= inst->SrcReg[0].NegateBase & (1<<i);
1014             if (swzelt == SWIZZLE_ONE)
1015                swz0 |= SWIZZLE_W << (i*2);
1016             else if (i < SWIZZLE_W)
1017                swz0 |= i << (i*2);
1018          }
1019          else {
1020             mask |= 1<<i;
1021             neg1 |= inst->SrcReg[0].NegateBase & (1<<i);
1022             swz1 |= swzelt << (i*2);
1023          }
1024       }
1025
1026       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
1027       reg[0].file = FILE_REG;
1028       reg[0].idx = REG_ID;
1029       reg[1] = cvp_emit_arg( cp, &inst->SrcReg[0], REG_ARG0 );
1030
1031       if (mask == WRITEMASK_XYZW) {
1032          cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
1033
1034       }
1035       else if (mask == 0) {
1036          cvp_emit_rsw(cp, result, reg[1], neg1, swz1, GL_TRUE);
1037       }
1038       else {
1039          cvp_emit_rsw(cp, result, reg[0], neg0, swz0, GL_TRUE);
1040          reg[1] = cvp_emit_rsw(cp, REG_ARG0, reg[1], neg1, swz1, GL_FALSE);
1041
1042          op = cvp_next_instruction(cp);
1043          op->msk.opcode = MSK;
1044          op->msk.dst = result;
1045          op->msk.file = reg[1].file;
1046          op->msk.idx = reg[1].idx;
1047          op->msk.mask = mask;
1048       }
1049
1050       if (result == REG_RES) {
1051          op = cvp_next_instruction(cp);
1052          *op = fixup;
1053       }
1054       break;
1055    }
1056
1057    case OPCODE_END:
1058       break;
1059
1060    default:
1061       result = cvp_choose_result( cp, &inst->DstReg, &fixup );
1062       nr_args = _mesa_num_inst_src_regs(inst->Opcode);
1063       for (i = 0; i < nr_args; i++)
1064          reg[i] = cvp_emit_arg( cp, &inst->SrcReg[i], REG_ARG0 + i );
1065
1066       op = cvp_next_instruction(cp);
1067       op->alu.opcode = inst->Opcode;
1068       op->alu.file0 = reg[0].file;
1069       op->alu.idx0 = reg[0].idx;
1070       op->alu.file1 = reg[1].file;
1071       op->alu.idx1 = reg[1].idx;
1072       op->alu.dst = result;
1073
1074       if (result == REG_RES) {
1075          op = cvp_next_instruction(cp);
1076          *op = fixup;
1077       }
1078       break;
1079    }
1080 }
1081
1082 static void free_tnl_data( struct vertex_program *program  )
1083 {
1084    struct tnl_compiled_program *p = (struct tnl_compiled_program *) program->TnlData;
1085    if (p->compiled_func)
1086       _mesa_free((void *)p->compiled_func);
1087    _mesa_free(p);
1088    program->TnlData = NULL;
1089 }
1090
1091 static void compile_vertex_program( struct vertex_program *program,
1092                                     GLboolean try_codegen )
1093 {
1094    struct compilation cp;
1095    struct tnl_compiled_program *p = CALLOC_STRUCT(tnl_compiled_program);
1096    GLuint i;
1097
1098    if (program->TnlData)
1099       free_tnl_data( program );
1100
1101    program->TnlData = p;
1102
1103    /* Initialize cp.  Note that ctx and VB aren't used in compilation
1104     * so we don't have to worry about statechanges:
1105     */
1106    _mesa_memset(&cp, 0, sizeof(cp));
1107    cp.csr = p->instructions;
1108
1109    /* Compile instructions:
1110     */
1111    for (i = 0; i < program->Base.NumInstructions; i++) {
1112       cvp_emit_inst(&cp, &program->Base.Instructions[i]);
1113    }
1114
1115    /* Finish up:
1116     */
1117    p->nr_instructions = cp.csr - p->instructions;
1118
1119    /* Print/disassemble:
1120     */
1121    if (DISASSEM) {
1122       for (i = 0; i < p->nr_instructions; i++) {
1123          _tnl_disassem_vba_insn(p->instructions[i]);
1124       }
1125       _mesa_printf("\n\n");
1126    }
1127
1128 #ifdef USE_SSE_ASM
1129    if (try_codegen)
1130       _tnl_sse_codegen_vertex_program(p);
1131 #endif
1132
1133 }
1134
1135
1136
1137
1138 /* ----------------------------------------------------------------------
1139  * Execution
1140  */
1141 static void userclip( GLcontext *ctx,
1142                       GLvector4f *clip,
1143                       GLubyte *clipmask,
1144                       GLubyte *clipormask,
1145                       GLubyte *clipandmask )
1146 {
1147    GLuint p;
1148
1149    for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
1150       if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
1151          GLuint nr, i;
1152          const GLfloat a = ctx->Transform._ClipUserPlane[p][0];
1153          const GLfloat b = ctx->Transform._ClipUserPlane[p][1];
1154          const GLfloat c = ctx->Transform._ClipUserPlane[p][2];
1155          const GLfloat d = ctx->Transform._ClipUserPlane[p][3];
1156          GLfloat *coord = (GLfloat *)clip->data;
1157          GLuint stride = clip->stride;
1158          GLuint count = clip->count;
1159
1160          for (nr = 0, i = 0 ; i < count ; i++) {
1161             GLfloat dp = (coord[0] * a +
1162                           coord[1] * b +
1163                           coord[2] * c +
1164                           coord[3] * d);
1165
1166             if (dp < 0) {
1167                nr++;
1168                clipmask[i] |= CLIP_USER_BIT;
1169             }
1170
1171             STRIDE_F(coord, stride);
1172          }
1173
1174          if (nr > 0) {
1175             *clipormask |= CLIP_USER_BIT;
1176             if (nr == count) {
1177                *clipandmask |= CLIP_USER_BIT;
1178                return;
1179             }
1180          }
1181       }
1182    }
1183 }
1184
1185
1186 static GLboolean
1187 do_ndc_cliptest(GLcontext *ctx, struct arb_vp_machine *m)
1188 {
1189    TNLcontext *tnl = TNL_CONTEXT(ctx);
1190    struct vertex_buffer *VB = m->VB;
1191
1192    /* Cliptest and perspective divide.  Clip functions must clear
1193     * the clipmask.
1194     */
1195    m->ormask = 0;
1196    m->andmask = CLIP_ALL_BITS;
1197
1198    if (tnl->NeedNdcCoords) {
1199       VB->NdcPtr =
1200          _mesa_clip_tab[VB->ClipPtr->size]( VB->ClipPtr,
1201                                             &m->ndcCoords,
1202                                             m->clipmask,
1203                                             &m->ormask,
1204                                             &m->andmask );
1205    }
1206    else {
1207       VB->NdcPtr = NULL;
1208       _mesa_clip_np_tab[VB->ClipPtr->size]( VB->ClipPtr,
1209                                             NULL,
1210                                             m->clipmask,
1211                                             &m->ormask,
1212                                             &m->andmask );
1213    }
1214
1215    if (m->andmask) {
1216       /* All vertices are outside the frustum */
1217       return GL_FALSE;
1218    }
1219
1220    /* Test userclip planes.  This contributes to VB->ClipMask.
1221     */
1222    if (ctx->Transform.ClipPlanesEnabled && !ctx->VertexProgram._Enabled) {
1223       userclip( ctx,
1224                 VB->ClipPtr,
1225                 m->clipmask,
1226                 &m->ormask,
1227                 &m->andmask );
1228
1229       if (m->andmask) {
1230          return GL_FALSE;
1231       }
1232    }
1233
1234    VB->ClipAndMask = m->andmask;
1235    VB->ClipOrMask = m->ormask;
1236    VB->ClipMask = m->clipmask;
1237
1238    return GL_TRUE;
1239 }
1240
1241
1242 static INLINE void call_func( struct tnl_compiled_program *p,
1243                               struct arb_vp_machine *m )
1244 {
1245    p->compiled_func(m);
1246 }
1247
1248 /**
1249  * Execute the given vertex program.
1250  *
1251  * TODO: Integrate the t_vertex.c code here, to build machine vertices
1252  * directly at this point.
1253  *
1254  * TODO: Eliminate the VB struct entirely and just use
1255  * struct arb_vertex_machine.
1256  */
1257 static GLboolean
1258 run_arb_vertex_program(GLcontext *ctx, struct tnl_pipeline_stage *stage)
1259 {
1260    struct vertex_program *program;
1261    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
1262    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1263    struct tnl_compiled_program *p;
1264    GLuint i, j;
1265    GLbitfield outputs;
1266
1267    if (ctx->ShaderObjects.CurrentProgram != NULL)
1268       return GL_TRUE;
1269
1270    program = (ctx->VertexProgram._Enabled ? ctx->VertexProgram.Current : ctx->_TnlProgram);
1271    if (!program || program->IsNVProgram)
1272       return GL_TRUE;
1273
1274    if (program->Base.Parameters) {
1275       _mesa_load_state_parameters(ctx, program->Base.Parameters);
1276    }
1277
1278    p = (struct tnl_compiled_program *)program->TnlData;
1279    assert(p);
1280
1281
1282    m->nr_inputs = m->nr_outputs = 0;
1283
1284    for (i = 0; i < _TNL_ATTRIB_MAX; i++) {
1285       if (program->Base.InputsRead & (1<<i)) {
1286          GLuint j = m->nr_inputs++;
1287          m->input[j].idx = i;
1288          m->input[j].data = (GLfloat *)m->VB->AttribPtr[i]->data;
1289          m->input[j].stride = m->VB->AttribPtr[i]->stride;
1290          m->input[j].size = m->VB->AttribPtr[i]->size;
1291          ASSIGN_4V(m->File[0][REG_IN0 + i], 0, 0, 0, 1);
1292       }
1293    }
1294
1295    for (i = 0; i < VERT_RESULT_MAX; i++) {
1296       if (program->Base.OutputsWritten & (1 << i)) {
1297          GLuint j = m->nr_outputs++;
1298          m->output[j].idx = i;
1299          m->output[j].data = (GLfloat *)m->attribs[i].data;
1300       }
1301    }
1302
1303
1304    /* Run the actual program:
1305     */
1306    for (m->vtx_nr = 0; m->vtx_nr < VB->Count; m->vtx_nr++) {
1307       for (j = 0; j < m->nr_inputs; j++) {
1308          GLuint idx = REG_IN0 + m->input[j].idx;
1309          switch (m->input[j].size) {
1310          case 4: m->File[0][idx][3] = m->input[j].data[3];
1311          case 3: m->File[0][idx][2] = m->input[j].data[2];
1312          case 2: m->File[0][idx][1] = m->input[j].data[1];
1313          case 1: m->File[0][idx][0] = m->input[j].data[0];
1314          }
1315
1316          STRIDE_F(m->input[j].data, m->input[j].stride);
1317       }
1318
1319       if (p->compiled_func) {
1320          call_func( p, m );
1321       }
1322       else {
1323          for (j = 0; j < p->nr_instructions; j++) {
1324             union instruction inst = p->instructions[j];
1325             opcode_func[inst.alu.opcode]( m, inst );
1326          }
1327       }
1328
1329       for (j = 0; j < m->nr_outputs; j++) {
1330          GLuint idx = REG_OUT0 + m->output[j].idx;
1331          m->output[j].data[0] = m->File[0][idx][0];
1332          m->output[j].data[1] = m->File[0][idx][1];
1333          m->output[j].data[2] = m->File[0][idx][2];
1334          m->output[j].data[3] = m->File[0][idx][3];
1335          m->output[j].data += 4;
1336       }
1337    }
1338
1339    /* Setup the VB pointers so that the next pipeline stages get
1340     * their data from the right place (the program output arrays).
1341     *
1342     * TODO: 1) Have tnl use these RESULT values for outputs rather
1343     * than trying to shoe-horn inputs and outputs into one set of
1344     * values.
1345     *
1346     * TODO: 2) Integrate t_vertex.c so that we just go straight ahead
1347     * and build machine vertices here.
1348     */
1349    VB->ClipPtr = &m->attribs[VERT_RESULT_HPOS];
1350    VB->ClipPtr->count = VB->Count;
1351
1352    outputs = program->Base.OutputsWritten;
1353
1354    if (outputs & (1<<VERT_RESULT_COL0)) {
1355       VB->ColorPtr[0] = &m->attribs[VERT_RESULT_COL0];
1356       VB->AttribPtr[VERT_ATTRIB_COLOR0] = VB->ColorPtr[0];
1357    }
1358
1359    if (outputs & (1<<VERT_RESULT_BFC0)) {
1360       VB->ColorPtr[1] = &m->attribs[VERT_RESULT_BFC0];
1361    }
1362
1363    if (outputs & (1<<VERT_RESULT_COL1)) {
1364       VB->SecondaryColorPtr[0] = &m->attribs[VERT_RESULT_COL1];
1365       VB->AttribPtr[VERT_ATTRIB_COLOR1] = VB->SecondaryColorPtr[0];
1366    }
1367
1368    if (outputs & (1<<VERT_RESULT_BFC1)) {
1369       VB->SecondaryColorPtr[1] = &m->attribs[VERT_RESULT_BFC1];
1370    }
1371
1372    if (outputs & (1<<VERT_RESULT_FOGC)) {
1373       VB->FogCoordPtr = &m->attribs[VERT_RESULT_FOGC];
1374       VB->AttribPtr[VERT_ATTRIB_FOG] = VB->FogCoordPtr;
1375    }
1376
1377    if (outputs & (1<<VERT_RESULT_PSIZ)) {
1378       VB->PointSizePtr = &m->attribs[VERT_RESULT_PSIZ];
1379       VB->AttribPtr[_TNL_ATTRIB_POINTSIZE] = &m->attribs[VERT_RESULT_PSIZ];
1380    }
1381
1382    for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
1383       if (outputs & (1<<(VERT_RESULT_TEX0+i))) {
1384          VB->TexCoordPtr[i] = &m->attribs[VERT_RESULT_TEX0 + i];
1385          VB->AttribPtr[VERT_ATTRIB_TEX0+i] = VB->TexCoordPtr[i];
1386       }
1387    }
1388
1389 #if 0
1390    for (i = 0; i < VB->Count; i++) {
1391       printf("Out %d: %f %f %f %f %f %f %f %f\n", i,
1392              VEC_ELT(VB->ClipPtr, GLfloat, i)[0],
1393              VEC_ELT(VB->ClipPtr, GLfloat, i)[1],
1394              VEC_ELT(VB->ClipPtr, GLfloat, i)[2],
1395              VEC_ELT(VB->ClipPtr, GLfloat, i)[3],
1396              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[0],
1397              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[1],
1398              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[2],
1399              VEC_ELT(VB->TexCoordPtr[0], GLfloat, i)[3]);
1400    }
1401 #endif
1402
1403    /* Perform NDC and cliptest operations:
1404     */
1405    return do_ndc_cliptest(ctx, m);
1406 }
1407
1408
1409 static void
1410 validate_vertex_program( GLcontext *ctx, struct tnl_pipeline_stage *stage )
1411 {
1412    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1413    struct vertex_program *program;
1414
1415    if (ctx->ShaderObjects.CurrentProgram != NULL)
1416       return;
1417
1418    program = (ctx->VertexProgram._Enabled ? ctx->VertexProgram.Current : 0);
1419    if (!program && ctx->_MaintainTnlProgram) {
1420       program = ctx->_TnlProgram;
1421    }
1422
1423    if (program) {
1424       if (!program->TnlData)
1425          compile_vertex_program( program, m->try_codegen );
1426
1427       /* Grab the state GL state and put into registers:
1428        */
1429       m->File[FILE_LOCAL_PARAM] = program->Base.LocalParams;
1430       m->File[FILE_ENV_PARAM] = ctx->VertexProgram.Parameters;
1431       /* GL_NV_vertex_programs can't reference GL state */
1432       if (program->Base.Parameters)
1433          m->File[FILE_STATE_PARAM] = program->Base.Parameters->ParameterValues;
1434       else
1435          m->File[FILE_STATE_PARAM] = NULL;
1436    }
1437 }
1438
1439
1440
1441
1442
1443
1444
1445 /**
1446  * Called the first time stage->run is called.  In effect, don't
1447  * allocate data until the first time the stage is run.
1448  */
1449 static GLboolean init_vertex_program( GLcontext *ctx,
1450                                       struct tnl_pipeline_stage *stage )
1451 {
1452    TNLcontext *tnl = TNL_CONTEXT(ctx);
1453    struct vertex_buffer *VB = &(tnl->vb);
1454    struct arb_vp_machine *m;
1455    const GLuint size = VB->Size;
1456    GLuint i;
1457
1458    stage->privatePtr = _mesa_calloc(sizeof(*m));
1459    m = ARB_VP_MACHINE(stage);
1460    if (!m)
1461       return GL_FALSE;
1462
1463    /* arb_vertex_machine struct should subsume the VB:
1464     */
1465    m->VB = VB;
1466
1467    m->File[0] = (GLfloat(*)[4])ALIGN_MALLOC(REG_MAX * sizeof(GLfloat) * 4, 16);
1468
1469    /* Initialize regs where necessary:
1470     */
1471    ASSIGN_4V(m->File[0][REG_ID], 0, 0, 0, 1);
1472    ASSIGN_4V(m->File[0][REG_ONES], 1, 1, 1, 1);
1473    ASSIGN_4V(m->File[0][REG_SWZ], -1, 1, 0, 0);
1474    ASSIGN_4V(m->File[0][REG_NEG], -1, -1, -1, -1);
1475    ASSIGN_4V(m->File[0][REG_LIT], 1, 0, 0, 1);
1476    ASSIGN_4V(m->File[0][REG_LIT2], 1, .5, .2, 1); /* debug value */
1477
1478    if (_mesa_getenv("MESA_EXPERIMENTAL"))
1479       m->try_codegen = GL_TRUE;
1480
1481    /* Allocate arrays of vertex output values */
1482    for (i = 0; i < VERT_RESULT_MAX; i++) {
1483       _mesa_vector4f_alloc( &m->attribs[i], 0, size, 32 );
1484       m->attribs[i].size = 4;
1485    }
1486
1487    /* a few other misc allocations */
1488    _mesa_vector4f_alloc( &m->ndcCoords, 0, size, 32 );
1489    m->clipmask = (GLubyte *) ALIGN_MALLOC(sizeof(GLubyte)*size, 32 );
1490
1491    if (ctx->_MaintainTnlProgram)
1492       _mesa_allow_light_in_model( ctx, GL_FALSE );
1493
1494    m->fpucntl_rnd_neg = RND_NEG_FPU; /* const value */
1495    m->fpucntl_restore = RESTORE_FPU; /* const value */
1496
1497    return GL_TRUE;
1498 }
1499
1500
1501
1502
1503 /**
1504  * Destructor for this pipeline stage.
1505  */
1506 static void dtr( struct tnl_pipeline_stage *stage )
1507 {
1508    struct arb_vp_machine *m = ARB_VP_MACHINE(stage);
1509
1510    if (m) {
1511       GLuint i;
1512
1513       /* free the vertex program result arrays */
1514       for (i = 0; i < VERT_RESULT_MAX; i++)
1515          _mesa_vector4f_free( &m->attribs[i] );
1516
1517       /* free misc arrays */
1518       _mesa_vector4f_free( &m->ndcCoords );
1519       ALIGN_FREE( m->clipmask );
1520       ALIGN_FREE( m->File[0] );
1521
1522       _mesa_free( m );
1523       stage->privatePtr = NULL;
1524    }
1525 }
1526
1527 /**
1528  * Public description of this pipeline stage.
1529  */
1530 const struct tnl_pipeline_stage _tnl_arb_vertex_program_stage =
1531 {
1532    "vertex-program",
1533    NULL,                        /* private_data */
1534    init_vertex_program,         /* create */
1535    dtr,                         /* destroy */
1536    validate_vertex_program,     /* validate */
1537    run_arb_vertex_program       /* run */
1538 };
1539
1540
1541 /**
1542  * Called via ctx->Driver.ProgramStringNotify() after a new vertex program
1543  * string has been parsed.
1544  */
1545 void
1546 _tnl_program_string(GLcontext *ctx, GLenum target, struct program *program)
1547 {
1548    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
1549       /* free any existing tnl data hanging off the program */
1550       struct vertex_program *vprog = (struct vertex_program *) program;
1551       if (vprog->TnlData) {
1552          free_tnl_data(vprog);
1553       }
1554    }
1555 }