src/mesa/drivers/dri/r300/r300_fragprog.c

   1 /*
   2  * Copyright (C) 2005 Ben Skeggs.
   3  *
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sublicense, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial
  16  * portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  */
  27
  28 /*
  29  * Authors:
  30  *   Ben Skeggs <darktama@iinet.net.au>
  31  *   Jerome Glisse <j.glisse@gmail.com>
  32  */
  33
  34 /*TODO'S
  35  *
  36  * - Depth write, WPOS/FOGC inputs
  37  * - FogOption
  38  * - Verify results of opcodes for accuracy, I've only checked them
  39  *   in specific cases.
  40  * - and more...
  41  */
  42
  43 #include "glheader.h"
  44 #include "macros.h"
  45 #include "enums.h"
  46 #include "shader/prog_instruction.h"
  47 #include "shader/prog_parameter.h"
  48 #include "shader/prog_print.h"
  49
  50 #include "r300_context.h"
  51 #include "r300_fragprog.h"
  52 #include "r300_reg.h"
  53 #include "r300_state.h"
  54
  55 /*
  56  * Usefull macros and values
  57  */
  58 #define ERROR(fmt, args...) do {                        \
  59                 fprintf(stderr, "%s::%s(): " fmt "\n",  \
  60                         __FILE__, __func__, ##args);    \
  61                 rp->error = GL_TRUE;                    \
  62         } while(0)
  63
  64 #define PFS_INVAL 0xFFFFFFFF
  65 #define COMPILE_STATE struct r300_pfs_compile_state *cs = rp->cs
  66
  67 #define SWIZZLE_XYZ             0
  68 #define SWIZZLE_XXX             1
  69 #define SWIZZLE_YYY             2
  70 #define SWIZZLE_ZZZ             3
  71 #define SWIZZLE_WWW             4
  72 #define SWIZZLE_YZX             5
  73 #define SWIZZLE_ZXY             6
  74 #define SWIZZLE_WZY             7
  75 #define SWIZZLE_111             8
  76 #define SWIZZLE_000             9
  77 #define SWIZZLE_HHH             10
  78
  79 #define swizzle(r, x, y, z, w) do_swizzle(rp, r,                \
  80                                           ((SWIZZLE_##x<<0)|    \
  81                                            (SWIZZLE_##y<<3)|    \
  82                                            (SWIZZLE_##z<<6)|    \
  83                                            (SWIZZLE_##w<<9)),   \
  84                                           0)
  85
  86 #define REG_TYPE_INPUT          0
  87 #define REG_TYPE_OUTPUT         1
  88 #define REG_TYPE_TEMP           2
  89 #define REG_TYPE_CONST          3
  90
  91 #define REG_TYPE_SHIFT          0
  92 #define REG_INDEX_SHIFT         2
  93 #define REG_VSWZ_SHIFT          8
  94 #define REG_SSWZ_SHIFT          13
  95 #define REG_NEGV_SHIFT          18
  96 #define REG_NEGS_SHIFT          19
  97 #define REG_ABS_SHIFT           20
  98 #define REG_NO_USE_SHIFT        21 // Hack for refcounting
  99 #define REG_VALID_SHIFT         22 // Does the register contain a defined value?
 100 #define REG_BUILTIN_SHIFT   23 // Is it a builtin (like all zero/all one)?
 101
 102 #define REG_TYPE_MASK           (0x03 << REG_TYPE_SHIFT)
 103 #define REG_INDEX_MASK          (0x3F << REG_INDEX_SHIFT)
 104 #define REG_VSWZ_MASK           (0x1F << REG_VSWZ_SHIFT)
 105 #define REG_SSWZ_MASK           (0x1F << REG_SSWZ_SHIFT)
 106 #define REG_NEGV_MASK           (0x01 << REG_NEGV_SHIFT)
 107 #define REG_NEGS_MASK           (0x01 << REG_NEGS_SHIFT)
 108 #define REG_ABS_MASK            (0x01 << REG_ABS_SHIFT)
 109 #define REG_NO_USE_MASK         (0x01 << REG_NO_USE_SHIFT)
 110 #define REG_VALID_MASK          (0x01 << REG_VALID_SHIFT)
 111 #define REG_BUILTIN_MASK        (0x01 << REG_BUILTIN_SHIFT)
 112
 113 #define REG(type, index, vswz, sswz, nouse, valid, builtin)     \
 114         (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |                   \
 115          ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |                \
 116          ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |              \
 117          ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |                \
 118          ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |  \
 119          ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |                   \
 120          ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 121 #define REG_GET_TYPE(reg)                                               \
 122         ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
 123 #define REG_GET_INDEX(reg)                                              \
 124         ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
 125 #define REG_GET_VSWZ(reg)                                               \
 126         ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
 127 #define REG_GET_SSWZ(reg)                                               \
 128         ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
 129 #define REG_GET_NO_USE(reg)                                             \
 130         ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
 131 #define REG_GET_VALID(reg)                                              \
 132         ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
 133 #define REG_GET_BUILTIN(reg)                                            \
 134         ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
 135 #define REG_SET_TYPE(reg, type)                                         \
 136         reg = ((reg & ~REG_TYPE_MASK) |                                 \
 137                ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
 138 #define REG_SET_INDEX(reg, index)                                       \
 139         reg = ((reg & ~REG_INDEX_MASK) |                                \
 140                ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
 141 #define REG_SET_VSWZ(reg, vswz)                                         \
 142         reg = ((reg & ~REG_VSWZ_MASK) |                                 \
 143                ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
 144 #define REG_SET_SSWZ(reg, sswz)                                         \
 145         reg = ((reg & ~REG_SSWZ_MASK) |                                 \
 146                ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 147 #define REG_SET_NO_USE(reg, nouse)                                      \
 148         reg = ((reg & ~REG_NO_USE_MASK) |                               \
 149                ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
 150 #define REG_SET_VALID(reg, valid)                                       \
 151         reg = ((reg & ~REG_VALID_MASK) |                                \
 152                ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
 153 #define REG_SET_BUILTIN(reg, builtin)                                   \
 154         reg = ((reg & ~REG_BUILTIN_MASK) |                              \
 155                ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
 156 #define REG_ABS(reg)                                                    \
 157         reg = (reg | REG_ABS_MASK)
 158 #define REG_NEGV(reg)                                                   \
 159         reg = (reg | REG_NEGV_MASK)
 160 #define REG_NEGS(reg)                                                   \
 161         reg = (reg | REG_NEGS_MASK)
 162
 163
 164 /*
 165  * Datas structures for fragment program generation
 166  */
 167
 168 /* description of r300 native hw instructions */
 169 static const struct {
 170         const char *name;
 171         int argc;
 172         int v_op;
 173         int s_op;
 174 } r300_fpop[] = {
 175         { "MAD", 3, R300_FPI0_OUTC_MAD, R300_FPI2_OUTA_MAD },
 176         { "DP3", 2, R300_FPI0_OUTC_DP3, R300_FPI2_OUTA_DP4 },
 177         { "DP4", 2, R300_FPI0_OUTC_DP4, R300_FPI2_OUTA_DP4 },
 178         { "MIN", 2, R300_FPI0_OUTC_MIN, R300_FPI2_OUTA_MIN },
 179         { "MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX },
 180         { "CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP },
 181         { "FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC },
 182         { "EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2 },
 183         { "LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2 },
 184         { "RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP },
 185         { "RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ },
 186         { "REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL },
 187         { "CMPH", 3, R300_FPI0_OUTC_CMPH, PFS_INVAL },
 188 };
 189
 190
 191 /* vector swizzles r300 can support natively, with a couple of
 192  * cases we handle specially
 193  *
 194  * REG_VSWZ/REG_SSWZ is an index into this table
 195  */
 196
 197 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
 198 #define SWIZZLE_HALF 6
 199
 200 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
 201                                           SWIZZLE_##y, \
 202                                           SWIZZLE_##z, \
 203                                           SWIZZLE_ZERO))
 204 static const struct r300_pfs_swizzle {
 205         GLuint hash;    /* swizzle value this matches */
 206         GLuint base;    /* base value for hw swizzle */
 207         GLuint stride;  /* difference in base between arg0/1/2 */
 208         GLuint flags;
 209 } v_swiz[] = {
 210 /* native swizzles */
 211         { MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR },
 212         { MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR },
 213         { MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR },
 214         { MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR },
 215         { MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A,     1, SLOT_SRC_SCALAR },
 216         { MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR },
 217         { MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR },
 218         { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH },
 219         { MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
 220         { MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
 221         { MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
 222         { PFS_INVAL, 0, 0, 0},
 223 };
 224
 225 /* used during matching of non-native swizzles */
 226 #define SWZ_X_MASK (7 << 0)
 227 #define SWZ_Y_MASK (7 << 3)
 228 #define SWZ_Z_MASK (7 << 6)
 229 #define SWZ_W_MASK (7 << 9)
 230 static const struct {
 231         GLuint hash;            /* used to mask matching swizzle components */
 232         int mask;               /* actual outmask */
 233         int count;              /* count of components matched */
 234 } s_mask[] = {
 235         { SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK, 1|2|4, 3},
 236         { SWZ_X_MASK|SWZ_Y_MASK, 1|2, 2},
 237         { SWZ_X_MASK|SWZ_Z_MASK, 1|4, 2},
 238         { SWZ_Y_MASK|SWZ_Z_MASK, 2|4, 2},
 239         { SWZ_X_MASK, 1, 1},
 240         { SWZ_Y_MASK, 2, 1},
 241         { SWZ_Z_MASK, 4, 1},
 242         { PFS_INVAL, PFS_INVAL, PFS_INVAL}
 243 };
 244
 245 static const struct {
 246         int base;       /* hw value of swizzle */
 247         int stride;     /* difference between SRC0/1/2 */
 248         GLuint flags;
 249 } s_swiz[] = {
 250         { R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR },
 251         { R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR },
 252         { R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR },
 253         { R300_FPI2_ARGA_SRC0A  , 1, SLOT_SRC_SCALAR },
 254         { R300_FPI2_ARGA_ZERO   , 0, 0 },
 255         { R300_FPI2_ARGA_ONE    , 0, 0 },
 256         { R300_FPI2_ARGA_HALF   , 0, 0 }
 257 };
 258
 259 /* boiler-plate reg, for convenience */
 260 static const GLuint undef = REG(REG_TYPE_TEMP,
 261                                 0,
 262                                 SWIZZLE_XYZ,
 263                                 SWIZZLE_W,
 264                                 GL_FALSE,
 265                                 GL_FALSE,
 266                                 GL_FALSE);
 267
 268 /* constant one source */
 269 static const GLuint pfs_one = REG(REG_TYPE_CONST,
 270                                   0,
 271                                   SWIZZLE_111,
 272                                   SWIZZLE_ONE,
 273                                   GL_FALSE,
 274                                   GL_TRUE,
 275                                   GL_TRUE);
 276
 277 /* constant half source */
 278 static const GLuint pfs_half = REG(REG_TYPE_CONST,
 279                                    0,
 280                                    SWIZZLE_HHH,
 281                                    SWIZZLE_HALF,
 282                                    GL_FALSE,
 283                                    GL_TRUE,
 284                                    GL_TRUE);
 285
 286 /* constant zero source */
 287 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
 288                                    0,
 289                                    SWIZZLE_000,
 290                                    SWIZZLE_ZERO,
 291                                    GL_FALSE,
 292                                    GL_TRUE,
 293                                    GL_TRUE);
 294
 295 /*
 296  * Common functions prototypes
 297  */
 298 static void dump_program(struct r300_fragment_program *rp);
 299 static void emit_arith(struct r300_fragment_program *rp, int op,
 300                                 GLuint dest, int mask,
 301                                 GLuint src0, GLuint src1, GLuint src2,
 302                                 int flags);
 303
 304 /**
 305  * Get an R300 temporary that can be written to in the given slot.
 306  */
 307 static int get_hw_temp(struct r300_fragment_program *rp, int slot)
 308 {
 309         COMPILE_STATE;
 310         int r;
 311
 312         for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 313                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
 314                         break;
 315         }
 316
 317         if (r >= PFS_NUM_TEMP_REGS) {
 318                 ERROR("Out of hardware temps\n");
 319                 return 0;
 320         }
 321
 322         // Reserved is used to avoid the following scenario:
 323         //  R300 temporary X is first assigned to Mesa temporary Y during vector ops
 324         //  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
 325         //  Then scalar ops on Mesa temporary Z are emitted and move back in time
 326         //  to overwrite the value of temporary Y.
 327         // End scenario.
 328         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 329         cs->hwtemps[r].free = -1;
 330
 331         // Reset to some value that won't mess things up when the user
 332         // tries to read from a temporary that hasn't been assigned a value yet.
 333         // In the normal case, vector_valid and scalar_valid should be set to
 334         // a sane value by the first emit that writes to this temporary.
 335         cs->hwtemps[r].vector_valid = 0;
 336         cs->hwtemps[r].scalar_valid = 0;
 337
 338         if (r > rp->max_temp_idx)
 339                 rp->max_temp_idx = r;
 340
 341         return r;
 342 }
 343
 344 /**
 345  * Get an R300 temporary that will act as a TEX destination register.
 346  */
 347 static int get_hw_temp_tex(struct r300_fragment_program *rp)
 348 {
 349         COMPILE_STATE;
 350         int r;
 351
 352         for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 353                 if (cs->used_in_node & (1 << r))
 354                         continue;
 355
 356                 // Note: Be very careful here
 357                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
 358                         break;
 359         }
 360
 361         if (r >= PFS_NUM_TEMP_REGS)
 362                 return get_hw_temp(rp, 0); /* Will cause an indirection */
 363
 364         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 365         cs->hwtemps[r].free = -1;
 366
 367         // Reset to some value that won't mess things up when the user
 368         // tries to read from a temporary that hasn't been assigned a value yet.
 369         // In the normal case, vector_valid and scalar_valid should be set to
 370         // a sane value by the first emit that writes to this temporary.
 371         cs->hwtemps[r].vector_valid = cs->nrslots;
 372         cs->hwtemps[r].scalar_valid = cs->nrslots;
 373
 374         if (r > rp->max_temp_idx)
 375                 rp->max_temp_idx = r;
 376
 377         return r;
 378 }
 379
 380 /**
 381  * Mark the given hardware register as free.
 382  */
 383 static void free_hw_temp(struct r300_fragment_program *rp, int idx)
 384 {
 385         COMPILE_STATE;
 386
 387         // Be very careful here. Consider sequences like
 388         //  MAD r0, r1,r2,r3
 389         //  TEX r4, ...
 390         // The TEX instruction may be moved in front of the MAD instruction
 391         // due to the way nodes work. We don't want to alias r1 and r4 in
 392         // this case.
 393         // I'm certain the register allocation could be further sanitized,
 394         // but it's tricky because of stuff that can happen inside emit_tex
 395         // and emit_arith.
 396         cs->hwtemps[idx].free = cs->nrslots+1;
 397 }
 398
 399
 400 /**
 401  * Create a new Mesa temporary register.
 402  */
 403 static GLuint get_temp_reg(struct r300_fragment_program *rp)
 404 {
 405         COMPILE_STATE;
 406         GLuint r = undef;
 407         GLuint index;
 408
 409         index = ffs(~cs->temp_in_use);
 410         if (!index) {
 411                 ERROR("Out of program temps\n");
 412                 return r;
 413         }
 414
 415         cs->temp_in_use |= (1 << --index);
 416         cs->temps[index].refcount = 0xFFFFFFFF;
 417         cs->temps[index].reg = -1;
 418
 419         REG_SET_TYPE(r, REG_TYPE_TEMP);
 420         REG_SET_INDEX(r, index);
 421         REG_SET_VALID(r, GL_TRUE);
 422         return r;
 423 }
 424
 425 /**
 426  * Create a new Mesa temporary register that will act as the destination
 427  * register for a texture read.
 428  */
 429 static GLuint get_temp_reg_tex(struct r300_fragment_program *rp)
 430 {
 431         COMPILE_STATE;
 432         GLuint r = undef;
 433         GLuint index;
 434
 435         index = ffs(~cs->temp_in_use);
 436         if (!index) {
 437                 ERROR("Out of program temps\n");
 438                 return r;
 439         }
 440
 441         cs->temp_in_use |= (1 << --index);
 442         cs->temps[index].refcount = 0xFFFFFFFF;
 443         cs->temps[index].reg = get_hw_temp_tex(rp);
 444
 445         REG_SET_TYPE(r, REG_TYPE_TEMP);
 446         REG_SET_INDEX(r, index);
 447         REG_SET_VALID(r, GL_TRUE);
 448         return r;
 449 }
 450
 451 /**
 452  * Free a Mesa temporary and the associated R300 temporary.
 453  */
 454 static void free_temp(struct r300_fragment_program *rp, GLuint r)
 455 {
 456         COMPILE_STATE;
 457         GLuint index = REG_GET_INDEX(r);
 458
 459         if (!(cs->temp_in_use & (1 << index)))
 460                 return;
 461
 462         if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
 463                 free_hw_temp(rp, cs->temps[index].reg);
 464                 cs->temps[index].reg = -1;
 465                 cs->temp_in_use &= ~(1 << index);
 466         } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
 467                 free_hw_temp(rp, cs->inputs[index].reg);
 468                 cs->inputs[index].reg = -1;
 469         }
 470 }
 471
 472 /**
 473  * Emit a hardware constant/parameter.
 474  *
 475  * \p cp Stable pointer to an array of 4 floats.
 476  *  The pointer must be stable in the sense that it remains to be valid
 477  *  and hold the contents of the constant/parameter throughout the lifetime
 478  *  of the fragment program (actually, up until the next time the fragment
 479  *  program is translated).
 480  */
 481 static GLuint emit_const4fv(struct r300_fragment_program *rp, const GLfloat* cp)
 482 {
 483         GLuint reg = undef;
 484         int index;
 485
 486         for(index = 0; index < rp->const_nr; ++index) {
 487                 if (rp->constant[index] == cp)
 488                         break;
 489         }
 490
 491         if (index >= rp->const_nr) {
 492                 if (index >= PFS_NUM_CONST_REGS) {
 493                         ERROR("Out of hw constants!\n");
 494                         return reg;
 495                 }
 496
 497                 rp->const_nr++;
 498                 rp->constant[index] = cp;
 499         }
 500
 501         REG_SET_TYPE(reg, REG_TYPE_CONST);
 502         REG_SET_INDEX(reg, index);
 503         REG_SET_VALID(reg, GL_TRUE);
 504         return reg;
 505 }
 506
 507 static inline GLuint negate(GLuint r)
 508 {
 509         REG_NEGS(r);
 510         REG_NEGV(r);
 511         return r;
 512 }
 513
 514 /* Hack, to prevent clobbering sources used multiple times when
 515  * emulating non-native instructions
 516  */
 517 static inline GLuint keep(GLuint r)
 518 {
 519         REG_SET_NO_USE(r, GL_TRUE);
 520         return r;
 521 }
 522
 523 static inline GLuint absolute(GLuint r)
 524 {
 525         REG_ABS(r);
 526         return r;
 527 }
 528
 529 static int swz_native(struct r300_fragment_program *rp,
 530                       GLuint src,
 531                       GLuint *r,
 532                       GLuint arbneg)
 533 {
 534         /* Native swizzle, handle negation */
 535         src = (src & ~REG_NEGS_MASK) |
 536                 (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
 537
 538         if ((arbneg & 0x7) == 0x0) {
 539                 src = src & ~REG_NEGV_MASK;
 540                 *r = src;
 541         } else if ((arbneg & 0x7) == 0x7) {
 542                 src |= REG_NEGV_MASK;
 543                 *r = src;
 544         } else {
 545                 if (!REG_GET_VALID(*r))
 546                         *r = get_temp_reg(rp);
 547                 src |= REG_NEGV_MASK;
 548                 emit_arith(rp,
 549                            PFS_OP_MAD,
 550                            *r,
 551                            arbneg & 0x7,
 552                            keep(src),
 553                            pfs_one,
 554                            pfs_zero,
 555                            0);
 556                 src = src & ~REG_NEGV_MASK;
 557                 emit_arith(rp,
 558                            PFS_OP_MAD,
 559                            *r,
 560                            (arbneg ^ 0x7) | WRITEMASK_W,
 561                            src,
 562                            pfs_one,
 563                            pfs_zero,
 564                            0);
 565         }
 566
 567         return 3;
 568 }
 569
 570 static int swz_emit_partial(struct r300_fragment_program *rp,
 571                             GLuint src,
 572                             GLuint *r,
 573                             int mask,
 574                             int mc,
 575                             GLuint arbneg)
 576 {
 577         GLuint tmp;
 578         GLuint wmask = 0;
 579
 580         if (!REG_GET_VALID(*r))
 581                 *r = get_temp_reg(rp);
 582
 583         /* A partial match, VSWZ/mask define what parts of the
 584          * desired swizzle we match
 585          */
 586         if (mc + s_mask[mask].count == 3) {
 587                 wmask = WRITEMASK_W;
 588                 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
 589         }
 590
 591         tmp = arbneg & s_mask[mask].mask;
 592         if (tmp) {
 593                 tmp = tmp ^ s_mask[mask].mask;
 594                 if (tmp) {
 595                         emit_arith(rp,
 596                                    PFS_OP_MAD,
 597                                    *r,
 598                                    arbneg & s_mask[mask].mask,
 599                                    keep(src) | REG_NEGV_MASK,
 600                                    pfs_one,
 601                                    pfs_zero,
 602                                    0);
 603                         if (!wmask) {
 604                                 REG_SET_NO_USE(src, GL_TRUE);
 605                         } else {
 606                                 REG_SET_NO_USE(src, GL_FALSE);
 607                         }
 608                         emit_arith(rp,
 609                                    PFS_OP_MAD,
 610                                    *r,
 611                                    tmp | wmask,
 612                                    src,
 613                                    pfs_one,
 614                                    pfs_zero,
 615                                    0);
 616                 } else {
 617                         if (!wmask) {
 618                                 REG_SET_NO_USE(src, GL_TRUE);
 619                         } else {
 620                                 REG_SET_NO_USE(src, GL_FALSE);
 621                         }
 622                         emit_arith(rp,
 623                                    PFS_OP_MAD,
 624                                    *r,
 625                                    (arbneg & s_mask[mask].mask) | wmask,
 626                                    src | REG_NEGV_MASK,
 627                                    pfs_one,
 628                                    pfs_zero,
 629                                    0);
 630                 }
 631         } else {
 632                 if (!wmask) {
 633                         REG_SET_NO_USE(src, GL_TRUE);
 634                 } else {
 635                         REG_SET_NO_USE(src, GL_FALSE);
 636                 }
 637                 emit_arith(rp, PFS_OP_MAD,
 638                            *r,
 639                            s_mask[mask].mask | wmask,
 640                            src,
 641                            pfs_one,
 642                            pfs_zero,
 643                            0);
 644         }
 645
 646         return s_mask[mask].count;
 647 }
 648
 649 static GLuint do_swizzle(struct r300_fragment_program *rp,
 650                          GLuint src,
 651                          GLuint arbswz,
 652                          GLuint arbneg)
 653 {
 654         GLuint r = undef;
 655         GLuint vswz;
 656         int c_mask = 0;
 657         int v_match = 0;
 658
 659         /* If swizzling from something without an XYZW native swizzle,
 660          * emit result to a temp, and do new swizzle from the temp.
 661          */
 662 #if 0
 663         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ ||
 664             REG_GET_SSWZ(src) != SWIZZLE_W) {
 665                 GLuint temp = get_temp_reg(rp);
 666                 emit_arith(rp,
 667                            PFS_OP_MAD,
 668                            temp,
 669                            WRITEMASK_XYZW,
 670                            src,
 671                            pfs_one,
 672                            pfs_zero,
 673                            0);
 674                 src = temp;
 675         }
 676 #endif
 677
 678         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ ||
 679             REG_GET_SSWZ(src) != SWIZZLE_W) {
 680             GLuint vsrcswz = (v_swiz[REG_GET_VSWZ(src)].hash & (SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK)) | REG_GET_SSWZ(src) << 9;
 681             GLint i;
 682
 683             GLuint newswz = 0;
 684             GLuint offset;
 685             for(i=0; i < 4; ++i){
 686                 offset = GET_SWZ(arbswz, i);
 687
 688                 newswz |= (offset <= 3)?GET_SWZ(vsrcswz, offset) << i*3:offset << i*3;
 689             }
 690
 691             arbswz = newswz & (SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK);
 692             REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
 693         }
 694         else
 695         {
 696             /* set scalar swizzling */
 697             REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
 698
 699         }
 700         do {
 701                 vswz = REG_GET_VSWZ(src);
 702                 do {
 703                         int chash;
 704
 705                         REG_SET_VSWZ(src, vswz);
 706                         chash = v_swiz[REG_GET_VSWZ(src)].hash &
 707                                 s_mask[c_mask].hash;
 708
 709                         if (chash == (arbswz & s_mask[c_mask].hash)) {
 710                                 if (s_mask[c_mask].count == 3) {
 711                                         v_match += swz_native(rp,
 712                                                                 src,
 713                                                                 &r,
 714                                                                 arbneg);
 715                                 } else {
 716                                         v_match += swz_emit_partial(rp,
 717                                                                     src,
 718                                                                     &r,
 719                                                                     c_mask,
 720                                                                     v_match,
 721                                                                     arbneg);
 722                                 }
 723
 724                                 if (v_match == 3)
 725                                         return r;
 726
 727                                 /* Fill with something invalid.. all 0's was
 728                                  * wrong before, matched SWIZZLE_X.  So all
 729                                  * 1's will be okay for now
 730                                  */
 731                                 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
 732                         }
 733                 } while(v_swiz[++vswz].hash != PFS_INVAL);
 734                 REG_SET_VSWZ(src, SWIZZLE_XYZ);
 735         } while (s_mask[++c_mask].hash != PFS_INVAL);
 736
 737         ERROR("should NEVER get here\n");
 738         return r;
 739 }
 740
 741 static GLuint t_src(struct r300_fragment_program *rp,
 742                     struct prog_src_register fpsrc)
 743 {
 744         GLuint r = undef;
 745
 746         switch (fpsrc.File) {
 747         case PROGRAM_TEMPORARY:
 748                 REG_SET_INDEX(r, fpsrc.Index);
 749                 REG_SET_VALID(r, GL_TRUE);
 750                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 751                 break;
 752         case PROGRAM_INPUT:
 753                 REG_SET_INDEX(r, fpsrc.Index);
 754                 REG_SET_VALID(r, GL_TRUE);
 755                 REG_SET_TYPE(r, REG_TYPE_INPUT);
 756                 break;
 757         case PROGRAM_LOCAL_PARAM:
 758                 r = emit_const4fv(rp,
 759                                   rp->mesa_program.Base.LocalParams[fpsrc.Index]);
 760                 break;
 761         case PROGRAM_ENV_PARAM:
 762                 r = emit_const4fv(rp,
 763                                   rp->ctx->FragmentProgram.Parameters[fpsrc.Index]);
 764                 break;
 765         case PROGRAM_STATE_VAR:
 766         case PROGRAM_NAMED_PARAM:
 767                 r = emit_const4fv(rp,
 768                                   rp->mesa_program.Base.Parameters->ParameterValues[fpsrc.Index]);
 769                 break;
 770         default:
 771                 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
 772                 return r;
 773         }
 774
 775         /* no point swizzling ONE/ZERO/HALF constants... */
 776         if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
 777                 r = do_swizzle(rp, r, fpsrc.Swizzle, fpsrc.NegateBase);
 778         return r;
 779 }
 780
 781 static GLuint t_scalar_src(struct r300_fragment_program *rp,
 782                            struct prog_src_register fpsrc)
 783 {
 784         struct prog_src_register src = fpsrc;
 785         int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */
 786
 787         src.Swizzle = ((sc<<0)|(sc<<3)|(sc<<6)|(sc<<9));
 788
 789         return t_src(rp, src);
 790 }
 791
 792 static GLuint t_dst(struct r300_fragment_program *rp,
 793                        struct prog_dst_register dest)
 794 {
 795         GLuint r = undef;
 796
 797         switch (dest.File) {
 798         case PROGRAM_TEMPORARY:
 799                 REG_SET_INDEX(r, dest.Index);
 800                 REG_SET_VALID(r, GL_TRUE);
 801                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 802                 return r;
 803         case PROGRAM_OUTPUT:
 804                 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
 805                 switch (dest.Index) {
 806                 case FRAG_RESULT_COLR:
 807                 case FRAG_RESULT_DEPR:
 808                         REG_SET_INDEX(r, dest.Index);
 809                         REG_SET_VALID(r, GL_TRUE);
 810                         return r;
 811                 default:
 812                         ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
 813                         return r;
 814                 }
 815         default:
 816                 ERROR("Bad DstReg->File 0x%x\n", dest.File);
 817                 return r;
 818         }
 819 }
 820
 821 static int t_hw_src(struct r300_fragment_program *rp,
 822                     GLuint src,
 823                     GLboolean tex)
 824 {
 825         COMPILE_STATE;
 826         int idx;
 827         int index = REG_GET_INDEX(src);
 828
 829         switch(REG_GET_TYPE(src)) {
 830         case REG_TYPE_TEMP:
 831                 /* NOTE: if reg==-1 here, a source is being read that
 832                  *       hasn't been written to. Undefined results.
 833                  */
 834                 if (cs->temps[index].reg == -1)
 835                         cs->temps[index].reg = get_hw_temp(rp, cs->nrslots);
 836
 837                 idx = cs->temps[index].reg;
 838
 839                 if (!REG_GET_NO_USE(src) &&
 840                     (--cs->temps[index].refcount == 0))
 841                         free_temp(rp, src);
 842                 break;
 843         case REG_TYPE_INPUT:
 844                 idx = cs->inputs[index].reg;
 845
 846                 if (!REG_GET_NO_USE(src) &&
 847                     (--cs->inputs[index].refcount == 0))
 848                         free_hw_temp(rp, cs->inputs[index].reg);
 849                 break;
 850         case REG_TYPE_CONST:
 851                 return (index | SRC_CONST);
 852         default:
 853                 ERROR("Invalid type for source reg\n");
 854                 return (0 | SRC_CONST);
 855         }
 856
 857         if (!tex)
 858                 cs->used_in_node |= (1 << idx);
 859
 860         return idx;
 861 }
 862
 863 static int t_hw_dst(struct r300_fragment_program *rp,
 864                     GLuint dest,
 865                     GLboolean tex,
 866                     int slot)
 867 {
 868         COMPILE_STATE;
 869         int idx;
 870         GLuint index = REG_GET_INDEX(dest);
 871         assert(REG_GET_VALID(dest));
 872
 873         switch(REG_GET_TYPE(dest)) {
 874         case REG_TYPE_TEMP:
 875                 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
 876                         if (!tex) {
 877                                 cs->temps[index].reg = get_hw_temp(rp, slot);
 878                         } else {
 879                                 cs->temps[index].reg = get_hw_temp_tex(rp);
 880                         }
 881                 }
 882                 idx = cs->temps[index].reg;
 883
 884                 if (!REG_GET_NO_USE(dest) &&
 885                     (--cs->temps[index].refcount == 0))
 886                         free_temp(rp, dest);
 887
 888                 cs->dest_in_node |= (1 << idx);
 889                 cs->used_in_node |= (1 << idx);
 890                 break;
 891         case REG_TYPE_OUTPUT:
 892                 switch(index) {
 893                 case FRAG_RESULT_COLR:
 894                         rp->node[rp->cur_node].flags |= R300_PFS_NODE_OUTPUT_COLOR;
 895                         break;
 896                 case FRAG_RESULT_DEPR:
 897                         rp->node[rp->cur_node].flags |= R300_PFS_NODE_OUTPUT_DEPTH;
 898                         break;
 899                 }
 900                 return index;
 901                 break;
 902         default:
 903                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
 904                 return 0;
 905         }
 906
 907         return idx;
 908 }
 909
 910 static void emit_nop(struct r300_fragment_program *rp)
 911 {
 912         COMPILE_STATE;
 913
 914         if (cs->nrslots >= PFS_MAX_ALU_INST) {
 915                 ERROR("Out of ALU instruction slots\n");
 916                 return;
 917         }
 918
 919         rp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
 920         rp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
 921         rp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
 922         rp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
 923         cs->nrslots++;
 924 }
 925
 926 static void emit_tex(struct r300_fragment_program *rp,
 927                      struct prog_instruction *fpi,
 928                      int opcode)
 929 {
 930         COMPILE_STATE;
 931         GLuint coord = t_src(rp, fpi->SrcReg[0]);
 932         GLuint dest = undef, rdest = undef;
 933         GLuint din = cs->dest_in_node, uin = cs->used_in_node;
 934         int unit = fpi->TexSrcUnit;
 935         int hwsrc, hwdest;
 936
 937         /* Resolve source/dest to hardware registers */
 938         hwsrc = t_hw_src(rp, coord, GL_TRUE);
 939         if (opcode != R300_FPITX_OP_KIL) {
 940                 dest = t_dst(rp, fpi->DstReg);
 941
 942                 /* r300 doesn't seem to be able to do TEX->output reg */
 943                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
 944                         rdest = dest;
 945                         dest = get_temp_reg_tex(rp);
 946                 }
 947                 hwdest = t_hw_dst(rp, dest, GL_TRUE, rp->node[rp->cur_node].alu_offset);
 948
 949                 /* Use a temp that hasn't been used in this node, rather
 950                  * than causing an indirection
 951                  */
 952                 if (uin & (1 << hwdest)) {
 953                         free_hw_temp(rp, hwdest);
 954                         hwdest = get_hw_temp_tex(rp);
 955                         cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
 956                 }
 957         } else {
 958                 hwdest = 0;
 959                 unit = 0;
 960         }
 961
 962         /* Indirection if source has been written in this node, or if the
 963          * dest has been read/written in this node
 964          */
 965         if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
 966              (din & (1<<hwsrc))) || (uin & (1<<hwdest))) {
 967
 968                 /* Finish off current node */
 969                 if (rp->node[rp->cur_node].alu_offset == cs->nrslots)
 970                         emit_nop(rp);
 971
 972                 rp->node[rp->cur_node].alu_end =
 973                                 cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
 974                 assert(rp->node[rp->cur_node].alu_end >= 0);
 975
 976                 if (++rp->cur_node >= PFS_MAX_TEX_INDIRECT) {
 977                         ERROR("too many levels of texture indirection\n");
 978                         return;
 979                 }
 980
 981                 /* Start new node */
 982                 rp->node[rp->cur_node].tex_offset = rp->tex.length;
 983                 rp->node[rp->cur_node].alu_offset = cs->nrslots;
 984                 rp->node[rp->cur_node].tex_end = -1;
 985                 rp->node[rp->cur_node].alu_end = -1;
 986                 rp->node[rp->cur_node].flags = 0;
 987                 cs->used_in_node = 0;
 988                 cs->dest_in_node = 0;
 989         }
 990
 991         if (rp->cur_node == 0)
 992                 rp->first_node_has_tex = 1;
 993
 994         rp->tex.inst[rp->tex.length++] = 0
 995                 | (hwsrc << R300_FPITX_SRC_SHIFT)
 996                 | (hwdest << R300_FPITX_DST_SHIFT)
 997                 | (unit << R300_FPITX_IMAGE_SHIFT)
 998                 /* not entirely sure about this */
 999                 | (opcode << R300_FPITX_OPCODE_SHIFT);
1000
1001         cs->dest_in_node |= (1 << hwdest);
1002         if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1003                 cs->used_in_node |= (1 << hwsrc);
1004
1005         rp->node[rp->cur_node].tex_end++;
1006
1007         /* Copy from temp to output if needed */
1008         if (REG_GET_VALID(rdest)) {
1009                 emit_arith(rp, PFS_OP_MAD, rdest, WRITEMASK_XYZW, dest,
1010                            pfs_one, pfs_zero, 0);
1011                 free_temp(rp, dest);
1012         }
1013 }
1014
1015
1016 /**
1017  * Returns the first slot where we could possibly allow writing to dest,
1018  * according to register allocation.
1019  */
1020 static int get_earliest_allowed_write(
1021                 struct r300_fragment_program* rp,
1022                 GLuint dest, int mask)
1023 {
1024         COMPILE_STATE;
1025         int idx;
1026         int pos;
1027         GLuint index = REG_GET_INDEX(dest);
1028         assert(REG_GET_VALID(dest));
1029
1030         switch(REG_GET_TYPE(dest)) {
1031                 case REG_TYPE_TEMP:
1032                         if (cs->temps[index].reg == -1)
1033                                 return 0;
1034
1035                         idx = cs->temps[index].reg;
1036                         break;
1037                 case REG_TYPE_OUTPUT:
1038                         return 0;
1039                 default:
1040                         ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1041                         return 0;
1042         }
1043
1044         pos = cs->hwtemps[idx].reserved;
1045         if (mask & WRITEMASK_XYZ) {
1046                 if (pos < cs->hwtemps[idx].vector_lastread)
1047                         pos = cs->hwtemps[idx].vector_lastread;
1048         }
1049         if (mask & WRITEMASK_W) {
1050                 if (pos < cs->hwtemps[idx].scalar_lastread)
1051                         pos = cs->hwtemps[idx].scalar_lastread;
1052         }
1053
1054         return pos;
1055 }
1056
1057
1058 /**
1059  * Allocates a slot for an ALU instruction that can consist of
1060  * a vertex part or a scalar part or both.
1061  *
1062  * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1063  * appropriate position (vector and/or scalar), and their positions are
1064  * recorded in the srcpos array.
1065  *
1066  * This function emits instruction code for the source fetch and the
1067  * argument selection. It does not emit instruction code for the
1068  * opcode or the destination selection.
1069  *
1070  * @return the index of the slot
1071  */
1072 static int find_and_prepare_slot(struct r300_fragment_program* rp,
1073                 GLboolean emit_vop,
1074                 GLboolean emit_sop,
1075                 int argc,
1076                 GLuint* src,
1077                 GLuint dest,
1078                 int mask)
1079 {
1080         COMPILE_STATE;
1081         int hwsrc[3];
1082         int srcpos[3];
1083         unsigned int used;
1084         int tempused;
1085         int tempvsrc[3];
1086         int tempssrc[3];
1087         int pos;
1088         int regnr;
1089         int i,j;
1090
1091         // Determine instruction slots, whether sources are required on
1092         // vector or scalar side, and the smallest slot number where
1093         // all source registers are available
1094         used = 0;
1095         if (emit_vop)
1096                 used |= SLOT_OP_VECTOR;
1097         if (emit_sop)
1098                 used |= SLOT_OP_SCALAR;
1099
1100         pos = get_earliest_allowed_write(rp, dest, mask);
1101
1102         if (rp->node[rp->cur_node].alu_offset > pos)
1103                 pos = rp->node[rp->cur_node].alu_offset;
1104         for(i = 0; i < argc; ++i) {
1105                 if (!REG_GET_BUILTIN(src[i])) {
1106                         if (emit_vop)
1107                                 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1108                         if (emit_sop)
1109                                 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1110                 }
1111
1112                 hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
1113                 regnr = hwsrc[i] & 31;
1114
1115                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1116                         if (used & (SLOT_SRC_VECTOR << i)) {
1117                                 if (cs->hwtemps[regnr].vector_valid > pos)
1118                                         pos = cs->hwtemps[regnr].vector_valid;
1119                         }
1120                         if (used & (SLOT_SRC_SCALAR << i)) {
1121                                 if (cs->hwtemps[regnr].scalar_valid > pos)
1122                                         pos = cs->hwtemps[regnr].scalar_valid;
1123                         }
1124                 }
1125         }
1126
1127         // Find a slot that fits
1128         for(; ; ++pos) {
1129                 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1130                         continue;
1131
1132                 if (pos >= cs->nrslots) {
1133                         if (cs->nrslots >= PFS_MAX_ALU_INST) {
1134                                 ERROR("Out of ALU instruction slots\n");
1135                                 return -1;
1136                         }
1137
1138                         rp->alu.inst[pos].inst0 = NOP_INST0;
1139                         rp->alu.inst[pos].inst1 = NOP_INST1;
1140                         rp->alu.inst[pos].inst2 = NOP_INST2;
1141                         rp->alu.inst[pos].inst3 = NOP_INST3;
1142
1143                         cs->nrslots++;
1144                 }
1145
1146                 // Note: When we need both parts (vector and scalar) of a source,
1147                 // we always try to put them into the same position. This makes the
1148                 // code easier to read, and it is optimal (i.e. one doesn't gain
1149                 // anything by splitting the parts).
1150                 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1151                 tempused = cs->slot[pos].used;
1152                 for(i = 0; i < 3; ++i) {
1153                         tempvsrc[i] = cs->slot[pos].vsrc[i];
1154                         tempssrc[i] = cs->slot[pos].ssrc[i];
1155                 }
1156
1157                 for(i = 0; i < argc; ++i) {
1158                         int flags = (used >> i) & SLOT_SRC_BOTH;
1159
1160                         if (!flags) {
1161                                 srcpos[i] = 0;
1162                                 continue;
1163                         }
1164
1165                         for(j = 0; j < 3; ++j) {
1166                                 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1167                                         if (tempvsrc[j] != hwsrc[i])
1168                                                 continue;
1169                                 }
1170
1171                                 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1172                                         if (tempssrc[j] != hwsrc[i])
1173                                                 continue;
1174                                 }
1175
1176                                 break;
1177                         }
1178
1179                         if (j == 3)
1180                                 break;
1181
1182                         srcpos[i] = j;
1183                         tempused |= flags << j;
1184                         if (flags & SLOT_SRC_VECTOR)
1185                                 tempvsrc[j] = hwsrc[i];
1186                         if (flags & SLOT_SRC_SCALAR)
1187                                 tempssrc[j] = hwsrc[i];
1188                 }
1189
1190                 if (i == argc)
1191                         break;
1192         }
1193
1194         // Found a slot, reserve it
1195         cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1196         for(i = 0; i < 3; ++i) {
1197                 cs->slot[pos].vsrc[i] = tempvsrc[i];
1198                 cs->slot[pos].ssrc[i] = tempssrc[i];
1199         }
1200
1201         for(i = 0; i < argc; ++i) {
1202                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1203                         int regnr = hwsrc[i] & 31;
1204
1205                         if (used & (SLOT_SRC_VECTOR << i)) {
1206                                 if (cs->hwtemps[regnr].vector_lastread < pos)
1207                                         cs->hwtemps[regnr].vector_lastread = pos;
1208                         }
1209                         if (used & (SLOT_SRC_SCALAR << i)) {
1210                                 if (cs->hwtemps[regnr].scalar_lastread < pos)
1211                                         cs->hwtemps[regnr].scalar_lastread = pos;
1212                         }
1213                 }
1214         }
1215
1216         // Emit the source fetch code
1217         rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
1218         rp->alu.inst[pos].inst1 |=
1219                         ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
1220                          (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
1221                          (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
1222
1223         rp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
1224         rp->alu.inst[pos].inst3 |=
1225                         ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
1226                          (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
1227                          (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
1228
1229         // Emit the argument selection code
1230         if (emit_vop) {
1231                 int swz[3];
1232
1233                 for(i = 0; i < 3; ++i) {
1234                         if (i < argc) {
1235                                 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1236                                             (srcpos[i] * v_swiz[REG_GET_VSWZ(src[i])].stride)) |
1237                                         ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
1238                                         ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
1239                         } else {
1240                                 swz[i] = R300_FPI0_ARGC_ZERO;
1241                         }
1242                 }
1243
1244                 rp->alu.inst[pos].inst0 &=
1245                                 ~(R300_FPI0_ARG0C_MASK|R300_FPI0_ARG1C_MASK|R300_FPI0_ARG2C_MASK);
1246                 rp->alu.inst[pos].inst0 |=
1247                                 (swz[0] << R300_FPI0_ARG0C_SHIFT) |
1248                                 (swz[1] << R300_FPI0_ARG1C_SHIFT) |
1249                                 (swz[2] << R300_FPI0_ARG2C_SHIFT);
1250         }
1251
1252         if (emit_sop) {
1253                 int swz[3];
1254
1255                 for(i = 0; i < 3; ++i) {
1256                         if (i < argc) {
1257                                 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1258                                                 (srcpos[i] * s_swiz[REG_GET_SSWZ(src[i])].stride)) |
1259                                                 ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
1260                                                 ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
1261                         } else {
1262                                 swz[i] = R300_FPI2_ARGA_ZERO;
1263                         }
1264                 }
1265
1266                 rp->alu.inst[pos].inst2 &=
1267                                 ~(R300_FPI2_ARG0A_MASK|R300_FPI2_ARG1A_MASK|R300_FPI2_ARG2A_MASK);
1268                 rp->alu.inst[pos].inst2 |=
1269                                 (swz[0] << R300_FPI2_ARG0A_SHIFT) |
1270                                 (swz[1] << R300_FPI2_ARG1A_SHIFT) |
1271                                 (swz[2] << R300_FPI2_ARG2A_SHIFT);
1272         }
1273
1274         return pos;
1275 }
1276
1277
1278 /**
1279  * Append an ALU instruction to the instruction list.
1280  */
1281 static void emit_arith(struct r300_fragment_program *rp,
1282                        int op,
1283                        GLuint dest,
1284                        int mask,
1285                        GLuint src0,
1286                        GLuint src1,
1287                        GLuint src2,
1288                        int flags)
1289 {
1290         COMPILE_STATE;
1291         GLuint src[3] = { src0, src1, src2 };
1292         int hwdest;
1293         GLboolean emit_vop, emit_sop;
1294         int vop, sop, argc;
1295         int pos;
1296
1297         vop = r300_fpop[op].v_op;
1298         sop = r300_fpop[op].s_op;
1299         argc = r300_fpop[op].argc;
1300
1301         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1302             REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1303                 if (mask & WRITEMASK_Z) {
1304                         mask = WRITEMASK_W;
1305                 } else {
1306                         return;
1307                 }
1308         }
1309
1310         emit_vop = GL_FALSE;
1311         emit_sop = GL_FALSE;
1312         if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
1313                 emit_vop = GL_TRUE;
1314         if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
1315                 emit_sop = GL_TRUE;
1316
1317         pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest, mask);
1318         if (pos < 0)
1319                 return;
1320
1321         hwdest = t_hw_dst(rp, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
1322
1323         if (flags & PFS_FLAG_SAT) {
1324                 vop |= R300_FPI0_OUTC_SAT;
1325                 sop |= R300_FPI2_OUTA_SAT;
1326         }
1327
1328         /* Throw the pieces together and get FPI0/1 */
1329         if (emit_vop) {
1330                 rp->alu.inst[pos].inst0 |= vop;
1331
1332                 rp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
1333
1334                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1335                         if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1336                                 rp->alu.inst[pos].inst1 |=
1337                                         (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
1338                         } else assert(0);
1339                 } else {
1340                         rp->alu.inst[pos].inst1 |=
1341                                         (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT;
1342
1343                         cs->hwtemps[hwdest].vector_valid = pos+1;
1344                 }
1345         }
1346
1347         /* And now FPI2/3 */
1348         if (emit_sop) {
1349                 rp->alu.inst[pos].inst2 |= sop;
1350
1351                 if (mask & WRITEMASK_W) {
1352                         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1353                                 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1354                                         rp->alu.inst[pos].inst3 |=
1355                                                         (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT;
1356                                 } else if (REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1357                                         rp->alu.inst[pos].inst3 |= R300_FPI3_DSTA_DEPTH;
1358                                 } else assert(0);
1359                         } else {
1360                                 rp->alu.inst[pos].inst3 |=
1361                                                 (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG;
1362
1363                                 cs->hwtemps[hwdest].scalar_valid = pos+1;
1364                         }
1365                 }
1366         }
1367
1368         return;
1369 }
1370
1371 #if 0
1372 static GLuint get_attrib(struct r300_fragment_program *rp, GLuint attr)
1373 {
1374         struct gl_fragment_program *mp = &rp->mesa_program;
1375         GLuint r = undef;
1376
1377         if (!(mp->Base.InputsRead & (1<<attr))) {
1378                 ERROR("Attribute %d was not provided!\n", attr);
1379                 return undef;
1380         }
1381
1382         REG_SET_TYPE(r, REG_TYPE_INPUT);
1383         REG_SET_INDEX(r, attr);
1384         REG_SET_VALID(r, GL_TRUE);
1385         return r;
1386 }
1387 #endif
1388
1389 static GLfloat SinCosConsts[2][4] = {
1390         {
1391                 1.273239545,  // 4/PI
1392                 -0.405284735, // -4/(PI*PI)
1393                 3.141592654,  // PI
1394                 0.2225        // weight
1395         },
1396         {
1397                 0.75,
1398                 0.0,
1399                 0.159154943,  // 1/(2*PI)
1400                 6.283185307   // 2*PI
1401         }
1402 };
1403
1404
1405 /**
1406  * Emit a LIT instruction.
1407  * \p flags may be PFS_FLAG_SAT
1408  *
1409  * Definition of LIT (from ARB_fragment_program):
1410  * tmp = VectorLoad(op0);
1411  * if (tmp.x < 0) tmp.x = 0;
1412  * if (tmp.y < 0) tmp.y = 0;
1413  * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1414  * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1415  * result.x = 1.0;
1416  * result.y = tmp.x;
1417  * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1418  * result.w = 1.0;
1419  *
1420  * The longest path of computation is the one leading to result.z,
1421  * consisting of 5 operations. This implementation of LIT takes
1422  * 5 slots. So unless there's some special undocumented opcode,
1423  * this implementation is potentially optimal. Unfortunately,
1424  * emit_arith is a bit too conservative because it doesn't understand
1425  * partial writes to the vector component.
1426  */
1427 static const GLfloat LitConst[4] = { 127.999999, 127.999999, 127.999999, -127.999999 };
1428
1429 static void emit_lit(struct r300_fragment_program *rp,
1430                 GLuint dest,
1431                 int mask,
1432                 GLuint src,
1433                 int flags)
1434 {
1435         COMPILE_STATE;
1436         GLuint cnst;
1437         int needTemporary;
1438         GLuint temp;
1439
1440         cnst = emit_const4fv(rp, LitConst);
1441
1442         needTemporary = 0;
1443         if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
1444                 needTemporary = 1;
1445         } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1446                 // LIT is typically followed by DP3/DP4, so there's no point
1447                 // in creating special code for this case
1448                 needTemporary = 1;
1449         }
1450
1451         if (needTemporary) {
1452                 temp = keep(get_temp_reg(rp));
1453         } else {
1454                 temp = keep(dest);
1455         }
1456
1457         // Note: The order of emit_arith inside the slots is relevant,
1458         // because emit_arith only looks at scalar vs. vector when resolving
1459         // dependencies, and it does not consider individual vector components,
1460         // so swizzling between the two parts can create fake dependencies.
1461
1462         // First slot
1463         emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_XY,
1464                    keep(src), pfs_zero, undef, 0);
1465         emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_W,
1466                    src, cnst, undef, 0);
1467
1468         // Second slot
1469         emit_arith(rp, PFS_OP_MIN, temp, WRITEMASK_Z,
1470                    swizzle(temp, W, W, W, W), cnst, undef, 0);
1471         emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
1472                    swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
1473
1474         // Third slot
1475         // If desired, we saturate the y result here.
1476         // This does not affect the use as a condition variable in the CMP later
1477         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
1478                    temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
1479         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
1480                    swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
1481
1482         // Fourth slot
1483         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
1484                    pfs_one, pfs_one, pfs_zero, 0);
1485         emit_arith(rp, PFS_OP_EX2, temp, WRITEMASK_W,
1486                    temp, undef, undef, 0);
1487
1488         // Fifth slot
1489         emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z,
1490                    pfs_zero, swizzle(temp, W, W, W, W), negate(swizzle(temp, Y, Y, Y, Y)), flags);
1491         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
1492                    pfs_one, pfs_one, pfs_zero, 0);
1493
1494         if (needTemporary) {
1495                 emit_arith(rp, PFS_OP_MAD, dest, mask,
1496                                    temp, pfs_one, pfs_zero, flags);
1497                 free_temp(rp, temp);
1498         } else {
1499                 // Decrease refcount of the destination
1500                 t_hw_dst(rp, dest, GL_FALSE, cs->nrslots);
1501         }
1502 }
1503
1504
1505 static GLboolean parse_program(struct r300_fragment_program *rp)
1506 {
1507         struct gl_fragment_program *mp = &rp->mesa_program;
1508         const struct prog_instruction *inst = mp->Base.Instructions;
1509         struct prog_instruction *fpi;
1510         GLuint src[3], dest, temp[2];
1511         int flags, mask = 0;
1512         int const_sin[2];
1513
1514         if (!inst || inst[0].Opcode == OPCODE_END) {
1515                 ERROR("empty program?\n");
1516                 return GL_FALSE;
1517         }
1518
1519         for (fpi=mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
1520                 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1521                         flags = PFS_FLAG_SAT;
1522                 else
1523                         flags = 0;
1524
1525                 if (fpi->Opcode != OPCODE_KIL) {
1526                         dest = t_dst(rp, fpi->DstReg);
1527                         mask = fpi->DstReg.WriteMask;
1528                 }
1529
1530                 switch (fpi->Opcode) {
1531                 case OPCODE_ABS:
1532                         src[0] = t_src(rp, fpi->SrcReg[0]);
1533                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1534                                    absolute(src[0]), pfs_one, pfs_zero,
1535                                    flags);
1536                         break;
1537                 case OPCODE_ADD:
1538                         src[0] = t_src(rp, fpi->SrcReg[0]);
1539                         src[1] = t_src(rp, fpi->SrcReg[1]);
1540                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1541                                    src[0], pfs_one, src[1],
1542                                    flags);
1543                         break;
1544                 case OPCODE_CMP:
1545                         src[0] = t_src(rp, fpi->SrcReg[0]);
1546                         src[1] = t_src(rp, fpi->SrcReg[1]);
1547                         src[2] = t_src(rp, fpi->SrcReg[2]);
1548                         /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1549                          *    r300 - if src2.c < 0.0 ? src1.c : src0.c
1550                          */
1551                         emit_arith(rp, PFS_OP_CMP, dest, mask,
1552                                    src[2], src[1], src[0],
1553                                    flags);
1554                         break;
1555                 case OPCODE_COS:
1556                         /*
1557                          * cos using a parabola (see SIN):
1558                          * cos(x):
1559                          *   x = (x/(2*PI))+0.75
1560                          *   x = frac(x)
1561                          *   x = (x*2*PI)-PI
1562                          *   result = sin(x)
1563                          */
1564                         temp[0] = get_temp_reg(rp);
1565                         const_sin[0] = emit_const4fv(rp, SinCosConsts[0]);
1566                         const_sin[1] = emit_const4fv(rp, SinCosConsts[1]);
1567                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1568
1569                         /* add 0.5*PI and do range reduction */
1570
1571                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1572                                    swizzle(src[0], X, X, X, X),
1573                                    swizzle(const_sin[1], Z, Z, Z, Z),
1574                                    swizzle(const_sin[1], X, X, X, X),
1575                                    0);
1576
1577                         emit_arith(rp, PFS_OP_FRC, temp[0], WRITEMASK_X,
1578                                    swizzle(temp[0], X, X, X, X),
1579                                    undef,
1580                                    undef,
1581                                    0);
1582
1583                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z,
1584                                    swizzle(temp[0], X, X, X, X),
1585                                    swizzle(const_sin[1], W, W, W, W), //2*PI
1586                                    negate(swizzle(const_sin[0], Z, Z, Z, Z)), //-PI
1587                                    0);
1588
1589                         /* SIN */
1590
1591                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1592                                    swizzle(temp[0], Z, Z, Z, Z),
1593                                    const_sin[0],
1594                                    pfs_zero,
1595                                    0);
1596
1597                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1598                                    swizzle(temp[0], Y, Y, Y, Y),
1599                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1600                                    swizzle(temp[0], X, X, X, X),
1601                                    0);
1602
1603                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1604                                    swizzle(temp[0], X, X, X, X),
1605                                    absolute(swizzle(temp[0], X, X, X, X)),
1606                                    negate(swizzle(temp[0], X, X, X, X)),
1607                                    0);
1608
1609
1610                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1611                                    swizzle(temp[0], Y, Y, Y, Y),
1612                                    swizzle(const_sin[0], W, W, W, W),
1613                                    swizzle(temp[0], X, X, X, X),
1614                                    flags);
1615
1616                         free_temp(rp, temp[0]);
1617                         break;
1618                 case OPCODE_DP3:
1619                         src[0] = t_src(rp, fpi->SrcReg[0]);
1620                         src[1] = t_src(rp, fpi->SrcReg[1]);
1621                         emit_arith(rp, PFS_OP_DP3, dest, mask,
1622                                    src[0], src[1], undef,
1623                                    flags);
1624                         break;
1625                 case OPCODE_DP4:
1626                         src[0] = t_src(rp, fpi->SrcReg[0]);
1627                         src[1] = t_src(rp, fpi->SrcReg[1]);
1628                         emit_arith(rp, PFS_OP_DP4, dest, mask,
1629                                    src[0], src[1], undef,
1630                                    flags);
1631                         break;
1632                 case OPCODE_DPH:
1633                         src[0] = t_src(rp, fpi->SrcReg[0]);
1634                         src[1] = t_src(rp, fpi->SrcReg[1]);
1635                         /* src0.xyz1 -> temp
1636                          * DP4 dest, temp, src1
1637                          */
1638 #if 0
1639                         temp[0] = get_temp_reg(rp);
1640                         src[0].s_swz = SWIZZLE_ONE;
1641                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1642                                    src[0], pfs_one, pfs_zero,
1643                                    0);
1644                         emit_arith(rp, PFS_OP_DP4, dest, mask,
1645                                    temp[0], src[1], undef,
1646                                    flags);
1647                         free_temp(rp, temp[0]);
1648 #else
1649                         emit_arith(rp, PFS_OP_DP4, dest, mask,
1650                                    swizzle(src[0], X, Y, Z, ONE), src[1],
1651                                    undef, flags);
1652 #endif
1653                         break;
1654                 case OPCODE_DST:
1655                         src[0] = t_src(rp, fpi->SrcReg[0]);
1656                         src[1] = t_src(rp, fpi->SrcReg[1]);
1657                         /* dest.y = src0.y * src1.y */
1658                         if (mask & WRITEMASK_Y)
1659                                 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Y,
1660                                            keep(src[0]), keep(src[1]),
1661                                            pfs_zero, flags);
1662                         /* dest.z = src0.z */
1663                         if (mask & WRITEMASK_Z)
1664                                 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Z,
1665                                            src[0], pfs_one, pfs_zero, flags);
1666                         /* result.x = 1.0
1667                          * result.w = src1.w */
1668                         if (mask & WRITEMASK_XW) {
1669                                 REG_SET_VSWZ(src[1], SWIZZLE_111); /*Cheat*/
1670                                 emit_arith(rp, PFS_OP_MAD, dest,
1671                                            mask & WRITEMASK_XW,
1672                                            src[1], pfs_one, pfs_zero,
1673                                            flags);
1674                         }
1675                         break;
1676                 case OPCODE_EX2:
1677                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1678                         emit_arith(rp, PFS_OP_EX2, dest, mask,
1679                                    src[0], undef, undef,
1680                                    flags);
1681                         break;
1682                 case OPCODE_FLR:
1683                         src[0] = t_src(rp, fpi->SrcReg[0]);
1684                         temp[0] = get_temp_reg(rp);
1685                         /* FRC temp, src0
1686                          * MAD dest, src0, 1.0, -temp
1687                          */
1688                         emit_arith(rp, PFS_OP_FRC, temp[0], mask,
1689                                    keep(src[0]), undef, undef,
1690                                    0);
1691                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1692                                    src[0], pfs_one, negate(temp[0]),
1693                                    flags);
1694                         free_temp(rp, temp[0]);
1695                         break;
1696                 case OPCODE_FRC:
1697                         src[0] = t_src(rp, fpi->SrcReg[0]);
1698                         emit_arith(rp, PFS_OP_FRC, dest, mask,
1699                                    src[0], undef, undef,
1700                                    flags);
1701                         break;
1702                 case OPCODE_KIL:
1703                         emit_tex(rp, fpi, R300_FPITX_OP_KIL);
1704                         break;
1705                 case OPCODE_LG2:
1706                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1707                         emit_arith(rp, PFS_OP_LG2, dest, mask,
1708                                    src[0], undef, undef,
1709                                    flags);
1710                         break;
1711                 case OPCODE_LIT:
1712                         src[0] = t_src(rp, fpi->SrcReg[0]);
1713                         emit_lit(rp, dest, mask, src[0], flags);
1714                         break;
1715                 case OPCODE_LRP:
1716                         src[0] = t_src(rp, fpi->SrcReg[0]);
1717                         src[1] = t_src(rp, fpi->SrcReg[1]);
1718                         src[2] = t_src(rp, fpi->SrcReg[2]);
1719                         /* result = tmp0tmp1 + (1 - tmp0)tmp2
1720                          *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1721                          *     MAD temp, -tmp0, tmp2, tmp2
1722                          *     MAD result, tmp0, tmp1, temp
1723                          */
1724                         temp[0] = get_temp_reg(rp);
1725                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1726                                    negate(keep(src[0])), keep(src[2]), src[2],
1727                                    0);
1728                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1729                                    src[0], src[1], temp[0],
1730                                    flags);
1731                         free_temp(rp, temp[0]);
1732                         break;
1733                 case OPCODE_MAD:
1734                         src[0] = t_src(rp, fpi->SrcReg[0]);
1735                         src[1] = t_src(rp, fpi->SrcReg[1]);
1736                         src[2] = t_src(rp, fpi->SrcReg[2]);
1737                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1738                                    src[0], src[1], src[2],
1739                                    flags);
1740                         break;
1741                 case OPCODE_MAX:
1742                         src[0] = t_src(rp, fpi->SrcReg[0]);
1743                         src[1] = t_src(rp, fpi->SrcReg[1]);
1744                         emit_arith(rp, PFS_OP_MAX, dest, mask,
1745                                    src[0], src[1], undef,
1746                                    flags);
1747                         break;
1748                 case OPCODE_MIN:
1749                         src[0] = t_src(rp, fpi->SrcReg[0]);
1750                         src[1] = t_src(rp, fpi->SrcReg[1]);
1751                         emit_arith(rp, PFS_OP_MIN, dest, mask,
1752                                    src[0], src[1], undef,
1753                                    flags);
1754                         break;
1755                 case OPCODE_MOV:
1756                 case OPCODE_SWZ:
1757                         src[0] = t_src(rp, fpi->SrcReg[0]);
1758                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1759                                    src[0], pfs_one, pfs_zero,
1760                                    flags);
1761                         break;
1762                 case OPCODE_MUL:
1763                         src[0] = t_src(rp, fpi->SrcReg[0]);
1764                         src[1] = t_src(rp, fpi->SrcReg[1]);
1765                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1766                                    src[0], src[1], pfs_zero,
1767                                    flags);
1768                         break;
1769                 case OPCODE_POW:
1770                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1771                         src[1] = t_scalar_src(rp, fpi->SrcReg[1]);
1772                         temp[0] = get_temp_reg(rp);
1773                         emit_arith(rp, PFS_OP_LG2, temp[0], WRITEMASK_W,
1774                                    src[0], undef, undef,
1775                                    0);
1776                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1777                                    temp[0], src[1], pfs_zero,
1778                                    0);
1779                         emit_arith(rp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
1780                                    temp[0], undef, undef,
1781                                    0);
1782                         free_temp(rp, temp[0]);
1783                         break;
1784                 case OPCODE_RCP:
1785                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1786                         emit_arith(rp, PFS_OP_RCP, dest, mask,
1787                                    src[0], undef, undef,
1788                                    flags);
1789                         break;
1790                 case OPCODE_RSQ:
1791                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1792                         emit_arith(rp, PFS_OP_RSQ, dest, mask,
1793                                    absolute(src[0]), pfs_zero, pfs_zero,
1794                                    flags);
1795                         break;
1796                 case OPCODE_SCS:
1797                         /*
1798                          * scs using a parabola :
1799                          * scs(x):
1800                          *   result.x = sin(-abs(x)+0.5*PI)  (cos)
1801                          *   result.y = sin(x)               (sin)
1802                          *
1803                          */
1804                         temp[0] = get_temp_reg(rp);
1805                         temp[1] = get_temp_reg(rp);
1806                         const_sin[0] = emit_const4fv(rp, SinCosConsts[0]);
1807                         const_sin[1] = emit_const4fv(rp, SinCosConsts[1]);
1808                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1809
1810                         /* x = -abs(x)+0.5*PI */
1811                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z,
1812                                    swizzle(const_sin[0], Z, Z, Z, Z), //PI
1813                                    pfs_half,
1814                                    negate(abs(swizzle(keep(src[0]), X, X, X, X))),
1815                                    0);
1816
1817                         /* C*x (sin) */
1818                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1819                                    swizzle(const_sin[0], Y, Y, Y, Y),
1820                                    swizzle(keep(src[0]), X, X, X, X),
1821                                    pfs_zero,
1822                                    0);
1823
1824                         /* B*x, C*x (cos) */
1825                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1826                                    swizzle(temp[0], Z, Z, Z, Z),
1827                                    const_sin[0],
1828                                    pfs_zero,
1829                                    0);
1830
1831                         /* B*x (sin) */
1832                         emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_W,
1833                                    swizzle(const_sin[0], X, X, X, X),
1834                                    keep(src[0]),
1835                                    pfs_zero,
1836                                    0);
1837
1838                         /* y = B*x + C*x*abs(x) (sin)*/
1839                         emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1840                                    absolute(src[0]),
1841                                    swizzle(temp[0], W, W, W, W),
1842                                    swizzle(temp[1], W, W, W, W),
1843                                    0);
1844
1845                         /* y = B*x + C*x*abs(x) (cos)*/
1846                         emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_W,
1847                                    swizzle(temp[0], Y, Y, Y, Y),
1848                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1849                                    swizzle(temp[0], X, X, X, X),
1850                                    0);
1851
1852                         /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1853                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1854                                    swizzle(temp[1], W, Z, Y, X),
1855                                    absolute(swizzle(temp[1], W, Z, Y, X)),
1856                                    negate(swizzle(temp[1], W, Z, Y, X)),
1857
1858                                    0);
1859
1860                         /* dest.xy = mad(temp.xy, P, temp2.wz) */
1861                         emit_arith(rp, PFS_OP_MAD, dest, mask & (WRITEMASK_X | WRITEMASK_Y),
1862                                    temp[0],
1863                                    swizzle(const_sin[0], W, W, W, W),
1864                                    swizzle(temp[1], W, Z, Y, X),
1865                                    flags);
1866
1867                         free_temp(rp, temp[0]);
1868                         free_temp(rp, temp[1]);
1869                         break;
1870                 case OPCODE_SGE:
1871                         src[0] = t_src(rp, fpi->SrcReg[0]);
1872                         src[1] = t_src(rp, fpi->SrcReg[1]);
1873                         temp[0] = get_temp_reg(rp);
1874                         /* temp = src0 - src1
1875                          * dest.c = (temp.c < 0.0) ? 0 : 1
1876                          */
1877                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1878                                    src[0], pfs_one, negate(src[1]),
1879                                    0);
1880                         emit_arith(rp, PFS_OP_CMP, dest, mask,
1881                                    pfs_one, pfs_zero, temp[0],
1882                                    0);
1883                         free_temp(rp, temp[0]);
1884                         break;
1885                 case OPCODE_SIN:
1886                         /*
1887                          *  using a parabola:
1888                          * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1889                          * extra precision is obtained by weighting against
1890                          * itself squared.
1891                          */
1892
1893                         temp[0] = get_temp_reg(rp);
1894                         const_sin[0] = emit_const4fv(rp, SinCosConsts[0]);
1895                         const_sin[1] = emit_const4fv(rp, SinCosConsts[1]);
1896                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1897
1898
1899                         /* do range reduction */
1900
1901                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1902                                    swizzle(keep(src[0]), X, X, X, X),
1903                                    swizzle(const_sin[1], Z, Z, Z, Z),
1904                                    pfs_half,
1905                                    0);
1906
1907                         emit_arith(rp, PFS_OP_FRC, temp[0], WRITEMASK_X,
1908                                    swizzle(temp[0], X, X, X, X),
1909                                    undef,
1910                                    undef,
1911                                    0);
1912
1913                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z,
1914                                    swizzle(temp[0], X, X, X, X),
1915                                    swizzle(const_sin[1], W, W, W, W), //2*PI
1916                                    negate(swizzle(const_sin[0], Z, Z, Z, Z)), //PI
1917                                    0);
1918
1919                         /* SIN */
1920
1921                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1922                                    swizzle(temp[0], Z, Z, Z, Z),
1923                                    const_sin[0],
1924                                    pfs_zero,
1925                                    0);
1926
1927                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1928                                    swizzle(temp[0], Y, Y, Y, Y),
1929                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1930                                    swizzle(temp[0], X, X, X, X),
1931                                    0);
1932
1933                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1934                                    swizzle(temp[0], X, X, X, X),
1935                                    absolute(swizzle(temp[0], X, X, X, X)),
1936                                    negate(swizzle(temp[0], X, X, X, X)),
1937                                    0);
1938
1939
1940                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1941                                    swizzle(temp[0], Y, Y, Y, Y),
1942                                    swizzle(const_sin[0], W, W, W, W),
1943                                    swizzle(temp[0], X, X, X, X),
1944                                    flags);
1945
1946                         free_temp(rp, temp[0]);
1947                         break;
1948                 case OPCODE_SLT:
1949                         src[0] = t_src(rp, fpi->SrcReg[0]);
1950                         src[1] = t_src(rp, fpi->SrcReg[1]);
1951                         temp[0] = get_temp_reg(rp);
1952                         /* temp = src0 - src1
1953                          * dest.c = (temp.c < 0.0) ? 1 : 0
1954                          */
1955                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1956                                    src[0], pfs_one, negate(src[1]),
1957                                    0);
1958                         emit_arith(rp, PFS_OP_CMP, dest, mask,
1959                                    pfs_zero, pfs_one, temp[0],
1960                                    0);
1961                         free_temp(rp, temp[0]);
1962                         break;
1963                 case OPCODE_SUB:
1964                         src[0] = t_src(rp, fpi->SrcReg[0]);
1965                         src[1] = t_src(rp, fpi->SrcReg[1]);
1966                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1967                                    src[0], pfs_one, negate(src[1]),
1968                                    flags);
1969                         break;
1970                 case OPCODE_TEX:
1971                         emit_tex(rp, fpi, R300_FPITX_OP_TEX);
1972                         break;
1973                 case OPCODE_TXB:
1974                         emit_tex(rp, fpi, R300_FPITX_OP_TXB);
1975                         break;
1976                 case OPCODE_TXP:
1977                         emit_tex(rp, fpi, R300_FPITX_OP_TXP);
1978                         break;
1979                 case OPCODE_XPD: {
1980                         src[0] = t_src(rp, fpi->SrcReg[0]);
1981                         src[1] = t_src(rp, fpi->SrcReg[1]);
1982                         temp[0] = get_temp_reg(rp);
1983                         /* temp = src0.zxy * src1.yzx */
1984                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_XYZ,
1985                                    swizzle(keep(src[0]), Z, X, Y, W),
1986                                    swizzle(keep(src[1]), Y, Z, X, W),
1987                                    pfs_zero,
1988                                    0);
1989                         /* dest.xyz = src0.yzx * src1.zxy - temp
1990                          * dest.w       = undefined
1991                          * */
1992                         emit_arith(rp, PFS_OP_MAD, dest, mask & WRITEMASK_XYZ,
1993                                    swizzle(src[0], Y, Z, X, W),
1994                                    swizzle(src[1], Z, X, Y, W),
1995                                    negate(temp[0]),
1996                                    flags);
1997                         /* cleanup */
1998                         free_temp(rp, temp[0]);
1999                         break;
2000                 }
2001                 default:
2002                         ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
2003                         break;
2004                 }
2005
2006                 if (rp->error)
2007                         return GL_FALSE;
2008
2009         }
2010
2011         return GL_TRUE;
2012 }
2013
2014 static void insert_wpos(struct gl_program *prog)
2015 {
2016         GLint tokens[6] = { STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0, 0 };
2017         struct prog_instruction *fpi;
2018         GLuint window_index;
2019         int i = 0;
2020         GLuint tempregi = prog->NumTemporaries;
2021         /* should do something else if no temps left... */
2022         prog->NumTemporaries++;
2023
2024         fpi = _mesa_alloc_instructions (prog->NumInstructions + 3);
2025         _mesa_init_instructions (fpi, prog->NumInstructions + 3);
2026
2027         /* perspective divide */
2028         fpi[i].Opcode = OPCODE_RCP;
2029
2030         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2031         fpi[i].DstReg.Index = tempregi;
2032         fpi[i].DstReg.WriteMask = WRITEMASK_W;
2033         fpi[i].DstReg.CondMask = COND_TR;
2034
2035         fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2036         fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2037         fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
2038         i++;
2039
2040         fpi[i].Opcode = OPCODE_MUL;
2041
2042         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2043         fpi[i].DstReg.Index = tempregi;
2044         fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2045         fpi[i].DstReg.CondMask = COND_TR;
2046
2047         fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2048         fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2049         fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
2050
2051         fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
2052         fpi[i].SrcReg[1].Index = tempregi;
2053         fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
2054         i++;
2055
2056         /* viewport transformation */
2057         window_index = _mesa_add_state_reference(prog->Parameters, tokens);
2058
2059         fpi[i].Opcode = OPCODE_MAD;
2060
2061         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2062         fpi[i].DstReg.Index = tempregi;
2063         fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2064         fpi[i].DstReg.CondMask = COND_TR;
2065
2066         fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
2067         fpi[i].SrcReg[0].Index = tempregi;
2068         fpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2069
2070         fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
2071         fpi[i].SrcReg[1].Index = window_index;
2072         fpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2073
2074         fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
2075         fpi[i].SrcReg[2].Index = window_index;
2076         fpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2077         i++;
2078
2079         _mesa_copy_instructions (&fpi[i], prog->Instructions, prog->NumInstructions);
2080
2081         free(prog->Instructions);
2082
2083         prog->Instructions = fpi;
2084
2085         prog->NumInstructions += i;
2086         fpi = &prog->Instructions[prog->NumInstructions-1];
2087
2088         assert(fpi->Opcode == OPCODE_END);
2089
2090         for(fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++){
2091                 for(i=0; i<3; i++)
2092                     if( fpi->SrcReg[i].File == PROGRAM_INPUT &&
2093                         fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS ){
2094                             fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
2095                             fpi->SrcReg[i].Index = tempregi;
2096                     }
2097         }
2098 }
2099
2100 /* - Init structures
2101  * - Determine what hwregs each input corresponds to
2102  */
2103 static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
2104 {
2105         struct r300_pfs_compile_state *cs = NULL;
2106         struct gl_fragment_program *mp = &rp->mesa_program;
2107         struct prog_instruction *fpi;
2108         GLuint InputsRead = mp->Base.InputsRead;
2109         GLuint temps_used = 0; /* for rp->temps[] */
2110         int i,j;
2111
2112         /* New compile, reset tracking data */
2113         rp->optimization = driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
2114         rp->translated = GL_FALSE;
2115         rp->error      = GL_FALSE;
2116         rp->cs = cs        = &(R300_CONTEXT(rp->ctx)->state.pfs_compile);
2117         rp->tex.length = 0;
2118         rp->cur_node   = 0;
2119         rp->first_node_has_tex = 0;
2120         rp->const_nr   = 0;
2121         rp->max_temp_idx = 0;
2122         rp->node[0].alu_end = -1;
2123         rp->node[0].tex_end = -1;
2124
2125         _mesa_memset(cs, 0, sizeof(*rp->cs));
2126         for (i=0;i<PFS_MAX_ALU_INST;i++) {
2127                 for (j=0;j<3;j++) {
2128                         cs->slot[i].vsrc[j] = SRC_CONST;
2129                         cs->slot[i].ssrc[j] = SRC_CONST;
2130                 }
2131         }
2132
2133         /* Work out what temps the Mesa inputs correspond to, this must match
2134          * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2135          * configures itself based on the fragprog's InputsRead
2136          *
2137          * NOTE: this depends on get_hw_temp() allocating registers in order,
2138          * starting from register 0.
2139          */
2140
2141         /* Texcoords come first */
2142         for (i=0;i<rp->ctx->Const.MaxTextureUnits;i++) {
2143                 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
2144                         cs->inputs[FRAG_ATTRIB_TEX0+i].refcount = 0;
2145                         cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp, 0);
2146                 }
2147         }
2148         InputsRead &= ~FRAG_BITS_TEX_ANY;
2149
2150         /* fragment position treated as a texcoord */
2151         if (InputsRead & FRAG_BIT_WPOS) {
2152                 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
2153                 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp, 0);
2154                 insert_wpos(&mp->Base);
2155         }
2156         InputsRead &= ~FRAG_BIT_WPOS;
2157
2158         /* Then primary colour */
2159         if (InputsRead & FRAG_BIT_COL0) {
2160                 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
2161                 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp, 0);
2162         }
2163         InputsRead &= ~FRAG_BIT_COL0;
2164
2165         /* Secondary color */
2166         if (InputsRead & FRAG_BIT_COL1) {
2167                 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
2168                 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp, 0);
2169         }
2170         InputsRead &= ~FRAG_BIT_COL1;
2171
2172         /* Anything else */
2173         if (InputsRead) {
2174                 WARN_ONCE("Don't know how to handle inputs 0x%x\n",
2175                           InputsRead);
2176                 /* force read from hwreg 0 for now */
2177                 for (i=0;i<32;i++)
2178                         if (InputsRead & (1<<i)) cs->inputs[i].reg = 0;
2179         }
2180
2181         /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2182          * That way, we can free up the reg when it's no longer needed
2183          */
2184         if (!mp->Base.Instructions) {
2185                 ERROR("No instructions found in program\n");
2186                 return;
2187         }
2188
2189         for (fpi=mp->Base.Instructions;fpi->Opcode != OPCODE_END; fpi++) {
2190                 int idx;
2191
2192                 for (i=0;i<3;i++) {
2193                         idx = fpi->SrcReg[i].Index;
2194                         switch (fpi->SrcReg[i].File) {
2195                         case PROGRAM_TEMPORARY:
2196                                 if (!(temps_used & (1<<idx))) {
2197                                         cs->temps[idx].reg = -1;
2198                                         cs->temps[idx].refcount = 1;
2199                                         temps_used |= (1 << idx);
2200                                 } else
2201                                         cs->temps[idx].refcount++;
2202                                 break;
2203                         case PROGRAM_INPUT:
2204                                 cs->inputs[idx].refcount++;
2205                                 break;
2206                         default: break;
2207                         }
2208                 }
2209
2210                 idx = fpi->DstReg.Index;
2211                 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
2212                         if (!(temps_used & (1<<idx))) {
2213                                 cs->temps[idx].reg = -1;
2214                                 cs->temps[idx].refcount = 1;
2215                                 temps_used |= (1 << idx);
2216                         } else
2217                                 cs->temps[idx].refcount++;
2218                 }
2219         }
2220         cs->temp_in_use = temps_used;
2221 }
2222
2223 static void update_params(struct r300_fragment_program *rp)
2224 {
2225         struct gl_fragment_program *mp = &rp->mesa_program;
2226
2227         /* Ask Mesa nicely to fill in ParameterValues for us */
2228         if (mp->Base.Parameters)
2229                 _mesa_load_state_parameters(rp->ctx, mp->Base.Parameters);
2230 }
2231
2232 void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_program *rp)
2233 {
2234         struct r300_pfs_compile_state *cs = NULL;
2235
2236         if (!rp->translated) {
2237
2238                 init_program(r300, rp);
2239                 cs = rp->cs;
2240
2241                 if (parse_program(rp) == GL_FALSE) {
2242                         dump_program(rp);
2243                         return;
2244                 }
2245
2246                 /* Finish off */
2247                 rp->node[rp->cur_node].alu_end =
2248                                 cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
2249                 if (rp->node[rp->cur_node].tex_end < 0)
2250                         rp->node[rp->cur_node].tex_end = 0;
2251                 rp->alu_offset = 0;
2252                 rp->alu_end    = cs->nrslots - 1;
2253                 rp->tex_offset = 0;
2254                 rp->tex_end    = rp->tex.length ? rp->tex.length - 1 : 0;
2255                 assert(rp->node[rp->cur_node].alu_end >= 0);
2256                 assert(rp->alu_end >= 0);
2257
2258                 rp->translated = GL_TRUE;
2259                 if (RADEON_DEBUG & DEBUG_PIXEL) dump_program(rp);
2260                 r300UpdateStateParameters(rp->ctx, _NEW_PROGRAM);
2261         }
2262
2263         update_params(rp);
2264 }
2265
2266 /* just some random things... */
2267 static void dump_program(struct r300_fragment_program *rp)
2268 {
2269         int n, i, j;
2270         static int pc = 0;
2271
2272         fprintf(stderr, "pc=%d*************************************\n", pc++);
2273
2274         fprintf(stderr, "Mesa program:\n");
2275         fprintf(stderr, "-------------\n");
2276                 _mesa_print_program(&rp->mesa_program.Base);
2277         fflush(stdout);
2278
2279         fprintf(stderr, "Hardware program\n");
2280         fprintf(stderr, "----------------\n");
2281
2282         for (n = 0; n < (rp->cur_node+1); n++) {
2283                 fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\
2284                         "alu_end: %d, tex_end: %d\n", n,
2285                         rp->node[n].alu_offset,
2286                         rp->node[n].tex_offset,
2287                         rp->node[n].alu_end,
2288                         rp->node[n].tex_end);
2289
2290                 if (rp->tex.length) {
2291                         fprintf(stderr, "  TEX:\n");
2292                         for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_offset+rp->node[n].tex_end; ++i) {
2293                                 const char* instr;
2294
2295                                 switch((rp->tex.inst[i] >> R300_FPITX_OPCODE_SHIFT) & 15) {
2296                                 case R300_FPITX_OP_TEX:
2297                                         instr = "TEX";
2298                                         break;
2299                                 case R300_FPITX_OP_KIL:
2300                                         instr = "KIL";
2301                                         break;
2302                                 case R300_FPITX_OP_TXP:
2303                                         instr = "TXP";
2304                                         break;
2305                                 case R300_FPITX_OP_TXB:
2306                                         instr = "TXB";
2307                                         break;
2308                                 default:
2309                                         instr = "UNKNOWN";
2310                                 }
2311
2312                                 fprintf(stderr, "    %s t%i, %c%i, texture[%i]   (%08x)\n",
2313                                                 instr,
2314                                                 (rp->tex.inst[i] >> R300_FPITX_DST_SHIFT) & 31,
2315                                                 (rp->tex.inst[i] & R300_FPITX_SRC_CONST) ? 'c': 't',
2316                                                 (rp->tex.inst[i] >> R300_FPITX_SRC_SHIFT) & 31,
2317                                                 (rp->tex.inst[i] & R300_FPITX_IMAGE_MASK) >> R300_FPITX_IMAGE_SHIFT,
2318                                                 rp->tex.inst[i]);
2319                         }
2320                 }
2321
2322                 for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_offset+rp->node[n].alu_end; ++i) {
2323                         char srcc[3][10], dstc[20];
2324                         char srca[3][10], dsta[20];
2325                         char argc[3][20];
2326                         char arga[3][20];
2327                         char flags[5], tmp[10];
2328
2329                         for(j = 0; j < 3; ++j) {
2330                                 int regc = rp->alu.inst[i].inst1 >> (j*6);
2331                                 int rega = rp->alu.inst[i].inst3 >> (j*6);
2332
2333                                 sprintf(srcc[j], "%c%i", (regc & 32) ? 'c' : 't', regc & 31);
2334                                 sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31);
2335                         }
2336
2337                         dstc[0] = 0;
2338                         sprintf(flags, "%s%s%s",
2339                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "",
2340                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? "y" : "",
2341                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? "z" : "");
2342                         if (flags[0] != 0) {
2343                                 sprintf(dstc, "t%i.%s ",
2344                                                 (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
2345                                                 flags);
2346                         }
2347                         sprintf(flags, "%s%s%s",
2348                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? "x" : "",
2349                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? "y" : "",
2350                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? "z" : "");
2351                         if (flags[0] != 0) {
2352                                 sprintf(tmp, "o%i.%s",
2353                                                 (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
2354                                                 flags);
2355                                 strcat(dstc, tmp);
2356                         }
2357
2358                         dsta[0] = 0;
2359                         if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) {
2360                                 sprintf(dsta, "t%i.w ", (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
2361                         }
2362                         if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) {
2363                                 sprintf(tmp, "o%i.w ", (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
2364                                 strcat(dsta, tmp);
2365                         }
2366                         if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) {
2367                                 strcat(dsta, "Z");
2368                         }
2369
2370                         fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2371                                         "       w: %3s %3s %3s -> %-20s (%08x)\n",
2372                                         i,
2373                                         srcc[0], srcc[1], srcc[2], dstc, rp->alu.inst[i].inst1,
2374                                         srca[0], srca[1], srca[2], dsta, rp->alu.inst[i].inst3);
2375
2376                         for(j = 0; j < 3; ++j) {
2377                                 int regc = rp->alu.inst[i].inst0 >> (j*7);
2378                                 int rega = rp->alu.inst[i].inst2 >> (j*7);
2379                                 int d;
2380                                 char buf[20];
2381
2382                                 d = regc & 31;
2383                                 if (d < 12) {
2384                                         switch(d % 4) {
2385                                                 case R300_FPI0_ARGC_SRC0C_XYZ:
2386                                                         sprintf(buf, "%s.xyz", srcc[d / 4]);
2387                                                         break;
2388                                                 case R300_FPI0_ARGC_SRC0C_XXX:
2389                                                         sprintf(buf, "%s.xxx", srcc[d / 4]);
2390                                                         break;
2391                                                 case R300_FPI0_ARGC_SRC0C_YYY:
2392                                                         sprintf(buf, "%s.yyy", srcc[d / 4]);
2393                                                         break;
2394                                                 case R300_FPI0_ARGC_SRC0C_ZZZ:
2395                                                         sprintf(buf, "%s.zzz", srcc[d / 4]);
2396                                                         break;
2397                                         }
2398                                 } else if (d < 15) {
2399                                         sprintf(buf, "%s.www", srca[d-12]);
2400                                 } else if (d == 20) {
2401                                         sprintf(buf, "0.0");
2402                                 } else if (d == 21) {
2403                                         sprintf(buf, "1.0");
2404                                 } else if (d == 22) {
2405                                         sprintf(buf, "0.5");
2406                                 } else if (d >= 23 && d < 32) {
2407                                         d -= 23;
2408                                         switch(d/3) {
2409                                                 case 0:
2410                                                         sprintf(buf, "%s.yzx", srcc[d % 3]);
2411                                                         break;
2412                                                 case 1:
2413                                                         sprintf(buf, "%s.zxy", srcc[d % 3]);
2414                                                         break;
2415                                                 case 2:
2416                                                         sprintf(buf, "%s.Wzy", srcc[d % 3]);
2417                                                         break;
2418                                         }
2419                                 } else {
2420                                         sprintf(buf, "%i", d);
2421                                 }
2422
2423                                 sprintf(argc[j], "%s%s%s%s",
2424                                                 (regc & 32) ? "-" : "",
2425                                                 (regc & 64) ? "|" : "",
2426                                                 buf,
2427                                                 (regc & 64) ? "|" : "");
2428
2429                                 d = rega & 31;
2430                                 if (d < 9) {
2431                                         sprintf(buf, "%s.%c", srcc[d / 3], 'x' + (char)(d%3));
2432                                 } else if (d < 12) {
2433                                         sprintf(buf, "%s.w", srca[d-9]);
2434                                 } else if (d == 16) {
2435                                         sprintf(buf, "0.0");
2436                                 } else if (d == 17) {
2437                                         sprintf(buf, "1.0");
2438                                 } else if (d == 18) {
2439                                         sprintf(buf, "0.5");
2440                                 } else {
2441                                         sprintf(buf, "%i", d);
2442                                 }
2443
2444                                 sprintf(arga[j], "%s%s%s%s",
2445                                                 (rega & 32) ? "-" : "",
2446                                                 (rega & 64) ? "|" : "",
2447                                                 buf,
2448                                                 (rega & 64) ? "|" : "");
2449                         }
2450
2451                         fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
2452                                         "       w: %8s %8s %8s    op: %08x\n",
2453                                         argc[0], argc[1], argc[2], rp->alu.inst[i].inst0,
2454                                         arga[0], arga[1], arga[2], rp->alu.inst[i].inst2);
2455                 }
2456         }
2457 }