src/mesa/drivers/dri/r300/r300_fragprog.c

   1 /*
   2  * Copyright (C) 2005 Ben Skeggs.
   3  *
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sublicense, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial
  16  * portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  */
  27
  28 /*
  29  * Authors:
  30  *   Ben Skeggs <darktama@iinet.net.au>
  31  *   Jerome Glisse <j.glisse@gmail.com>
  32  */
  33
  34 /*TODO'S
  35  *
  36  * - Depth write, WPOS/FOGC inputs
  37  * - FogOption
  38  * - Verify results of opcodes for accuracy, I've only checked them
  39  *   in specific cases.
  40  * - and more...
  41  */
  42
  43 #include "glheader.h"
  44 #include "macros.h"
  45 #include "enums.h"
  46 #include "shader/prog_instruction.h"
  47 #include "shader/prog_parameter.h"
  48 #include "shader/prog_print.h"
  49
  50 #include "r300_context.h"
  51 #include "r300_fragprog.h"
  52 #include "r300_reg.h"
  53 #include "r300_state.h"
  54
  55 /*
  56  * Usefull macros and values
  57  */
  58 #define ERROR(fmt, args...) do {                        \
  59                 fprintf(stderr, "%s::%s(): " fmt "\n",  \
  60                         __FILE__, __func__, ##args);    \
  61                 rp->error = GL_TRUE;                    \
  62         } while(0)
  63
  64 #define PFS_INVAL 0xFFFFFFFF
  65 #define COMPILE_STATE struct r300_pfs_compile_state *cs = rp->cs
  66
  67 #define SWIZZLE_XYZ             0
  68 #define SWIZZLE_XXX             1
  69 #define SWIZZLE_YYY             2
  70 #define SWIZZLE_ZZZ             3
  71 #define SWIZZLE_WWW             4
  72 #define SWIZZLE_YZX             5
  73 #define SWIZZLE_ZXY             6
  74 #define SWIZZLE_WZY             7
  75 #define SWIZZLE_111             8
  76 #define SWIZZLE_000             9
  77 #define SWIZZLE_HHH             10
  78
  79 #define swizzle(r, x, y, z, w) do_swizzle(rp, r,                \
  80                                           ((SWIZZLE_##x<<0)|    \
  81                                            (SWIZZLE_##y<<3)|    \
  82                                            (SWIZZLE_##z<<6)|    \
  83                                            (SWIZZLE_##w<<9)),   \
  84                                           0)
  85
  86 #define REG_TYPE_INPUT          0
  87 #define REG_TYPE_OUTPUT         1
  88 #define REG_TYPE_TEMP           2
  89 #define REG_TYPE_CONST          3
  90
  91 #define REG_TYPE_SHIFT          0
  92 #define REG_INDEX_SHIFT         2
  93 #define REG_VSWZ_SHIFT          8
  94 #define REG_SSWZ_SHIFT          13
  95 #define REG_NEGV_SHIFT          18
  96 #define REG_NEGS_SHIFT          19
  97 #define REG_ABS_SHIFT           20
  98 #define REG_NO_USE_SHIFT        21 // Hack for refcounting
  99 #define REG_VALID_SHIFT         22 // Does the register contain a defined value?
 100 #define REG_BUILTIN_SHIFT   23 // Is it a builtin (like all zero/all one)?
 101
 102 #define REG_TYPE_MASK           (0x03 << REG_TYPE_SHIFT)
 103 #define REG_INDEX_MASK          (0x3F << REG_INDEX_SHIFT)
 104 #define REG_VSWZ_MASK           (0x1F << REG_VSWZ_SHIFT)
 105 #define REG_SSWZ_MASK           (0x1F << REG_SSWZ_SHIFT)
 106 #define REG_NEGV_MASK           (0x01 << REG_NEGV_SHIFT)
 107 #define REG_NEGS_MASK           (0x01 << REG_NEGS_SHIFT)
 108 #define REG_ABS_MASK            (0x01 << REG_ABS_SHIFT)
 109 #define REG_NO_USE_MASK         (0x01 << REG_NO_USE_SHIFT)
 110 #define REG_VALID_MASK          (0x01 << REG_VALID_SHIFT)
 111 #define REG_BUILTIN_MASK        (0x01 << REG_BUILTIN_SHIFT)
 112
 113 #define REG(type, index, vswz, sswz, nouse, valid, builtin)     \
 114         (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |                   \
 115          ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |                \
 116          ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |              \
 117          ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |                \
 118          ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |  \
 119          ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |                   \
 120          ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 121 #define REG_GET_TYPE(reg)                                               \
 122         ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
 123 #define REG_GET_INDEX(reg)                                              \
 124         ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
 125 #define REG_GET_VSWZ(reg)                                               \
 126         ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
 127 #define REG_GET_SSWZ(reg)                                               \
 128         ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
 129 #define REG_GET_NO_USE(reg)                                             \
 130         ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
 131 #define REG_GET_VALID(reg)                                              \
 132         ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
 133 #define REG_GET_BUILTIN(reg)                                            \
 134         ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
 135 #define REG_SET_TYPE(reg, type)                                         \
 136         reg = ((reg & ~REG_TYPE_MASK) |                                 \
 137                ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
 138 #define REG_SET_INDEX(reg, index)                                       \
 139         reg = ((reg & ~REG_INDEX_MASK) |                                \
 140                ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
 141 #define REG_SET_VSWZ(reg, vswz)                                         \
 142         reg = ((reg & ~REG_VSWZ_MASK) |                                 \
 143                ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
 144 #define REG_SET_SSWZ(reg, sswz)                                         \
 145         reg = ((reg & ~REG_SSWZ_MASK) |                                 \
 146                ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 147 #define REG_SET_NO_USE(reg, nouse)                                      \
 148         reg = ((reg & ~REG_NO_USE_MASK) |                               \
 149                ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
 150 #define REG_SET_VALID(reg, valid)                                       \
 151         reg = ((reg & ~REG_VALID_MASK) |                                \
 152                ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
 153 #define REG_SET_BUILTIN(reg, builtin)                                   \
 154         reg = ((reg & ~REG_BUILTIN_MASK) |                              \
 155                ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
 156 #define REG_ABS(reg)                                                    \
 157         reg = (reg | REG_ABS_MASK)
 158 #define REG_NEGV(reg)                                                   \
 159         reg = (reg | REG_NEGV_MASK)
 160 #define REG_NEGS(reg)                                                   \
 161         reg = (reg | REG_NEGS_MASK)
 162
 163
 164 /*
 165  * Datas structures for fragment program generation
 166  */
 167
 168 /* description of r300 native hw instructions */
 169 static const struct {
 170         const char *name;
 171         int argc;
 172         int v_op;
 173         int s_op;
 174 } r300_fpop[] = {
 175         { "MAD", 3, R300_FPI0_OUTC_MAD, R300_FPI2_OUTA_MAD },
 176         { "DP3", 2, R300_FPI0_OUTC_DP3, R300_FPI2_OUTA_DP4 },
 177         { "DP4", 2, R300_FPI0_OUTC_DP4, R300_FPI2_OUTA_DP4 },
 178         { "MIN", 2, R300_FPI0_OUTC_MIN, R300_FPI2_OUTA_MIN },
 179         { "MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX },
 180         { "CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP },
 181         { "FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC },
 182         { "EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2 },
 183         { "LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2 },
 184         { "RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP },
 185         { "RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ },
 186         { "REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL },
 187         { "CMPH", 3, R300_FPI0_OUTC_CMPH, PFS_INVAL },
 188 };
 189
 190
 191 /* vector swizzles r300 can support natively, with a couple of
 192  * cases we handle specially
 193  *
 194  * REG_VSWZ/REG_SSWZ is an index into this table
 195  */
 196
 197 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
 198 #define SWIZZLE_HALF 6
 199
 200 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
 201                                           SWIZZLE_##y, \
 202                                           SWIZZLE_##z, \
 203                                           SWIZZLE_ZERO))
 204 static const struct r300_pfs_swizzle {
 205         GLuint hash;    /* swizzle value this matches */
 206         GLuint base;    /* base value for hw swizzle */
 207         GLuint stride;  /* difference in base between arg0/1/2 */
 208         GLuint flags;
 209 } v_swiz[] = {
 210 /* native swizzles */
 211         { MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR },
 212         { MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR },
 213         { MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR },
 214         { MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR },
 215         { MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A,     1, SLOT_SRC_SCALAR },
 216         { MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR },
 217         { MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR },
 218         { MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH },
 219         { MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
 220         { MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
 221         { MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
 222         { PFS_INVAL, 0, 0, 0},
 223 };
 224
 225 /* used during matching of non-native swizzles */
 226 #define SWZ_X_MASK (7 << 0)
 227 #define SWZ_Y_MASK (7 << 3)
 228 #define SWZ_Z_MASK (7 << 6)
 229 #define SWZ_W_MASK (7 << 9)
 230 static const struct {
 231         GLuint hash;            /* used to mask matching swizzle components */
 232         int mask;               /* actual outmask */
 233         int count;              /* count of components matched */
 234 } s_mask[] = {
 235         { SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK, 1|2|4, 3},
 236         { SWZ_X_MASK|SWZ_Y_MASK, 1|2, 2},
 237         { SWZ_X_MASK|SWZ_Z_MASK, 1|4, 2},
 238         { SWZ_Y_MASK|SWZ_Z_MASK, 2|4, 2},
 239         { SWZ_X_MASK, 1, 1},
 240         { SWZ_Y_MASK, 2, 1},
 241         { SWZ_Z_MASK, 4, 1},
 242         { PFS_INVAL, PFS_INVAL, PFS_INVAL}
 243 };
 244
 245 static const struct {
 246         int base;       /* hw value of swizzle */
 247         int stride;     /* difference between SRC0/1/2 */
 248         GLuint flags;
 249 } s_swiz[] = {
 250         { R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR },
 251         { R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR },
 252         { R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR },
 253         { R300_FPI2_ARGA_SRC0A  , 1, SLOT_SRC_SCALAR },
 254         { R300_FPI2_ARGA_ZERO   , 0, 0 },
 255         { R300_FPI2_ARGA_ONE    , 0, 0 },
 256         { R300_FPI2_ARGA_HALF   , 0, 0 }
 257 };
 258
 259 /* boiler-plate reg, for convenience */
 260 static const GLuint undef = REG(REG_TYPE_TEMP,
 261                                 0,
 262                                 SWIZZLE_XYZ,
 263                                 SWIZZLE_W,
 264                                 GL_FALSE,
 265                                 GL_FALSE,
 266                                 GL_FALSE);
 267
 268 /* constant one source */
 269 static const GLuint pfs_one = REG(REG_TYPE_CONST,
 270                                   0,
 271                                   SWIZZLE_111,
 272                                   SWIZZLE_ONE,
 273                                   GL_FALSE,
 274                                   GL_TRUE,
 275                                   GL_TRUE);
 276
 277 /* constant half source */
 278 static const GLuint pfs_half = REG(REG_TYPE_CONST,
 279                                    0,
 280                                    SWIZZLE_HHH,
 281                                    SWIZZLE_HALF,
 282                                    GL_FALSE,
 283                                    GL_TRUE,
 284                                    GL_TRUE);
 285
 286 /* constant zero source */
 287 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
 288                                    0,
 289                                    SWIZZLE_000,
 290                                    SWIZZLE_ZERO,
 291                                    GL_FALSE,
 292                                    GL_TRUE,
 293                                    GL_TRUE);
 294
 295 /*
 296  * Common functions prototypes
 297  */
 298 static void dump_program(struct r300_fragment_program *rp);
 299 static void emit_arith(struct r300_fragment_program *rp, int op,
 300                                 GLuint dest, int mask,
 301                                 GLuint src0, GLuint src1, GLuint src2,
 302                                 int flags);
 303
 304 /**
 305  * Get an R300 temporary that can be written to in the given slot.
 306  */
 307 static int get_hw_temp(struct r300_fragment_program *rp, int slot)
 308 {
 309         COMPILE_STATE;
 310         int r;
 311
 312         for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 313                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
 314                         break;
 315         }
 316
 317         if (r >= PFS_NUM_TEMP_REGS) {
 318                 ERROR("Out of hardware temps\n");
 319                 return 0;
 320         }
 321
 322         // Reserved is used to avoid the following scenario:
 323         //  R300 temporary X is first assigned to Mesa temporary Y during vector ops
 324         //  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
 325         //  Then scalar ops on Mesa temporary Z are emitted and move back in time
 326         //  to overwrite the value of temporary Y.
 327         // End scenario.
 328         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 329         cs->hwtemps[r].free = -1;
 330
 331         // Reset to some value that won't mess things up when the user
 332         // tries to read from a temporary that hasn't been assigned a value yet.
 333         // In the normal case, vector_valid and scalar_valid should be set to
 334         // a sane value by the first emit that writes to this temporary.
 335         cs->hwtemps[r].vector_valid = 0;
 336         cs->hwtemps[r].scalar_valid = 0;
 337
 338         if (r > rp->max_temp_idx)
 339                 rp->max_temp_idx = r;
 340
 341         return r;
 342 }
 343
 344 /**
 345  * Get an R300 temporary that will act as a TEX destination register.
 346  */
 347 static int get_hw_temp_tex(struct r300_fragment_program *rp)
 348 {
 349         COMPILE_STATE;
 350         int r;
 351
 352         for(r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 353                 if (cs->used_in_node & (1 << r))
 354                         continue;
 355
 356                 // Note: Be very careful here
 357                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
 358                         break;
 359         }
 360
 361         if (r >= PFS_NUM_TEMP_REGS)
 362                 return get_hw_temp(rp, 0); /* Will cause an indirection */
 363
 364         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 365         cs->hwtemps[r].free = -1;
 366
 367         // Reset to some value that won't mess things up when the user
 368         // tries to read from a temporary that hasn't been assigned a value yet.
 369         // In the normal case, vector_valid and scalar_valid should be set to
 370         // a sane value by the first emit that writes to this temporary.
 371         cs->hwtemps[r].vector_valid = cs->nrslots;
 372         cs->hwtemps[r].scalar_valid = cs->nrslots;
 373
 374         if (r > rp->max_temp_idx)
 375                 rp->max_temp_idx = r;
 376
 377         return r;
 378 }
 379
 380 /**
 381  * Mark the given hardware register as free.
 382  */
 383 static void free_hw_temp(struct r300_fragment_program *rp, int idx)
 384 {
 385         COMPILE_STATE;
 386
 387         // Be very careful here. Consider sequences like
 388         //  MAD r0, r1,r2,r3
 389         //  TEX r4, ...
 390         // The TEX instruction may be moved in front of the MAD instruction
 391         // due to the way nodes work. We don't want to alias r1 and r4 in
 392         // this case.
 393         // I'm certain the register allocation could be further sanitized,
 394         // but it's tricky because of stuff that can happen inside emit_tex
 395         // and emit_arith.
 396         cs->hwtemps[idx].free = cs->nrslots+1;
 397 }
 398
 399
 400 /**
 401  * Create a new Mesa temporary register.
 402  */
 403 static GLuint get_temp_reg(struct r300_fragment_program *rp)
 404 {
 405         COMPILE_STATE;
 406         GLuint r = undef;
 407         GLuint index;
 408
 409         index = ffs(~cs->temp_in_use);
 410         if (!index) {
 411                 ERROR("Out of program temps\n");
 412                 return r;
 413         }
 414
 415         cs->temp_in_use |= (1 << --index);
 416         cs->temps[index].refcount = 0xFFFFFFFF;
 417         cs->temps[index].reg = -1;
 418
 419         REG_SET_TYPE(r, REG_TYPE_TEMP);
 420         REG_SET_INDEX(r, index);
 421         REG_SET_VALID(r, GL_TRUE);
 422         return r;
 423 }
 424
 425 /**
 426  * Create a new Mesa temporary register that will act as the destination
 427  * register for a texture read.
 428  */
 429 static GLuint get_temp_reg_tex(struct r300_fragment_program *rp)
 430 {
 431         COMPILE_STATE;
 432         GLuint r = undef;
 433         GLuint index;
 434
 435         index = ffs(~cs->temp_in_use);
 436         if (!index) {
 437                 ERROR("Out of program temps\n");
 438                 return r;
 439         }
 440
 441         cs->temp_in_use |= (1 << --index);
 442         cs->temps[index].refcount = 0xFFFFFFFF;
 443         cs->temps[index].reg = get_hw_temp_tex(rp);
 444
 445         REG_SET_TYPE(r, REG_TYPE_TEMP);
 446         REG_SET_INDEX(r, index);
 447         REG_SET_VALID(r, GL_TRUE);
 448         return r;
 449 }
 450
 451 /**
 452  * Free a Mesa temporary and the associated R300 temporary.
 453  */
 454 static void free_temp(struct r300_fragment_program *rp, GLuint r)
 455 {
 456         COMPILE_STATE;
 457         GLuint index = REG_GET_INDEX(r);
 458
 459         if (!(cs->temp_in_use & (1 << index)))
 460                 return;
 461
 462         if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
 463                 free_hw_temp(rp, cs->temps[index].reg);
 464                 cs->temps[index].reg = -1;
 465                 cs->temp_in_use &= ~(1 << index);
 466         } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
 467                 free_hw_temp(rp, cs->inputs[index].reg);
 468                 cs->inputs[index].reg = -1;
 469         }
 470 }
 471
 472 /**
 473  * Emit a hardware constant/parameter.
 474  *
 475  * \p cp Stable pointer to an array of 4 floats.
 476  *  The pointer must be stable in the sense that it remains to be valid
 477  *  and hold the contents of the constant/parameter throughout the lifetime
 478  *  of the fragment program (actually, up until the next time the fragment
 479  *  program is translated).
 480  */
 481 static GLuint emit_const4fv(struct r300_fragment_program *rp, const GLfloat* cp)
 482 {
 483         GLuint reg = undef;
 484         int index;
 485
 486         for(index = 0; index < rp->const_nr; ++index) {
 487                 if (rp->constant[index] == cp)
 488                         break;
 489         }
 490
 491         if (index >= rp->const_nr) {
 492                 if (index >= PFS_NUM_CONST_REGS) {
 493                         ERROR("Out of hw constants!\n");
 494                         return reg;
 495                 }
 496
 497                 rp->const_nr++;
 498                 rp->constant[index] = cp;
 499         }
 500
 501         REG_SET_TYPE(reg, REG_TYPE_CONST);
 502         REG_SET_INDEX(reg, index);
 503         REG_SET_VALID(reg, GL_TRUE);
 504         return reg;
 505 }
 506
 507 static inline GLuint negate(GLuint r)
 508 {
 509         REG_NEGS(r);
 510         REG_NEGV(r);
 511         return r;
 512 }
 513
 514 /* Hack, to prevent clobbering sources used multiple times when
 515  * emulating non-native instructions
 516  */
 517 static inline GLuint keep(GLuint r)
 518 {
 519         REG_SET_NO_USE(r, GL_TRUE);
 520         return r;
 521 }
 522
 523 static inline GLuint absolute(GLuint r)
 524 {
 525         REG_ABS(r);
 526         return r;
 527 }
 528
 529 static int swz_native(struct r300_fragment_program *rp,
 530                       GLuint src,
 531                       GLuint *r,
 532                       GLuint arbneg)
 533 {
 534         /* Native swizzle, handle negation */
 535         src = (src & ~REG_NEGS_MASK) |
 536                 (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
 537
 538         if ((arbneg & 0x7) == 0x0) {
 539                 src = src & ~REG_NEGV_MASK;
 540                 *r = src;
 541         } else if ((arbneg & 0x7) == 0x7) {
 542                 src |= REG_NEGV_MASK;
 543                 *r = src;
 544         } else {
 545                 if (!REG_GET_VALID(*r))
 546                         *r = get_temp_reg(rp);
 547                 src |= REG_NEGV_MASK;
 548                 emit_arith(rp,
 549                            PFS_OP_MAD,
 550                            *r,
 551                            arbneg & 0x7,
 552                            keep(src),
 553                            pfs_one,
 554                            pfs_zero,
 555                            0);
 556                 src = src & ~REG_NEGV_MASK;
 557                 emit_arith(rp,
 558                            PFS_OP_MAD,
 559                            *r,
 560                            (arbneg ^ 0x7) | WRITEMASK_W,
 561                            src,
 562                            pfs_one,
 563                            pfs_zero,
 564                            0);
 565         }
 566
 567         return 3;
 568 }
 569
 570 static int swz_emit_partial(struct r300_fragment_program *rp,
 571                             GLuint src,
 572                             GLuint *r,
 573                             int mask,
 574                             int mc,
 575                             GLuint arbneg)
 576 {
 577         GLuint tmp;
 578         GLuint wmask = 0;
 579
 580         if (!REG_GET_VALID(*r))
 581                 *r = get_temp_reg(rp);
 582
 583         /* A partial match, VSWZ/mask define what parts of the
 584          * desired swizzle we match
 585          */
 586         if (mc + s_mask[mask].count == 3) {
 587                 wmask = WRITEMASK_W;
 588                 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
 589         }
 590
 591         tmp = arbneg & s_mask[mask].mask;
 592         if (tmp) {
 593                 tmp = tmp ^ s_mask[mask].mask;
 594                 if (tmp) {
 595                         emit_arith(rp,
 596                                    PFS_OP_MAD,
 597                                    *r,
 598                                    arbneg & s_mask[mask].mask,
 599                                    keep(src) | REG_NEGV_MASK,
 600                                    pfs_one,
 601                                    pfs_zero,
 602                                    0);
 603                         if (!wmask) {
 604                                 REG_SET_NO_USE(src, GL_TRUE);
 605                         } else {
 606                                 REG_SET_NO_USE(src, GL_FALSE);
 607                         }
 608                         emit_arith(rp,
 609                                    PFS_OP_MAD,
 610                                    *r,
 611                                    tmp | wmask,
 612                                    src,
 613                                    pfs_one,
 614                                    pfs_zero,
 615                                    0);
 616                 } else {
 617                         if (!wmask) {
 618                                 REG_SET_NO_USE(src, GL_TRUE);
 619                         } else {
 620                                 REG_SET_NO_USE(src, GL_FALSE);
 621                         }
 622                         emit_arith(rp,
 623                                    PFS_OP_MAD,
 624                                    *r,
 625                                    (arbneg & s_mask[mask].mask) | wmask,
 626                                    src | REG_NEGV_MASK,
 627                                    pfs_one,
 628                                    pfs_zero,
 629                                    0);
 630                 }
 631         } else {
 632                 if (!wmask) {
 633                         REG_SET_NO_USE(src, GL_TRUE);
 634                 } else {
 635                         REG_SET_NO_USE(src, GL_FALSE);
 636                 }
 637                 emit_arith(rp, PFS_OP_MAD,
 638                            *r,
 639                            s_mask[mask].mask | wmask,
 640                            src,
 641                            pfs_one,
 642                            pfs_zero,
 643                            0);
 644         }
 645
 646         return s_mask[mask].count;
 647 }
 648
 649 static GLuint do_swizzle(struct r300_fragment_program *rp,
 650                          GLuint src,
 651                          GLuint arbswz,
 652                          GLuint arbneg)
 653 {
 654         GLuint r = undef;
 655         GLuint vswz;
 656         int c_mask = 0;
 657         int v_match = 0;
 658
 659         /* If swizzling from something without an XYZW native swizzle,
 660          * emit result to a temp, and do new swizzle from the temp.
 661          */
 662 #if 0
 663         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ ||
 664             REG_GET_SSWZ(src) != SWIZZLE_W) {
 665                 GLuint temp = get_temp_reg(rp);
 666                 emit_arith(rp,
 667                            PFS_OP_MAD,
 668                            temp,
 669                            WRITEMASK_XYZW,
 670                            src,
 671                            pfs_one,
 672                            pfs_zero,
 673                            0);
 674                 src = temp;
 675         }
 676 #endif
 677
 678         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ ||
 679             REG_GET_SSWZ(src) != SWIZZLE_W) {
 680             GLuint vsrcswz = (v_swiz[REG_GET_VSWZ(src)].hash & (SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK)) | REG_GET_SSWZ(src) << 9;
 681             GLint i;
 682
 683             GLuint newswz = 0;
 684             GLuint offset;
 685             for(i=0; i < 4; ++i){
 686                 offset = GET_SWZ(arbswz, i);
 687
 688                 newswz |= (offset <= 3)?GET_SWZ(vsrcswz, offset) << i*3:offset << i*3;
 689             }
 690
 691             arbswz = newswz & (SWZ_X_MASK|SWZ_Y_MASK|SWZ_Z_MASK);
 692             REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
 693         }
 694         else
 695         {
 696             /* set scalar swizzling */
 697             REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
 698
 699         }
 700         do {
 701                 vswz = REG_GET_VSWZ(src);
 702                 do {
 703                         int chash;
 704
 705                         REG_SET_VSWZ(src, vswz);
 706                         chash = v_swiz[REG_GET_VSWZ(src)].hash &
 707                                 s_mask[c_mask].hash;
 708
 709                         if (chash == (arbswz & s_mask[c_mask].hash)) {
 710                                 if (s_mask[c_mask].count == 3) {
 711                                         v_match += swz_native(rp,
 712                                                                 src,
 713                                                                 &r,
 714                                                                 arbneg);
 715                                 } else {
 716                                         v_match += swz_emit_partial(rp,
 717                                                                     src,
 718                                                                     &r,
 719                                                                     c_mask,
 720                                                                     v_match,
 721                                                                     arbneg);
 722                                 }
 723
 724                                 if (v_match == 3)
 725                                         return r;
 726
 727                                 /* Fill with something invalid.. all 0's was
 728                                  * wrong before, matched SWIZZLE_X.  So all
 729                                  * 1's will be okay for now
 730                                  */
 731                                 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
 732                         }
 733                 } while(v_swiz[++vswz].hash != PFS_INVAL);
 734                 REG_SET_VSWZ(src, SWIZZLE_XYZ);
 735         } while (s_mask[++c_mask].hash != PFS_INVAL);
 736
 737         ERROR("should NEVER get here\n");
 738         return r;
 739 }
 740
 741 static GLuint t_src(struct r300_fragment_program *rp,
 742                     struct prog_src_register fpsrc)
 743 {
 744         GLuint r = undef;
 745
 746         switch (fpsrc.File) {
 747         case PROGRAM_TEMPORARY:
 748                 REG_SET_INDEX(r, fpsrc.Index);
 749                 REG_SET_VALID(r, GL_TRUE);
 750                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 751                 break;
 752         case PROGRAM_INPUT:
 753                 REG_SET_INDEX(r, fpsrc.Index);
 754                 REG_SET_VALID(r, GL_TRUE);
 755                 REG_SET_TYPE(r, REG_TYPE_INPUT);
 756                 break;
 757         case PROGRAM_LOCAL_PARAM:
 758                 r = emit_const4fv(rp,
 759                                   rp->mesa_program.Base.LocalParams[fpsrc.Index]);
 760                 break;
 761         case PROGRAM_ENV_PARAM:
 762                 r = emit_const4fv(rp,
 763                                   rp->ctx->FragmentProgram.Parameters[fpsrc.Index]);
 764                 break;
 765         case PROGRAM_STATE_VAR:
 766         case PROGRAM_NAMED_PARAM:
 767                 r = emit_const4fv(rp,
 768                                   rp->mesa_program.Base.Parameters->ParameterValues[fpsrc.Index]);
 769                 break;
 770         default:
 771                 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
 772                 return r;
 773         }
 774
 775         /* no point swizzling ONE/ZERO/HALF constants... */
 776         if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
 777                 r = do_swizzle(rp, r, fpsrc.Swizzle, fpsrc.NegateBase);
 778         return r;
 779 }
 780
 781 static GLuint t_scalar_src(struct r300_fragment_program *rp,
 782                            struct prog_src_register fpsrc)
 783 {
 784         struct prog_src_register src = fpsrc;
 785         int sc = GET_SWZ(fpsrc.Swizzle, 0); /* X */
 786
 787         src.Swizzle = ((sc<<0)|(sc<<3)|(sc<<6)|(sc<<9));
 788
 789         return t_src(rp, src);
 790 }
 791
 792 static GLuint t_dst(struct r300_fragment_program *rp,
 793                        struct prog_dst_register dest)
 794 {
 795         GLuint r = undef;
 796
 797         switch (dest.File) {
 798         case PROGRAM_TEMPORARY:
 799                 REG_SET_INDEX(r, dest.Index);
 800                 REG_SET_VALID(r, GL_TRUE);
 801                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 802                 return r;
 803         case PROGRAM_OUTPUT:
 804                 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
 805                 switch (dest.Index) {
 806                 case FRAG_RESULT_COLR:
 807                 case FRAG_RESULT_DEPR:
 808                         REG_SET_INDEX(r, dest.Index);
 809                         REG_SET_VALID(r, GL_TRUE);
 810                         return r;
 811                 default:
 812                         ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
 813                         return r;
 814                 }
 815         default:
 816                 ERROR("Bad DstReg->File 0x%x\n", dest.File);
 817                 return r;
 818         }
 819 }
 820
 821 static int t_hw_src(struct r300_fragment_program *rp,
 822                     GLuint src,
 823                     GLboolean tex)
 824 {
 825         COMPILE_STATE;
 826         int idx;
 827         int index = REG_GET_INDEX(src);
 828
 829         switch(REG_GET_TYPE(src)) {
 830         case REG_TYPE_TEMP:
 831                 /* NOTE: if reg==-1 here, a source is being read that
 832                  *       hasn't been written to. Undefined results.
 833                  */
 834                 if (cs->temps[index].reg == -1)
 835                         cs->temps[index].reg = get_hw_temp(rp, cs->nrslots);
 836
 837                 idx = cs->temps[index].reg;
 838
 839                 if (!REG_GET_NO_USE(src) &&
 840                     (--cs->temps[index].refcount == 0))
 841                         free_temp(rp, src);
 842                 break;
 843         case REG_TYPE_INPUT:
 844                 idx = cs->inputs[index].reg;
 845
 846                 if (!REG_GET_NO_USE(src) &&
 847                     (--cs->inputs[index].refcount == 0))
 848                         free_hw_temp(rp, cs->inputs[index].reg);
 849                 break;
 850         case REG_TYPE_CONST:
 851                 return (index | SRC_CONST);
 852         default:
 853                 ERROR("Invalid type for source reg\n");
 854                 return (0 | SRC_CONST);
 855         }
 856
 857         if (!tex)
 858                 cs->used_in_node |= (1 << idx);
 859
 860         return idx;
 861 }
 862
 863 static int t_hw_dst(struct r300_fragment_program *rp,
 864                     GLuint dest,
 865                     GLboolean tex,
 866                     int slot)
 867 {
 868         COMPILE_STATE;
 869         int idx;
 870         GLuint index = REG_GET_INDEX(dest);
 871         assert(REG_GET_VALID(dest));
 872
 873         switch(REG_GET_TYPE(dest)) {
 874         case REG_TYPE_TEMP:
 875                 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
 876                         if (!tex) {
 877                                 cs->temps[index].reg = get_hw_temp(rp, slot);
 878                         } else {
 879                                 cs->temps[index].reg = get_hw_temp_tex(rp);
 880                         }
 881                 }
 882                 idx = cs->temps[index].reg;
 883
 884                 if (!REG_GET_NO_USE(dest) &&
 885                     (--cs->temps[index].refcount == 0))
 886                         free_temp(rp, dest);
 887
 888                 cs->dest_in_node |= (1 << idx);
 889                 cs->used_in_node |= (1 << idx);
 890                 break;
 891         case REG_TYPE_OUTPUT:
 892                 switch(index) {
 893                 case FRAG_RESULT_COLR:
 894                         rp->node[rp->cur_node].flags |= R300_PFS_NODE_OUTPUT_COLOR;
 895                         break;
 896                 case FRAG_RESULT_DEPR:
 897                         rp->node[rp->cur_node].flags |= R300_PFS_NODE_OUTPUT_DEPTH;
 898                         break;
 899                 }
 900                 return index;
 901                 break;
 902         default:
 903                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
 904                 return 0;
 905         }
 906
 907         return idx;
 908 }
 909
 910 static void emit_nop(struct r300_fragment_program *rp)
 911 {
 912         COMPILE_STATE;
 913
 914         if (cs->nrslots >= PFS_MAX_ALU_INST) {
 915                 ERROR("Out of ALU instruction slots\n");
 916                 return;
 917         }
 918
 919         rp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
 920         rp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
 921         rp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
 922         rp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
 923         cs->nrslots++;
 924 }
 925
 926 static void emit_tex(struct r300_fragment_program *rp,
 927                      struct prog_instruction *fpi,
 928                      int opcode)
 929 {
 930         COMPILE_STATE;
 931         GLuint coord = t_src(rp, fpi->SrcReg[0]);
 932         GLuint dest = undef, rdest = undef;
 933         GLuint din, uin;
 934         int unit = fpi->TexSrcUnit;
 935         int hwsrc, hwdest;
 936         GLuint tempreg = 0;
 937
 938         uin = cs->used_in_node;
 939         din = cs->dest_in_node;
 940
 941         /* Resolve source/dest to hardware registers */
 942         if (opcode != R300_FPITX_OP_KIL) {
 943                 if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX) {
 944                         /**
 945                          * Hardware uses [0..1]x[0..1] range for rectangle textures
 946                          * instead of [0..Width]x[0..Height].
 947                          * Add a scaling instruction.
 948                          *
 949                          * \todo Refactor this once we have proper rewriting/optimization
 950                          * support for programs.
 951                          */
 952                         GLint tokens[6] = { STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0, 0, 0 };
 953                         int factor_index;
 954                         GLuint factorreg;
 955
 956                         tokens[2] = unit;
 957                         factor_index = _mesa_add_state_reference(rp->mesa_program.Base.Parameters, tokens);
 958                         factorreg = emit_const4fv(rp,
 959                                         rp->mesa_program.Base.Parameters->ParameterValues[factor_index]);
 960                         tempreg = keep(get_temp_reg(rp));
 961
 962                         emit_arith(rp, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
 963                                    coord, factorreg, pfs_zero, 0);
 964
 965                         /* Ensure correct node indirection */
 966                         uin = cs->used_in_node;
 967                         din = cs->dest_in_node;
 968
 969                         hwsrc = t_hw_src(rp, tempreg, GL_TRUE);
 970                 } else {
 971                         hwsrc = t_hw_src(rp, coord, GL_TRUE);
 972                 }
 973
 974                 dest = t_dst(rp, fpi->DstReg);
 975
 976                 /* r300 doesn't seem to be able to do TEX->output reg */
 977                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
 978                         rdest = dest;
 979                         dest = get_temp_reg_tex(rp);
 980                 }
 981                 hwdest = t_hw_dst(rp, dest, GL_TRUE, rp->node[rp->cur_node].alu_offset);
 982
 983                 /* Use a temp that hasn't been used in this node, rather
 984                  * than causing an indirection
 985                  */
 986                 if (uin & (1 << hwdest)) {
 987                         free_hw_temp(rp, hwdest);
 988                         hwdest = get_hw_temp_tex(rp);
 989                         cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
 990                 }
 991         } else {
 992                 hwdest = 0;
 993                 unit = 0;
 994                 hwsrc = t_hw_src(rp, coord, GL_TRUE);
 995         }
 996
 997
 998         /* Indirection if source has been written in this node, or if the
 999          * dest has been read/written in this node
1000          */
1001         if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
1002              (din & (1<<hwsrc))) || (uin & (1<<hwdest))) {
1003
1004                 /* Finish off current node */
1005                 if (rp->node[rp->cur_node].alu_offset == cs->nrslots)
1006                         emit_nop(rp);
1007
1008                 rp->node[rp->cur_node].alu_end =
1009                                 cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
1010                 assert(rp->node[rp->cur_node].alu_end >= 0);
1011
1012                 if (++rp->cur_node >= PFS_MAX_TEX_INDIRECT) {
1013                         ERROR("too many levels of texture indirection\n");
1014                         return;
1015                 }
1016
1017                 /* Start new node */
1018                 rp->node[rp->cur_node].tex_offset = rp->tex.length;
1019                 rp->node[rp->cur_node].alu_offset = cs->nrslots;
1020                 rp->node[rp->cur_node].tex_end = -1;
1021                 rp->node[rp->cur_node].alu_end = -1;
1022                 rp->node[rp->cur_node].flags = 0;
1023                 cs->used_in_node = 0;
1024                 cs->dest_in_node = 0;
1025         }
1026
1027         if (rp->cur_node == 0)
1028                 rp->first_node_has_tex = 1;
1029
1030         rp->tex.inst[rp->tex.length++] = 0
1031                 | (hwsrc << R300_FPITX_SRC_SHIFT)
1032                 | (hwdest << R300_FPITX_DST_SHIFT)
1033                 | (unit << R300_FPITX_IMAGE_SHIFT)
1034                 /* not entirely sure about this */
1035                 | (opcode << R300_FPITX_OPCODE_SHIFT);
1036
1037         cs->dest_in_node |= (1 << hwdest);
1038         if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1039                 cs->used_in_node |= (1 << hwsrc);
1040
1041         rp->node[rp->cur_node].tex_end++;
1042
1043         /* Copy from temp to output if needed */
1044         if (REG_GET_VALID(rdest)) {
1045                 emit_arith(rp, PFS_OP_MAD, rdest, WRITEMASK_XYZW, dest,
1046                            pfs_one, pfs_zero, 0);
1047                 free_temp(rp, dest);
1048         }
1049
1050         /* Free temp register */
1051         if (tempreg != 0)
1052                 free_temp(rp, tempreg);
1053 }
1054
1055
1056 /**
1057  * Returns the first slot where we could possibly allow writing to dest,
1058  * according to register allocation.
1059  */
1060 static int get_earliest_allowed_write(
1061                 struct r300_fragment_program* rp,
1062                 GLuint dest, int mask)
1063 {
1064         COMPILE_STATE;
1065         int idx;
1066         int pos;
1067         GLuint index = REG_GET_INDEX(dest);
1068         assert(REG_GET_VALID(dest));
1069
1070         switch(REG_GET_TYPE(dest)) {
1071                 case REG_TYPE_TEMP:
1072                         if (cs->temps[index].reg == -1)
1073                                 return 0;
1074
1075                         idx = cs->temps[index].reg;
1076                         break;
1077                 case REG_TYPE_OUTPUT:
1078                         return 0;
1079                 default:
1080                         ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1081                         return 0;
1082         }
1083
1084         pos = cs->hwtemps[idx].reserved;
1085         if (mask & WRITEMASK_XYZ) {
1086                 if (pos < cs->hwtemps[idx].vector_lastread)
1087                         pos = cs->hwtemps[idx].vector_lastread;
1088         }
1089         if (mask & WRITEMASK_W) {
1090                 if (pos < cs->hwtemps[idx].scalar_lastread)
1091                         pos = cs->hwtemps[idx].scalar_lastread;
1092         }
1093
1094         return pos;
1095 }
1096
1097
1098 /**
1099  * Allocates a slot for an ALU instruction that can consist of
1100  * a vertex part or a scalar part or both.
1101  *
1102  * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1103  * appropriate position (vector and/or scalar), and their positions are
1104  * recorded in the srcpos array.
1105  *
1106  * This function emits instruction code for the source fetch and the
1107  * argument selection. It does not emit instruction code for the
1108  * opcode or the destination selection.
1109  *
1110  * @return the index of the slot
1111  */
1112 static int find_and_prepare_slot(struct r300_fragment_program* rp,
1113                 GLboolean emit_vop,
1114                 GLboolean emit_sop,
1115                 int argc,
1116                 GLuint* src,
1117                 GLuint dest,
1118                 int mask)
1119 {
1120         COMPILE_STATE;
1121         int hwsrc[3];
1122         int srcpos[3];
1123         unsigned int used;
1124         int tempused;
1125         int tempvsrc[3];
1126         int tempssrc[3];
1127         int pos;
1128         int regnr;
1129         int i,j;
1130
1131         // Determine instruction slots, whether sources are required on
1132         // vector or scalar side, and the smallest slot number where
1133         // all source registers are available
1134         used = 0;
1135         if (emit_vop)
1136                 used |= SLOT_OP_VECTOR;
1137         if (emit_sop)
1138                 used |= SLOT_OP_SCALAR;
1139
1140         pos = get_earliest_allowed_write(rp, dest, mask);
1141
1142         if (rp->node[rp->cur_node].alu_offset > pos)
1143                 pos = rp->node[rp->cur_node].alu_offset;
1144         for(i = 0; i < argc; ++i) {
1145                 if (!REG_GET_BUILTIN(src[i])) {
1146                         if (emit_vop)
1147                                 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1148                         if (emit_sop)
1149                                 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1150                 }
1151
1152                 hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE); /* Note: sideeffects wrt refcounting! */
1153                 regnr = hwsrc[i] & 31;
1154
1155                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1156                         if (used & (SLOT_SRC_VECTOR << i)) {
1157                                 if (cs->hwtemps[regnr].vector_valid > pos)
1158                                         pos = cs->hwtemps[regnr].vector_valid;
1159                         }
1160                         if (used & (SLOT_SRC_SCALAR << i)) {
1161                                 if (cs->hwtemps[regnr].scalar_valid > pos)
1162                                         pos = cs->hwtemps[regnr].scalar_valid;
1163                         }
1164                 }
1165         }
1166
1167         // Find a slot that fits
1168         for(; ; ++pos) {
1169                 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1170                         continue;
1171
1172                 if (pos >= cs->nrslots) {
1173                         if (cs->nrslots >= PFS_MAX_ALU_INST) {
1174                                 ERROR("Out of ALU instruction slots\n");
1175                                 return -1;
1176                         }
1177
1178                         rp->alu.inst[pos].inst0 = NOP_INST0;
1179                         rp->alu.inst[pos].inst1 = NOP_INST1;
1180                         rp->alu.inst[pos].inst2 = NOP_INST2;
1181                         rp->alu.inst[pos].inst3 = NOP_INST3;
1182
1183                         cs->nrslots++;
1184                 }
1185
1186                 // Note: When we need both parts (vector and scalar) of a source,
1187                 // we always try to put them into the same position. This makes the
1188                 // code easier to read, and it is optimal (i.e. one doesn't gain
1189                 // anything by splitting the parts).
1190                 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1191                 tempused = cs->slot[pos].used;
1192                 for(i = 0; i < 3; ++i) {
1193                         tempvsrc[i] = cs->slot[pos].vsrc[i];
1194                         tempssrc[i] = cs->slot[pos].ssrc[i];
1195                 }
1196
1197                 for(i = 0; i < argc; ++i) {
1198                         int flags = (used >> i) & SLOT_SRC_BOTH;
1199
1200                         if (!flags) {
1201                                 srcpos[i] = 0;
1202                                 continue;
1203                         }
1204
1205                         for(j = 0; j < 3; ++j) {
1206                                 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1207                                         if (tempvsrc[j] != hwsrc[i])
1208                                                 continue;
1209                                 }
1210
1211                                 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1212                                         if (tempssrc[j] != hwsrc[i])
1213                                                 continue;
1214                                 }
1215
1216                                 break;
1217                         }
1218
1219                         if (j == 3)
1220                                 break;
1221
1222                         srcpos[i] = j;
1223                         tempused |= flags << j;
1224                         if (flags & SLOT_SRC_VECTOR)
1225                                 tempvsrc[j] = hwsrc[i];
1226                         if (flags & SLOT_SRC_SCALAR)
1227                                 tempssrc[j] = hwsrc[i];
1228                 }
1229
1230                 if (i == argc)
1231                         break;
1232         }
1233
1234         // Found a slot, reserve it
1235         cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1236         for(i = 0; i < 3; ++i) {
1237                 cs->slot[pos].vsrc[i] = tempvsrc[i];
1238                 cs->slot[pos].ssrc[i] = tempssrc[i];
1239         }
1240
1241         for(i = 0; i < argc; ++i) {
1242                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1243                         int regnr = hwsrc[i] & 31;
1244
1245                         if (used & (SLOT_SRC_VECTOR << i)) {
1246                                 if (cs->hwtemps[regnr].vector_lastread < pos)
1247                                         cs->hwtemps[regnr].vector_lastread = pos;
1248                         }
1249                         if (used & (SLOT_SRC_SCALAR << i)) {
1250                                 if (cs->hwtemps[regnr].scalar_lastread < pos)
1251                                         cs->hwtemps[regnr].scalar_lastread = pos;
1252                         }
1253                 }
1254         }
1255
1256         // Emit the source fetch code
1257         rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
1258         rp->alu.inst[pos].inst1 |=
1259                         ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
1260                          (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
1261                          (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
1262
1263         rp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
1264         rp->alu.inst[pos].inst3 |=
1265                         ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
1266                          (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
1267                          (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
1268
1269         // Emit the argument selection code
1270         if (emit_vop) {
1271                 int swz[3];
1272
1273                 for(i = 0; i < 3; ++i) {
1274                         if (i < argc) {
1275                                 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1276                                             (srcpos[i] * v_swiz[REG_GET_VSWZ(src[i])].stride)) |
1277                                         ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
1278                                         ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
1279                         } else {
1280                                 swz[i] = R300_FPI0_ARGC_ZERO;
1281                         }
1282                 }
1283
1284                 rp->alu.inst[pos].inst0 &=
1285                                 ~(R300_FPI0_ARG0C_MASK|R300_FPI0_ARG1C_MASK|R300_FPI0_ARG2C_MASK);
1286                 rp->alu.inst[pos].inst0 |=
1287                                 (swz[0] << R300_FPI0_ARG0C_SHIFT) |
1288                                 (swz[1] << R300_FPI0_ARG1C_SHIFT) |
1289                                 (swz[2] << R300_FPI0_ARG2C_SHIFT);
1290         }
1291
1292         if (emit_sop) {
1293                 int swz[3];
1294
1295                 for(i = 0; i < 3; ++i) {
1296                         if (i < argc) {
1297                                 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1298                                                 (srcpos[i] * s_swiz[REG_GET_SSWZ(src[i])].stride)) |
1299                                                 ((src[i] & REG_NEGV_MASK) ? ARG_NEG : 0) |
1300                                                 ((src[i] & REG_ABS_MASK) ? ARG_ABS : 0);
1301                         } else {
1302                                 swz[i] = R300_FPI2_ARGA_ZERO;
1303                         }
1304                 }
1305
1306                 rp->alu.inst[pos].inst2 &=
1307                                 ~(R300_FPI2_ARG0A_MASK|R300_FPI2_ARG1A_MASK|R300_FPI2_ARG2A_MASK);
1308                 rp->alu.inst[pos].inst2 |=
1309                                 (swz[0] << R300_FPI2_ARG0A_SHIFT) |
1310                                 (swz[1] << R300_FPI2_ARG1A_SHIFT) |
1311                                 (swz[2] << R300_FPI2_ARG2A_SHIFT);
1312         }
1313
1314         return pos;
1315 }
1316
1317
1318 /**
1319  * Append an ALU instruction to the instruction list.
1320  */
1321 static void emit_arith(struct r300_fragment_program *rp,
1322                        int op,
1323                        GLuint dest,
1324                        int mask,
1325                        GLuint src0,
1326                        GLuint src1,
1327                        GLuint src2,
1328                        int flags)
1329 {
1330         COMPILE_STATE;
1331         GLuint src[3] = { src0, src1, src2 };
1332         int hwdest;
1333         GLboolean emit_vop, emit_sop;
1334         int vop, sop, argc;
1335         int pos;
1336
1337         vop = r300_fpop[op].v_op;
1338         sop = r300_fpop[op].s_op;
1339         argc = r300_fpop[op].argc;
1340
1341         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1342             REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1343                 if (mask & WRITEMASK_Z) {
1344                         mask = WRITEMASK_W;
1345                 } else {
1346                         return;
1347                 }
1348         }
1349
1350         emit_vop = GL_FALSE;
1351         emit_sop = GL_FALSE;
1352         if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
1353                 emit_vop = GL_TRUE;
1354         if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
1355                 emit_sop = GL_TRUE;
1356
1357         pos = find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest, mask);
1358         if (pos < 0)
1359                 return;
1360
1361         hwdest = t_hw_dst(rp, dest, GL_FALSE, pos); /* Note: Side effects wrt register allocation */
1362
1363         if (flags & PFS_FLAG_SAT) {
1364                 vop |= R300_FPI0_OUTC_SAT;
1365                 sop |= R300_FPI2_OUTA_SAT;
1366         }
1367
1368         /* Throw the pieces together and get FPI0/1 */
1369         if (emit_vop) {
1370                 rp->alu.inst[pos].inst0 |= vop;
1371
1372                 rp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
1373
1374                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1375                         if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1376                                 rp->alu.inst[pos].inst1 |=
1377                                         (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
1378                         } else assert(0);
1379                 } else {
1380                         rp->alu.inst[pos].inst1 |=
1381                                         (mask & WRITEMASK_XYZ) << R300_FPI1_DSTC_REG_MASK_SHIFT;
1382
1383                         cs->hwtemps[hwdest].vector_valid = pos+1;
1384                 }
1385         }
1386
1387         /* And now FPI2/3 */
1388         if (emit_sop) {
1389                 rp->alu.inst[pos].inst2 |= sop;
1390
1391                 if (mask & WRITEMASK_W) {
1392                         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1393                                 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1394                                         rp->alu.inst[pos].inst3 |=
1395                                                         (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_OUTPUT;
1396                                 } else if (REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1397                                         rp->alu.inst[pos].inst3 |= R300_FPI3_DSTA_DEPTH;
1398                                 } else assert(0);
1399                         } else {
1400                                 rp->alu.inst[pos].inst3 |=
1401                                                 (hwdest << R300_FPI3_DSTA_SHIFT) | R300_FPI3_DSTA_REG;
1402
1403                                 cs->hwtemps[hwdest].scalar_valid = pos+1;
1404                         }
1405                 }
1406         }
1407
1408         return;
1409 }
1410
1411 #if 0
1412 static GLuint get_attrib(struct r300_fragment_program *rp, GLuint attr)
1413 {
1414         struct gl_fragment_program *mp = &rp->mesa_program;
1415         GLuint r = undef;
1416
1417         if (!(mp->Base.InputsRead & (1<<attr))) {
1418                 ERROR("Attribute %d was not provided!\n", attr);
1419                 return undef;
1420         }
1421
1422         REG_SET_TYPE(r, REG_TYPE_INPUT);
1423         REG_SET_INDEX(r, attr);
1424         REG_SET_VALID(r, GL_TRUE);
1425         return r;
1426 }
1427 #endif
1428
1429 static GLfloat SinCosConsts[2][4] = {
1430         {
1431                 1.273239545,  // 4/PI
1432                 -0.405284735, // -4/(PI*PI)
1433                 3.141592654,  // PI
1434                 0.2225        // weight
1435         },
1436         {
1437                 0.75,
1438                 0.0,
1439                 0.159154943,  // 1/(2*PI)
1440                 6.283185307   // 2*PI
1441         }
1442 };
1443
1444
1445 /**
1446  * Emit a LIT instruction.
1447  * \p flags may be PFS_FLAG_SAT
1448  *
1449  * Definition of LIT (from ARB_fragment_program):
1450  * tmp = VectorLoad(op0);
1451  * if (tmp.x < 0) tmp.x = 0;
1452  * if (tmp.y < 0) tmp.y = 0;
1453  * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1454  * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1455  * result.x = 1.0;
1456  * result.y = tmp.x;
1457  * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1458  * result.w = 1.0;
1459  *
1460  * The longest path of computation is the one leading to result.z,
1461  * consisting of 5 operations. This implementation of LIT takes
1462  * 5 slots. So unless there's some special undocumented opcode,
1463  * this implementation is potentially optimal. Unfortunately,
1464  * emit_arith is a bit too conservative because it doesn't understand
1465  * partial writes to the vector component.
1466  */
1467 static const GLfloat LitConst[4] = { 127.999999, 127.999999, 127.999999, -127.999999 };
1468
1469 static void emit_lit(struct r300_fragment_program *rp,
1470                 GLuint dest,
1471                 int mask,
1472                 GLuint src,
1473                 int flags)
1474 {
1475         COMPILE_STATE;
1476         GLuint cnst;
1477         int needTemporary;
1478         GLuint temp;
1479
1480         cnst = emit_const4fv(rp, LitConst);
1481
1482         needTemporary = 0;
1483         if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
1484                 needTemporary = 1;
1485         } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1486                 // LIT is typically followed by DP3/DP4, so there's no point
1487                 // in creating special code for this case
1488                 needTemporary = 1;
1489         }
1490
1491         if (needTemporary) {
1492                 temp = keep(get_temp_reg(rp));
1493         } else {
1494                 temp = keep(dest);
1495         }
1496
1497         // Note: The order of emit_arith inside the slots is relevant,
1498         // because emit_arith only looks at scalar vs. vector when resolving
1499         // dependencies, and it does not consider individual vector components,
1500         // so swizzling between the two parts can create fake dependencies.
1501
1502         // First slot
1503         emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_XY,
1504                    keep(src), pfs_zero, undef, 0);
1505         emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_W,
1506                    src, cnst, undef, 0);
1507
1508         // Second slot
1509         emit_arith(rp, PFS_OP_MIN, temp, WRITEMASK_Z,
1510                    swizzle(temp, W, W, W, W), cnst, undef, 0);
1511         emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
1512                    swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
1513
1514         // Third slot
1515         // If desired, we saturate the y result here.
1516         // This does not affect the use as a condition variable in the CMP later
1517         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
1518                    temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
1519         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
1520                    swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
1521
1522         // Fourth slot
1523         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
1524                    pfs_one, pfs_one, pfs_zero, 0);
1525         emit_arith(rp, PFS_OP_EX2, temp, WRITEMASK_W,
1526                    temp, undef, undef, 0);
1527
1528         // Fifth slot
1529         emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z,
1530                    pfs_zero, swizzle(temp, W, W, W, W), negate(swizzle(temp, Y, Y, Y, Y)), flags);
1531         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
1532                    pfs_one, pfs_one, pfs_zero, 0);
1533
1534         if (needTemporary) {
1535                 emit_arith(rp, PFS_OP_MAD, dest, mask,
1536                                    temp, pfs_one, pfs_zero, flags);
1537                 free_temp(rp, temp);
1538         } else {
1539                 // Decrease refcount of the destination
1540                 t_hw_dst(rp, dest, GL_FALSE, cs->nrslots);
1541         }
1542 }
1543
1544
1545 static GLboolean parse_program(struct r300_fragment_program *rp)
1546 {
1547         struct gl_fragment_program *mp = &rp->mesa_program;
1548         const struct prog_instruction *inst = mp->Base.Instructions;
1549         struct prog_instruction *fpi;
1550         GLuint src[3], dest, temp[2];
1551         int flags, mask = 0;
1552         int const_sin[2];
1553
1554         if (!inst || inst[0].Opcode == OPCODE_END) {
1555                 ERROR("empty program?\n");
1556                 return GL_FALSE;
1557         }
1558
1559         for (fpi=mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
1560                 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1561                         flags = PFS_FLAG_SAT;
1562                 else
1563                         flags = 0;
1564
1565                 if (fpi->Opcode != OPCODE_KIL) {
1566                         dest = t_dst(rp, fpi->DstReg);
1567                         mask = fpi->DstReg.WriteMask;
1568                 }
1569
1570                 switch (fpi->Opcode) {
1571                 case OPCODE_ABS:
1572                         src[0] = t_src(rp, fpi->SrcReg[0]);
1573                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1574                                    absolute(src[0]), pfs_one, pfs_zero,
1575                                    flags);
1576                         break;
1577                 case OPCODE_ADD:
1578                         src[0] = t_src(rp, fpi->SrcReg[0]);
1579                         src[1] = t_src(rp, fpi->SrcReg[1]);
1580                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1581                                    src[0], pfs_one, src[1],
1582                                    flags);
1583                         break;
1584                 case OPCODE_CMP:
1585                         src[0] = t_src(rp, fpi->SrcReg[0]);
1586                         src[1] = t_src(rp, fpi->SrcReg[1]);
1587                         src[2] = t_src(rp, fpi->SrcReg[2]);
1588                         /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1589                          *    r300 - if src2.c < 0.0 ? src1.c : src0.c
1590                          */
1591                         emit_arith(rp, PFS_OP_CMP, dest, mask,
1592                                    src[2], src[1], src[0],
1593                                    flags);
1594                         break;
1595                 case OPCODE_COS:
1596                         /*
1597                          * cos using a parabola (see SIN):
1598                          * cos(x):
1599                          *   x = (x/(2*PI))+0.75
1600                          *   x = frac(x)
1601                          *   x = (x*2*PI)-PI
1602                          *   result = sin(x)
1603                          */
1604                         temp[0] = get_temp_reg(rp);
1605                         const_sin[0] = emit_const4fv(rp, SinCosConsts[0]);
1606                         const_sin[1] = emit_const4fv(rp, SinCosConsts[1]);
1607                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1608
1609                         /* add 0.5*PI and do range reduction */
1610
1611                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1612                                    swizzle(src[0], X, X, X, X),
1613                                    swizzle(const_sin[1], Z, Z, Z, Z),
1614                                    swizzle(const_sin[1], X, X, X, X),
1615                                    0);
1616
1617                         emit_arith(rp, PFS_OP_FRC, temp[0], WRITEMASK_X,
1618                                    swizzle(temp[0], X, X, X, X),
1619                                    undef,
1620                                    undef,
1621                                    0);
1622
1623                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z,
1624                                    swizzle(temp[0], X, X, X, X),
1625                                    swizzle(const_sin[1], W, W, W, W), //2*PI
1626                                    negate(swizzle(const_sin[0], Z, Z, Z, Z)), //-PI
1627                                    0);
1628
1629                         /* SIN */
1630
1631                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1632                                    swizzle(temp[0], Z, Z, Z, Z),
1633                                    const_sin[0],
1634                                    pfs_zero,
1635                                    0);
1636
1637                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1638                                    swizzle(temp[0], Y, Y, Y, Y),
1639                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1640                                    swizzle(temp[0], X, X, X, X),
1641                                    0);
1642
1643                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1644                                    swizzle(temp[0], X, X, X, X),
1645                                    absolute(swizzle(temp[0], X, X, X, X)),
1646                                    negate(swizzle(temp[0], X, X, X, X)),
1647                                    0);
1648
1649
1650                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1651                                    swizzle(temp[0], Y, Y, Y, Y),
1652                                    swizzle(const_sin[0], W, W, W, W),
1653                                    swizzle(temp[0], X, X, X, X),
1654                                    flags);
1655
1656                         free_temp(rp, temp[0]);
1657                         break;
1658                 case OPCODE_DP3:
1659                         src[0] = t_src(rp, fpi->SrcReg[0]);
1660                         src[1] = t_src(rp, fpi->SrcReg[1]);
1661                         emit_arith(rp, PFS_OP_DP3, dest, mask,
1662                                    src[0], src[1], undef,
1663                                    flags);
1664                         break;
1665                 case OPCODE_DP4:
1666                         src[0] = t_src(rp, fpi->SrcReg[0]);
1667                         src[1] = t_src(rp, fpi->SrcReg[1]);
1668                         emit_arith(rp, PFS_OP_DP4, dest, mask,
1669                                    src[0], src[1], undef,
1670                                    flags);
1671                         break;
1672                 case OPCODE_DPH:
1673                         src[0] = t_src(rp, fpi->SrcReg[0]);
1674                         src[1] = t_src(rp, fpi->SrcReg[1]);
1675                         /* src0.xyz1 -> temp
1676                          * DP4 dest, temp, src1
1677                          */
1678 #if 0
1679                         temp[0] = get_temp_reg(rp);
1680                         src[0].s_swz = SWIZZLE_ONE;
1681                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1682                                    src[0], pfs_one, pfs_zero,
1683                                    0);
1684                         emit_arith(rp, PFS_OP_DP4, dest, mask,
1685                                    temp[0], src[1], undef,
1686                                    flags);
1687                         free_temp(rp, temp[0]);
1688 #else
1689                         emit_arith(rp, PFS_OP_DP4, dest, mask,
1690                                    swizzle(src[0], X, Y, Z, ONE), src[1],
1691                                    undef, flags);
1692 #endif
1693                         break;
1694                 case OPCODE_DST:
1695                         src[0] = t_src(rp, fpi->SrcReg[0]);
1696                         src[1] = t_src(rp, fpi->SrcReg[1]);
1697                         /* dest.y = src0.y * src1.y */
1698                         if (mask & WRITEMASK_Y)
1699                                 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Y,
1700                                            keep(src[0]), keep(src[1]),
1701                                            pfs_zero, flags);
1702                         /* dest.z = src0.z */
1703                         if (mask & WRITEMASK_Z)
1704                                 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Z,
1705                                            src[0], pfs_one, pfs_zero, flags);
1706                         /* result.x = 1.0
1707                          * result.w = src1.w */
1708                         if (mask & WRITEMASK_XW) {
1709                                 REG_SET_VSWZ(src[1], SWIZZLE_111); /*Cheat*/
1710                                 emit_arith(rp, PFS_OP_MAD, dest,
1711                                            mask & WRITEMASK_XW,
1712                                            src[1], pfs_one, pfs_zero,
1713                                            flags);
1714                         }
1715                         break;
1716                 case OPCODE_EX2:
1717                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1718                         emit_arith(rp, PFS_OP_EX2, dest, mask,
1719                                    src[0], undef, undef,
1720                                    flags);
1721                         break;
1722                 case OPCODE_FLR:
1723                         src[0] = t_src(rp, fpi->SrcReg[0]);
1724                         temp[0] = get_temp_reg(rp);
1725                         /* FRC temp, src0
1726                          * MAD dest, src0, 1.0, -temp
1727                          */
1728                         emit_arith(rp, PFS_OP_FRC, temp[0], mask,
1729                                    keep(src[0]), undef, undef,
1730                                    0);
1731                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1732                                    src[0], pfs_one, negate(temp[0]),
1733                                    flags);
1734                         free_temp(rp, temp[0]);
1735                         break;
1736                 case OPCODE_FRC:
1737                         src[0] = t_src(rp, fpi->SrcReg[0]);
1738                         emit_arith(rp, PFS_OP_FRC, dest, mask,
1739                                    src[0], undef, undef,
1740                                    flags);
1741                         break;
1742                 case OPCODE_KIL:
1743                         emit_tex(rp, fpi, R300_FPITX_OP_KIL);
1744                         break;
1745                 case OPCODE_LG2:
1746                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1747                         emit_arith(rp, PFS_OP_LG2, dest, mask,
1748                                    src[0], undef, undef,
1749                                    flags);
1750                         break;
1751                 case OPCODE_LIT:
1752                         src[0] = t_src(rp, fpi->SrcReg[0]);
1753                         emit_lit(rp, dest, mask, src[0], flags);
1754                         break;
1755                 case OPCODE_LRP:
1756                         src[0] = t_src(rp, fpi->SrcReg[0]);
1757                         src[1] = t_src(rp, fpi->SrcReg[1]);
1758                         src[2] = t_src(rp, fpi->SrcReg[2]);
1759                         /* result = tmp0tmp1 + (1 - tmp0)tmp2
1760                          *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1761                          *     MAD temp, -tmp0, tmp2, tmp2
1762                          *     MAD result, tmp0, tmp1, temp
1763                          */
1764                         temp[0] = get_temp_reg(rp);
1765                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1766                                    negate(keep(src[0])), keep(src[2]), src[2],
1767                                    0);
1768                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1769                                    src[0], src[1], temp[0],
1770                                    flags);
1771                         free_temp(rp, temp[0]);
1772                         break;
1773                 case OPCODE_MAD:
1774                         src[0] = t_src(rp, fpi->SrcReg[0]);
1775                         src[1] = t_src(rp, fpi->SrcReg[1]);
1776                         src[2] = t_src(rp, fpi->SrcReg[2]);
1777                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1778                                    src[0], src[1], src[2],
1779                                    flags);
1780                         break;
1781                 case OPCODE_MAX:
1782                         src[0] = t_src(rp, fpi->SrcReg[0]);
1783                         src[1] = t_src(rp, fpi->SrcReg[1]);
1784                         emit_arith(rp, PFS_OP_MAX, dest, mask,
1785                                    src[0], src[1], undef,
1786                                    flags);
1787                         break;
1788                 case OPCODE_MIN:
1789                         src[0] = t_src(rp, fpi->SrcReg[0]);
1790                         src[1] = t_src(rp, fpi->SrcReg[1]);
1791                         emit_arith(rp, PFS_OP_MIN, dest, mask,
1792                                    src[0], src[1], undef,
1793                                    flags);
1794                         break;
1795                 case OPCODE_MOV:
1796                 case OPCODE_SWZ:
1797                         src[0] = t_src(rp, fpi->SrcReg[0]);
1798                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1799                                    src[0], pfs_one, pfs_zero,
1800                                    flags);
1801                         break;
1802                 case OPCODE_MUL:
1803                         src[0] = t_src(rp, fpi->SrcReg[0]);
1804                         src[1] = t_src(rp, fpi->SrcReg[1]);
1805                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1806                                    src[0], src[1], pfs_zero,
1807                                    flags);
1808                         break;
1809                 case OPCODE_POW:
1810                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1811                         src[1] = t_scalar_src(rp, fpi->SrcReg[1]);
1812                         temp[0] = get_temp_reg(rp);
1813                         emit_arith(rp, PFS_OP_LG2, temp[0], WRITEMASK_W,
1814                                    src[0], undef, undef,
1815                                    0);
1816                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1817                                    temp[0], src[1], pfs_zero,
1818                                    0);
1819                         emit_arith(rp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
1820                                    temp[0], undef, undef,
1821                                    0);
1822                         free_temp(rp, temp[0]);
1823                         break;
1824                 case OPCODE_RCP:
1825                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1826                         emit_arith(rp, PFS_OP_RCP, dest, mask,
1827                                    src[0], undef, undef,
1828                                    flags);
1829                         break;
1830                 case OPCODE_RSQ:
1831                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1832                         emit_arith(rp, PFS_OP_RSQ, dest, mask,
1833                                    absolute(src[0]), pfs_zero, pfs_zero,
1834                                    flags);
1835                         break;
1836                 case OPCODE_SCS:
1837                         /*
1838                          * scs using a parabola :
1839                          * scs(x):
1840                          *   result.x = sin(-abs(x)+0.5*PI)  (cos)
1841                          *   result.y = sin(x)               (sin)
1842                          *
1843                          */
1844                         temp[0] = get_temp_reg(rp);
1845                         temp[1] = get_temp_reg(rp);
1846                         const_sin[0] = emit_const4fv(rp, SinCosConsts[0]);
1847                         const_sin[1] = emit_const4fv(rp, SinCosConsts[1]);
1848                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1849
1850                         /* x = -abs(x)+0.5*PI */
1851                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z,
1852                                    swizzle(const_sin[0], Z, Z, Z, Z), //PI
1853                                    pfs_half,
1854                                    negate(abs(swizzle(keep(src[0]), X, X, X, X))),
1855                                    0);
1856
1857                         /* C*x (sin) */
1858                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1859                                    swizzle(const_sin[0], Y, Y, Y, Y),
1860                                    swizzle(keep(src[0]), X, X, X, X),
1861                                    pfs_zero,
1862                                    0);
1863
1864                         /* B*x, C*x (cos) */
1865                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1866                                    swizzle(temp[0], Z, Z, Z, Z),
1867                                    const_sin[0],
1868                                    pfs_zero,
1869                                    0);
1870
1871                         /* B*x (sin) */
1872                         emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_W,
1873                                    swizzle(const_sin[0], X, X, X, X),
1874                                    keep(src[0]),
1875                                    pfs_zero,
1876                                    0);
1877
1878                         /* y = B*x + C*x*abs(x) (sin)*/
1879                         emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1880                                    absolute(src[0]),
1881                                    swizzle(temp[0], W, W, W, W),
1882                                    swizzle(temp[1], W, W, W, W),
1883                                    0);
1884
1885                         /* y = B*x + C*x*abs(x) (cos)*/
1886                         emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_W,
1887                                    swizzle(temp[0], Y, Y, Y, Y),
1888                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1889                                    swizzle(temp[0], X, X, X, X),
1890                                    0);
1891
1892                         /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1893                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1894                                    swizzle(temp[1], W, Z, Y, X),
1895                                    absolute(swizzle(temp[1], W, Z, Y, X)),
1896                                    negate(swizzle(temp[1], W, Z, Y, X)),
1897
1898                                    0);
1899
1900                         /* dest.xy = mad(temp.xy, P, temp2.wz) */
1901                         emit_arith(rp, PFS_OP_MAD, dest, mask & (WRITEMASK_X | WRITEMASK_Y),
1902                                    temp[0],
1903                                    swizzle(const_sin[0], W, W, W, W),
1904                                    swizzle(temp[1], W, Z, Y, X),
1905                                    flags);
1906
1907                         free_temp(rp, temp[0]);
1908                         free_temp(rp, temp[1]);
1909                         break;
1910                 case OPCODE_SGE:
1911                         src[0] = t_src(rp, fpi->SrcReg[0]);
1912                         src[1] = t_src(rp, fpi->SrcReg[1]);
1913                         temp[0] = get_temp_reg(rp);
1914                         /* temp = src0 - src1
1915                          * dest.c = (temp.c < 0.0) ? 0 : 1
1916                          */
1917                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1918                                    src[0], pfs_one, negate(src[1]),
1919                                    0);
1920                         emit_arith(rp, PFS_OP_CMP, dest, mask,
1921                                    pfs_one, pfs_zero, temp[0],
1922                                    0);
1923                         free_temp(rp, temp[0]);
1924                         break;
1925                 case OPCODE_SIN:
1926                         /*
1927                          *  using a parabola:
1928                          * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1929                          * extra precision is obtained by weighting against
1930                          * itself squared.
1931                          */
1932
1933                         temp[0] = get_temp_reg(rp);
1934                         const_sin[0] = emit_const4fv(rp, SinCosConsts[0]);
1935                         const_sin[1] = emit_const4fv(rp, SinCosConsts[1]);
1936                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1937
1938
1939                         /* do range reduction */
1940
1941                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1942                                    swizzle(keep(src[0]), X, X, X, X),
1943                                    swizzle(const_sin[1], Z, Z, Z, Z),
1944                                    pfs_half,
1945                                    0);
1946
1947                         emit_arith(rp, PFS_OP_FRC, temp[0], WRITEMASK_X,
1948                                    swizzle(temp[0], X, X, X, X),
1949                                    undef,
1950                                    undef,
1951                                    0);
1952
1953                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z,
1954                                    swizzle(temp[0], X, X, X, X),
1955                                    swizzle(const_sin[1], W, W, W, W), //2*PI
1956                                    negate(swizzle(const_sin[0], Z, Z, Z, Z)), //PI
1957                                    0);
1958
1959                         /* SIN */
1960
1961                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X | WRITEMASK_Y,
1962                                    swizzle(temp[0], Z, Z, Z, Z),
1963                                    const_sin[0],
1964                                    pfs_zero,
1965                                    0);
1966
1967                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1968                                    swizzle(temp[0], Y, Y, Y, Y),
1969                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1970                                    swizzle(temp[0], X, X, X, X),
1971                                    0);
1972
1973                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1974                                    swizzle(temp[0], X, X, X, X),
1975                                    absolute(swizzle(temp[0], X, X, X, X)),
1976                                    negate(swizzle(temp[0], X, X, X, X)),
1977                                    0);
1978
1979
1980                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1981                                    swizzle(temp[0], Y, Y, Y, Y),
1982                                    swizzle(const_sin[0], W, W, W, W),
1983                                    swizzle(temp[0], X, X, X, X),
1984                                    flags);
1985
1986                         free_temp(rp, temp[0]);
1987                         break;
1988                 case OPCODE_SLT:
1989                         src[0] = t_src(rp, fpi->SrcReg[0]);
1990                         src[1] = t_src(rp, fpi->SrcReg[1]);
1991                         temp[0] = get_temp_reg(rp);
1992                         /* temp = src0 - src1
1993                          * dest.c = (temp.c < 0.0) ? 1 : 0
1994                          */
1995                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1996                                    src[0], pfs_one, negate(src[1]),
1997                                    0);
1998                         emit_arith(rp, PFS_OP_CMP, dest, mask,
1999                                    pfs_zero, pfs_one, temp[0],
2000                                    0);
2001                         free_temp(rp, temp[0]);
2002                         break;
2003                 case OPCODE_SUB:
2004                         src[0] = t_src(rp, fpi->SrcReg[0]);
2005                         src[1] = t_src(rp, fpi->SrcReg[1]);
2006                         emit_arith(rp, PFS_OP_MAD, dest, mask,
2007                                    src[0], pfs_one, negate(src[1]),
2008                                    flags);
2009                         break;
2010                 case OPCODE_TEX:
2011                         emit_tex(rp, fpi, R300_FPITX_OP_TEX);
2012                         break;
2013                 case OPCODE_TXB:
2014                         emit_tex(rp, fpi, R300_FPITX_OP_TXB);
2015                         break;
2016                 case OPCODE_TXP:
2017                         emit_tex(rp, fpi, R300_FPITX_OP_TXP);
2018                         break;
2019                 case OPCODE_XPD: {
2020                         src[0] = t_src(rp, fpi->SrcReg[0]);
2021                         src[1] = t_src(rp, fpi->SrcReg[1]);
2022                         temp[0] = get_temp_reg(rp);
2023                         /* temp = src0.zxy * src1.yzx */
2024                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_XYZ,
2025                                    swizzle(keep(src[0]), Z, X, Y, W),
2026                                    swizzle(keep(src[1]), Y, Z, X, W),
2027                                    pfs_zero,
2028                                    0);
2029                         /* dest.xyz = src0.yzx * src1.zxy - temp
2030                          * dest.w       = undefined
2031                          * */
2032                         emit_arith(rp, PFS_OP_MAD, dest, mask & WRITEMASK_XYZ,
2033                                    swizzle(src[0], Y, Z, X, W),
2034                                    swizzle(src[1], Z, X, Y, W),
2035                                    negate(temp[0]),
2036                                    flags);
2037                         /* cleanup */
2038                         free_temp(rp, temp[0]);
2039                         break;
2040                 }
2041                 default:
2042                         ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
2043                         break;
2044                 }
2045
2046                 if (rp->error)
2047                         return GL_FALSE;
2048
2049         }
2050
2051         return GL_TRUE;
2052 }
2053
2054 static void insert_wpos(struct gl_program *prog)
2055 {
2056         GLint tokens[6] = { STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0, 0 };
2057         struct prog_instruction *fpi;
2058         GLuint window_index;
2059         int i = 0;
2060         GLuint tempregi = prog->NumTemporaries;
2061         /* should do something else if no temps left... */
2062         prog->NumTemporaries++;
2063
2064         fpi = _mesa_alloc_instructions (prog->NumInstructions + 3);
2065         _mesa_init_instructions (fpi, prog->NumInstructions + 3);
2066
2067         /* perspective divide */
2068         fpi[i].Opcode = OPCODE_RCP;
2069
2070         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2071         fpi[i].DstReg.Index = tempregi;
2072         fpi[i].DstReg.WriteMask = WRITEMASK_W;
2073         fpi[i].DstReg.CondMask = COND_TR;
2074
2075         fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2076         fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2077         fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
2078         i++;
2079
2080         fpi[i].Opcode = OPCODE_MUL;
2081
2082         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2083         fpi[i].DstReg.Index = tempregi;
2084         fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2085         fpi[i].DstReg.CondMask = COND_TR;
2086
2087         fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2088         fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2089         fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
2090
2091         fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
2092         fpi[i].SrcReg[1].Index = tempregi;
2093         fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
2094         i++;
2095
2096         /* viewport transformation */
2097         window_index = _mesa_add_state_reference(prog->Parameters, tokens);
2098
2099         fpi[i].Opcode = OPCODE_MAD;
2100
2101         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2102         fpi[i].DstReg.Index = tempregi;
2103         fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2104         fpi[i].DstReg.CondMask = COND_TR;
2105
2106         fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
2107         fpi[i].SrcReg[0].Index = tempregi;
2108         fpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2109
2110         fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
2111         fpi[i].SrcReg[1].Index = window_index;
2112         fpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2113
2114         fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
2115         fpi[i].SrcReg[2].Index = window_index;
2116         fpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2117         i++;
2118
2119         _mesa_copy_instructions (&fpi[i], prog->Instructions, prog->NumInstructions);
2120
2121         free(prog->Instructions);
2122
2123         prog->Instructions = fpi;
2124
2125         prog->NumInstructions += i;
2126         fpi = &prog->Instructions[prog->NumInstructions-1];
2127
2128         assert(fpi->Opcode == OPCODE_END);
2129
2130         for(fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++){
2131                 for(i=0; i<3; i++)
2132                     if( fpi->SrcReg[i].File == PROGRAM_INPUT &&
2133                         fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS ){
2134                             fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
2135                             fpi->SrcReg[i].Index = tempregi;
2136                     }
2137         }
2138 }
2139
2140 /* - Init structures
2141  * - Determine what hwregs each input corresponds to
2142  */
2143 static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
2144 {
2145         struct r300_pfs_compile_state *cs = NULL;
2146         struct gl_fragment_program *mp = &rp->mesa_program;
2147         struct prog_instruction *fpi;
2148         GLuint InputsRead = mp->Base.InputsRead;
2149         GLuint temps_used = 0; /* for rp->temps[] */
2150         int i,j;
2151
2152         /* New compile, reset tracking data */
2153         rp->optimization = driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
2154         rp->translated = GL_FALSE;
2155         rp->error      = GL_FALSE;
2156         rp->cs = cs        = &(R300_CONTEXT(rp->ctx)->state.pfs_compile);
2157         rp->tex.length = 0;
2158         rp->cur_node   = 0;
2159         rp->first_node_has_tex = 0;
2160         rp->const_nr   = 0;
2161         rp->max_temp_idx = 0;
2162         rp->node[0].alu_end = -1;
2163         rp->node[0].tex_end = -1;
2164
2165         _mesa_memset(cs, 0, sizeof(*rp->cs));
2166         for (i=0;i<PFS_MAX_ALU_INST;i++) {
2167                 for (j=0;j<3;j++) {
2168                         cs->slot[i].vsrc[j] = SRC_CONST;
2169                         cs->slot[i].ssrc[j] = SRC_CONST;
2170                 }
2171         }
2172
2173         /* Work out what temps the Mesa inputs correspond to, this must match
2174          * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2175          * configures itself based on the fragprog's InputsRead
2176          *
2177          * NOTE: this depends on get_hw_temp() allocating registers in order,
2178          * starting from register 0.
2179          */
2180
2181         /* Texcoords come first */
2182         for (i=0;i<rp->ctx->Const.MaxTextureUnits;i++) {
2183                 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
2184                         cs->inputs[FRAG_ATTRIB_TEX0+i].refcount = 0;
2185                         cs->inputs[FRAG_ATTRIB_TEX0+i].reg = get_hw_temp(rp, 0);
2186                 }
2187         }
2188         InputsRead &= ~FRAG_BITS_TEX_ANY;
2189
2190         /* fragment position treated as a texcoord */
2191         if (InputsRead & FRAG_BIT_WPOS) {
2192                 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
2193                 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp, 0);
2194                 insert_wpos(&mp->Base);
2195         }
2196         InputsRead &= ~FRAG_BIT_WPOS;
2197
2198         /* Then primary colour */
2199         if (InputsRead & FRAG_BIT_COL0) {
2200                 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
2201                 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp, 0);
2202         }
2203         InputsRead &= ~FRAG_BIT_COL0;
2204
2205         /* Secondary color */
2206         if (InputsRead & FRAG_BIT_COL1) {
2207                 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
2208                 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp, 0);
2209         }
2210         InputsRead &= ~FRAG_BIT_COL1;
2211
2212         /* Anything else */
2213         if (InputsRead) {
2214                 WARN_ONCE("Don't know how to handle inputs 0x%x\n",
2215                           InputsRead);
2216                 /* force read from hwreg 0 for now */
2217                 for (i=0;i<32;i++)
2218                         if (InputsRead & (1<<i)) cs->inputs[i].reg = 0;
2219         }
2220
2221         /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2222          * That way, we can free up the reg when it's no longer needed
2223          */
2224         if (!mp->Base.Instructions) {
2225                 ERROR("No instructions found in program\n");
2226                 return;
2227         }
2228
2229         for (fpi=mp->Base.Instructions;fpi->Opcode != OPCODE_END; fpi++) {
2230                 int idx;
2231
2232                 for (i=0;i<3;i++) {
2233                         idx = fpi->SrcReg[i].Index;
2234                         switch (fpi->SrcReg[i].File) {
2235                         case PROGRAM_TEMPORARY:
2236                                 if (!(temps_used & (1<<idx))) {
2237                                         cs->temps[idx].reg = -1;
2238                                         cs->temps[idx].refcount = 1;
2239                                         temps_used |= (1 << idx);
2240                                 } else
2241                                         cs->temps[idx].refcount++;
2242                                 break;
2243                         case PROGRAM_INPUT:
2244                                 cs->inputs[idx].refcount++;
2245                                 break;
2246                         default: break;
2247                         }
2248                 }
2249
2250                 idx = fpi->DstReg.Index;
2251                 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
2252                         if (!(temps_used & (1<<idx))) {
2253                                 cs->temps[idx].reg = -1;
2254                                 cs->temps[idx].refcount = 1;
2255                                 temps_used |= (1 << idx);
2256                         } else
2257                                 cs->temps[idx].refcount++;
2258                 }
2259         }
2260         cs->temp_in_use = temps_used;
2261 }
2262
2263 static void update_params(struct r300_fragment_program *rp)
2264 {
2265         struct gl_fragment_program *mp = &rp->mesa_program;
2266
2267         /* Ask Mesa nicely to fill in ParameterValues for us */
2268         if (mp->Base.Parameters)
2269                 _mesa_load_state_parameters(rp->ctx, mp->Base.Parameters);
2270 }
2271
2272 void r300_translate_fragment_shader(r300ContextPtr r300, struct r300_fragment_program *rp)
2273 {
2274         struct r300_pfs_compile_state *cs = NULL;
2275
2276         if (!rp->translated) {
2277
2278                 init_program(r300, rp);
2279                 cs = rp->cs;
2280
2281                 if (parse_program(rp) == GL_FALSE) {
2282                         dump_program(rp);
2283                         return;
2284                 }
2285
2286                 /* Finish off */
2287                 rp->node[rp->cur_node].alu_end =
2288                                 cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
2289                 if (rp->node[rp->cur_node].tex_end < 0)
2290                         rp->node[rp->cur_node].tex_end = 0;
2291                 rp->alu_offset = 0;
2292                 rp->alu_end    = cs->nrslots - 1;
2293                 rp->tex_offset = 0;
2294                 rp->tex_end    = rp->tex.length ? rp->tex.length - 1 : 0;
2295                 assert(rp->node[rp->cur_node].alu_end >= 0);
2296                 assert(rp->alu_end >= 0);
2297
2298                 rp->translated = GL_TRUE;
2299                 if (RADEON_DEBUG & DEBUG_PIXEL) dump_program(rp);
2300                 r300UpdateStateParameters(rp->ctx, _NEW_PROGRAM);
2301         }
2302
2303         update_params(rp);
2304 }
2305
2306 /* just some random things... */
2307 static void dump_program(struct r300_fragment_program *rp)
2308 {
2309         int n, i, j;
2310         static int pc = 0;
2311
2312         fprintf(stderr, "pc=%d*************************************\n", pc++);
2313
2314         fprintf(stderr, "Mesa program:\n");
2315         fprintf(stderr, "-------------\n");
2316                 _mesa_print_program(&rp->mesa_program.Base);
2317         fflush(stdout);
2318
2319         fprintf(stderr, "Hardware program\n");
2320         fprintf(stderr, "----------------\n");
2321
2322         for (n = 0; n < (rp->cur_node+1); n++) {
2323                 fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "\
2324                         "alu_end: %d, tex_end: %d\n", n,
2325                         rp->node[n].alu_offset,
2326                         rp->node[n].tex_offset,
2327                         rp->node[n].alu_end,
2328                         rp->node[n].tex_end);
2329
2330                 if (rp->tex.length) {
2331                         fprintf(stderr, "  TEX:\n");
2332                         for(i = rp->node[n].tex_offset; i <= rp->node[n].tex_offset+rp->node[n].tex_end; ++i) {
2333                                 const char* instr;
2334
2335                                 switch((rp->tex.inst[i] >> R300_FPITX_OPCODE_SHIFT) & 15) {
2336                                 case R300_FPITX_OP_TEX:
2337                                         instr = "TEX";
2338                                         break;
2339                                 case R300_FPITX_OP_KIL:
2340                                         instr = "KIL";
2341                                         break;
2342                                 case R300_FPITX_OP_TXP:
2343                                         instr = "TXP";
2344                                         break;
2345                                 case R300_FPITX_OP_TXB:
2346                                         instr = "TXB";
2347                                         break;
2348                                 default:
2349                                         instr = "UNKNOWN";
2350                                 }
2351
2352                                 fprintf(stderr, "    %s t%i, %c%i, texture[%i]   (%08x)\n",
2353                                                 instr,
2354                                                 (rp->tex.inst[i] >> R300_FPITX_DST_SHIFT) & 31,
2355                                                 (rp->tex.inst[i] & R300_FPITX_SRC_CONST) ? 'c': 't',
2356                                                 (rp->tex.inst[i] >> R300_FPITX_SRC_SHIFT) & 31,
2357                                                 (rp->tex.inst[i] & R300_FPITX_IMAGE_MASK) >> R300_FPITX_IMAGE_SHIFT,
2358                                                 rp->tex.inst[i]);
2359                         }
2360                 }
2361
2362                 for(i = rp->node[n].alu_offset; i <= rp->node[n].alu_offset+rp->node[n].alu_end; ++i) {
2363                         char srcc[3][10], dstc[20];
2364                         char srca[3][10], dsta[20];
2365                         char argc[3][20];
2366                         char arga[3][20];
2367                         char flags[5], tmp[10];
2368
2369                         for(j = 0; j < 3; ++j) {
2370                                 int regc = rp->alu.inst[i].inst1 >> (j*6);
2371                                 int rega = rp->alu.inst[i].inst3 >> (j*6);
2372
2373                                 sprintf(srcc[j], "%c%i", (regc & 32) ? 'c' : 't', regc & 31);
2374                                 sprintf(srca[j], "%c%i", (rega & 32) ? 'c' : 't', rega & 31);
2375                         }
2376
2377                         dstc[0] = 0;
2378                         sprintf(flags, "%s%s%s",
2379                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "",
2380                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Y) ? "y" : "",
2381                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_REG_Z) ? "z" : "");
2382                         if (flags[0] != 0) {
2383                                 sprintf(dstc, "t%i.%s ",
2384                                                 (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
2385                                                 flags);
2386                         }
2387                         sprintf(flags, "%s%s%s",
2388                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_X) ? "x" : "",
2389                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? "y" : "",
2390                                         (rp->alu.inst[i].inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? "z" : "");
2391                         if (flags[0] != 0) {
2392                                 sprintf(tmp, "o%i.%s",
2393                                                 (rp->alu.inst[i].inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
2394                                                 flags);
2395                                 strcat(dstc, tmp);
2396                         }
2397
2398                         dsta[0] = 0;
2399                         if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) {
2400                                 sprintf(dsta, "t%i.w ", (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
2401                         }
2402                         if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) {
2403                                 sprintf(tmp, "o%i.w ", (rp->alu.inst[i].inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
2404                                 strcat(dsta, tmp);
2405                         }
2406                         if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) {
2407                                 strcat(dsta, "Z");
2408                         }
2409
2410                         fprintf(stderr, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2411                                         "       w: %3s %3s %3s -> %-20s (%08x)\n",
2412                                         i,
2413                                         srcc[0], srcc[1], srcc[2], dstc, rp->alu.inst[i].inst1,
2414                                         srca[0], srca[1], srca[2], dsta, rp->alu.inst[i].inst3);
2415
2416                         for(j = 0; j < 3; ++j) {
2417                                 int regc = rp->alu.inst[i].inst0 >> (j*7);
2418                                 int rega = rp->alu.inst[i].inst2 >> (j*7);
2419                                 int d;
2420                                 char buf[20];
2421
2422                                 d = regc & 31;
2423                                 if (d < 12) {
2424                                         switch(d % 4) {
2425                                                 case R300_FPI0_ARGC_SRC0C_XYZ:
2426                                                         sprintf(buf, "%s.xyz", srcc[d / 4]);
2427                                                         break;
2428                                                 case R300_FPI0_ARGC_SRC0C_XXX:
2429                                                         sprintf(buf, "%s.xxx", srcc[d / 4]);
2430                                                         break;
2431                                                 case R300_FPI0_ARGC_SRC0C_YYY:
2432                                                         sprintf(buf, "%s.yyy", srcc[d / 4]);
2433                                                         break;
2434                                                 case R300_FPI0_ARGC_SRC0C_ZZZ:
2435                                                         sprintf(buf, "%s.zzz", srcc[d / 4]);
2436                                                         break;
2437                                         }
2438                                 } else if (d < 15) {
2439                                         sprintf(buf, "%s.www", srca[d-12]);
2440                                 } else if (d == 20) {
2441                                         sprintf(buf, "0.0");
2442                                 } else if (d == 21) {
2443                                         sprintf(buf, "1.0");
2444                                 } else if (d == 22) {
2445                                         sprintf(buf, "0.5");
2446                                 } else if (d >= 23 && d < 32) {
2447                                         d -= 23;
2448                                         switch(d/3) {
2449                                                 case 0:
2450                                                         sprintf(buf, "%s.yzx", srcc[d % 3]);
2451                                                         break;
2452                                                 case 1:
2453                                                         sprintf(buf, "%s.zxy", srcc[d % 3]);
2454                                                         break;
2455                                                 case 2:
2456                                                         sprintf(buf, "%s.Wzy", srcc[d % 3]);
2457                                                         break;
2458                                         }
2459                                 } else {
2460                                         sprintf(buf, "%i", d);
2461                                 }
2462
2463                                 sprintf(argc[j], "%s%s%s%s",
2464                                                 (regc & 32) ? "-" : "",
2465                                                 (regc & 64) ? "|" : "",
2466                                                 buf,
2467                                                 (regc & 64) ? "|" : "");
2468
2469                                 d = rega & 31;
2470                                 if (d < 9) {
2471                                         sprintf(buf, "%s.%c", srcc[d / 3], 'x' + (char)(d%3));
2472                                 } else if (d < 12) {
2473                                         sprintf(buf, "%s.w", srca[d-9]);
2474                                 } else if (d == 16) {
2475                                         sprintf(buf, "0.0");
2476                                 } else if (d == 17) {
2477                                         sprintf(buf, "1.0");
2478                                 } else if (d == 18) {
2479                                         sprintf(buf, "0.5");
2480                                 } else {
2481                                         sprintf(buf, "%i", d);
2482                                 }
2483
2484                                 sprintf(arga[j], "%s%s%s%s",
2485                                                 (rega & 32) ? "-" : "",
2486                                                 (rega & 64) ? "|" : "",
2487                                                 buf,
2488                                                 (rega & 64) ? "|" : "");
2489                         }
2490
2491                         fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
2492                                         "       w: %8s %8s %8s    op: %08x\n",
2493                                         argc[0], argc[1], argc[2], rp->alu.inst[i].inst0,
2494                                         arga[0], arga[1], arga[2], rp->alu.inst[i].inst2);
2495                 }
2496         }
2497 }