src/mesa/drivers/dri/r300/r300_fragprog.c

   1 /*
   2  * Copyright (C) 2005 Ben Skeggs.
   3  *
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sublicense, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial
  16  * portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  */
  27
  28 /*
  29  * Authors:
  30  *   Ben Skeggs <darktama@iinet.net.au>
  31  *   Jerome Glisse <j.glisse@gmail.com>
  32  */
  33
  34 /*TODO'S
  35  *
  36  * - Depth write, WPOS/FOGC inputs
  37  * - FogOption
  38  * - Verify results of opcodes for accuracy, I've only checked them
  39  *   in specific cases.
  40  * - and more...
  41  */
  42
  43 #include "glheader.h"
  44 #include "macros.h"
  45 #include "enums.h"
  46 #include "shader/prog_instruction.h"
  47 #include "shader/prog_parameter.h"
  48 #include "shader/prog_print.h"
  49
  50 #include "r300_context.h"
  51 #include "r300_fragprog.h"
  52 #include "r300_reg.h"
  53 #include "r300_state.h"
  54
  55 /*
  56  * Usefull macros and values
  57  */
  58 #define ERROR(fmt, args...) do {                        \
  59                 fprintf(stderr, "%s::%s(): " fmt "\n",  \
  60                         __FILE__, __func__, ##args);    \
  61                 rp->error = GL_TRUE;                    \
  62         } while(0)
  63
  64 #define PFS_INVAL 0xFFFFFFFF
  65 #define COMPILE_STATE struct r300_pfs_compile_state *cs = rp->cs
  66
  67 #define SWIZZLE_XYZ             0
  68 #define SWIZZLE_XXX             1
  69 #define SWIZZLE_YYY             2
  70 #define SWIZZLE_ZZZ             3
  71 #define SWIZZLE_WWW             4
  72 #define SWIZZLE_YZX             5
  73 #define SWIZZLE_ZXY             6
  74 #define SWIZZLE_WZY             7
  75 #define SWIZZLE_111             8
  76 #define SWIZZLE_000             9
  77 #define SWIZZLE_HHH             10
  78
  79 #define swizzle(r, x, y, z, w) do_swizzle(rp, r,                \
  80                                           ((SWIZZLE_##x<<0)|    \
  81                                            (SWIZZLE_##y<<3)|    \
  82                                            (SWIZZLE_##z<<6)|    \
  83                                            (SWIZZLE_##w<<9)),   \
  84                                           0)
  85
  86 #define REG_TYPE_INPUT          0
  87 #define REG_TYPE_OUTPUT         1
  88 #define REG_TYPE_TEMP           2
  89 #define REG_TYPE_CONST          3
  90
  91 #define REG_TYPE_SHIFT          0
  92 #define REG_INDEX_SHIFT         2
  93 #define REG_VSWZ_SHIFT          8
  94 #define REG_SSWZ_SHIFT          13
  95 #define REG_NEGV_SHIFT          18
  96 #define REG_NEGS_SHIFT          19
  97 #define REG_ABS_SHIFT           20
  98 #define REG_NO_USE_SHIFT        21      // Hack for refcounting
  99 #define REG_VALID_SHIFT         22      // Does the register contain a defined value?
 100 #define REG_BUILTIN_SHIFT   23  // Is it a builtin (like all zero/all one)?
 101
 102 #define REG_TYPE_MASK           (0x03 << REG_TYPE_SHIFT)
 103 #define REG_INDEX_MASK          (0x3F << REG_INDEX_SHIFT)
 104 #define REG_VSWZ_MASK           (0x1F << REG_VSWZ_SHIFT)
 105 #define REG_SSWZ_MASK           (0x1F << REG_SSWZ_SHIFT)
 106 #define REG_NEGV_MASK           (0x01 << REG_NEGV_SHIFT)
 107 #define REG_NEGS_MASK           (0x01 << REG_NEGS_SHIFT)
 108 #define REG_ABS_MASK            (0x01 << REG_ABS_SHIFT)
 109 #define REG_NO_USE_MASK         (0x01 << REG_NO_USE_SHIFT)
 110 #define REG_VALID_MASK          (0x01 << REG_VALID_SHIFT)
 111 #define REG_BUILTIN_MASK        (0x01 << REG_BUILTIN_SHIFT)
 112
 113 #define REG(type, index, vswz, sswz, nouse, valid, builtin)     \
 114         (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |                   \
 115          ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |                \
 116          ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |              \
 117          ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |                \
 118          ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |  \
 119          ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |                   \
 120          ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 121 #define REG_GET_TYPE(reg)                                               \
 122         ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
 123 #define REG_GET_INDEX(reg)                                              \
 124         ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
 125 #define REG_GET_VSWZ(reg)                                               \
 126         ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
 127 #define REG_GET_SSWZ(reg)                                               \
 128         ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
 129 #define REG_GET_NO_USE(reg)                                             \
 130         ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
 131 #define REG_GET_VALID(reg)                                              \
 132         ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
 133 #define REG_GET_BUILTIN(reg)                                            \
 134         ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
 135 #define REG_SET_TYPE(reg, type)                                         \
 136         reg = ((reg & ~REG_TYPE_MASK) |                                 \
 137                ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
 138 #define REG_SET_INDEX(reg, index)                                       \
 139         reg = ((reg & ~REG_INDEX_MASK) |                                \
 140                ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
 141 #define REG_SET_VSWZ(reg, vswz)                                         \
 142         reg = ((reg & ~REG_VSWZ_MASK) |                                 \
 143                ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
 144 #define REG_SET_SSWZ(reg, sswz)                                         \
 145         reg = ((reg & ~REG_SSWZ_MASK) |                                 \
 146                ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 147 #define REG_SET_NO_USE(reg, nouse)                                      \
 148         reg = ((reg & ~REG_NO_USE_MASK) |                               \
 149                ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
 150 #define REG_SET_VALID(reg, valid)                                       \
 151         reg = ((reg & ~REG_VALID_MASK) |                                \
 152                ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
 153 #define REG_SET_BUILTIN(reg, builtin)                                   \
 154         reg = ((reg & ~REG_BUILTIN_MASK) |                              \
 155                ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
 156 #define REG_ABS(reg)                                                    \
 157         reg = (reg | REG_ABS_MASK)
 158 #define REG_NEGV(reg)                                                   \
 159         reg = (reg | REG_NEGV_MASK)
 160 #define REG_NEGS(reg)                                                   \
 161         reg = (reg | REG_NEGS_MASK)
 162
 163 /*
 164  * Datas structures for fragment program generation
 165  */
 166
 167 /* description of r300 native hw instructions */
 168 static const struct {
 169         const char *name;
 170         int argc;
 171         int v_op;
 172         int s_op;
 173 } r300_fpop[] = {
 174         /* *INDENT-OFF* */
 175         {"MAD", 3, R300_FPI0_OUTC_MAD, R300_FPI2_OUTA_MAD},
 176         {"DP3", 2, R300_FPI0_OUTC_DP3, R300_FPI2_OUTA_DP4},
 177         {"DP4", 2, R300_FPI0_OUTC_DP4, R300_FPI2_OUTA_DP4},
 178         {"MIN", 2, R300_FPI0_OUTC_MIN, R300_FPI2_OUTA_MIN},
 179         {"MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX},
 180         {"CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP},
 181         {"FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC},
 182         {"EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2},
 183         {"LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2},
 184         {"RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP},
 185         {"RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ},
 186         {"REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL},
 187         {"CMPH", 3, R300_FPI0_OUTC_CMPH, PFS_INVAL},
 188         /* *INDENT-ON* */
 189 };
 190
 191 /* vector swizzles r300 can support natively, with a couple of
 192  * cases we handle specially
 193  *
 194  * REG_VSWZ/REG_SSWZ is an index into this table
 195  */
 196
 197 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
 198 #define SWIZZLE_HALF 6
 199
 200 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
 201                                           SWIZZLE_##y, \
 202                                           SWIZZLE_##z, \
 203                                           SWIZZLE_ZERO))
 204 /* native swizzles */
 205 static const struct r300_pfs_swizzle {
 206         GLuint hash;            /* swizzle value this matches */
 207         GLuint base;            /* base value for hw swizzle */
 208         GLuint stride;          /* difference in base between arg0/1/2 */
 209         GLuint flags;
 210 } v_swiz[] = {
 211         /* *INDENT-OFF* */
 212         {MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
 213         {MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
 214         {MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
 215         {MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
 216         {MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
 217         {MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
 218         {MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
 219         {MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
 220         {MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
 221         {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
 222         {MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
 223         {PFS_INVAL, 0, 0, 0},
 224         /* *INDENT-ON* */
 225 };
 226
 227 /* used during matching of non-native swizzles */
 228 #define SWZ_X_MASK (7 << 0)
 229 #define SWZ_Y_MASK (7 << 3)
 230 #define SWZ_Z_MASK (7 << 6)
 231 #define SWZ_W_MASK (7 << 9)
 232 static const struct {
 233         GLuint hash;            /* used to mask matching swizzle components */
 234         int mask;               /* actual outmask */
 235         int count;              /* count of components matched */
 236 } s_mask[] = {
 237         /* *INDENT-OFF* */
 238         {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
 239         {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
 240         {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
 241         {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
 242         {SWZ_X_MASK, 1, 1},
 243         {SWZ_Y_MASK, 2, 1},
 244         {SWZ_Z_MASK, 4, 1},
 245         {PFS_INVAL, PFS_INVAL, PFS_INVAL}
 246         /* *INDENT-ON* */
 247 };
 248
 249 static const struct {
 250         int base;               /* hw value of swizzle */
 251         int stride;             /* difference between SRC0/1/2 */
 252         GLuint flags;
 253 } s_swiz[] = {
 254         /* *INDENT-OFF* */
 255         {R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
 256         {R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
 257         {R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
 258         {R300_FPI2_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
 259         {R300_FPI2_ARGA_ZERO, 0, 0},
 260         {R300_FPI2_ARGA_ONE, 0, 0},
 261         {R300_FPI2_ARGA_HALF, 0, 0}
 262         /* *INDENT-ON* */
 263 };
 264
 265 /* boiler-plate reg, for convenience */
 266 static const GLuint undef = REG(REG_TYPE_TEMP,
 267                                 0,
 268                                 SWIZZLE_XYZ,
 269                                 SWIZZLE_W,
 270                                 GL_FALSE,
 271                                 GL_FALSE,
 272                                 GL_FALSE);
 273
 274 /* constant one source */
 275 static const GLuint pfs_one = REG(REG_TYPE_CONST,
 276                                   0,
 277                                   SWIZZLE_111,
 278                                   SWIZZLE_ONE,
 279                                   GL_FALSE,
 280                                   GL_TRUE,
 281                                   GL_TRUE);
 282
 283 /* constant half source */
 284 static const GLuint pfs_half = REG(REG_TYPE_CONST,
 285                                    0,
 286                                    SWIZZLE_HHH,
 287                                    SWIZZLE_HALF,
 288                                    GL_FALSE,
 289                                    GL_TRUE,
 290                                    GL_TRUE);
 291
 292 /* constant zero source */
 293 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
 294                                    0,
 295                                    SWIZZLE_000,
 296                                    SWIZZLE_ZERO,
 297                                    GL_FALSE,
 298                                    GL_TRUE,
 299                                    GL_TRUE);
 300
 301 /*
 302  * Common functions prototypes
 303  */
 304 static void dump_program(struct r300_fragment_program *rp);
 305 static void emit_arith(struct r300_fragment_program *rp, int op,
 306                        GLuint dest, int mask,
 307                        GLuint src0, GLuint src1, GLuint src2, int flags);
 308
 309 /**
 310  * Get an R300 temporary that can be written to in the given slot.
 311  */
 312 static int get_hw_temp(struct r300_fragment_program *rp, int slot)
 313 {
 314         COMPILE_STATE;
 315         int r;
 316
 317         for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 318                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
 319                         break;
 320         }
 321
 322         if (r >= PFS_NUM_TEMP_REGS) {
 323                 ERROR("Out of hardware temps\n");
 324                 return 0;
 325         }
 326         // Reserved is used to avoid the following scenario:
 327         //  R300 temporary X is first assigned to Mesa temporary Y during vector ops
 328         //  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
 329         //  Then scalar ops on Mesa temporary Z are emitted and move back in time
 330         //  to overwrite the value of temporary Y.
 331         // End scenario.
 332         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 333         cs->hwtemps[r].free = -1;
 334
 335         // Reset to some value that won't mess things up when the user
 336         // tries to read from a temporary that hasn't been assigned a value yet.
 337         // In the normal case, vector_valid and scalar_valid should be set to
 338         // a sane value by the first emit that writes to this temporary.
 339         cs->hwtemps[r].vector_valid = 0;
 340         cs->hwtemps[r].scalar_valid = 0;
 341
 342         if (r > rp->max_temp_idx)
 343                 rp->max_temp_idx = r;
 344
 345         return r;
 346 }
 347
 348 /**
 349  * Get an R300 temporary that will act as a TEX destination register.
 350  */
 351 static int get_hw_temp_tex(struct r300_fragment_program *rp)
 352 {
 353         COMPILE_STATE;
 354         int r;
 355
 356         for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 357                 if (cs->used_in_node & (1 << r))
 358                         continue;
 359
 360                 // Note: Be very careful here
 361                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
 362                         break;
 363         }
 364
 365         if (r >= PFS_NUM_TEMP_REGS)
 366                 return get_hw_temp(rp, 0);      /* Will cause an indirection */
 367
 368         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 369         cs->hwtemps[r].free = -1;
 370
 371         // Reset to some value that won't mess things up when the user
 372         // tries to read from a temporary that hasn't been assigned a value yet.
 373         // In the normal case, vector_valid and scalar_valid should be set to
 374         // a sane value by the first emit that writes to this temporary.
 375         cs->hwtemps[r].vector_valid = cs->nrslots;
 376         cs->hwtemps[r].scalar_valid = cs->nrslots;
 377
 378         if (r > rp->max_temp_idx)
 379                 rp->max_temp_idx = r;
 380
 381         return r;
 382 }
 383
 384 /**
 385  * Mark the given hardware register as free.
 386  */
 387 static void free_hw_temp(struct r300_fragment_program *rp, int idx)
 388 {
 389         COMPILE_STATE;
 390
 391         // Be very careful here. Consider sequences like
 392         //  MAD r0, r1,r2,r3
 393         //  TEX r4, ...
 394         // The TEX instruction may be moved in front of the MAD instruction
 395         // due to the way nodes work. We don't want to alias r1 and r4 in
 396         // this case.
 397         // I'm certain the register allocation could be further sanitized,
 398         // but it's tricky because of stuff that can happen inside emit_tex
 399         // and emit_arith.
 400         cs->hwtemps[idx].free = cs->nrslots + 1;
 401 }
 402
 403 /**
 404  * Create a new Mesa temporary register.
 405  */
 406 static GLuint get_temp_reg(struct r300_fragment_program *rp)
 407 {
 408         COMPILE_STATE;
 409         GLuint r = undef;
 410         GLuint index;
 411
 412         index = ffs(~cs->temp_in_use);
 413         if (!index) {
 414                 ERROR("Out of program temps\n");
 415                 return r;
 416         }
 417
 418         cs->temp_in_use |= (1 << --index);
 419         cs->temps[index].refcount = 0xFFFFFFFF;
 420         cs->temps[index].reg = -1;
 421
 422         REG_SET_TYPE(r, REG_TYPE_TEMP);
 423         REG_SET_INDEX(r, index);
 424         REG_SET_VALID(r, GL_TRUE);
 425         return r;
 426 }
 427
 428 /**
 429  * Create a new Mesa temporary register that will act as the destination
 430  * register for a texture read.
 431  */
 432 static GLuint get_temp_reg_tex(struct r300_fragment_program *rp)
 433 {
 434         COMPILE_STATE;
 435         GLuint r = undef;
 436         GLuint index;
 437
 438         index = ffs(~cs->temp_in_use);
 439         if (!index) {
 440                 ERROR("Out of program temps\n");
 441                 return r;
 442         }
 443
 444         cs->temp_in_use |= (1 << --index);
 445         cs->temps[index].refcount = 0xFFFFFFFF;
 446         cs->temps[index].reg = get_hw_temp_tex(rp);
 447
 448         REG_SET_TYPE(r, REG_TYPE_TEMP);
 449         REG_SET_INDEX(r, index);
 450         REG_SET_VALID(r, GL_TRUE);
 451         return r;
 452 }
 453
 454 /**
 455  * Free a Mesa temporary and the associated R300 temporary.
 456  */
 457 static void free_temp(struct r300_fragment_program *rp, GLuint r)
 458 {
 459         COMPILE_STATE;
 460         GLuint index = REG_GET_INDEX(r);
 461
 462         if (!(cs->temp_in_use & (1 << index)))
 463                 return;
 464
 465         if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
 466                 free_hw_temp(rp, cs->temps[index].reg);
 467                 cs->temps[index].reg = -1;
 468                 cs->temp_in_use &= ~(1 << index);
 469         } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
 470                 free_hw_temp(rp, cs->inputs[index].reg);
 471                 cs->inputs[index].reg = -1;
 472         }
 473 }
 474
 475 /**
 476  * Emit a hardware constant/parameter.
 477  *
 478  * \p cp Stable pointer to an array of 4 floats.
 479  *  The pointer must be stable in the sense that it remains to be valid
 480  *  and hold the contents of the constant/parameter throughout the lifetime
 481  *  of the fragment program (actually, up until the next time the fragment
 482  *  program is translated).
 483  */
 484 static GLuint emit_const4fv(struct r300_fragment_program *rp,
 485                             const GLfloat * cp)
 486 {
 487         GLuint reg = undef;
 488         int index;
 489
 490         for (index = 0; index < rp->const_nr; ++index) {
 491                 if (rp->constant[index] == cp)
 492                         break;
 493         }
 494
 495         if (index >= rp->const_nr) {
 496                 if (index >= PFS_NUM_CONST_REGS) {
 497                         ERROR("Out of hw constants!\n");
 498                         return reg;
 499                 }
 500
 501                 rp->const_nr++;
 502                 rp->constant[index] = cp;
 503         }
 504
 505         REG_SET_TYPE(reg, REG_TYPE_CONST);
 506         REG_SET_INDEX(reg, index);
 507         REG_SET_VALID(reg, GL_TRUE);
 508         return reg;
 509 }
 510
 511 static inline GLuint negate(GLuint r)
 512 {
 513         REG_NEGS(r);
 514         REG_NEGV(r);
 515         return r;
 516 }
 517
 518 /* Hack, to prevent clobbering sources used multiple times when
 519  * emulating non-native instructions
 520  */
 521 static inline GLuint keep(GLuint r)
 522 {
 523         REG_SET_NO_USE(r, GL_TRUE);
 524         return r;
 525 }
 526
 527 static inline GLuint absolute(GLuint r)
 528 {
 529         REG_ABS(r);
 530         return r;
 531 }
 532
 533 static int swz_native(struct r300_fragment_program *rp,
 534                       GLuint src, GLuint * r, GLuint arbneg)
 535 {
 536         /* Native swizzle, handle negation */
 537         src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
 538
 539         if ((arbneg & 0x7) == 0x0) {
 540                 src = src & ~REG_NEGV_MASK;
 541                 *r = src;
 542         } else if ((arbneg & 0x7) == 0x7) {
 543                 src |= REG_NEGV_MASK;
 544                 *r = src;
 545         } else {
 546                 if (!REG_GET_VALID(*r))
 547                         *r = get_temp_reg(rp);
 548                 src |= REG_NEGV_MASK;
 549                 emit_arith(rp,
 550                            PFS_OP_MAD,
 551                            *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
 552                 src = src & ~REG_NEGV_MASK;
 553                 emit_arith(rp,
 554                            PFS_OP_MAD,
 555                            *r,
 556                            (arbneg ^ 0x7) | WRITEMASK_W,
 557                            src, pfs_one, pfs_zero, 0);
 558         }
 559
 560         return 3;
 561 }
 562
 563 static int swz_emit_partial(struct r300_fragment_program *rp,
 564                             GLuint src,
 565                             GLuint * r, int mask, int mc, GLuint arbneg)
 566 {
 567         GLuint tmp;
 568         GLuint wmask = 0;
 569
 570         if (!REG_GET_VALID(*r))
 571                 *r = get_temp_reg(rp);
 572
 573         /* A partial match, VSWZ/mask define what parts of the
 574          * desired swizzle we match
 575          */
 576         if (mc + s_mask[mask].count == 3) {
 577                 wmask = WRITEMASK_W;
 578                 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
 579         }
 580
 581         tmp = arbneg & s_mask[mask].mask;
 582         if (tmp) {
 583                 tmp = tmp ^ s_mask[mask].mask;
 584                 if (tmp) {
 585                         emit_arith(rp,
 586                                    PFS_OP_MAD,
 587                                    *r,
 588                                    arbneg & s_mask[mask].mask,
 589                                    keep(src) | REG_NEGV_MASK,
 590                                    pfs_one, pfs_zero, 0);
 591                         if (!wmask) {
 592                                 REG_SET_NO_USE(src, GL_TRUE);
 593                         } else {
 594                                 REG_SET_NO_USE(src, GL_FALSE);
 595                         }
 596                         emit_arith(rp,
 597                                    PFS_OP_MAD,
 598                                    *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
 599                 } else {
 600                         if (!wmask) {
 601                                 REG_SET_NO_USE(src, GL_TRUE);
 602                         } else {
 603                                 REG_SET_NO_USE(src, GL_FALSE);
 604                         }
 605                         emit_arith(rp,
 606                                    PFS_OP_MAD,
 607                                    *r,
 608                                    (arbneg & s_mask[mask].mask) | wmask,
 609                                    src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
 610                 }
 611         } else {
 612                 if (!wmask) {
 613                         REG_SET_NO_USE(src, GL_TRUE);
 614                 } else {
 615                         REG_SET_NO_USE(src, GL_FALSE);
 616                 }
 617                 emit_arith(rp, PFS_OP_MAD,
 618                            *r,
 619                            s_mask[mask].mask | wmask,
 620                            src, pfs_one, pfs_zero, 0);
 621         }
 622
 623         return s_mask[mask].count;
 624 }
 625
 626 static GLuint do_swizzle(struct r300_fragment_program *rp,
 627                          GLuint src, GLuint arbswz, GLuint arbneg)
 628 {
 629         GLuint r = undef;
 630         GLuint vswz;
 631         int c_mask = 0;
 632         int v_match = 0;
 633
 634         /* If swizzling from something without an XYZW native swizzle,
 635          * emit result to a temp, and do new swizzle from the temp.
 636          */
 637 #if 0
 638         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
 639                 GLuint temp = get_temp_reg(rp);
 640                 emit_arith(rp,
 641                            PFS_OP_MAD,
 642                            temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
 643                 src = temp;
 644         }
 645 #endif
 646
 647         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
 648                 GLuint vsrcswz =
 649                     (v_swiz[REG_GET_VSWZ(src)].
 650                      hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
 651                     REG_GET_SSWZ(src) << 9;
 652                 GLint i;
 653
 654                 GLuint newswz = 0;
 655                 GLuint offset;
 656                 for (i = 0; i < 4; ++i) {
 657                         offset = GET_SWZ(arbswz, i);
 658
 659                         newswz |=
 660                             (offset <= 3) ? GET_SWZ(vsrcswz,
 661                                                     offset) << i *
 662                             3 : offset << i * 3;
 663                 }
 664
 665                 arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
 666                 REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
 667         } else {
 668                 /* set scalar swizzling */
 669                 REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
 670
 671         }
 672         do {
 673                 vswz = REG_GET_VSWZ(src);
 674                 do {
 675                         int chash;
 676
 677                         REG_SET_VSWZ(src, vswz);
 678                         chash = v_swiz[REG_GET_VSWZ(src)].hash &
 679                             s_mask[c_mask].hash;
 680
 681                         if (chash == (arbswz & s_mask[c_mask].hash)) {
 682                                 if (s_mask[c_mask].count == 3) {
 683                                         v_match += swz_native(rp,
 684                                                               src, &r, arbneg);
 685                                 } else {
 686                                         v_match += swz_emit_partial(rp,
 687                                                                     src,
 688                                                                     &r,
 689                                                                     c_mask,
 690                                                                     v_match,
 691                                                                     arbneg);
 692                                 }
 693
 694                                 if (v_match == 3)
 695                                         return r;
 696
 697                                 /* Fill with something invalid.. all 0's was
 698                                  * wrong before, matched SWIZZLE_X.  So all
 699                                  * 1's will be okay for now
 700                                  */
 701                                 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
 702                         }
 703                 } while (v_swiz[++vswz].hash != PFS_INVAL);
 704                 REG_SET_VSWZ(src, SWIZZLE_XYZ);
 705         } while (s_mask[++c_mask].hash != PFS_INVAL);
 706
 707         ERROR("should NEVER get here\n");
 708         return r;
 709 }
 710
 711 static GLuint t_src(struct r300_fragment_program *rp,
 712                     struct prog_src_register fpsrc)
 713 {
 714         GLuint r = undef;
 715
 716         switch (fpsrc.File) {
 717         case PROGRAM_TEMPORARY:
 718                 REG_SET_INDEX(r, fpsrc.Index);
 719                 REG_SET_VALID(r, GL_TRUE);
 720                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 721                 break;
 722         case PROGRAM_INPUT:
 723                 REG_SET_INDEX(r, fpsrc.Index);
 724                 REG_SET_VALID(r, GL_TRUE);
 725                 REG_SET_TYPE(r, REG_TYPE_INPUT);
 726                 break;
 727         case PROGRAM_LOCAL_PARAM:
 728                 r = emit_const4fv(rp,
 729                                   rp->mesa_program.Base.LocalParams[fpsrc.
 730                                                                     Index]);
 731                 break;
 732         case PROGRAM_ENV_PARAM:
 733                 r = emit_const4fv(rp,
 734                                   rp->ctx->FragmentProgram.Parameters[fpsrc.
 735                                                                       Index]);
 736                 break;
 737         case PROGRAM_STATE_VAR:
 738         case PROGRAM_NAMED_PARAM:
 739                 r = emit_const4fv(rp,
 740                                   rp->mesa_program.Base.Parameters->
 741                                   ParameterValues[fpsrc.Index]);
 742                 break;
 743         default:
 744                 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
 745                 return r;
 746         }
 747
 748         /* no point swizzling ONE/ZERO/HALF constants... */
 749         if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
 750                 r = do_swizzle(rp, r, fpsrc.Swizzle, fpsrc.NegateBase);
 751         return r;
 752 }
 753
 754 static GLuint t_scalar_src(struct r300_fragment_program *rp,
 755                            struct prog_src_register fpsrc)
 756 {
 757         struct prog_src_register src = fpsrc;
 758         int sc = GET_SWZ(fpsrc.Swizzle, 0);     /* X */
 759
 760         src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
 761
 762         return t_src(rp, src);
 763 }
 764
 765 static GLuint t_dst(struct r300_fragment_program *rp,
 766                     struct prog_dst_register dest)
 767 {
 768         GLuint r = undef;
 769
 770         switch (dest.File) {
 771         case PROGRAM_TEMPORARY:
 772                 REG_SET_INDEX(r, dest.Index);
 773                 REG_SET_VALID(r, GL_TRUE);
 774                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 775                 return r;
 776         case PROGRAM_OUTPUT:
 777                 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
 778                 switch (dest.Index) {
 779                 case FRAG_RESULT_COLR:
 780                 case FRAG_RESULT_DEPR:
 781                         REG_SET_INDEX(r, dest.Index);
 782                         REG_SET_VALID(r, GL_TRUE);
 783                         return r;
 784                 default:
 785                         ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
 786                         return r;
 787                 }
 788         default:
 789                 ERROR("Bad DstReg->File 0x%x\n", dest.File);
 790                 return r;
 791         }
 792 }
 793
 794 static int t_hw_src(struct r300_fragment_program *rp, GLuint src, GLboolean tex)
 795 {
 796         COMPILE_STATE;
 797         int idx;
 798         int index = REG_GET_INDEX(src);
 799
 800         switch (REG_GET_TYPE(src)) {
 801         case REG_TYPE_TEMP:
 802                 /* NOTE: if reg==-1 here, a source is being read that
 803                  *       hasn't been written to. Undefined results.
 804                  */
 805                 if (cs->temps[index].reg == -1)
 806                         cs->temps[index].reg = get_hw_temp(rp, cs->nrslots);
 807
 808                 idx = cs->temps[index].reg;
 809
 810                 if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
 811                         free_temp(rp, src);
 812                 break;
 813         case REG_TYPE_INPUT:
 814                 idx = cs->inputs[index].reg;
 815
 816                 if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
 817                         free_hw_temp(rp, cs->inputs[index].reg);
 818                 break;
 819         case REG_TYPE_CONST:
 820                 return (index | SRC_CONST);
 821         default:
 822                 ERROR("Invalid type for source reg\n");
 823                 return (0 | SRC_CONST);
 824         }
 825
 826         if (!tex)
 827                 cs->used_in_node |= (1 << idx);
 828
 829         return idx;
 830 }
 831
 832 static int t_hw_dst(struct r300_fragment_program *rp,
 833                     GLuint dest, GLboolean tex, int slot)
 834 {
 835         COMPILE_STATE;
 836         int idx;
 837         GLuint index = REG_GET_INDEX(dest);
 838         assert(REG_GET_VALID(dest));
 839
 840         switch (REG_GET_TYPE(dest)) {
 841         case REG_TYPE_TEMP:
 842                 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
 843                         if (!tex) {
 844                                 cs->temps[index].reg = get_hw_temp(rp, slot);
 845                         } else {
 846                                 cs->temps[index].reg = get_hw_temp_tex(rp);
 847                         }
 848                 }
 849                 idx = cs->temps[index].reg;
 850
 851                 if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
 852                         free_temp(rp, dest);
 853
 854                 cs->dest_in_node |= (1 << idx);
 855                 cs->used_in_node |= (1 << idx);
 856                 break;
 857         case REG_TYPE_OUTPUT:
 858                 switch (index) {
 859                 case FRAG_RESULT_COLR:
 860                         rp->node[rp->cur_node].flags |=
 861                             R300_PFS_NODE_OUTPUT_COLOR;
 862                         break;
 863                 case FRAG_RESULT_DEPR:
 864                         rp->node[rp->cur_node].flags |=
 865                             R300_PFS_NODE_OUTPUT_DEPTH;
 866                         break;
 867                 }
 868                 return index;
 869                 break;
 870         default:
 871                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
 872                 return 0;
 873         }
 874
 875         return idx;
 876 }
 877
 878 static void emit_nop(struct r300_fragment_program *rp)
 879 {
 880         COMPILE_STATE;
 881
 882         if (cs->nrslots >= PFS_MAX_ALU_INST) {
 883                 ERROR("Out of ALU instruction slots\n");
 884                 return;
 885         }
 886
 887         rp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
 888         rp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
 889         rp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
 890         rp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
 891         cs->nrslots++;
 892 }
 893
 894 static void emit_tex(struct r300_fragment_program *rp,
 895                      struct prog_instruction *fpi, int opcode)
 896 {
 897         COMPILE_STATE;
 898         GLuint coord = t_src(rp, fpi->SrcReg[0]);
 899         GLuint dest = undef, rdest = undef;
 900         GLuint din, uin;
 901         int unit = fpi->TexSrcUnit;
 902         int hwsrc, hwdest;
 903         GLuint tempreg = 0;
 904
 905         uin = cs->used_in_node;
 906         din = cs->dest_in_node;
 907
 908         /* Resolve source/dest to hardware registers */
 909         if (opcode != R300_FPITX_OP_KIL) {
 910                 if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX) {
 911                         /**
 912                          * Hardware uses [0..1]x[0..1] range for rectangle textures
 913                          * instead of [0..Width]x[0..Height].
 914                          * Add a scaling instruction.
 915                          *
 916                          * \todo Refactor this once we have proper rewriting/optimization
 917                          * support for programs.
 918                          */
 919                         gl_state_index tokens[STATE_LENGTH] = {
 920                                 STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
 921                                 0
 922                         };
 923                         int factor_index;
 924                         GLuint factorreg;
 925
 926                         tokens[2] = unit;
 927                         factor_index =
 928                             _mesa_add_state_reference(rp->mesa_program.Base.
 929                                                       Parameters, tokens);
 930                         factorreg =
 931                             emit_const4fv(rp,
 932                                           rp->mesa_program.Base.Parameters->
 933                                           ParameterValues[factor_index]);
 934                         tempreg = keep(get_temp_reg(rp));
 935
 936                         emit_arith(rp, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
 937                                    coord, factorreg, pfs_zero, 0);
 938
 939                         /* Ensure correct node indirection */
 940                         uin = cs->used_in_node;
 941                         din = cs->dest_in_node;
 942
 943                         hwsrc = t_hw_src(rp, tempreg, GL_TRUE);
 944                 } else {
 945                         hwsrc = t_hw_src(rp, coord, GL_TRUE);
 946                 }
 947
 948                 dest = t_dst(rp, fpi->DstReg);
 949
 950                 /* r300 doesn't seem to be able to do TEX->output reg */
 951                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
 952                         rdest = dest;
 953                         dest = get_temp_reg_tex(rp);
 954                 }
 955                 hwdest =
 956                     t_hw_dst(rp, dest, GL_TRUE,
 957                              rp->node[rp->cur_node].alu_offset);
 958
 959                 /* Use a temp that hasn't been used in this node, rather
 960                  * than causing an indirection
 961                  */
 962                 if (uin & (1 << hwdest)) {
 963                         free_hw_temp(rp, hwdest);
 964                         hwdest = get_hw_temp_tex(rp);
 965                         cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
 966                 }
 967         } else {
 968                 hwdest = 0;
 969                 unit = 0;
 970                 hwsrc = t_hw_src(rp, coord, GL_TRUE);
 971         }
 972
 973         /* Indirection if source has been written in this node, or if the
 974          * dest has been read/written in this node
 975          */
 976         if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
 977              (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
 978
 979                 /* Finish off current node */
 980                 if (rp->node[rp->cur_node].alu_offset == cs->nrslots)
 981                         emit_nop(rp);
 982
 983                 rp->node[rp->cur_node].alu_end =
 984                     cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
 985                 assert(rp->node[rp->cur_node].alu_end >= 0);
 986
 987                 if (++rp->cur_node >= PFS_MAX_TEX_INDIRECT) {
 988                         ERROR("too many levels of texture indirection\n");
 989                         return;
 990                 }
 991
 992                 /* Start new node */
 993                 rp->node[rp->cur_node].tex_offset = rp->tex.length;
 994                 rp->node[rp->cur_node].alu_offset = cs->nrslots;
 995                 rp->node[rp->cur_node].tex_end = -1;
 996                 rp->node[rp->cur_node].alu_end = -1;
 997                 rp->node[rp->cur_node].flags = 0;
 998                 cs->used_in_node = 0;
 999                 cs->dest_in_node = 0;
1000         }
1001
1002         if (rp->cur_node == 0)
1003                 rp->first_node_has_tex = 1;
1004
1005         rp->tex.inst[rp->tex.length++] = 0 | (hwsrc << R300_FPITX_SRC_SHIFT)
1006             | (hwdest << R300_FPITX_DST_SHIFT)
1007             | (unit << R300_FPITX_IMAGE_SHIFT)
1008             /* not entirely sure about this */
1009             | (opcode << R300_FPITX_OPCODE_SHIFT);
1010
1011         cs->dest_in_node |= (1 << hwdest);
1012         if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1013                 cs->used_in_node |= (1 << hwsrc);
1014
1015         rp->node[rp->cur_node].tex_end++;
1016
1017         /* Copy from temp to output if needed */
1018         if (REG_GET_VALID(rdest)) {
1019                 emit_arith(rp, PFS_OP_MAD, rdest, WRITEMASK_XYZW, dest,
1020                            pfs_one, pfs_zero, 0);
1021                 free_temp(rp, dest);
1022         }
1023
1024         /* Free temp register */
1025         if (tempreg != 0)
1026                 free_temp(rp, tempreg);
1027 }
1028
1029 /**
1030  * Returns the first slot where we could possibly allow writing to dest,
1031  * according to register allocation.
1032  */
1033 static int get_earliest_allowed_write(struct r300_fragment_program *rp,
1034                                       GLuint dest, int mask)
1035 {
1036         COMPILE_STATE;
1037         int idx;
1038         int pos;
1039         GLuint index = REG_GET_INDEX(dest);
1040         assert(REG_GET_VALID(dest));
1041
1042         switch (REG_GET_TYPE(dest)) {
1043         case REG_TYPE_TEMP:
1044                 if (cs->temps[index].reg == -1)
1045                         return 0;
1046
1047                 idx = cs->temps[index].reg;
1048                 break;
1049         case REG_TYPE_OUTPUT:
1050                 return 0;
1051         default:
1052                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1053                 return 0;
1054         }
1055
1056         pos = cs->hwtemps[idx].reserved;
1057         if (mask & WRITEMASK_XYZ) {
1058                 if (pos < cs->hwtemps[idx].vector_lastread)
1059                         pos = cs->hwtemps[idx].vector_lastread;
1060         }
1061         if (mask & WRITEMASK_W) {
1062                 if (pos < cs->hwtemps[idx].scalar_lastread)
1063                         pos = cs->hwtemps[idx].scalar_lastread;
1064         }
1065
1066         return pos;
1067 }
1068
1069 /**
1070  * Allocates a slot for an ALU instruction that can consist of
1071  * a vertex part or a scalar part or both.
1072  *
1073  * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1074  * appropriate position (vector and/or scalar), and their positions are
1075  * recorded in the srcpos array.
1076  *
1077  * This function emits instruction code for the source fetch and the
1078  * argument selection. It does not emit instruction code for the
1079  * opcode or the destination selection.
1080  *
1081  * @return the index of the slot
1082  */
1083 static int find_and_prepare_slot(struct r300_fragment_program *rp,
1084                                  GLboolean emit_vop,
1085                                  GLboolean emit_sop,
1086                                  int argc, GLuint * src, GLuint dest, int mask)
1087 {
1088         COMPILE_STATE;
1089         int hwsrc[3];
1090         int srcpos[3];
1091         unsigned int used;
1092         int tempused;
1093         int tempvsrc[3];
1094         int tempssrc[3];
1095         int pos;
1096         int regnr;
1097         int i, j;
1098
1099         // Determine instruction slots, whether sources are required on
1100         // vector or scalar side, and the smallest slot number where
1101         // all source registers are available
1102         used = 0;
1103         if (emit_vop)
1104                 used |= SLOT_OP_VECTOR;
1105         if (emit_sop)
1106                 used |= SLOT_OP_SCALAR;
1107
1108         pos = get_earliest_allowed_write(rp, dest, mask);
1109
1110         if (rp->node[rp->cur_node].alu_offset > pos)
1111                 pos = rp->node[rp->cur_node].alu_offset;
1112         for (i = 0; i < argc; ++i) {
1113                 if (!REG_GET_BUILTIN(src[i])) {
1114                         if (emit_vop)
1115                                 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1116                         if (emit_sop)
1117                                 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1118                 }
1119
1120                 hwsrc[i] = t_hw_src(rp, src[i], GL_FALSE);      /* Note: sideeffects wrt refcounting! */
1121                 regnr = hwsrc[i] & 31;
1122
1123                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1124                         if (used & (SLOT_SRC_VECTOR << i)) {
1125                                 if (cs->hwtemps[regnr].vector_valid > pos)
1126                                         pos = cs->hwtemps[regnr].vector_valid;
1127                         }
1128                         if (used & (SLOT_SRC_SCALAR << i)) {
1129                                 if (cs->hwtemps[regnr].scalar_valid > pos)
1130                                         pos = cs->hwtemps[regnr].scalar_valid;
1131                         }
1132                 }
1133         }
1134
1135         // Find a slot that fits
1136         for (;; ++pos) {
1137                 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1138                         continue;
1139
1140                 if (pos >= cs->nrslots) {
1141                         if (cs->nrslots >= PFS_MAX_ALU_INST) {
1142                                 ERROR("Out of ALU instruction slots\n");
1143                                 return -1;
1144                         }
1145
1146                         rp->alu.inst[pos].inst0 = NOP_INST0;
1147                         rp->alu.inst[pos].inst1 = NOP_INST1;
1148                         rp->alu.inst[pos].inst2 = NOP_INST2;
1149                         rp->alu.inst[pos].inst3 = NOP_INST3;
1150
1151                         cs->nrslots++;
1152                 }
1153                 // Note: When we need both parts (vector and scalar) of a source,
1154                 // we always try to put them into the same position. This makes the
1155                 // code easier to read, and it is optimal (i.e. one doesn't gain
1156                 // anything by splitting the parts).
1157                 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1158                 tempused = cs->slot[pos].used;
1159                 for (i = 0; i < 3; ++i) {
1160                         tempvsrc[i] = cs->slot[pos].vsrc[i];
1161                         tempssrc[i] = cs->slot[pos].ssrc[i];
1162                 }
1163
1164                 for (i = 0; i < argc; ++i) {
1165                         int flags = (used >> i) & SLOT_SRC_BOTH;
1166
1167                         if (!flags) {
1168                                 srcpos[i] = 0;
1169                                 continue;
1170                         }
1171
1172                         for (j = 0; j < 3; ++j) {
1173                                 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1174                                         if (tempvsrc[j] != hwsrc[i])
1175                                                 continue;
1176                                 }
1177
1178                                 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1179                                         if (tempssrc[j] != hwsrc[i])
1180                                                 continue;
1181                                 }
1182
1183                                 break;
1184                         }
1185
1186                         if (j == 3)
1187                                 break;
1188
1189                         srcpos[i] = j;
1190                         tempused |= flags << j;
1191                         if (flags & SLOT_SRC_VECTOR)
1192                                 tempvsrc[j] = hwsrc[i];
1193                         if (flags & SLOT_SRC_SCALAR)
1194                                 tempssrc[j] = hwsrc[i];
1195                 }
1196
1197                 if (i == argc)
1198                         break;
1199         }
1200
1201         // Found a slot, reserve it
1202         cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1203         for (i = 0; i < 3; ++i) {
1204                 cs->slot[pos].vsrc[i] = tempvsrc[i];
1205                 cs->slot[pos].ssrc[i] = tempssrc[i];
1206         }
1207
1208         for (i = 0; i < argc; ++i) {
1209                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1210                         int regnr = hwsrc[i] & 31;
1211
1212                         if (used & (SLOT_SRC_VECTOR << i)) {
1213                                 if (cs->hwtemps[regnr].vector_lastread < pos)
1214                                         cs->hwtemps[regnr].vector_lastread =
1215                                             pos;
1216                         }
1217                         if (used & (SLOT_SRC_SCALAR << i)) {
1218                                 if (cs->hwtemps[regnr].scalar_lastread < pos)
1219                                         cs->hwtemps[regnr].scalar_lastread =
1220                                             pos;
1221                         }
1222                 }
1223         }
1224
1225         // Emit the source fetch code
1226         rp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
1227         rp->alu.inst[pos].inst1 |=
1228             ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
1229              (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
1230              (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
1231
1232         rp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
1233         rp->alu.inst[pos].inst3 |=
1234             ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
1235              (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
1236              (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
1237
1238         // Emit the argument selection code
1239         if (emit_vop) {
1240                 int swz[3];
1241
1242                 for (i = 0; i < 3; ++i) {
1243                         if (i < argc) {
1244                                 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1245                                           (srcpos[i] *
1246                                            v_swiz[REG_GET_VSWZ(src[i])].
1247                                            stride)) | ((src[i] & REG_NEGV_MASK)
1248                                                        ? ARG_NEG : 0) | ((src[i]
1249                                                                           &
1250                                                                           REG_ABS_MASK)
1251                                                                          ?
1252                                                                          ARG_ABS
1253                                                                          : 0);
1254                         } else {
1255                                 swz[i] = R300_FPI0_ARGC_ZERO;
1256                         }
1257                 }
1258
1259                 rp->alu.inst[pos].inst0 &=
1260                     ~(R300_FPI0_ARG0C_MASK | R300_FPI0_ARG1C_MASK |
1261                       R300_FPI0_ARG2C_MASK);
1262                 rp->alu.inst[pos].inst0 |=
1263                     (swz[0] << R300_FPI0_ARG0C_SHIFT) | (swz[1] <<
1264                                                          R300_FPI0_ARG1C_SHIFT)
1265                     | (swz[2] << R300_FPI0_ARG2C_SHIFT);
1266         }
1267
1268         if (emit_sop) {
1269                 int swz[3];
1270
1271                 for (i = 0; i < 3; ++i) {
1272                         if (i < argc) {
1273                                 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1274                                           (srcpos[i] *
1275                                            s_swiz[REG_GET_SSWZ(src[i])].
1276                                            stride)) | ((src[i] & REG_NEGV_MASK)
1277                                                        ? ARG_NEG : 0) | ((src[i]
1278                                                                           &
1279                                                                           REG_ABS_MASK)
1280                                                                          ?
1281                                                                          ARG_ABS
1282                                                                          : 0);
1283                         } else {
1284                                 swz[i] = R300_FPI2_ARGA_ZERO;
1285                         }
1286                 }
1287
1288                 rp->alu.inst[pos].inst2 &=
1289                     ~(R300_FPI2_ARG0A_MASK | R300_FPI2_ARG1A_MASK |
1290                       R300_FPI2_ARG2A_MASK);
1291                 rp->alu.inst[pos].inst2 |=
1292                     (swz[0] << R300_FPI2_ARG0A_SHIFT) | (swz[1] <<
1293                                                          R300_FPI2_ARG1A_SHIFT)
1294                     | (swz[2] << R300_FPI2_ARG2A_SHIFT);
1295         }
1296
1297         return pos;
1298 }
1299
1300 /**
1301  * Append an ALU instruction to the instruction list.
1302  */
1303 static void emit_arith(struct r300_fragment_program *rp,
1304                        int op,
1305                        GLuint dest,
1306                        int mask,
1307                        GLuint src0, GLuint src1, GLuint src2, int flags)
1308 {
1309         COMPILE_STATE;
1310         GLuint src[3] = { src0, src1, src2 };
1311         int hwdest;
1312         GLboolean emit_vop, emit_sop;
1313         int vop, sop, argc;
1314         int pos;
1315
1316         vop = r300_fpop[op].v_op;
1317         sop = r300_fpop[op].s_op;
1318         argc = r300_fpop[op].argc;
1319
1320         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1321             REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1322                 if (mask & WRITEMASK_Z) {
1323                         mask = WRITEMASK_W;
1324                 } else {
1325                         return;
1326                 }
1327         }
1328
1329         emit_vop = GL_FALSE;
1330         emit_sop = GL_FALSE;
1331         if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
1332                 emit_vop = GL_TRUE;
1333         if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
1334                 emit_sop = GL_TRUE;
1335
1336         pos =
1337             find_and_prepare_slot(rp, emit_vop, emit_sop, argc, src, dest,
1338                                   mask);
1339         if (pos < 0)
1340                 return;
1341
1342         hwdest = t_hw_dst(rp, dest, GL_FALSE, pos);     /* Note: Side effects wrt register allocation */
1343
1344         if (flags & PFS_FLAG_SAT) {
1345                 vop |= R300_FPI0_OUTC_SAT;
1346                 sop |= R300_FPI2_OUTA_SAT;
1347         }
1348
1349         /* Throw the pieces together and get FPI0/1 */
1350         if (emit_vop) {
1351                 rp->alu.inst[pos].inst0 |= vop;
1352
1353                 rp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
1354
1355                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1356                         if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1357                                 rp->alu.inst[pos].inst1 |=
1358                                     (mask & WRITEMASK_XYZ) <<
1359                                     R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
1360                         } else
1361                                 assert(0);
1362                 } else {
1363                         rp->alu.inst[pos].inst1 |=
1364                             (mask & WRITEMASK_XYZ) <<
1365                             R300_FPI1_DSTC_REG_MASK_SHIFT;
1366
1367                         cs->hwtemps[hwdest].vector_valid = pos + 1;
1368                 }
1369         }
1370
1371         /* And now FPI2/3 */
1372         if (emit_sop) {
1373                 rp->alu.inst[pos].inst2 |= sop;
1374
1375                 if (mask & WRITEMASK_W) {
1376                         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1377                                 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1378                                         rp->alu.inst[pos].inst3 |=
1379                                             (hwdest << R300_FPI3_DSTA_SHIFT) |
1380                                             R300_FPI3_DSTA_OUTPUT;
1381                                 } else if (REG_GET_INDEX(dest) ==
1382                                            FRAG_RESULT_DEPR) {
1383                                         rp->alu.inst[pos].inst3 |=
1384                                             R300_FPI3_DSTA_DEPTH;
1385                                 } else
1386                                         assert(0);
1387                         } else {
1388                                 rp->alu.inst[pos].inst3 |=
1389                                     (hwdest << R300_FPI3_DSTA_SHIFT) |
1390                                     R300_FPI3_DSTA_REG;
1391
1392                                 cs->hwtemps[hwdest].scalar_valid = pos + 1;
1393                         }
1394                 }
1395         }
1396
1397         return;
1398 }
1399
1400 #if 0
1401 static GLuint get_attrib(struct r300_fragment_program *rp, GLuint attr)
1402 {
1403         struct gl_fragment_program *mp = &rp->mesa_program;
1404         GLuint r = undef;
1405
1406         if (!(mp->Base.InputsRead & (1 << attr))) {
1407                 ERROR("Attribute %d was not provided!\n", attr);
1408                 return undef;
1409         }
1410
1411         REG_SET_TYPE(r, REG_TYPE_INPUT);
1412         REG_SET_INDEX(r, attr);
1413         REG_SET_VALID(r, GL_TRUE);
1414         return r;
1415 }
1416 #endif
1417
1418 static GLfloat SinCosConsts[2][4] = {
1419         {
1420          1.273239545,           // 4/PI
1421          -0.405284735,          // -4/(PI*PI)
1422          3.141592654,           // PI
1423          0.2225                 // weight
1424          },
1425         {
1426          0.75,
1427          0.0,
1428          0.159154943,           // 1/(2*PI)
1429          6.283185307            // 2*PI
1430          }
1431 };
1432
1433 /**
1434  * Emit a LIT instruction.
1435  * \p flags may be PFS_FLAG_SAT
1436  *
1437  * Definition of LIT (from ARB_fragment_program):
1438  * tmp = VectorLoad(op0);
1439  * if (tmp.x < 0) tmp.x = 0;
1440  * if (tmp.y < 0) tmp.y = 0;
1441  * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1442  * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1443  * result.x = 1.0;
1444  * result.y = tmp.x;
1445  * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1446  * result.w = 1.0;
1447  *
1448  * The longest path of computation is the one leading to result.z,
1449  * consisting of 5 operations. This implementation of LIT takes
1450  * 5 slots. So unless there's some special undocumented opcode,
1451  * this implementation is potentially optimal. Unfortunately,
1452  * emit_arith is a bit too conservative because it doesn't understand
1453  * partial writes to the vector component.
1454  */
1455 static const GLfloat LitConst[4] =
1456     { 127.999999, 127.999999, 127.999999, -127.999999 };
1457
1458 static void emit_lit(struct r300_fragment_program *rp,
1459                      GLuint dest, int mask, GLuint src, int flags)
1460 {
1461         COMPILE_STATE;
1462         GLuint cnst;
1463         int needTemporary;
1464         GLuint temp;
1465
1466         cnst = emit_const4fv(rp, LitConst);
1467
1468         needTemporary = 0;
1469         if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
1470                 needTemporary = 1;
1471         } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1472                 // LIT is typically followed by DP3/DP4, so there's no point
1473                 // in creating special code for this case
1474                 needTemporary = 1;
1475         }
1476
1477         if (needTemporary) {
1478                 temp = keep(get_temp_reg(rp));
1479         } else {
1480                 temp = keep(dest);
1481         }
1482
1483         // Note: The order of emit_arith inside the slots is relevant,
1484         // because emit_arith only looks at scalar vs. vector when resolving
1485         // dependencies, and it does not consider individual vector components,
1486         // so swizzling between the two parts can create fake dependencies.
1487
1488         // First slot
1489         emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_XY,
1490                    keep(src), pfs_zero, undef, 0);
1491         emit_arith(rp, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
1492
1493         // Second slot
1494         emit_arith(rp, PFS_OP_MIN, temp, WRITEMASK_Z,
1495                    swizzle(temp, W, W, W, W), cnst, undef, 0);
1496         emit_arith(rp, PFS_OP_LG2, temp, WRITEMASK_W,
1497                    swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
1498
1499         // Third slot
1500         // If desired, we saturate the y result here.
1501         // This does not affect the use as a condition variable in the CMP later
1502         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W,
1503                    temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
1504         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_Y,
1505                    swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
1506
1507         // Fourth slot
1508         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_X,
1509                    pfs_one, pfs_one, pfs_zero, 0);
1510         emit_arith(rp, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
1511
1512         // Fifth slot
1513         emit_arith(rp, PFS_OP_CMP, temp, WRITEMASK_Z,
1514                    pfs_zero, swizzle(temp, W, W, W, W),
1515                    negate(swizzle(temp, Y, Y, Y, Y)), flags);
1516         emit_arith(rp, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
1517                    pfs_zero, 0);
1518
1519         if (needTemporary) {
1520                 emit_arith(rp, PFS_OP_MAD, dest, mask,
1521                            temp, pfs_one, pfs_zero, flags);
1522                 free_temp(rp, temp);
1523         } else {
1524                 // Decrease refcount of the destination
1525                 t_hw_dst(rp, dest, GL_FALSE, cs->nrslots);
1526         }
1527 }
1528
1529 static GLboolean parse_program(struct r300_fragment_program *rp)
1530 {
1531         struct gl_fragment_program *mp = &rp->mesa_program;
1532         const struct prog_instruction *inst = mp->Base.Instructions;
1533         struct prog_instruction *fpi;
1534         GLuint src[3], dest, temp[2];
1535         int flags, mask = 0;
1536         int const_sin[2];
1537
1538         if (!inst || inst[0].Opcode == OPCODE_END) {
1539                 ERROR("empty program?\n");
1540                 return GL_FALSE;
1541         }
1542
1543         for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
1544                 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1545                         flags = PFS_FLAG_SAT;
1546                 else
1547                         flags = 0;
1548
1549                 if (fpi->Opcode != OPCODE_KIL) {
1550                         dest = t_dst(rp, fpi->DstReg);
1551                         mask = fpi->DstReg.WriteMask;
1552                 }
1553
1554                 switch (fpi->Opcode) {
1555                 case OPCODE_ABS:
1556                         src[0] = t_src(rp, fpi->SrcReg[0]);
1557                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1558                                    absolute(src[0]), pfs_one, pfs_zero, flags);
1559                         break;
1560                 case OPCODE_ADD:
1561                         src[0] = t_src(rp, fpi->SrcReg[0]);
1562                         src[1] = t_src(rp, fpi->SrcReg[1]);
1563                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1564                                    src[0], pfs_one, src[1], flags);
1565                         break;
1566                 case OPCODE_CMP:
1567                         src[0] = t_src(rp, fpi->SrcReg[0]);
1568                         src[1] = t_src(rp, fpi->SrcReg[1]);
1569                         src[2] = t_src(rp, fpi->SrcReg[2]);
1570                         /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1571                          *    r300 - if src2.c < 0.0 ? src1.c : src0.c
1572                          */
1573                         emit_arith(rp, PFS_OP_CMP, dest, mask,
1574                                    src[2], src[1], src[0], flags);
1575                         break;
1576                 case OPCODE_COS:
1577                         /*
1578                          * cos using a parabola (see SIN):
1579                          * cos(x):
1580                          *   x = (x/(2*PI))+0.75
1581                          *   x = frac(x)
1582                          *   x = (x*2*PI)-PI
1583                          *   result = sin(x)
1584                          */
1585                         temp[0] = get_temp_reg(rp);
1586                         const_sin[0] = emit_const4fv(rp, SinCosConsts[0]);
1587                         const_sin[1] = emit_const4fv(rp, SinCosConsts[1]);
1588                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1589
1590                         /* add 0.5*PI and do range reduction */
1591
1592                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1593                                    swizzle(src[0], X, X, X, X),
1594                                    swizzle(const_sin[1], Z, Z, Z, Z),
1595                                    swizzle(const_sin[1], X, X, X, X), 0);
1596
1597                         emit_arith(rp, PFS_OP_FRC, temp[0], WRITEMASK_X,
1598                                    swizzle(temp[0], X, X, X, X),
1599                                    undef, undef, 0);
1600
1601                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),       //2*PI
1602                                    negate(swizzle(const_sin[0], Z, Z, Z, Z)),   //-PI
1603                                    0);
1604
1605                         /* SIN */
1606
1607                         emit_arith(rp, PFS_OP_MAD, temp[0],
1608                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1609                                                                       Z, Z, Z,
1610                                                                       Z),
1611                                    const_sin[0], pfs_zero, 0);
1612
1613                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1614                                    swizzle(temp[0], Y, Y, Y, Y),
1615                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1616                                    swizzle(temp[0], X, X, X, X), 0);
1617
1618                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1619                                    swizzle(temp[0], X, X, X, X),
1620                                    absolute(swizzle(temp[0], X, X, X, X)),
1621                                    negate(swizzle(temp[0], X, X, X, X)), 0);
1622
1623                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1624                                    swizzle(temp[0], Y, Y, Y, Y),
1625                                    swizzle(const_sin[0], W, W, W, W),
1626                                    swizzle(temp[0], X, X, X, X), flags);
1627
1628                         free_temp(rp, temp[0]);
1629                         break;
1630                 case OPCODE_DP3:
1631                         src[0] = t_src(rp, fpi->SrcReg[0]);
1632                         src[1] = t_src(rp, fpi->SrcReg[1]);
1633                         emit_arith(rp, PFS_OP_DP3, dest, mask,
1634                                    src[0], src[1], undef, flags);
1635                         break;
1636                 case OPCODE_DP4:
1637                         src[0] = t_src(rp, fpi->SrcReg[0]);
1638                         src[1] = t_src(rp, fpi->SrcReg[1]);
1639                         emit_arith(rp, PFS_OP_DP4, dest, mask,
1640                                    src[0], src[1], undef, flags);
1641                         break;
1642                 case OPCODE_DPH:
1643                         src[0] = t_src(rp, fpi->SrcReg[0]);
1644                         src[1] = t_src(rp, fpi->SrcReg[1]);
1645                         /* src0.xyz1 -> temp
1646                          * DP4 dest, temp, src1
1647                          */
1648 #if 0
1649                         temp[0] = get_temp_reg(rp);
1650                         src[0].s_swz = SWIZZLE_ONE;
1651                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1652                                    src[0], pfs_one, pfs_zero, 0);
1653                         emit_arith(rp, PFS_OP_DP4, dest, mask,
1654                                    temp[0], src[1], undef, flags);
1655                         free_temp(rp, temp[0]);
1656 #else
1657                         emit_arith(rp, PFS_OP_DP4, dest, mask,
1658                                    swizzle(src[0], X, Y, Z, ONE), src[1],
1659                                    undef, flags);
1660 #endif
1661                         break;
1662                 case OPCODE_DST:
1663                         src[0] = t_src(rp, fpi->SrcReg[0]);
1664                         src[1] = t_src(rp, fpi->SrcReg[1]);
1665                         /* dest.y = src0.y * src1.y */
1666                         if (mask & WRITEMASK_Y)
1667                                 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Y,
1668                                            keep(src[0]), keep(src[1]),
1669                                            pfs_zero, flags);
1670                         /* dest.z = src0.z */
1671                         if (mask & WRITEMASK_Z)
1672                                 emit_arith(rp, PFS_OP_MAD, dest, WRITEMASK_Z,
1673                                            src[0], pfs_one, pfs_zero, flags);
1674                         /* result.x = 1.0
1675                          * result.w = src1.w */
1676                         if (mask & WRITEMASK_XW) {
1677                                 REG_SET_VSWZ(src[1], SWIZZLE_111);      /*Cheat */
1678                                 emit_arith(rp, PFS_OP_MAD, dest,
1679                                            mask & WRITEMASK_XW,
1680                                            src[1], pfs_one, pfs_zero, flags);
1681                         }
1682                         break;
1683                 case OPCODE_EX2:
1684                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1685                         emit_arith(rp, PFS_OP_EX2, dest, mask,
1686                                    src[0], undef, undef, flags);
1687                         break;
1688                 case OPCODE_FLR:
1689                         src[0] = t_src(rp, fpi->SrcReg[0]);
1690                         temp[0] = get_temp_reg(rp);
1691                         /* FRC temp, src0
1692                          * MAD dest, src0, 1.0, -temp
1693                          */
1694                         emit_arith(rp, PFS_OP_FRC, temp[0], mask,
1695                                    keep(src[0]), undef, undef, 0);
1696                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1697                                    src[0], pfs_one, negate(temp[0]), flags);
1698                         free_temp(rp, temp[0]);
1699                         break;
1700                 case OPCODE_FRC:
1701                         src[0] = t_src(rp, fpi->SrcReg[0]);
1702                         emit_arith(rp, PFS_OP_FRC, dest, mask,
1703                                    src[0], undef, undef, flags);
1704                         break;
1705                 case OPCODE_KIL:
1706                         emit_tex(rp, fpi, R300_FPITX_OP_KIL);
1707                         break;
1708                 case OPCODE_LG2:
1709                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1710                         emit_arith(rp, PFS_OP_LG2, dest, mask,
1711                                    src[0], undef, undef, flags);
1712                         break;
1713                 case OPCODE_LIT:
1714                         src[0] = t_src(rp, fpi->SrcReg[0]);
1715                         emit_lit(rp, dest, mask, src[0], flags);
1716                         break;
1717                 case OPCODE_LRP:
1718                         src[0] = t_src(rp, fpi->SrcReg[0]);
1719                         src[1] = t_src(rp, fpi->SrcReg[1]);
1720                         src[2] = t_src(rp, fpi->SrcReg[2]);
1721                         /* result = tmp0tmp1 + (1 - tmp0)tmp2
1722                          *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1723                          *     MAD temp, -tmp0, tmp2, tmp2
1724                          *     MAD result, tmp0, tmp1, temp
1725                          */
1726                         temp[0] = get_temp_reg(rp);
1727                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1728                                    negate(keep(src[0])), keep(src[2]), src[2],
1729                                    0);
1730                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1731                                    src[0], src[1], temp[0], flags);
1732                         free_temp(rp, temp[0]);
1733                         break;
1734                 case OPCODE_MAD:
1735                         src[0] = t_src(rp, fpi->SrcReg[0]);
1736                         src[1] = t_src(rp, fpi->SrcReg[1]);
1737                         src[2] = t_src(rp, fpi->SrcReg[2]);
1738                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1739                                    src[0], src[1], src[2], flags);
1740                         break;
1741                 case OPCODE_MAX:
1742                         src[0] = t_src(rp, fpi->SrcReg[0]);
1743                         src[1] = t_src(rp, fpi->SrcReg[1]);
1744                         emit_arith(rp, PFS_OP_MAX, dest, mask,
1745                                    src[0], src[1], undef, flags);
1746                         break;
1747                 case OPCODE_MIN:
1748                         src[0] = t_src(rp, fpi->SrcReg[0]);
1749                         src[1] = t_src(rp, fpi->SrcReg[1]);
1750                         emit_arith(rp, PFS_OP_MIN, dest, mask,
1751                                    src[0], src[1], undef, flags);
1752                         break;
1753                 case OPCODE_MOV:
1754                 case OPCODE_SWZ:
1755                         src[0] = t_src(rp, fpi->SrcReg[0]);
1756                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1757                                    src[0], pfs_one, pfs_zero, flags);
1758                         break;
1759                 case OPCODE_MUL:
1760                         src[0] = t_src(rp, fpi->SrcReg[0]);
1761                         src[1] = t_src(rp, fpi->SrcReg[1]);
1762                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1763                                    src[0], src[1], pfs_zero, flags);
1764                         break;
1765                 case OPCODE_POW:
1766                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1767                         src[1] = t_scalar_src(rp, fpi->SrcReg[1]);
1768                         temp[0] = get_temp_reg(rp);
1769                         emit_arith(rp, PFS_OP_LG2, temp[0], WRITEMASK_W,
1770                                    src[0], undef, undef, 0);
1771                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1772                                    temp[0], src[1], pfs_zero, 0);
1773                         emit_arith(rp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
1774                                    temp[0], undef, undef, 0);
1775                         free_temp(rp, temp[0]);
1776                         break;
1777                 case OPCODE_RCP:
1778                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1779                         emit_arith(rp, PFS_OP_RCP, dest, mask,
1780                                    src[0], undef, undef, flags);
1781                         break;
1782                 case OPCODE_RSQ:
1783                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1784                         emit_arith(rp, PFS_OP_RSQ, dest, mask,
1785                                    absolute(src[0]), pfs_zero, pfs_zero, flags);
1786                         break;
1787                 case OPCODE_SCS:
1788                         /*
1789                          * scs using a parabola :
1790                          * scs(x):
1791                          *   result.x = sin(-abs(x)+0.5*PI)  (cos)
1792                          *   result.y = sin(x)               (sin)
1793                          *
1794                          */
1795                         temp[0] = get_temp_reg(rp);
1796                         temp[1] = get_temp_reg(rp);
1797                         const_sin[0] = emit_const4fv(rp, SinCosConsts[0]);
1798                         const_sin[1] = emit_const4fv(rp, SinCosConsts[1]);
1799                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1800
1801                         /* x = -abs(x)+0.5*PI */
1802                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z),     //PI
1803                                    pfs_half,
1804                                    negate(abs
1805                                           (swizzle(keep(src[0]), X, X, X, X))),
1806                                    0);
1807
1808                         /* C*x (sin) */
1809                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1810                                    swizzle(const_sin[0], Y, Y, Y, Y),
1811                                    swizzle(keep(src[0]), X, X, X, X),
1812                                    pfs_zero, 0);
1813
1814                         /* B*x, C*x (cos) */
1815                         emit_arith(rp, PFS_OP_MAD, temp[0],
1816                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1817                                                                       Z, Z, Z,
1818                                                                       Z),
1819                                    const_sin[0], pfs_zero, 0);
1820
1821                         /* B*x (sin) */
1822                         emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_W,
1823                                    swizzle(const_sin[0], X, X, X, X),
1824                                    keep(src[0]), pfs_zero, 0);
1825
1826                         /* y = B*x + C*x*abs(x) (sin) */
1827                         emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1828                                    absolute(src[0]),
1829                                    swizzle(temp[0], W, W, W, W),
1830                                    swizzle(temp[1], W, W, W, W), 0);
1831
1832                         /* y = B*x + C*x*abs(x) (cos) */
1833                         emit_arith(rp, PFS_OP_MAD, temp[1], WRITEMASK_W,
1834                                    swizzle(temp[0], Y, Y, Y, Y),
1835                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1836                                    swizzle(temp[0], X, X, X, X), 0);
1837
1838                         /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1839                         emit_arith(rp, PFS_OP_MAD, temp[0],
1840                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
1841                                                                       W, Z, Y,
1842                                                                       X),
1843                                    absolute(swizzle(temp[1], W, Z, Y, X)),
1844                                    negate(swizzle(temp[1], W, Z, Y, X)), 0);
1845
1846                         /* dest.xy = mad(temp.xy, P, temp2.wz) */
1847                         emit_arith(rp, PFS_OP_MAD, dest,
1848                                    mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
1849                                    swizzle(const_sin[0], W, W, W, W),
1850                                    swizzle(temp[1], W, Z, Y, X), flags);
1851
1852                         free_temp(rp, temp[0]);
1853                         free_temp(rp, temp[1]);
1854                         break;
1855                 case OPCODE_SGE:
1856                         src[0] = t_src(rp, fpi->SrcReg[0]);
1857                         src[1] = t_src(rp, fpi->SrcReg[1]);
1858                         temp[0] = get_temp_reg(rp);
1859                         /* temp = src0 - src1
1860                          * dest.c = (temp.c < 0.0) ? 0 : 1
1861                          */
1862                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1863                                    src[0], pfs_one, negate(src[1]), 0);
1864                         emit_arith(rp, PFS_OP_CMP, dest, mask,
1865                                    pfs_one, pfs_zero, temp[0], 0);
1866                         free_temp(rp, temp[0]);
1867                         break;
1868                 case OPCODE_SIN:
1869                         /*
1870                          *  using a parabola:
1871                          * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1872                          * extra precision is obtained by weighting against
1873                          * itself squared.
1874                          */
1875
1876                         temp[0] = get_temp_reg(rp);
1877                         const_sin[0] = emit_const4fv(rp, SinCosConsts[0]);
1878                         const_sin[1] = emit_const4fv(rp, SinCosConsts[1]);
1879                         src[0] = t_scalar_src(rp, fpi->SrcReg[0]);
1880
1881                         /* do range reduction */
1882
1883                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1884                                    swizzle(keep(src[0]), X, X, X, X),
1885                                    swizzle(const_sin[1], Z, Z, Z, Z),
1886                                    pfs_half, 0);
1887
1888                         emit_arith(rp, PFS_OP_FRC, temp[0], WRITEMASK_X,
1889                                    swizzle(temp[0], X, X, X, X),
1890                                    undef, undef, 0);
1891
1892                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),       //2*PI
1893                                    negate(swizzle(const_sin[0], Z, Z, Z, Z)),   //PI
1894                                    0);
1895
1896                         /* SIN */
1897
1898                         emit_arith(rp, PFS_OP_MAD, temp[0],
1899                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1900                                                                       Z, Z, Z,
1901                                                                       Z),
1902                                    const_sin[0], pfs_zero, 0);
1903
1904                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1905                                    swizzle(temp[0], Y, Y, Y, Y),
1906                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1907                                    swizzle(temp[0], X, X, X, X), 0);
1908
1909                         emit_arith(rp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1910                                    swizzle(temp[0], X, X, X, X),
1911                                    absolute(swizzle(temp[0], X, X, X, X)),
1912                                    negate(swizzle(temp[0], X, X, X, X)), 0);
1913
1914                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1915                                    swizzle(temp[0], Y, Y, Y, Y),
1916                                    swizzle(const_sin[0], W, W, W, W),
1917                                    swizzle(temp[0], X, X, X, X), flags);
1918
1919                         free_temp(rp, temp[0]);
1920                         break;
1921                 case OPCODE_SLT:
1922                         src[0] = t_src(rp, fpi->SrcReg[0]);
1923                         src[1] = t_src(rp, fpi->SrcReg[1]);
1924                         temp[0] = get_temp_reg(rp);
1925                         /* temp = src0 - src1
1926                          * dest.c = (temp.c < 0.0) ? 1 : 0
1927                          */
1928                         emit_arith(rp, PFS_OP_MAD, temp[0], mask,
1929                                    src[0], pfs_one, negate(src[1]), 0);
1930                         emit_arith(rp, PFS_OP_CMP, dest, mask,
1931                                    pfs_zero, pfs_one, temp[0], 0);
1932                         free_temp(rp, temp[0]);
1933                         break;
1934                 case OPCODE_SUB:
1935                         src[0] = t_src(rp, fpi->SrcReg[0]);
1936                         src[1] = t_src(rp, fpi->SrcReg[1]);
1937                         emit_arith(rp, PFS_OP_MAD, dest, mask,
1938                                    src[0], pfs_one, negate(src[1]), flags);
1939                         break;
1940                 case OPCODE_TEX:
1941                         emit_tex(rp, fpi, R300_FPITX_OP_TEX);
1942                         break;
1943                 case OPCODE_TXB:
1944                         emit_tex(rp, fpi, R300_FPITX_OP_TXB);
1945                         break;
1946                 case OPCODE_TXP:
1947                         emit_tex(rp, fpi, R300_FPITX_OP_TXP);
1948                         break;
1949                 case OPCODE_XPD:{
1950                                 src[0] = t_src(rp, fpi->SrcReg[0]);
1951                                 src[1] = t_src(rp, fpi->SrcReg[1]);
1952                                 temp[0] = get_temp_reg(rp);
1953                                 /* temp = src0.zxy * src1.yzx */
1954                                 emit_arith(rp, PFS_OP_MAD, temp[0],
1955                                            WRITEMASK_XYZ, swizzle(keep(src[0]),
1956                                                                   Z, X, Y, W),
1957                                            swizzle(keep(src[1]), Y, Z, X, W),
1958                                            pfs_zero, 0);
1959                                 /* dest.xyz = src0.yzx * src1.zxy - temp
1960                                  * dest.w       = undefined
1961                                  * */
1962                                 emit_arith(rp, PFS_OP_MAD, dest,
1963                                            mask & WRITEMASK_XYZ, swizzle(src[0],
1964                                                                          Y, Z,
1965                                                                          X, W),
1966                                            swizzle(src[1], Z, X, Y, W),
1967                                            negate(temp[0]), flags);
1968                                 /* cleanup */
1969                                 free_temp(rp, temp[0]);
1970                                 break;
1971                         }
1972                 default:
1973                         ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
1974                         break;
1975                 }
1976
1977                 if (rp->error)
1978                         return GL_FALSE;
1979
1980         }
1981
1982         return GL_TRUE;
1983 }
1984
1985 static void insert_wpos(struct gl_program *prog)
1986 {
1987         static gl_state_index tokens[STATE_LENGTH] = {
1988                 STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
1989         };
1990         struct prog_instruction *fpi;
1991         GLuint window_index;
1992         int i = 0;
1993         GLuint tempregi = prog->NumTemporaries;
1994         /* should do something else if no temps left... */
1995         prog->NumTemporaries++;
1996
1997         fpi = _mesa_alloc_instructions(prog->NumInstructions + 3);
1998         _mesa_init_instructions(fpi, prog->NumInstructions + 3);
1999
2000         /* perspective divide */
2001         fpi[i].Opcode = OPCODE_RCP;
2002
2003         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2004         fpi[i].DstReg.Index = tempregi;
2005         fpi[i].DstReg.WriteMask = WRITEMASK_W;
2006         fpi[i].DstReg.CondMask = COND_TR;
2007
2008         fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2009         fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2010         fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
2011         i++;
2012
2013         fpi[i].Opcode = OPCODE_MUL;
2014
2015         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2016         fpi[i].DstReg.Index = tempregi;
2017         fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2018         fpi[i].DstReg.CondMask = COND_TR;
2019
2020         fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2021         fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2022         fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
2023
2024         fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
2025         fpi[i].SrcReg[1].Index = tempregi;
2026         fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
2027         i++;
2028
2029         /* viewport transformation */
2030         window_index = _mesa_add_state_reference(prog->Parameters, tokens);
2031
2032         fpi[i].Opcode = OPCODE_MAD;
2033
2034         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2035         fpi[i].DstReg.Index = tempregi;
2036         fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2037         fpi[i].DstReg.CondMask = COND_TR;
2038
2039         fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
2040         fpi[i].SrcReg[0].Index = tempregi;
2041         fpi[i].SrcReg[0].Swizzle =
2042             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2043
2044         fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
2045         fpi[i].SrcReg[1].Index = window_index;
2046         fpi[i].SrcReg[1].Swizzle =
2047             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2048
2049         fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
2050         fpi[i].SrcReg[2].Index = window_index;
2051         fpi[i].SrcReg[2].Swizzle =
2052             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2053         i++;
2054
2055         _mesa_copy_instructions(&fpi[i], prog->Instructions,
2056                                 prog->NumInstructions);
2057
2058         free(prog->Instructions);
2059
2060         prog->Instructions = fpi;
2061
2062         prog->NumInstructions += i;
2063         fpi = &prog->Instructions[prog->NumInstructions - 1];
2064
2065         assert(fpi->Opcode == OPCODE_END);
2066
2067         for (fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++) {
2068                 for (i = 0; i < 3; i++)
2069                         if (fpi->SrcReg[i].File == PROGRAM_INPUT &&
2070                             fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS) {
2071                                 fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
2072                                 fpi->SrcReg[i].Index = tempregi;
2073                         }
2074         }
2075 }
2076
2077 /* - Init structures
2078  * - Determine what hwregs each input corresponds to
2079  */
2080 static void init_program(r300ContextPtr r300, struct r300_fragment_program *rp)
2081 {
2082         struct r300_pfs_compile_state *cs = NULL;
2083         struct gl_fragment_program *mp = &rp->mesa_program;
2084         struct prog_instruction *fpi;
2085         GLuint InputsRead = mp->Base.InputsRead;
2086         GLuint temps_used = 0;  /* for rp->temps[] */
2087         int i, j;
2088
2089         /* New compile, reset tracking data */
2090         rp->optimization =
2091             driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
2092         rp->translated = GL_FALSE;
2093         rp->error = GL_FALSE;
2094         rp->cs = cs = &(R300_CONTEXT(rp->ctx)->state.pfs_compile);
2095         rp->tex.length = 0;
2096         rp->cur_node = 0;
2097         rp->first_node_has_tex = 0;
2098         rp->const_nr = 0;
2099         rp->max_temp_idx = 0;
2100         rp->node[0].alu_end = -1;
2101         rp->node[0].tex_end = -1;
2102
2103         _mesa_memset(cs, 0, sizeof(*rp->cs));
2104         for (i = 0; i < PFS_MAX_ALU_INST; i++) {
2105                 for (j = 0; j < 3; j++) {
2106                         cs->slot[i].vsrc[j] = SRC_CONST;
2107                         cs->slot[i].ssrc[j] = SRC_CONST;
2108                 }
2109         }
2110
2111         /* Work out what temps the Mesa inputs correspond to, this must match
2112          * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2113          * configures itself based on the fragprog's InputsRead
2114          *
2115          * NOTE: this depends on get_hw_temp() allocating registers in order,
2116          * starting from register 0.
2117          */
2118
2119         /* Texcoords come first */
2120         for (i = 0; i < rp->ctx->Const.MaxTextureUnits; i++) {
2121                 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
2122                         cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
2123                         cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
2124                             get_hw_temp(rp, 0);
2125                 }
2126         }
2127         InputsRead &= ~FRAG_BITS_TEX_ANY;
2128
2129         /* fragment position treated as a texcoord */
2130         if (InputsRead & FRAG_BIT_WPOS) {
2131                 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
2132                 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(rp, 0);
2133                 insert_wpos(&mp->Base);
2134         }
2135         InputsRead &= ~FRAG_BIT_WPOS;
2136
2137         /* Then primary colour */
2138         if (InputsRead & FRAG_BIT_COL0) {
2139                 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
2140                 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(rp, 0);
2141         }
2142         InputsRead &= ~FRAG_BIT_COL0;
2143
2144         /* Secondary color */
2145         if (InputsRead & FRAG_BIT_COL1) {
2146                 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
2147                 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(rp, 0);
2148         }
2149         InputsRead &= ~FRAG_BIT_COL1;
2150
2151         /* Anything else */
2152         if (InputsRead) {
2153                 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
2154                 /* force read from hwreg 0 for now */
2155                 for (i = 0; i < 32; i++)
2156                         if (InputsRead & (1 << i))
2157                                 cs->inputs[i].reg = 0;
2158         }
2159
2160         /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2161          * That way, we can free up the reg when it's no longer needed
2162          */
2163         if (!mp->Base.Instructions) {
2164                 ERROR("No instructions found in program\n");
2165                 return;
2166         }
2167
2168         for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
2169                 int idx;
2170
2171                 for (i = 0; i < 3; i++) {
2172                         idx = fpi->SrcReg[i].Index;
2173                         switch (fpi->SrcReg[i].File) {
2174                         case PROGRAM_TEMPORARY:
2175                                 if (!(temps_used & (1 << idx))) {
2176                                         cs->temps[idx].reg = -1;
2177                                         cs->temps[idx].refcount = 1;
2178                                         temps_used |= (1 << idx);
2179                                 } else
2180                                         cs->temps[idx].refcount++;
2181                                 break;
2182                         case PROGRAM_INPUT:
2183                                 cs->inputs[idx].refcount++;
2184                                 break;
2185                         default:
2186                                 break;
2187                         }
2188                 }
2189
2190                 idx = fpi->DstReg.Index;
2191                 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
2192                         if (!(temps_used & (1 << idx))) {
2193                                 cs->temps[idx].reg = -1;
2194                                 cs->temps[idx].refcount = 1;
2195                                 temps_used |= (1 << idx);
2196                         } else
2197                                 cs->temps[idx].refcount++;
2198                 }
2199         }
2200         cs->temp_in_use = temps_used;
2201 }
2202
2203 static void update_params(struct r300_fragment_program *rp)
2204 {
2205         struct gl_fragment_program *mp = &rp->mesa_program;
2206
2207         /* Ask Mesa nicely to fill in ParameterValues for us */
2208         if (mp->Base.Parameters)
2209                 _mesa_load_state_parameters(rp->ctx, mp->Base.Parameters);
2210 }
2211
2212 void r300TranslateFragmentShader(r300ContextPtr r300,
2213                                     struct r300_fragment_program *rp)
2214 {
2215         struct r300_pfs_compile_state *cs = NULL;
2216
2217         if (!rp->translated) {
2218
2219                 init_program(r300, rp);
2220                 cs = rp->cs;
2221
2222                 if (parse_program(rp) == GL_FALSE) {
2223                         dump_program(rp);
2224                         return;
2225                 }
2226
2227                 /* Finish off */
2228                 rp->node[rp->cur_node].alu_end =
2229                     cs->nrslots - rp->node[rp->cur_node].alu_offset - 1;
2230                 if (rp->node[rp->cur_node].tex_end < 0)
2231                         rp->node[rp->cur_node].tex_end = 0;
2232                 rp->alu_offset = 0;
2233                 rp->alu_end = cs->nrslots - 1;
2234                 rp->tex_offset = 0;
2235                 rp->tex_end = rp->tex.length ? rp->tex.length - 1 : 0;
2236                 assert(rp->node[rp->cur_node].alu_end >= 0);
2237                 assert(rp->alu_end >= 0);
2238
2239                 rp->translated = GL_TRUE;
2240                 if (RADEON_DEBUG & DEBUG_PIXEL)
2241                         dump_program(rp);
2242                 r300UpdateStateParameters(rp->ctx, _NEW_PROGRAM);
2243         }
2244
2245         update_params(rp);
2246 }
2247
2248 /* just some random things... */
2249 static void dump_program(struct r300_fragment_program *rp)
2250 {
2251         int n, i, j;
2252         static int pc = 0;
2253
2254         fprintf(stderr, "pc=%d*************************************\n", pc++);
2255
2256         fprintf(stderr, "Mesa program:\n");
2257         fprintf(stderr, "-------------\n");
2258         _mesa_print_program(&rp->mesa_program.Base);
2259         fflush(stdout);
2260
2261         fprintf(stderr, "Hardware program\n");
2262         fprintf(stderr, "----------------\n");
2263
2264         for (n = 0; n < (rp->cur_node + 1); n++) {
2265                 fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
2266                         "alu_end: %d, tex_end: %d\n", n,
2267                         rp->node[n].alu_offset,
2268                         rp->node[n].tex_offset,
2269                         rp->node[n].alu_end, rp->node[n].tex_end);
2270
2271                 if (rp->tex.length) {
2272                         fprintf(stderr, "  TEX:\n");
2273                         for (i = rp->node[n].tex_offset;
2274                              i <= rp->node[n].tex_offset + rp->node[n].tex_end;
2275                              ++i) {
2276                                 const char *instr;
2277
2278                                 switch ((rp->tex.
2279                                          inst[i] >> R300_FPITX_OPCODE_SHIFT) &
2280                                         15) {
2281                                 case R300_FPITX_OP_TEX:
2282                                         instr = "TEX";
2283                                         break;
2284                                 case R300_FPITX_OP_KIL:
2285                                         instr = "KIL";
2286                                         break;
2287                                 case R300_FPITX_OP_TXP:
2288                                         instr = "TXP";
2289                                         break;
2290                                 case R300_FPITX_OP_TXB:
2291                                         instr = "TXB";
2292                                         break;
2293                                 default:
2294                                         instr = "UNKNOWN";
2295                                 }
2296
2297                                 fprintf(stderr,
2298                                         "    %s t%i, %c%i, texture[%i]   (%08x)\n",
2299                                         instr,
2300                                         (rp->tex.
2301                                          inst[i] >> R300_FPITX_DST_SHIFT) & 31,
2302                                         (rp->tex.
2303                                          inst[i] & R300_FPITX_SRC_CONST) ? 'c' :
2304                                         't',
2305                                         (rp->tex.
2306                                          inst[i] >> R300_FPITX_SRC_SHIFT) & 31,
2307                                         (rp->tex.
2308                                          inst[i] & R300_FPITX_IMAGE_MASK) >>
2309                                         R300_FPITX_IMAGE_SHIFT,
2310                                         rp->tex.inst[i]);
2311                         }
2312                 }
2313
2314                 for (i = rp->node[n].alu_offset;
2315                      i <= rp->node[n].alu_offset + rp->node[n].alu_end; ++i) {
2316                         char srcc[3][10], dstc[20];
2317                         char srca[3][10], dsta[20];
2318                         char argc[3][20];
2319                         char arga[3][20];
2320                         char flags[5], tmp[10];
2321
2322                         for (j = 0; j < 3; ++j) {
2323                                 int regc = rp->alu.inst[i].inst1 >> (j * 6);
2324                                 int rega = rp->alu.inst[i].inst3 >> (j * 6);
2325
2326                                 sprintf(srcc[j], "%c%i",
2327                                         (regc & 32) ? 'c' : 't', regc & 31);
2328                                 sprintf(srca[j], "%c%i",
2329                                         (rega & 32) ? 'c' : 't', rega & 31);
2330                         }
2331
2332                         dstc[0] = 0;
2333                         sprintf(flags, "%s%s%s",
2334                                 (rp->alu.inst[i].
2335                                  inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "",
2336                                 (rp->alu.inst[i].
2337                                  inst1 & R300_FPI1_DSTC_REG_Y) ? "y" : "",
2338                                 (rp->alu.inst[i].
2339                                  inst1 & R300_FPI1_DSTC_REG_Z) ? "z" : "");
2340                         if (flags[0] != 0) {
2341                                 sprintf(dstc, "t%i.%s ",
2342                                         (rp->alu.inst[i].
2343                                          inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
2344                                         flags);
2345                         }
2346                         sprintf(flags, "%s%s%s",
2347                                 (rp->alu.inst[i].
2348                                  inst1 & R300_FPI1_DSTC_OUTPUT_X) ? "x" : "",
2349                                 (rp->alu.inst[i].
2350                                  inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? "y" : "",
2351                                 (rp->alu.inst[i].
2352                                  inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? "z" : "");
2353                         if (flags[0] != 0) {
2354                                 sprintf(tmp, "o%i.%s",
2355                                         (rp->alu.inst[i].
2356                                          inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
2357                                         flags);
2358                                 strcat(dstc, tmp);
2359                         }
2360
2361                         dsta[0] = 0;
2362                         if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) {
2363                                 sprintf(dsta, "t%i.w ",
2364                                         (rp->alu.inst[i].
2365                                          inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
2366                         }
2367                         if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) {
2368                                 sprintf(tmp, "o%i.w ",
2369                                         (rp->alu.inst[i].
2370                                          inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
2371                                 strcat(dsta, tmp);
2372                         }
2373                         if (rp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) {
2374                                 strcat(dsta, "Z");
2375                         }
2376
2377                         fprintf(stderr,
2378                                 "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2379                                 "       w: %3s %3s %3s -> %-20s (%08x)\n", i,
2380                                 srcc[0], srcc[1], srcc[2], dstc,
2381                                 rp->alu.inst[i].inst1, srca[0], srca[1],
2382                                 srca[2], dsta, rp->alu.inst[i].inst3);
2383
2384                         for (j = 0; j < 3; ++j) {
2385                                 int regc = rp->alu.inst[i].inst0 >> (j * 7);
2386                                 int rega = rp->alu.inst[i].inst2 >> (j * 7);
2387                                 int d;
2388                                 char buf[20];
2389
2390                                 d = regc & 31;
2391                                 if (d < 12) {
2392                                         switch (d % 4) {
2393                                         case R300_FPI0_ARGC_SRC0C_XYZ:
2394                                                 sprintf(buf, "%s.xyz",
2395                                                         srcc[d / 4]);
2396                                                 break;
2397                                         case R300_FPI0_ARGC_SRC0C_XXX:
2398                                                 sprintf(buf, "%s.xxx",
2399                                                         srcc[d / 4]);
2400                                                 break;
2401                                         case R300_FPI0_ARGC_SRC0C_YYY:
2402                                                 sprintf(buf, "%s.yyy",
2403                                                         srcc[d / 4]);
2404                                                 break;
2405                                         case R300_FPI0_ARGC_SRC0C_ZZZ:
2406                                                 sprintf(buf, "%s.zzz",
2407                                                         srcc[d / 4]);
2408                                                 break;
2409                                         }
2410                                 } else if (d < 15) {
2411                                         sprintf(buf, "%s.www", srca[d - 12]);
2412                                 } else if (d == 20) {
2413                                         sprintf(buf, "0.0");
2414                                 } else if (d == 21) {
2415                                         sprintf(buf, "1.0");
2416                                 } else if (d == 22) {
2417                                         sprintf(buf, "0.5");
2418                                 } else if (d >= 23 && d < 32) {
2419                                         d -= 23;
2420                                         switch (d / 3) {
2421                                         case 0:
2422                                                 sprintf(buf, "%s.yzx",
2423                                                         srcc[d % 3]);
2424                                                 break;
2425                                         case 1:
2426                                                 sprintf(buf, "%s.zxy",
2427                                                         srcc[d % 3]);
2428                                                 break;
2429                                         case 2:
2430                                                 sprintf(buf, "%s.Wzy",
2431                                                         srcc[d % 3]);
2432                                                 break;
2433                                         }
2434                                 } else {
2435                                         sprintf(buf, "%i", d);
2436                                 }
2437
2438                                 sprintf(argc[j], "%s%s%s%s",
2439                                         (regc & 32) ? "-" : "",
2440                                         (regc & 64) ? "|" : "",
2441                                         buf, (regc & 64) ? "|" : "");
2442
2443                                 d = rega & 31;
2444                                 if (d < 9) {
2445                                         sprintf(buf, "%s.%c", srcc[d / 3],
2446                                                 'x' + (char)(d % 3));
2447                                 } else if (d < 12) {
2448                                         sprintf(buf, "%s.w", srca[d - 9]);
2449                                 } else if (d == 16) {
2450                                         sprintf(buf, "0.0");
2451                                 } else if (d == 17) {
2452                                         sprintf(buf, "1.0");
2453                                 } else if (d == 18) {
2454                                         sprintf(buf, "0.5");
2455                                 } else {
2456                                         sprintf(buf, "%i", d);
2457                                 }
2458
2459                                 sprintf(arga[j], "%s%s%s%s",
2460                                         (rega & 32) ? "-" : "",
2461                                         (rega & 64) ? "|" : "",
2462                                         buf, (rega & 64) ? "|" : "");
2463                         }
2464
2465                         fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
2466                                 "       w: %8s %8s %8s    op: %08x\n",
2467                                 argc[0], argc[1], argc[2],
2468                                 rp->alu.inst[i].inst0, arga[0], arga[1],
2469                                 arga[2], rp->alu.inst[i].inst2);
2470                 }
2471         }
2472 }