src/mesa/drivers/dri/r300/r300_fragprog.c

   1 /*
   2  * Copyright (C) 2005 Ben Skeggs.
   3  *
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sublicense, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial
  16  * portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  */
  27
  28 /**
  29  * \file
  30  *
  31  * \author Ben Skeggs <darktama@iinet.net.au>
  32  *
  33  * \author Jerome Glisse <j.glisse@gmail.com>
  34  *
  35  * \todo Depth write, WPOS/FOGC inputs
  36  *
  37  * \todo FogOption
  38  *
  39  * \todo Verify results of opcodes for accuracy, I've only checked them in
  40  * specific cases.
  41  */
  42
  43 #include "glheader.h"
  44 #include "macros.h"
  45 #include "enums.h"
  46 #include "shader/prog_instruction.h"
  47 #include "shader/prog_parameter.h"
  48 #include "shader/prog_print.h"
  49
  50 #include "r300_context.h"
  51 #include "r300_fragprog.h"
  52 #include "r300_reg.h"
  53 #include "r300_state.h"
  54
  55 /*
  56  * Usefull macros and values
  57  */
  58 #define ERROR(fmt, args...) do {                        \
  59                 fprintf(stderr, "%s::%s(): " fmt "\n",  \
  60                         __FILE__, __FUNCTION__, ##args);        \
  61                 fp->error = GL_TRUE;                    \
  62         } while(0)
  63
  64 #define PFS_INVAL 0xFFFFFFFF
  65 #define COMPILE_STATE struct r300_pfs_compile_state *cs = fp->cs
  66
  67 #define SWIZZLE_XYZ             0
  68 #define SWIZZLE_XXX             1
  69 #define SWIZZLE_YYY             2
  70 #define SWIZZLE_ZZZ             3
  71 #define SWIZZLE_WWW             4
  72 #define SWIZZLE_YZX             5
  73 #define SWIZZLE_ZXY             6
  74 #define SWIZZLE_WZY             7
  75 #define SWIZZLE_111             8
  76 #define SWIZZLE_000             9
  77 #define SWIZZLE_HHH             10
  78
  79 #define swizzle(r, x, y, z, w) do_swizzle(fp, r,                \
  80                                           ((SWIZZLE_##x<<0)|    \
  81                                            (SWIZZLE_##y<<3)|    \
  82                                            (SWIZZLE_##z<<6)|    \
  83                                            (SWIZZLE_##w<<9)),   \
  84                                           0)
  85
  86 #define REG_TYPE_INPUT          0
  87 #define REG_TYPE_OUTPUT         1
  88 #define REG_TYPE_TEMP           2
  89 #define REG_TYPE_CONST          3
  90
  91 #define REG_TYPE_SHIFT          0
  92 #define REG_INDEX_SHIFT         2
  93 #define REG_VSWZ_SHIFT          8
  94 #define REG_SSWZ_SHIFT          13
  95 #define REG_NEGV_SHIFT          18
  96 #define REG_NEGS_SHIFT          19
  97 #define REG_ABS_SHIFT           20
  98 #define REG_NO_USE_SHIFT        21      // Hack for refcounting
  99 #define REG_VALID_SHIFT         22      // Does the register contain a defined value?
 100 #define REG_BUILTIN_SHIFT   23  // Is it a builtin (like all zero/all one)?
 101
 102 #define REG_TYPE_MASK           (0x03 << REG_TYPE_SHIFT)
 103 #define REG_INDEX_MASK          (0x3F << REG_INDEX_SHIFT)
 104 #define REG_VSWZ_MASK           (0x1F << REG_VSWZ_SHIFT)
 105 #define REG_SSWZ_MASK           (0x1F << REG_SSWZ_SHIFT)
 106 #define REG_NEGV_MASK           (0x01 << REG_NEGV_SHIFT)
 107 #define REG_NEGS_MASK           (0x01 << REG_NEGS_SHIFT)
 108 #define REG_ABS_MASK            (0x01 << REG_ABS_SHIFT)
 109 #define REG_NO_USE_MASK         (0x01 << REG_NO_USE_SHIFT)
 110 #define REG_VALID_MASK          (0x01 << REG_VALID_SHIFT)
 111 #define REG_BUILTIN_MASK        (0x01 << REG_BUILTIN_SHIFT)
 112
 113 #define REG(type, index, vswz, sswz, nouse, valid, builtin)     \
 114         (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |                   \
 115          ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |                \
 116          ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |              \
 117          ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |                \
 118          ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |  \
 119          ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |                   \
 120          ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 121 #define REG_GET_TYPE(reg)                                               \
 122         ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
 123 #define REG_GET_INDEX(reg)                                              \
 124         ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
 125 #define REG_GET_VSWZ(reg)                                               \
 126         ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
 127 #define REG_GET_SSWZ(reg)                                               \
 128         ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
 129 #define REG_GET_NO_USE(reg)                                             \
 130         ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
 131 #define REG_GET_VALID(reg)                                              \
 132         ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
 133 #define REG_GET_BUILTIN(reg)                                            \
 134         ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
 135 #define REG_SET_TYPE(reg, type)                                         \
 136         reg = ((reg & ~REG_TYPE_MASK) |                                 \
 137                ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
 138 #define REG_SET_INDEX(reg, index)                                       \
 139         reg = ((reg & ~REG_INDEX_MASK) |                                \
 140                ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
 141 #define REG_SET_VSWZ(reg, vswz)                                         \
 142         reg = ((reg & ~REG_VSWZ_MASK) |                                 \
 143                ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
 144 #define REG_SET_SSWZ(reg, sswz)                                         \
 145         reg = ((reg & ~REG_SSWZ_MASK) |                                 \
 146                ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 147 #define REG_SET_NO_USE(reg, nouse)                                      \
 148         reg = ((reg & ~REG_NO_USE_MASK) |                               \
 149                ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
 150 #define REG_SET_VALID(reg, valid)                                       \
 151         reg = ((reg & ~REG_VALID_MASK) |                                \
 152                ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
 153 #define REG_SET_BUILTIN(reg, builtin)                                   \
 154         reg = ((reg & ~REG_BUILTIN_MASK) |                              \
 155                ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
 156 #define REG_ABS(reg)                                                    \
 157         reg = (reg | REG_ABS_MASK)
 158 #define REG_NEGV(reg)                                                   \
 159         reg = (reg | REG_NEGV_MASK)
 160 #define REG_NEGS(reg)                                                   \
 161         reg = (reg | REG_NEGS_MASK)
 162
 163 /*
 164  * Datas structures for fragment program generation
 165  */
 166
 167 /* description of r300 native hw instructions */
 168 static const struct {
 169         const char *name;
 170         int argc;
 171         int v_op;
 172         int s_op;
 173 } r300_fpop[] = {
 174         /* *INDENT-OFF* */
 175         {"MAD", 3, R300_FPI0_OUTC_MAD, R300_FPI2_OUTA_MAD},
 176         {"DP3", 2, R300_FPI0_OUTC_DP3, R300_FPI2_OUTA_DP4},
 177         {"DP4", 2, R300_FPI0_OUTC_DP4, R300_FPI2_OUTA_DP4},
 178         {"MIN", 2, R300_FPI0_OUTC_MIN, R300_FPI2_OUTA_MIN},
 179         {"MAX", 2, R300_FPI0_OUTC_MAX, R300_FPI2_OUTA_MAX},
 180         {"CMP", 3, R300_FPI0_OUTC_CMP, R300_FPI2_OUTA_CMP},
 181         {"FRC", 1, R300_FPI0_OUTC_FRC, R300_FPI2_OUTA_FRC},
 182         {"EX2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_EX2},
 183         {"LG2", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_LG2},
 184         {"RCP", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RCP},
 185         {"RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA, R300_FPI2_OUTA_RSQ},
 186         {"REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA, PFS_INVAL},
 187         {"CMPH", 3, R300_FPI0_OUTC_CMPH, PFS_INVAL},
 188         /* *INDENT-ON* */
 189 };
 190
 191 /* vector swizzles r300 can support natively, with a couple of
 192  * cases we handle specially
 193  *
 194  * REG_VSWZ/REG_SSWZ is an index into this table
 195  */
 196
 197 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
 198 #define SWIZZLE_HALF 6
 199
 200 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
 201                                           SWIZZLE_##y, \
 202                                           SWIZZLE_##z, \
 203                                           SWIZZLE_ZERO))
 204 /* native swizzles */
 205 static const struct r300_pfs_swizzle {
 206         GLuint hash;            /* swizzle value this matches */
 207         GLuint base;            /* base value for hw swizzle */
 208         GLuint stride;          /* difference in base between arg0/1/2 */
 209         GLuint flags;
 210 } v_swiz[] = {
 211         /* *INDENT-OFF* */
 212         {MAKE_SWZ3(X, Y, Z), R300_FPI0_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
 213         {MAKE_SWZ3(X, X, X), R300_FPI0_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
 214         {MAKE_SWZ3(Y, Y, Y), R300_FPI0_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
 215         {MAKE_SWZ3(Z, Z, Z), R300_FPI0_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
 216         {MAKE_SWZ3(W, W, W), R300_FPI0_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
 217         {MAKE_SWZ3(Y, Z, X), R300_FPI0_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
 218         {MAKE_SWZ3(Z, X, Y), R300_FPI0_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
 219         {MAKE_SWZ3(W, Z, Y), R300_FPI0_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
 220         {MAKE_SWZ3(ONE, ONE, ONE), R300_FPI0_ARGC_ONE, 0, 0},
 221         {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_FPI0_ARGC_ZERO, 0, 0},
 222         {MAKE_SWZ3(HALF, HALF, HALF), R300_FPI0_ARGC_HALF, 0, 0},
 223         {PFS_INVAL, 0, 0, 0},
 224         /* *INDENT-ON* */
 225 };
 226
 227 /* used during matching of non-native swizzles */
 228 #define SWZ_X_MASK (7 << 0)
 229 #define SWZ_Y_MASK (7 << 3)
 230 #define SWZ_Z_MASK (7 << 6)
 231 #define SWZ_W_MASK (7 << 9)
 232 static const struct {
 233         GLuint hash;            /* used to mask matching swizzle components */
 234         int mask;               /* actual outmask */
 235         int count;              /* count of components matched */
 236 } s_mask[] = {
 237         /* *INDENT-OFF* */
 238         {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
 239         {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
 240         {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
 241         {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
 242         {SWZ_X_MASK, 1, 1},
 243         {SWZ_Y_MASK, 2, 1},
 244         {SWZ_Z_MASK, 4, 1},
 245         {PFS_INVAL, PFS_INVAL, PFS_INVAL}
 246         /* *INDENT-ON* */
 247 };
 248
 249 static const struct {
 250         int base;               /* hw value of swizzle */
 251         int stride;             /* difference between SRC0/1/2 */
 252         GLuint flags;
 253 } s_swiz[] = {
 254         /* *INDENT-OFF* */
 255         {R300_FPI2_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
 256         {R300_FPI2_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
 257         {R300_FPI2_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
 258         {R300_FPI2_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
 259         {R300_FPI2_ARGA_ZERO, 0, 0},
 260         {R300_FPI2_ARGA_ONE, 0, 0},
 261         {R300_FPI2_ARGA_HALF, 0, 0}
 262         /* *INDENT-ON* */
 263 };
 264
 265 /* boiler-plate reg, for convenience */
 266 static const GLuint undef = REG(REG_TYPE_TEMP,
 267                                 0,
 268                                 SWIZZLE_XYZ,
 269                                 SWIZZLE_W,
 270                                 GL_FALSE,
 271                                 GL_FALSE,
 272                                 GL_FALSE);
 273
 274 /* constant one source */
 275 static const GLuint pfs_one = REG(REG_TYPE_CONST,
 276                                   0,
 277                                   SWIZZLE_111,
 278                                   SWIZZLE_ONE,
 279                                   GL_FALSE,
 280                                   GL_TRUE,
 281                                   GL_TRUE);
 282
 283 /* constant half source */
 284 static const GLuint pfs_half = REG(REG_TYPE_CONST,
 285                                    0,
 286                                    SWIZZLE_HHH,
 287                                    SWIZZLE_HALF,
 288                                    GL_FALSE,
 289                                    GL_TRUE,
 290                                    GL_TRUE);
 291
 292 /* constant zero source */
 293 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
 294                                    0,
 295                                    SWIZZLE_000,
 296                                    SWIZZLE_ZERO,
 297                                    GL_FALSE,
 298                                    GL_TRUE,
 299                                    GL_TRUE);
 300
 301 /*
 302  * Common functions prototypes
 303  */
 304 static void dump_program(struct r300_fragment_program *fp);
 305 static void emit_arith(struct r300_fragment_program *fp, int op,
 306                        GLuint dest, int mask,
 307                        GLuint src0, GLuint src1, GLuint src2, int flags);
 308
 309 /**
 310  * Get an R300 temporary that can be written to in the given slot.
 311  */
 312 static int get_hw_temp(struct r300_fragment_program *fp, int slot)
 313 {
 314         COMPILE_STATE;
 315         int r;
 316
 317         for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 318                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
 319                         break;
 320         }
 321
 322         if (r >= PFS_NUM_TEMP_REGS) {
 323                 ERROR("Out of hardware temps\n");
 324                 return 0;
 325         }
 326         // Reserved is used to avoid the following scenario:
 327         //  R300 temporary X is first assigned to Mesa temporary Y during vector ops
 328         //  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
 329         //  Then scalar ops on Mesa temporary Z are emitted and move back in time
 330         //  to overwrite the value of temporary Y.
 331         // End scenario.
 332         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 333         cs->hwtemps[r].free = -1;
 334
 335         // Reset to some value that won't mess things up when the user
 336         // tries to read from a temporary that hasn't been assigned a value yet.
 337         // In the normal case, vector_valid and scalar_valid should be set to
 338         // a sane value by the first emit that writes to this temporary.
 339         cs->hwtemps[r].vector_valid = 0;
 340         cs->hwtemps[r].scalar_valid = 0;
 341
 342         if (r > fp->max_temp_idx)
 343                 fp->max_temp_idx = r;
 344
 345         return r;
 346 }
 347
 348 /**
 349  * Get an R300 temporary that will act as a TEX destination register.
 350  */
 351 static int get_hw_temp_tex(struct r300_fragment_program *fp)
 352 {
 353         COMPILE_STATE;
 354         int r;
 355
 356         for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 357                 if (cs->used_in_node & (1 << r))
 358                         continue;
 359
 360                 // Note: Be very careful here
 361                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
 362                         break;
 363         }
 364
 365         if (r >= PFS_NUM_TEMP_REGS)
 366                 return get_hw_temp(fp, 0);      /* Will cause an indirection */
 367
 368         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 369         cs->hwtemps[r].free = -1;
 370
 371         // Reset to some value that won't mess things up when the user
 372         // tries to read from a temporary that hasn't been assigned a value yet.
 373         // In the normal case, vector_valid and scalar_valid should be set to
 374         // a sane value by the first emit that writes to this temporary.
 375         cs->hwtemps[r].vector_valid = cs->nrslots;
 376         cs->hwtemps[r].scalar_valid = cs->nrslots;
 377
 378         if (r > fp->max_temp_idx)
 379                 fp->max_temp_idx = r;
 380
 381         return r;
 382 }
 383
 384 /**
 385  * Mark the given hardware register as free.
 386  */
 387 static void free_hw_temp(struct r300_fragment_program *fp, int idx)
 388 {
 389         COMPILE_STATE;
 390
 391         // Be very careful here. Consider sequences like
 392         //  MAD r0, r1,r2,r3
 393         //  TEX r4, ...
 394         // The TEX instruction may be moved in front of the MAD instruction
 395         // due to the way nodes work. We don't want to alias r1 and r4 in
 396         // this case.
 397         // I'm certain the register allocation could be further sanitized,
 398         // but it's tricky because of stuff that can happen inside emit_tex
 399         // and emit_arith.
 400         cs->hwtemps[idx].free = cs->nrslots + 1;
 401 }
 402
 403 /**
 404  * Create a new Mesa temporary register.
 405  */
 406 static GLuint get_temp_reg(struct r300_fragment_program *fp)
 407 {
 408         COMPILE_STATE;
 409         GLuint r = undef;
 410         GLuint index;
 411
 412         index = ffs(~cs->temp_in_use);
 413         if (!index) {
 414                 ERROR("Out of program temps\n");
 415                 return r;
 416         }
 417
 418         cs->temp_in_use |= (1 << --index);
 419         cs->temps[index].refcount = 0xFFFFFFFF;
 420         cs->temps[index].reg = -1;
 421
 422         REG_SET_TYPE(r, REG_TYPE_TEMP);
 423         REG_SET_INDEX(r, index);
 424         REG_SET_VALID(r, GL_TRUE);
 425         return r;
 426 }
 427
 428 /**
 429  * Create a new Mesa temporary register that will act as the destination
 430  * register for a texture read.
 431  */
 432 static GLuint get_temp_reg_tex(struct r300_fragment_program *fp)
 433 {
 434         COMPILE_STATE;
 435         GLuint r = undef;
 436         GLuint index;
 437
 438         index = ffs(~cs->temp_in_use);
 439         if (!index) {
 440                 ERROR("Out of program temps\n");
 441                 return r;
 442         }
 443
 444         cs->temp_in_use |= (1 << --index);
 445         cs->temps[index].refcount = 0xFFFFFFFF;
 446         cs->temps[index].reg = get_hw_temp_tex(fp);
 447
 448         REG_SET_TYPE(r, REG_TYPE_TEMP);
 449         REG_SET_INDEX(r, index);
 450         REG_SET_VALID(r, GL_TRUE);
 451         return r;
 452 }
 453
 454 /**
 455  * Free a Mesa temporary and the associated R300 temporary.
 456  */
 457 static void free_temp(struct r300_fragment_program *fp, GLuint r)
 458 {
 459         COMPILE_STATE;
 460         GLuint index = REG_GET_INDEX(r);
 461
 462         if (!(cs->temp_in_use & (1 << index)))
 463                 return;
 464
 465         if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
 466                 free_hw_temp(fp, cs->temps[index].reg);
 467                 cs->temps[index].reg = -1;
 468                 cs->temp_in_use &= ~(1 << index);
 469         } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
 470                 free_hw_temp(fp, cs->inputs[index].reg);
 471                 cs->inputs[index].reg = -1;
 472         }
 473 }
 474
 475 /**
 476  * Emit a hardware constant/parameter.
 477  *
 478  * \p cp Stable pointer to an array of 4 floats.
 479  *  The pointer must be stable in the sense that it remains to be valid
 480  *  and hold the contents of the constant/parameter throughout the lifetime
 481  *  of the fragment program (actually, up until the next time the fragment
 482  *  program is translated).
 483  */
 484 static GLuint emit_const4fv(struct r300_fragment_program *fp,
 485                             const GLfloat * cp)
 486 {
 487         GLuint reg = undef;
 488         int index;
 489
 490         for (index = 0; index < fp->const_nr; ++index) {
 491                 if (fp->constant[index] == cp)
 492                         break;
 493         }
 494
 495         if (index >= fp->const_nr) {
 496                 if (index >= PFS_NUM_CONST_REGS) {
 497                         ERROR("Out of hw constants!\n");
 498                         return reg;
 499                 }
 500
 501                 fp->const_nr++;
 502                 fp->constant[index] = cp;
 503         }
 504
 505         REG_SET_TYPE(reg, REG_TYPE_CONST);
 506         REG_SET_INDEX(reg, index);
 507         REG_SET_VALID(reg, GL_TRUE);
 508         return reg;
 509 }
 510
 511 static inline GLuint negate(GLuint r)
 512 {
 513         REG_NEGS(r);
 514         REG_NEGV(r);
 515         return r;
 516 }
 517
 518 /* Hack, to prevent clobbering sources used multiple times when
 519  * emulating non-native instructions
 520  */
 521 static inline GLuint keep(GLuint r)
 522 {
 523         REG_SET_NO_USE(r, GL_TRUE);
 524         return r;
 525 }
 526
 527 static inline GLuint absolute(GLuint r)
 528 {
 529         REG_ABS(r);
 530         return r;
 531 }
 532
 533 static int swz_native(struct r300_fragment_program *fp,
 534                       GLuint src, GLuint * r, GLuint arbneg)
 535 {
 536         /* Native swizzle, handle negation */
 537         src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
 538
 539         if ((arbneg & 0x7) == 0x0) {
 540                 src = src & ~REG_NEGV_MASK;
 541                 *r = src;
 542         } else if ((arbneg & 0x7) == 0x7) {
 543                 src |= REG_NEGV_MASK;
 544                 *r = src;
 545         } else {
 546                 if (!REG_GET_VALID(*r))
 547                         *r = get_temp_reg(fp);
 548                 src |= REG_NEGV_MASK;
 549                 emit_arith(fp,
 550                            PFS_OP_MAD,
 551                            *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
 552                 src = src & ~REG_NEGV_MASK;
 553                 emit_arith(fp,
 554                            PFS_OP_MAD,
 555                            *r,
 556                            (arbneg ^ 0x7) | WRITEMASK_W,
 557                            src, pfs_one, pfs_zero, 0);
 558         }
 559
 560         return 3;
 561 }
 562
 563 static int swz_emit_partial(struct r300_fragment_program *fp,
 564                             GLuint src,
 565                             GLuint * r, int mask, int mc, GLuint arbneg)
 566 {
 567         GLuint tmp;
 568         GLuint wmask = 0;
 569
 570         if (!REG_GET_VALID(*r))
 571                 *r = get_temp_reg(fp);
 572
 573         /* A partial match, VSWZ/mask define what parts of the
 574          * desired swizzle we match
 575          */
 576         if (mc + s_mask[mask].count == 3) {
 577                 wmask = WRITEMASK_W;
 578                 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
 579         }
 580
 581         tmp = arbneg & s_mask[mask].mask;
 582         if (tmp) {
 583                 tmp = tmp ^ s_mask[mask].mask;
 584                 if (tmp) {
 585                         emit_arith(fp,
 586                                    PFS_OP_MAD,
 587                                    *r,
 588                                    arbneg & s_mask[mask].mask,
 589                                    keep(src) | REG_NEGV_MASK,
 590                                    pfs_one, pfs_zero, 0);
 591                         if (!wmask) {
 592                                 REG_SET_NO_USE(src, GL_TRUE);
 593                         } else {
 594                                 REG_SET_NO_USE(src, GL_FALSE);
 595                         }
 596                         emit_arith(fp,
 597                                    PFS_OP_MAD,
 598                                    *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
 599                 } else {
 600                         if (!wmask) {
 601                                 REG_SET_NO_USE(src, GL_TRUE);
 602                         } else {
 603                                 REG_SET_NO_USE(src, GL_FALSE);
 604                         }
 605                         emit_arith(fp,
 606                                    PFS_OP_MAD,
 607                                    *r,
 608                                    (arbneg & s_mask[mask].mask) | wmask,
 609                                    src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
 610                 }
 611         } else {
 612                 if (!wmask) {
 613                         REG_SET_NO_USE(src, GL_TRUE);
 614                 } else {
 615                         REG_SET_NO_USE(src, GL_FALSE);
 616                 }
 617                 emit_arith(fp, PFS_OP_MAD,
 618                            *r,
 619                            s_mask[mask].mask | wmask,
 620                            src, pfs_one, pfs_zero, 0);
 621         }
 622
 623         return s_mask[mask].count;
 624 }
 625
 626 static GLuint do_swizzle(struct r300_fragment_program *fp,
 627                          GLuint src, GLuint arbswz, GLuint arbneg)
 628 {
 629         GLuint r = undef;
 630         GLuint vswz;
 631         int c_mask = 0;
 632         int v_match = 0;
 633
 634         /* If swizzling from something without an XYZW native swizzle,
 635          * emit result to a temp, and do new swizzle from the temp.
 636          */
 637 #if 0
 638         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
 639                 GLuint temp = get_temp_reg(fp);
 640                 emit_arith(fp,
 641                            PFS_OP_MAD,
 642                            temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
 643                 src = temp;
 644         }
 645 #endif
 646
 647         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
 648                 GLuint vsrcswz =
 649                     (v_swiz[REG_GET_VSWZ(src)].
 650                      hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
 651                     REG_GET_SSWZ(src) << 9;
 652                 GLint i;
 653
 654                 GLuint newswz = 0;
 655                 GLuint offset;
 656                 for (i = 0; i < 4; ++i) {
 657                         offset = GET_SWZ(arbswz, i);
 658
 659                         newswz |=
 660                             (offset <= 3) ? GET_SWZ(vsrcswz,
 661                                                     offset) << i *
 662                             3 : offset << i * 3;
 663                 }
 664
 665                 arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
 666                 REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
 667         } else {
 668                 /* set scalar swizzling */
 669                 REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
 670
 671         }
 672         do {
 673                 vswz = REG_GET_VSWZ(src);
 674                 do {
 675                         int chash;
 676
 677                         REG_SET_VSWZ(src, vswz);
 678                         chash = v_swiz[REG_GET_VSWZ(src)].hash &
 679                             s_mask[c_mask].hash;
 680
 681                         if (chash == (arbswz & s_mask[c_mask].hash)) {
 682                                 if (s_mask[c_mask].count == 3) {
 683                                         v_match += swz_native(fp,
 684                                                               src, &r, arbneg);
 685                                 } else {
 686                                         v_match += swz_emit_partial(fp,
 687                                                                     src,
 688                                                                     &r,
 689                                                                     c_mask,
 690                                                                     v_match,
 691                                                                     arbneg);
 692                                 }
 693
 694                                 if (v_match == 3)
 695                                         return r;
 696
 697                                 /* Fill with something invalid.. all 0's was
 698                                  * wrong before, matched SWIZZLE_X.  So all
 699                                  * 1's will be okay for now
 700                                  */
 701                                 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
 702                         }
 703                 } while (v_swiz[++vswz].hash != PFS_INVAL);
 704                 REG_SET_VSWZ(src, SWIZZLE_XYZ);
 705         } while (s_mask[++c_mask].hash != PFS_INVAL);
 706
 707         ERROR("should NEVER get here\n");
 708         return r;
 709 }
 710
 711 static GLuint t_src(struct r300_fragment_program *fp,
 712                     struct prog_src_register fpsrc)
 713 {
 714         GLuint r = undef;
 715
 716         switch (fpsrc.File) {
 717         case PROGRAM_TEMPORARY:
 718                 REG_SET_INDEX(r, fpsrc.Index);
 719                 REG_SET_VALID(r, GL_TRUE);
 720                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 721                 break;
 722         case PROGRAM_INPUT:
 723                 REG_SET_INDEX(r, fpsrc.Index);
 724                 REG_SET_VALID(r, GL_TRUE);
 725                 REG_SET_TYPE(r, REG_TYPE_INPUT);
 726                 break;
 727         case PROGRAM_LOCAL_PARAM:
 728                 r = emit_const4fv(fp,
 729                                   fp->mesa_program.Base.LocalParams[fpsrc.
 730                                                                     Index]);
 731                 break;
 732         case PROGRAM_ENV_PARAM:
 733                 r = emit_const4fv(fp,
 734                                   fp->ctx->FragmentProgram.Parameters[fpsrc.
 735                                                                       Index]);
 736                 break;
 737         case PROGRAM_STATE_VAR:
 738         case PROGRAM_NAMED_PARAM:
 739                 r = emit_const4fv(fp,
 740                                   fp->mesa_program.Base.Parameters->
 741                                   ParameterValues[fpsrc.Index]);
 742                 break;
 743         default:
 744                 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
 745                 return r;
 746         }
 747
 748         /* no point swizzling ONE/ZERO/HALF constants... */
 749         if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
 750                 r = do_swizzle(fp, r, fpsrc.Swizzle, fpsrc.NegateBase);
 751         return r;
 752 }
 753
 754 static GLuint t_scalar_src(struct r300_fragment_program *fp,
 755                            struct prog_src_register fpsrc)
 756 {
 757         struct prog_src_register src = fpsrc;
 758         int sc = GET_SWZ(fpsrc.Swizzle, 0);     /* X */
 759
 760         src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
 761
 762         return t_src(fp, src);
 763 }
 764
 765 static GLuint t_dst(struct r300_fragment_program *fp,
 766                     struct prog_dst_register dest)
 767 {
 768         GLuint r = undef;
 769
 770         switch (dest.File) {
 771         case PROGRAM_TEMPORARY:
 772                 REG_SET_INDEX(r, dest.Index);
 773                 REG_SET_VALID(r, GL_TRUE);
 774                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 775                 return r;
 776         case PROGRAM_OUTPUT:
 777                 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
 778                 switch (dest.Index) {
 779                 case FRAG_RESULT_COLR:
 780                 case FRAG_RESULT_DEPR:
 781                         REG_SET_INDEX(r, dest.Index);
 782                         REG_SET_VALID(r, GL_TRUE);
 783                         return r;
 784                 default:
 785                         ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
 786                         return r;
 787                 }
 788         default:
 789                 ERROR("Bad DstReg->File 0x%x\n", dest.File);
 790                 return r;
 791         }
 792 }
 793
 794 static int t_hw_src(struct r300_fragment_program *fp, GLuint src, GLboolean tex)
 795 {
 796         COMPILE_STATE;
 797         int idx;
 798         int index = REG_GET_INDEX(src);
 799
 800         switch (REG_GET_TYPE(src)) {
 801         case REG_TYPE_TEMP:
 802                 /* NOTE: if reg==-1 here, a source is being read that
 803                  *       hasn't been written to. Undefined results.
 804                  */
 805                 if (cs->temps[index].reg == -1)
 806                         cs->temps[index].reg = get_hw_temp(fp, cs->nrslots);
 807
 808                 idx = cs->temps[index].reg;
 809
 810                 if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
 811                         free_temp(fp, src);
 812                 break;
 813         case REG_TYPE_INPUT:
 814                 idx = cs->inputs[index].reg;
 815
 816                 if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
 817                         free_hw_temp(fp, cs->inputs[index].reg);
 818                 break;
 819         case REG_TYPE_CONST:
 820                 return (index | SRC_CONST);
 821         default:
 822                 ERROR("Invalid type for source reg\n");
 823                 return (0 | SRC_CONST);
 824         }
 825
 826         if (!tex)
 827                 cs->used_in_node |= (1 << idx);
 828
 829         return idx;
 830 }
 831
 832 static int t_hw_dst(struct r300_fragment_program *fp,
 833                     GLuint dest, GLboolean tex, int slot)
 834 {
 835         COMPILE_STATE;
 836         int idx;
 837         GLuint index = REG_GET_INDEX(dest);
 838         assert(REG_GET_VALID(dest));
 839
 840         switch (REG_GET_TYPE(dest)) {
 841         case REG_TYPE_TEMP:
 842                 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
 843                         if (!tex) {
 844                                 cs->temps[index].reg = get_hw_temp(fp, slot);
 845                         } else {
 846                                 cs->temps[index].reg = get_hw_temp_tex(fp);
 847                         }
 848                 }
 849                 idx = cs->temps[index].reg;
 850
 851                 if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
 852                         free_temp(fp, dest);
 853
 854                 cs->dest_in_node |= (1 << idx);
 855                 cs->used_in_node |= (1 << idx);
 856                 break;
 857         case REG_TYPE_OUTPUT:
 858                 switch (index) {
 859                 case FRAG_RESULT_COLR:
 860                         fp->node[fp->cur_node].flags |=
 861                             R300_PFS_NODE_OUTPUT_COLOR;
 862                         break;
 863                 case FRAG_RESULT_DEPR:
 864                         fp->node[fp->cur_node].flags |=
 865                             R300_PFS_NODE_OUTPUT_DEPTH;
 866                         break;
 867                 }
 868                 return index;
 869                 break;
 870         default:
 871                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
 872                 return 0;
 873         }
 874
 875         return idx;
 876 }
 877
 878 static void emit_nop(struct r300_fragment_program *fp)
 879 {
 880         COMPILE_STATE;
 881
 882         if (cs->nrslots >= PFS_MAX_ALU_INST) {
 883                 ERROR("Out of ALU instruction slots\n");
 884                 return;
 885         }
 886
 887         fp->alu.inst[cs->nrslots].inst0 = NOP_INST0;
 888         fp->alu.inst[cs->nrslots].inst1 = NOP_INST1;
 889         fp->alu.inst[cs->nrslots].inst2 = NOP_INST2;
 890         fp->alu.inst[cs->nrslots].inst3 = NOP_INST3;
 891         cs->nrslots++;
 892 }
 893
 894 static void emit_tex(struct r300_fragment_program *fp,
 895                      struct prog_instruction *fpi, int opcode)
 896 {
 897         COMPILE_STATE;
 898         GLuint coord = t_src(fp, fpi->SrcReg[0]);
 899         GLuint dest = undef, rdest = undef;
 900         GLuint din, uin;
 901         int unit = fpi->TexSrcUnit;
 902         int hwsrc, hwdest;
 903         GLuint tempreg = 0;
 904
 905         uin = cs->used_in_node;
 906         din = cs->dest_in_node;
 907
 908         /* Resolve source/dest to hardware registers */
 909         if (opcode != R300_FPITX_OP_KIL) {
 910                 if (fpi->TexSrcTarget == TEXTURE_RECT_INDEX) {
 911                         /**
 912                          * Hardware uses [0..1]x[0..1] range for rectangle textures
 913                          * instead of [0..Width]x[0..Height].
 914                          * Add a scaling instruction.
 915                          *
 916                          * \todo Refactor this once we have proper rewriting/optimization
 917                          * support for programs.
 918                          */
 919                         gl_state_index tokens[STATE_LENGTH] = {
 920                                 STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
 921                                 0
 922                         };
 923                         int factor_index;
 924                         GLuint factorreg;
 925
 926                         tokens[2] = unit;
 927                         factor_index =
 928                             _mesa_add_state_reference(fp->mesa_program.Base.
 929                                                       Parameters, tokens);
 930                         factorreg =
 931                             emit_const4fv(fp,
 932                                           fp->mesa_program.Base.Parameters->
 933                                           ParameterValues[factor_index]);
 934                         tempreg = keep(get_temp_reg(fp));
 935
 936                         emit_arith(fp, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
 937                                    coord, factorreg, pfs_zero, 0);
 938
 939                         /* Ensure correct node indirection */
 940                         uin = cs->used_in_node;
 941                         din = cs->dest_in_node;
 942
 943                         hwsrc = t_hw_src(fp, tempreg, GL_TRUE);
 944                 } else {
 945                         hwsrc = t_hw_src(fp, coord, GL_TRUE);
 946                 }
 947
 948                 dest = t_dst(fp, fpi->DstReg);
 949
 950                 /* r300 doesn't seem to be able to do TEX->output reg */
 951                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
 952                         rdest = dest;
 953                         dest = get_temp_reg_tex(fp);
 954                 } else if (fpi->DstReg.WriteMask != WRITEMASK_XYZW) {
 955                         /* in case write mask isn't XYZW */
 956                         rdest = dest;
 957                         dest = get_temp_reg_tex(fp);
 958                 }
 959                 hwdest =
 960                     t_hw_dst(fp, dest, GL_TRUE,
 961                              fp->node[fp->cur_node].alu_offset);
 962
 963                 /* Use a temp that hasn't been used in this node, rather
 964                  * than causing an indirection
 965                  */
 966                 if (uin & (1 << hwdest)) {
 967                         free_hw_temp(fp, hwdest);
 968                         hwdest = get_hw_temp_tex(fp);
 969                         cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
 970                 }
 971         } else {
 972                 hwdest = 0;
 973                 unit = 0;
 974                 hwsrc = t_hw_src(fp, coord, GL_TRUE);
 975         }
 976
 977         /* Indirection if source has been written in this node, or if the
 978          * dest has been read/written in this node
 979          */
 980         if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
 981              (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
 982
 983                 /* Finish off current node */
 984                 if (fp->node[fp->cur_node].alu_offset == cs->nrslots)
 985                         emit_nop(fp);
 986
 987                 fp->node[fp->cur_node].alu_end =
 988                     cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
 989                 assert(fp->node[fp->cur_node].alu_end >= 0);
 990
 991                 if (++fp->cur_node >= PFS_MAX_TEX_INDIRECT) {
 992                         ERROR("too many levels of texture indirection\n");
 993                         return;
 994                 }
 995
 996                 /* Start new node */
 997                 fp->node[fp->cur_node].tex_offset = fp->tex.length;
 998                 fp->node[fp->cur_node].alu_offset = cs->nrslots;
 999                 fp->node[fp->cur_node].tex_end = -1;
1000                 fp->node[fp->cur_node].alu_end = -1;
1001                 fp->node[fp->cur_node].flags = 0;
1002                 cs->used_in_node = 0;
1003                 cs->dest_in_node = 0;
1004         }
1005
1006         if (fp->cur_node == 0)
1007                 fp->first_node_has_tex = 1;
1008
1009         fp->tex.inst[fp->tex.length++] = 0 | (hwsrc << R300_FPITX_SRC_SHIFT)
1010             | (hwdest << R300_FPITX_DST_SHIFT)
1011             | (unit << R300_FPITX_IMAGE_SHIFT)
1012             /* not entirely sure about this */
1013             | (opcode << R300_FPITX_OPCODE_SHIFT);
1014
1015         cs->dest_in_node |= (1 << hwdest);
1016         if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1017                 cs->used_in_node |= (1 << hwsrc);
1018
1019         fp->node[fp->cur_node].tex_end++;
1020
1021         /* Copy from temp to output if needed */
1022         if (REG_GET_VALID(rdest)) {
1023                 emit_arith(fp, PFS_OP_MAD, rdest, fpi->DstReg.WriteMask, dest,
1024                            pfs_one, pfs_zero, 0);
1025                 free_temp(fp, dest);
1026         }
1027
1028         /* Free temp register */
1029         if (tempreg != 0)
1030                 free_temp(fp, tempreg);
1031 }
1032
1033 /**
1034  * Returns the first slot where we could possibly allow writing to dest,
1035  * according to register allocation.
1036  */
1037 static int get_earliest_allowed_write(struct r300_fragment_program *fp,
1038                                       GLuint dest, int mask)
1039 {
1040         COMPILE_STATE;
1041         int idx;
1042         int pos;
1043         GLuint index = REG_GET_INDEX(dest);
1044         assert(REG_GET_VALID(dest));
1045
1046         switch (REG_GET_TYPE(dest)) {
1047         case REG_TYPE_TEMP:
1048                 if (cs->temps[index].reg == -1)
1049                         return 0;
1050
1051                 idx = cs->temps[index].reg;
1052                 break;
1053         case REG_TYPE_OUTPUT:
1054                 return 0;
1055         default:
1056                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1057                 return 0;
1058         }
1059
1060         pos = cs->hwtemps[idx].reserved;
1061         if (mask & WRITEMASK_XYZ) {
1062                 if (pos < cs->hwtemps[idx].vector_lastread)
1063                         pos = cs->hwtemps[idx].vector_lastread;
1064         }
1065         if (mask & WRITEMASK_W) {
1066                 if (pos < cs->hwtemps[idx].scalar_lastread)
1067                         pos = cs->hwtemps[idx].scalar_lastread;
1068         }
1069
1070         return pos;
1071 }
1072
1073 /**
1074  * Allocates a slot for an ALU instruction that can consist of
1075  * a vertex part or a scalar part or both.
1076  *
1077  * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1078  * appropriate position (vector and/or scalar), and their positions are
1079  * recorded in the srcpos array.
1080  *
1081  * This function emits instruction code for the source fetch and the
1082  * argument selection. It does not emit instruction code for the
1083  * opcode or the destination selection.
1084  *
1085  * @return the index of the slot
1086  */
1087 static int find_and_prepare_slot(struct r300_fragment_program *fp,
1088                                  GLboolean emit_vop,
1089                                  GLboolean emit_sop,
1090                                  int argc, GLuint * src, GLuint dest, int mask)
1091 {
1092         COMPILE_STATE;
1093         int hwsrc[3];
1094         int srcpos[3];
1095         unsigned int used;
1096         int tempused;
1097         int tempvsrc[3];
1098         int tempssrc[3];
1099         int pos;
1100         int regnr;
1101         int i, j;
1102
1103         // Determine instruction slots, whether sources are required on
1104         // vector or scalar side, and the smallest slot number where
1105         // all source registers are available
1106         used = 0;
1107         if (emit_vop)
1108                 used |= SLOT_OP_VECTOR;
1109         if (emit_sop)
1110                 used |= SLOT_OP_SCALAR;
1111
1112         pos = get_earliest_allowed_write(fp, dest, mask);
1113
1114         if (fp->node[fp->cur_node].alu_offset > pos)
1115                 pos = fp->node[fp->cur_node].alu_offset;
1116         for (i = 0; i < argc; ++i) {
1117                 if (!REG_GET_BUILTIN(src[i])) {
1118                         if (emit_vop)
1119                                 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1120                         if (emit_sop)
1121                                 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1122                 }
1123
1124                 hwsrc[i] = t_hw_src(fp, src[i], GL_FALSE);      /* Note: sideeffects wrt refcounting! */
1125                 regnr = hwsrc[i] & 31;
1126
1127                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1128                         if (used & (SLOT_SRC_VECTOR << i)) {
1129                                 if (cs->hwtemps[regnr].vector_valid > pos)
1130                                         pos = cs->hwtemps[regnr].vector_valid;
1131                         }
1132                         if (used & (SLOT_SRC_SCALAR << i)) {
1133                                 if (cs->hwtemps[regnr].scalar_valid > pos)
1134                                         pos = cs->hwtemps[regnr].scalar_valid;
1135                         }
1136                 }
1137         }
1138
1139         // Find a slot that fits
1140         for (;; ++pos) {
1141                 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1142                         continue;
1143
1144                 if (pos >= cs->nrslots) {
1145                         if (cs->nrslots >= PFS_MAX_ALU_INST) {
1146                                 ERROR("Out of ALU instruction slots\n");
1147                                 return -1;
1148                         }
1149
1150                         fp->alu.inst[pos].inst0 = NOP_INST0;
1151                         fp->alu.inst[pos].inst1 = NOP_INST1;
1152                         fp->alu.inst[pos].inst2 = NOP_INST2;
1153                         fp->alu.inst[pos].inst3 = NOP_INST3;
1154
1155                         cs->nrslots++;
1156                 }
1157                 // Note: When we need both parts (vector and scalar) of a source,
1158                 // we always try to put them into the same position. This makes the
1159                 // code easier to read, and it is optimal (i.e. one doesn't gain
1160                 // anything by splitting the parts).
1161                 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1162                 tempused = cs->slot[pos].used;
1163                 for (i = 0; i < 3; ++i) {
1164                         tempvsrc[i] = cs->slot[pos].vsrc[i];
1165                         tempssrc[i] = cs->slot[pos].ssrc[i];
1166                 }
1167
1168                 for (i = 0; i < argc; ++i) {
1169                         int flags = (used >> i) & SLOT_SRC_BOTH;
1170
1171                         if (!flags) {
1172                                 srcpos[i] = 0;
1173                                 continue;
1174                         }
1175
1176                         for (j = 0; j < 3; ++j) {
1177                                 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1178                                         if (tempvsrc[j] != hwsrc[i])
1179                                                 continue;
1180                                 }
1181
1182                                 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1183                                         if (tempssrc[j] != hwsrc[i])
1184                                                 continue;
1185                                 }
1186
1187                                 break;
1188                         }
1189
1190                         if (j == 3)
1191                                 break;
1192
1193                         srcpos[i] = j;
1194                         tempused |= flags << j;
1195                         if (flags & SLOT_SRC_VECTOR)
1196                                 tempvsrc[j] = hwsrc[i];
1197                         if (flags & SLOT_SRC_SCALAR)
1198                                 tempssrc[j] = hwsrc[i];
1199                 }
1200
1201                 if (i == argc)
1202                         break;
1203         }
1204
1205         // Found a slot, reserve it
1206         cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1207         for (i = 0; i < 3; ++i) {
1208                 cs->slot[pos].vsrc[i] = tempvsrc[i];
1209                 cs->slot[pos].ssrc[i] = tempssrc[i];
1210         }
1211
1212         for (i = 0; i < argc; ++i) {
1213                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1214                         int regnr = hwsrc[i] & 31;
1215
1216                         if (used & (SLOT_SRC_VECTOR << i)) {
1217                                 if (cs->hwtemps[regnr].vector_lastread < pos)
1218                                         cs->hwtemps[regnr].vector_lastread =
1219                                             pos;
1220                         }
1221                         if (used & (SLOT_SRC_SCALAR << i)) {
1222                                 if (cs->hwtemps[regnr].scalar_lastread < pos)
1223                                         cs->hwtemps[regnr].scalar_lastread =
1224                                             pos;
1225                         }
1226                 }
1227         }
1228
1229         // Emit the source fetch code
1230         fp->alu.inst[pos].inst1 &= ~R300_FPI1_SRC_MASK;
1231         fp->alu.inst[pos].inst1 |=
1232             ((cs->slot[pos].vsrc[0] << R300_FPI1_SRC0C_SHIFT) |
1233              (cs->slot[pos].vsrc[1] << R300_FPI1_SRC1C_SHIFT) |
1234              (cs->slot[pos].vsrc[2] << R300_FPI1_SRC2C_SHIFT));
1235
1236         fp->alu.inst[pos].inst3 &= ~R300_FPI3_SRC_MASK;
1237         fp->alu.inst[pos].inst3 |=
1238             ((cs->slot[pos].ssrc[0] << R300_FPI3_SRC0A_SHIFT) |
1239              (cs->slot[pos].ssrc[1] << R300_FPI3_SRC1A_SHIFT) |
1240              (cs->slot[pos].ssrc[2] << R300_FPI3_SRC2A_SHIFT));
1241
1242         // Emit the argument selection code
1243         if (emit_vop) {
1244                 int swz[3];
1245
1246                 for (i = 0; i < 3; ++i) {
1247                         if (i < argc) {
1248                                 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1249                                           (srcpos[i] *
1250                                            v_swiz[REG_GET_VSWZ(src[i])].
1251                                            stride)) | ((src[i] & REG_NEGV_MASK)
1252                                                        ? ARG_NEG : 0) | ((src[i]
1253                                                                           &
1254                                                                           REG_ABS_MASK)
1255                                                                          ?
1256                                                                          ARG_ABS
1257                                                                          : 0);
1258                         } else {
1259                                 swz[i] = R300_FPI0_ARGC_ZERO;
1260                         }
1261                 }
1262
1263                 fp->alu.inst[pos].inst0 &=
1264                     ~(R300_FPI0_ARG0C_MASK | R300_FPI0_ARG1C_MASK |
1265                       R300_FPI0_ARG2C_MASK);
1266                 fp->alu.inst[pos].inst0 |=
1267                     (swz[0] << R300_FPI0_ARG0C_SHIFT) | (swz[1] <<
1268                                                          R300_FPI0_ARG1C_SHIFT)
1269                     | (swz[2] << R300_FPI0_ARG2C_SHIFT);
1270         }
1271
1272         if (emit_sop) {
1273                 int swz[3];
1274
1275                 for (i = 0; i < 3; ++i) {
1276                         if (i < argc) {
1277                                 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1278                                           (srcpos[i] *
1279                                            s_swiz[REG_GET_SSWZ(src[i])].
1280                                            stride)) | ((src[i] & REG_NEGV_MASK)
1281                                                        ? ARG_NEG : 0) | ((src[i]
1282                                                                           &
1283                                                                           REG_ABS_MASK)
1284                                                                          ?
1285                                                                          ARG_ABS
1286                                                                          : 0);
1287                         } else {
1288                                 swz[i] = R300_FPI2_ARGA_ZERO;
1289                         }
1290                 }
1291
1292                 fp->alu.inst[pos].inst2 &=
1293                     ~(R300_FPI2_ARG0A_MASK | R300_FPI2_ARG1A_MASK |
1294                       R300_FPI2_ARG2A_MASK);
1295                 fp->alu.inst[pos].inst2 |=
1296                     (swz[0] << R300_FPI2_ARG0A_SHIFT) | (swz[1] <<
1297                                                          R300_FPI2_ARG1A_SHIFT)
1298                     | (swz[2] << R300_FPI2_ARG2A_SHIFT);
1299         }
1300
1301         return pos;
1302 }
1303
1304 /**
1305  * Append an ALU instruction to the instruction list.
1306  */
1307 static void emit_arith(struct r300_fragment_program *fp,
1308                        int op,
1309                        GLuint dest,
1310                        int mask,
1311                        GLuint src0, GLuint src1, GLuint src2, int flags)
1312 {
1313         COMPILE_STATE;
1314         GLuint src[3] = { src0, src1, src2 };
1315         int hwdest;
1316         GLboolean emit_vop, emit_sop;
1317         int vop, sop, argc;
1318         int pos;
1319
1320         vop = r300_fpop[op].v_op;
1321         sop = r300_fpop[op].s_op;
1322         argc = r300_fpop[op].argc;
1323
1324         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1325             REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1326                 if (mask & WRITEMASK_Z) {
1327                         mask = WRITEMASK_W;
1328                 } else {
1329                         return;
1330                 }
1331         }
1332
1333         emit_vop = GL_FALSE;
1334         emit_sop = GL_FALSE;
1335         if ((mask & WRITEMASK_XYZ) || vop == R300_FPI0_OUTC_DP3)
1336                 emit_vop = GL_TRUE;
1337         if ((mask & WRITEMASK_W) || vop == R300_FPI0_OUTC_REPL_ALPHA)
1338                 emit_sop = GL_TRUE;
1339
1340         pos =
1341             find_and_prepare_slot(fp, emit_vop, emit_sop, argc, src, dest,
1342                                   mask);
1343         if (pos < 0)
1344                 return;
1345
1346         hwdest = t_hw_dst(fp, dest, GL_FALSE, pos);     /* Note: Side effects wrt register allocation */
1347
1348         if (flags & PFS_FLAG_SAT) {
1349                 vop |= R300_FPI0_OUTC_SAT;
1350                 sop |= R300_FPI2_OUTA_SAT;
1351         }
1352
1353         /* Throw the pieces together and get FPI0/1 */
1354         if (emit_vop) {
1355                 fp->alu.inst[pos].inst0 |= vop;
1356
1357                 fp->alu.inst[pos].inst1 |= hwdest << R300_FPI1_DSTC_SHIFT;
1358
1359                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1360                         if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1361                                 fp->alu.inst[pos].inst1 |=
1362                                     (mask & WRITEMASK_XYZ) <<
1363                                     R300_FPI1_DSTC_OUTPUT_MASK_SHIFT;
1364                         } else
1365                                 assert(0);
1366                 } else {
1367                         fp->alu.inst[pos].inst1 |=
1368                             (mask & WRITEMASK_XYZ) <<
1369                             R300_FPI1_DSTC_REG_MASK_SHIFT;
1370
1371                         cs->hwtemps[hwdest].vector_valid = pos + 1;
1372                 }
1373         }
1374
1375         /* And now FPI2/3 */
1376         if (emit_sop) {
1377                 fp->alu.inst[pos].inst2 |= sop;
1378
1379                 if (mask & WRITEMASK_W) {
1380                         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1381                                 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1382                                         fp->alu.inst[pos].inst3 |=
1383                                             (hwdest << R300_FPI3_DSTA_SHIFT) |
1384                                             R300_FPI3_DSTA_OUTPUT;
1385                                 } else if (REG_GET_INDEX(dest) ==
1386                                            FRAG_RESULT_DEPR) {
1387                                         fp->alu.inst[pos].inst3 |=
1388                                             R300_FPI3_DSTA_DEPTH;
1389                                 } else
1390                                         assert(0);
1391                         } else {
1392                                 fp->alu.inst[pos].inst3 |=
1393                                     (hwdest << R300_FPI3_DSTA_SHIFT) |
1394                                     R300_FPI3_DSTA_REG;
1395
1396                                 cs->hwtemps[hwdest].scalar_valid = pos + 1;
1397                         }
1398                 }
1399         }
1400
1401         return;
1402 }
1403
1404 #if 0
1405 static GLuint get_attrib(struct r300_fragment_program *fp, GLuint attr)
1406 {
1407         struct gl_fragment_program *mp = &fp->mesa_program;
1408         GLuint r = undef;
1409
1410         if (!(mp->Base.InputsRead & (1 << attr))) {
1411                 ERROR("Attribute %d was not provided!\n", attr);
1412                 return undef;
1413         }
1414
1415         REG_SET_TYPE(r, REG_TYPE_INPUT);
1416         REG_SET_INDEX(r, attr);
1417         REG_SET_VALID(r, GL_TRUE);
1418         return r;
1419 }
1420 #endif
1421
1422 static GLfloat SinCosConsts[2][4] = {
1423         {
1424          1.273239545,           // 4/PI
1425          -0.405284735,          // -4/(PI*PI)
1426          3.141592654,           // PI
1427          0.2225                 // weight
1428          },
1429         {
1430          0.75,
1431          0.0,
1432          0.159154943,           // 1/(2*PI)
1433          6.283185307            // 2*PI
1434          }
1435 };
1436
1437 /**
1438  * Emit a LIT instruction.
1439  * \p flags may be PFS_FLAG_SAT
1440  *
1441  * Definition of LIT (from ARB_fragment_program):
1442  * tmp = VectorLoad(op0);
1443  * if (tmp.x < 0) tmp.x = 0;
1444  * if (tmp.y < 0) tmp.y = 0;
1445  * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1446  * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1447  * result.x = 1.0;
1448  * result.y = tmp.x;
1449  * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1450  * result.w = 1.0;
1451  *
1452  * The longest path of computation is the one leading to result.z,
1453  * consisting of 5 operations. This implementation of LIT takes
1454  * 5 slots. So unless there's some special undocumented opcode,
1455  * this implementation is potentially optimal. Unfortunately,
1456  * emit_arith is a bit too conservative because it doesn't understand
1457  * partial writes to the vector component.
1458  */
1459 static const GLfloat LitConst[4] =
1460     { 127.999999, 127.999999, 127.999999, -127.999999 };
1461
1462 static void emit_lit(struct r300_fragment_program *fp,
1463                      GLuint dest, int mask, GLuint src, int flags)
1464 {
1465         COMPILE_STATE;
1466         GLuint cnst;
1467         int needTemporary;
1468         GLuint temp;
1469
1470         cnst = emit_const4fv(fp, LitConst);
1471
1472         needTemporary = 0;
1473         if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
1474                 needTemporary = 1;
1475         } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1476                 // LIT is typically followed by DP3/DP4, so there's no point
1477                 // in creating special code for this case
1478                 needTemporary = 1;
1479         }
1480
1481         if (needTemporary) {
1482                 temp = keep(get_temp_reg(fp));
1483         } else {
1484                 temp = keep(dest);
1485         }
1486
1487         // Note: The order of emit_arith inside the slots is relevant,
1488         // because emit_arith only looks at scalar vs. vector when resolving
1489         // dependencies, and it does not consider individual vector components,
1490         // so swizzling between the two parts can create fake dependencies.
1491
1492         // First slot
1493         emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_XY,
1494                    keep(src), pfs_zero, undef, 0);
1495         emit_arith(fp, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
1496
1497         // Second slot
1498         emit_arith(fp, PFS_OP_MIN, temp, WRITEMASK_Z,
1499                    swizzle(temp, W, W, W, W), cnst, undef, 0);
1500         emit_arith(fp, PFS_OP_LG2, temp, WRITEMASK_W,
1501                    swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
1502
1503         // Third slot
1504         // If desired, we saturate the y result here.
1505         // This does not affect the use as a condition variable in the CMP later
1506         emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W,
1507                    temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
1508         emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_Y,
1509                    swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
1510
1511         // Fourth slot
1512         emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_X,
1513                    pfs_one, pfs_one, pfs_zero, 0);
1514         emit_arith(fp, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
1515
1516         // Fifth slot
1517         emit_arith(fp, PFS_OP_CMP, temp, WRITEMASK_Z,
1518                    pfs_zero, swizzle(temp, W, W, W, W),
1519                    negate(swizzle(temp, Y, Y, Y, Y)), flags);
1520         emit_arith(fp, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
1521                    pfs_zero, 0);
1522
1523         if (needTemporary) {
1524                 emit_arith(fp, PFS_OP_MAD, dest, mask,
1525                            temp, pfs_one, pfs_zero, flags);
1526                 free_temp(fp, temp);
1527         } else {
1528                 // Decrease refcount of the destination
1529                 t_hw_dst(fp, dest, GL_FALSE, cs->nrslots);
1530         }
1531 }
1532
1533 static GLboolean parse_program(struct r300_fragment_program *fp)
1534 {
1535         struct gl_fragment_program *mp = &fp->mesa_program;
1536         const struct prog_instruction *inst = mp->Base.Instructions;
1537         struct prog_instruction *fpi;
1538         GLuint src[3], dest, temp[2];
1539         int flags, mask = 0;
1540         int const_sin[2];
1541
1542         if (!inst || inst[0].Opcode == OPCODE_END) {
1543                 ERROR("empty program?\n");
1544                 return GL_FALSE;
1545         }
1546
1547         for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
1548                 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1549                         flags = PFS_FLAG_SAT;
1550                 else
1551                         flags = 0;
1552
1553                 if (fpi->Opcode != OPCODE_KIL) {
1554                         dest = t_dst(fp, fpi->DstReg);
1555                         mask = fpi->DstReg.WriteMask;
1556                 }
1557
1558                 switch (fpi->Opcode) {
1559                 case OPCODE_ABS:
1560                         src[0] = t_src(fp, fpi->SrcReg[0]);
1561                         emit_arith(fp, PFS_OP_MAD, dest, mask,
1562                                    absolute(src[0]), pfs_one, pfs_zero, flags);
1563                         break;
1564                 case OPCODE_ADD:
1565                         src[0] = t_src(fp, fpi->SrcReg[0]);
1566                         src[1] = t_src(fp, fpi->SrcReg[1]);
1567                         emit_arith(fp, PFS_OP_MAD, dest, mask,
1568                                    src[0], pfs_one, src[1], flags);
1569                         break;
1570                 case OPCODE_CMP:
1571                         src[0] = t_src(fp, fpi->SrcReg[0]);
1572                         src[1] = t_src(fp, fpi->SrcReg[1]);
1573                         src[2] = t_src(fp, fpi->SrcReg[2]);
1574                         /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1575                          *    r300 - if src2.c < 0.0 ? src1.c : src0.c
1576                          */
1577                         emit_arith(fp, PFS_OP_CMP, dest, mask,
1578                                    src[2], src[1], src[0], flags);
1579                         break;
1580                 case OPCODE_COS:
1581                         /*
1582                          * cos using a parabola (see SIN):
1583                          * cos(x):
1584                          *   x = (x/(2*PI))+0.75
1585                          *   x = frac(x)
1586                          *   x = (x*2*PI)-PI
1587                          *   result = sin(x)
1588                          */
1589                         temp[0] = get_temp_reg(fp);
1590                         const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
1591                         const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
1592                         src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
1593
1594                         /* add 0.5*PI and do range reduction */
1595
1596                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1597                                    swizzle(src[0], X, X, X, X),
1598                                    swizzle(const_sin[1], Z, Z, Z, Z),
1599                                    swizzle(const_sin[1], X, X, X, X), 0);
1600
1601                         emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
1602                                    swizzle(temp[0], X, X, X, X),
1603                                    undef, undef, 0);
1604
1605                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),       //2*PI
1606                                    negate(swizzle(const_sin[0], Z, Z, Z, Z)),   //-PI
1607                                    0);
1608
1609                         /* SIN */
1610
1611                         emit_arith(fp, PFS_OP_MAD, temp[0],
1612                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1613                                                                       Z, Z, Z,
1614                                                                       Z),
1615                                    const_sin[0], pfs_zero, 0);
1616
1617                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1618                                    swizzle(temp[0], Y, Y, Y, Y),
1619                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1620                                    swizzle(temp[0], X, X, X, X), 0);
1621
1622                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1623                                    swizzle(temp[0], X, X, X, X),
1624                                    absolute(swizzle(temp[0], X, X, X, X)),
1625                                    negate(swizzle(temp[0], X, X, X, X)), 0);
1626
1627                         emit_arith(fp, PFS_OP_MAD, dest, mask,
1628                                    swizzle(temp[0], Y, Y, Y, Y),
1629                                    swizzle(const_sin[0], W, W, W, W),
1630                                    swizzle(temp[0], X, X, X, X), flags);
1631
1632                         free_temp(fp, temp[0]);
1633                         break;
1634                 case OPCODE_DP3:
1635                         src[0] = t_src(fp, fpi->SrcReg[0]);
1636                         src[1] = t_src(fp, fpi->SrcReg[1]);
1637                         emit_arith(fp, PFS_OP_DP3, dest, mask,
1638                                    src[0], src[1], undef, flags);
1639                         break;
1640                 case OPCODE_DP4:
1641                         src[0] = t_src(fp, fpi->SrcReg[0]);
1642                         src[1] = t_src(fp, fpi->SrcReg[1]);
1643                         emit_arith(fp, PFS_OP_DP4, dest, mask,
1644                                    src[0], src[1], undef, flags);
1645                         break;
1646                 case OPCODE_DPH:
1647                         src[0] = t_src(fp, fpi->SrcReg[0]);
1648                         src[1] = t_src(fp, fpi->SrcReg[1]);
1649                         /* src0.xyz1 -> temp
1650                          * DP4 dest, temp, src1
1651                          */
1652 #if 0
1653                         temp[0] = get_temp_reg(fp);
1654                         src[0].s_swz = SWIZZLE_ONE;
1655                         emit_arith(fp, PFS_OP_MAD, temp[0], mask,
1656                                    src[0], pfs_one, pfs_zero, 0);
1657                         emit_arith(fp, PFS_OP_DP4, dest, mask,
1658                                    temp[0], src[1], undef, flags);
1659                         free_temp(fp, temp[0]);
1660 #else
1661                         emit_arith(fp, PFS_OP_DP4, dest, mask,
1662                                    swizzle(src[0], X, Y, Z, ONE), src[1],
1663                                    undef, flags);
1664 #endif
1665                         break;
1666                 case OPCODE_DST:
1667                         src[0] = t_src(fp, fpi->SrcReg[0]);
1668                         src[1] = t_src(fp, fpi->SrcReg[1]);
1669                         /* dest.y = src0.y * src1.y */
1670                         if (mask & WRITEMASK_Y)
1671                                 emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Y,
1672                                            keep(src[0]), keep(src[1]),
1673                                            pfs_zero, flags);
1674                         /* dest.z = src0.z */
1675                         if (mask & WRITEMASK_Z)
1676                                 emit_arith(fp, PFS_OP_MAD, dest, WRITEMASK_Z,
1677                                            src[0], pfs_one, pfs_zero, flags);
1678                         /* result.x = 1.0
1679                          * result.w = src1.w */
1680                         if (mask & WRITEMASK_XW) {
1681                                 REG_SET_VSWZ(src[1], SWIZZLE_111);      /*Cheat */
1682                                 emit_arith(fp, PFS_OP_MAD, dest,
1683                                            mask & WRITEMASK_XW,
1684                                            src[1], pfs_one, pfs_zero, flags);
1685                         }
1686                         break;
1687                 case OPCODE_EX2:
1688                         src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
1689                         emit_arith(fp, PFS_OP_EX2, dest, mask,
1690                                    src[0], undef, undef, flags);
1691                         break;
1692                 case OPCODE_FLR:
1693                         src[0] = t_src(fp, fpi->SrcReg[0]);
1694                         temp[0] = get_temp_reg(fp);
1695                         /* FRC temp, src0
1696                          * MAD dest, src0, 1.0, -temp
1697                          */
1698                         emit_arith(fp, PFS_OP_FRC, temp[0], mask,
1699                                    keep(src[0]), undef, undef, 0);
1700                         emit_arith(fp, PFS_OP_MAD, dest, mask,
1701                                    src[0], pfs_one, negate(temp[0]), flags);
1702                         free_temp(fp, temp[0]);
1703                         break;
1704                 case OPCODE_FRC:
1705                         src[0] = t_src(fp, fpi->SrcReg[0]);
1706                         emit_arith(fp, PFS_OP_FRC, dest, mask,
1707                                    src[0], undef, undef, flags);
1708                         break;
1709                 case OPCODE_KIL:
1710                         emit_tex(fp, fpi, R300_FPITX_OP_KIL);
1711                         break;
1712                 case OPCODE_LG2:
1713                         src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
1714                         emit_arith(fp, PFS_OP_LG2, dest, mask,
1715                                    src[0], undef, undef, flags);
1716                         break;
1717                 case OPCODE_LIT:
1718                         src[0] = t_src(fp, fpi->SrcReg[0]);
1719                         emit_lit(fp, dest, mask, src[0], flags);
1720                         break;
1721                 case OPCODE_LRP:
1722                         src[0] = t_src(fp, fpi->SrcReg[0]);
1723                         src[1] = t_src(fp, fpi->SrcReg[1]);
1724                         src[2] = t_src(fp, fpi->SrcReg[2]);
1725                         /* result = tmp0tmp1 + (1 - tmp0)tmp2
1726                          *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1727                          *     MAD temp, -tmp0, tmp2, tmp2
1728                          *     MAD result, tmp0, tmp1, temp
1729                          */
1730                         temp[0] = get_temp_reg(fp);
1731                         emit_arith(fp, PFS_OP_MAD, temp[0], mask,
1732                                    negate(keep(src[0])), keep(src[2]), src[2],
1733                                    0);
1734                         emit_arith(fp, PFS_OP_MAD, dest, mask,
1735                                    src[0], src[1], temp[0], flags);
1736                         free_temp(fp, temp[0]);
1737                         break;
1738                 case OPCODE_MAD:
1739                         src[0] = t_src(fp, fpi->SrcReg[0]);
1740                         src[1] = t_src(fp, fpi->SrcReg[1]);
1741                         src[2] = t_src(fp, fpi->SrcReg[2]);
1742                         emit_arith(fp, PFS_OP_MAD, dest, mask,
1743                                    src[0], src[1], src[2], flags);
1744                         break;
1745                 case OPCODE_MAX:
1746                         src[0] = t_src(fp, fpi->SrcReg[0]);
1747                         src[1] = t_src(fp, fpi->SrcReg[1]);
1748                         emit_arith(fp, PFS_OP_MAX, dest, mask,
1749                                    src[0], src[1], undef, flags);
1750                         break;
1751                 case OPCODE_MIN:
1752                         src[0] = t_src(fp, fpi->SrcReg[0]);
1753                         src[1] = t_src(fp, fpi->SrcReg[1]);
1754                         emit_arith(fp, PFS_OP_MIN, dest, mask,
1755                                    src[0], src[1], undef, flags);
1756                         break;
1757                 case OPCODE_MOV:
1758                 case OPCODE_SWZ:
1759                         src[0] = t_src(fp, fpi->SrcReg[0]);
1760                         emit_arith(fp, PFS_OP_MAD, dest, mask,
1761                                    src[0], pfs_one, pfs_zero, flags);
1762                         break;
1763                 case OPCODE_MUL:
1764                         src[0] = t_src(fp, fpi->SrcReg[0]);
1765                         src[1] = t_src(fp, fpi->SrcReg[1]);
1766                         emit_arith(fp, PFS_OP_MAD, dest, mask,
1767                                    src[0], src[1], pfs_zero, flags);
1768                         break;
1769                 case OPCODE_POW:
1770                         src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
1771                         src[1] = t_scalar_src(fp, fpi->SrcReg[1]);
1772                         temp[0] = get_temp_reg(fp);
1773                         emit_arith(fp, PFS_OP_LG2, temp[0], WRITEMASK_W,
1774                                    src[0], undef, undef, 0);
1775                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1776                                    temp[0], src[1], pfs_zero, 0);
1777                         emit_arith(fp, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
1778                                    temp[0], undef, undef, 0);
1779                         free_temp(fp, temp[0]);
1780                         break;
1781                 case OPCODE_RCP:
1782                         src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
1783                         emit_arith(fp, PFS_OP_RCP, dest, mask,
1784                                    src[0], undef, undef, flags);
1785                         break;
1786                 case OPCODE_RSQ:
1787                         src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
1788                         emit_arith(fp, PFS_OP_RSQ, dest, mask,
1789                                    absolute(src[0]), pfs_zero, pfs_zero, flags);
1790                         break;
1791                 case OPCODE_SCS:
1792                         /*
1793                          * scs using a parabola :
1794                          * scs(x):
1795                          *   result.x = sin(-abs(x)+0.5*PI)  (cos)
1796                          *   result.y = sin(x)               (sin)
1797                          *
1798                          */
1799                         temp[0] = get_temp_reg(fp);
1800                         temp[1] = get_temp_reg(fp);
1801                         const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
1802                         const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
1803                         src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
1804
1805                         /* x = -abs(x)+0.5*PI */
1806                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z),     //PI
1807                                    pfs_half,
1808                                    negate(abs
1809                                           (swizzle(keep(src[0]), X, X, X, X))),
1810                                    0);
1811
1812                         /* C*x (sin) */
1813                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_W,
1814                                    swizzle(const_sin[0], Y, Y, Y, Y),
1815                                    swizzle(keep(src[0]), X, X, X, X),
1816                                    pfs_zero, 0);
1817
1818                         /* B*x, C*x (cos) */
1819                         emit_arith(fp, PFS_OP_MAD, temp[0],
1820                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1821                                                                       Z, Z, Z,
1822                                                                       Z),
1823                                    const_sin[0], pfs_zero, 0);
1824
1825                         /* B*x (sin) */
1826                         emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
1827                                    swizzle(const_sin[0], X, X, X, X),
1828                                    keep(src[0]), pfs_zero, 0);
1829
1830                         /* y = B*x + C*x*abs(x) (sin) */
1831                         emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1832                                    absolute(src[0]),
1833                                    swizzle(temp[0], W, W, W, W),
1834                                    swizzle(temp[1], W, W, W, W), 0);
1835
1836                         /* y = B*x + C*x*abs(x) (cos) */
1837                         emit_arith(fp, PFS_OP_MAD, temp[1], WRITEMASK_W,
1838                                    swizzle(temp[0], Y, Y, Y, Y),
1839                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1840                                    swizzle(temp[0], X, X, X, X), 0);
1841
1842                         /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1843                         emit_arith(fp, PFS_OP_MAD, temp[0],
1844                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
1845                                                                       W, Z, Y,
1846                                                                       X),
1847                                    absolute(swizzle(temp[1], W, Z, Y, X)),
1848                                    negate(swizzle(temp[1], W, Z, Y, X)), 0);
1849
1850                         /* dest.xy = mad(temp.xy, P, temp2.wz) */
1851                         emit_arith(fp, PFS_OP_MAD, dest,
1852                                    mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
1853                                    swizzle(const_sin[0], W, W, W, W),
1854                                    swizzle(temp[1], W, Z, Y, X), flags);
1855
1856                         free_temp(fp, temp[0]);
1857                         free_temp(fp, temp[1]);
1858                         break;
1859                 case OPCODE_SGE:
1860                         src[0] = t_src(fp, fpi->SrcReg[0]);
1861                         src[1] = t_src(fp, fpi->SrcReg[1]);
1862                         temp[0] = get_temp_reg(fp);
1863                         /* temp = src0 - src1
1864                          * dest.c = (temp.c < 0.0) ? 0 : 1
1865                          */
1866                         emit_arith(fp, PFS_OP_MAD, temp[0], mask,
1867                                    src[0], pfs_one, negate(src[1]), 0);
1868                         emit_arith(fp, PFS_OP_CMP, dest, mask,
1869                                    pfs_one, pfs_zero, temp[0], 0);
1870                         free_temp(fp, temp[0]);
1871                         break;
1872                 case OPCODE_SIN:
1873                         /*
1874                          *  using a parabola:
1875                          * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1876                          * extra precision is obtained by weighting against
1877                          * itself squared.
1878                          */
1879
1880                         temp[0] = get_temp_reg(fp);
1881                         const_sin[0] = emit_const4fv(fp, SinCosConsts[0]);
1882                         const_sin[1] = emit_const4fv(fp, SinCosConsts[1]);
1883                         src[0] = t_scalar_src(fp, fpi->SrcReg[0]);
1884
1885                         /* do range reduction */
1886
1887                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1888                                    swizzle(keep(src[0]), X, X, X, X),
1889                                    swizzle(const_sin[1], Z, Z, Z, Z),
1890                                    pfs_half, 0);
1891
1892                         emit_arith(fp, PFS_OP_FRC, temp[0], WRITEMASK_X,
1893                                    swizzle(temp[0], X, X, X, X),
1894                                    undef, undef, 0);
1895
1896                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),       //2*PI
1897                                    negate(swizzle(const_sin[0], Z, Z, Z, Z)),   //PI
1898                                    0);
1899
1900                         /* SIN */
1901
1902                         emit_arith(fp, PFS_OP_MAD, temp[0],
1903                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1904                                                                       Z, Z, Z,
1905                                                                       Z),
1906                                    const_sin[0], pfs_zero, 0);
1907
1908                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_X,
1909                                    swizzle(temp[0], Y, Y, Y, Y),
1910                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1911                                    swizzle(temp[0], X, X, X, X), 0);
1912
1913                         emit_arith(fp, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1914                                    swizzle(temp[0], X, X, X, X),
1915                                    absolute(swizzle(temp[0], X, X, X, X)),
1916                                    negate(swizzle(temp[0], X, X, X, X)), 0);
1917
1918                         emit_arith(fp, PFS_OP_MAD, dest, mask,
1919                                    swizzle(temp[0], Y, Y, Y, Y),
1920                                    swizzle(const_sin[0], W, W, W, W),
1921                                    swizzle(temp[0], X, X, X, X), flags);
1922
1923                         free_temp(fp, temp[0]);
1924                         break;
1925                 case OPCODE_SLT:
1926                         src[0] = t_src(fp, fpi->SrcReg[0]);
1927                         src[1] = t_src(fp, fpi->SrcReg[1]);
1928                         temp[0] = get_temp_reg(fp);
1929                         /* temp = src0 - src1
1930                          * dest.c = (temp.c < 0.0) ? 1 : 0
1931                          */
1932                         emit_arith(fp, PFS_OP_MAD, temp[0], mask,
1933                                    src[0], pfs_one, negate(src[1]), 0);
1934                         emit_arith(fp, PFS_OP_CMP, dest, mask,
1935                                    pfs_zero, pfs_one, temp[0], 0);
1936                         free_temp(fp, temp[0]);
1937                         break;
1938                 case OPCODE_SUB:
1939                         src[0] = t_src(fp, fpi->SrcReg[0]);
1940                         src[1] = t_src(fp, fpi->SrcReg[1]);
1941                         emit_arith(fp, PFS_OP_MAD, dest, mask,
1942                                    src[0], pfs_one, negate(src[1]), flags);
1943                         break;
1944                 case OPCODE_TEX:
1945                         emit_tex(fp, fpi, R300_FPITX_OP_TEX);
1946                         break;
1947                 case OPCODE_TXB:
1948                         emit_tex(fp, fpi, R300_FPITX_OP_TXB);
1949                         break;
1950                 case OPCODE_TXP:
1951                         emit_tex(fp, fpi, R300_FPITX_OP_TXP);
1952                         break;
1953                 case OPCODE_XPD:{
1954                                 src[0] = t_src(fp, fpi->SrcReg[0]);
1955                                 src[1] = t_src(fp, fpi->SrcReg[1]);
1956                                 temp[0] = get_temp_reg(fp);
1957                                 /* temp = src0.zxy * src1.yzx */
1958                                 emit_arith(fp, PFS_OP_MAD, temp[0],
1959                                            WRITEMASK_XYZ, swizzle(keep(src[0]),
1960                                                                   Z, X, Y, W),
1961                                            swizzle(keep(src[1]), Y, Z, X, W),
1962                                            pfs_zero, 0);
1963                                 /* dest.xyz = src0.yzx * src1.zxy - temp
1964                                  * dest.w       = undefined
1965                                  * */
1966                                 emit_arith(fp, PFS_OP_MAD, dest,
1967                                            mask & WRITEMASK_XYZ, swizzle(src[0],
1968                                                                          Y, Z,
1969                                                                          X, W),
1970                                            swizzle(src[1], Z, X, Y, W),
1971                                            negate(temp[0]), flags);
1972                                 /* cleanup */
1973                                 free_temp(fp, temp[0]);
1974                                 break;
1975                         }
1976                 default:
1977                         ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
1978                         break;
1979                 }
1980
1981                 if (fp->error)
1982                         return GL_FALSE;
1983
1984         }
1985
1986         return GL_TRUE;
1987 }
1988
1989 static void insert_wpos(struct gl_program *prog)
1990 {
1991         static gl_state_index tokens[STATE_LENGTH] = {
1992                 STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
1993         };
1994         struct prog_instruction *fpi;
1995         GLuint window_index;
1996         int i = 0;
1997         GLuint tempregi = prog->NumTemporaries;
1998         /* should do something else if no temps left... */
1999         prog->NumTemporaries++;
2000
2001         fpi = _mesa_alloc_instructions(prog->NumInstructions + 3);
2002         _mesa_init_instructions(fpi, prog->NumInstructions + 3);
2003
2004         /* perspective divide */
2005         fpi[i].Opcode = OPCODE_RCP;
2006
2007         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2008         fpi[i].DstReg.Index = tempregi;
2009         fpi[i].DstReg.WriteMask = WRITEMASK_W;
2010         fpi[i].DstReg.CondMask = COND_TR;
2011
2012         fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2013         fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2014         fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
2015         i++;
2016
2017         fpi[i].Opcode = OPCODE_MUL;
2018
2019         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2020         fpi[i].DstReg.Index = tempregi;
2021         fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2022         fpi[i].DstReg.CondMask = COND_TR;
2023
2024         fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2025         fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2026         fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
2027
2028         fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
2029         fpi[i].SrcReg[1].Index = tempregi;
2030         fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
2031         i++;
2032
2033         /* viewport transformation */
2034         window_index = _mesa_add_state_reference(prog->Parameters, tokens);
2035
2036         fpi[i].Opcode = OPCODE_MAD;
2037
2038         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2039         fpi[i].DstReg.Index = tempregi;
2040         fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2041         fpi[i].DstReg.CondMask = COND_TR;
2042
2043         fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
2044         fpi[i].SrcReg[0].Index = tempregi;
2045         fpi[i].SrcReg[0].Swizzle =
2046             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2047
2048         fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
2049         fpi[i].SrcReg[1].Index = window_index;
2050         fpi[i].SrcReg[1].Swizzle =
2051             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2052
2053         fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
2054         fpi[i].SrcReg[2].Index = window_index;
2055         fpi[i].SrcReg[2].Swizzle =
2056             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2057         i++;
2058
2059         _mesa_copy_instructions(&fpi[i], prog->Instructions,
2060                                 prog->NumInstructions);
2061
2062         free(prog->Instructions);
2063
2064         prog->Instructions = fpi;
2065
2066         prog->NumInstructions += i;
2067         fpi = &prog->Instructions[prog->NumInstructions - 1];
2068
2069         assert(fpi->Opcode == OPCODE_END);
2070
2071         for (fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++) {
2072                 for (i = 0; i < 3; i++)
2073                         if (fpi->SrcReg[i].File == PROGRAM_INPUT &&
2074                             fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS) {
2075                                 fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
2076                                 fpi->SrcReg[i].Index = tempregi;
2077                         }
2078         }
2079 }
2080
2081 /* - Init structures
2082  * - Determine what hwregs each input corresponds to
2083  */
2084 static void init_program(r300ContextPtr r300, struct r300_fragment_program *fp)
2085 {
2086         struct r300_pfs_compile_state *cs = NULL;
2087         struct gl_fragment_program *mp = &fp->mesa_program;
2088         struct prog_instruction *fpi;
2089         GLuint InputsRead = mp->Base.InputsRead;
2090         GLuint temps_used = 0;  /* for fp->temps[] */
2091         int i, j;
2092
2093         /* New compile, reset tracking data */
2094         fp->optimization =
2095             driQueryOptioni(&r300->radeon.optionCache, "fp_optimization");
2096         fp->translated = GL_FALSE;
2097         fp->error = GL_FALSE;
2098         fp->cs = cs = &(R300_CONTEXT(fp->ctx)->state.pfs_compile);
2099         fp->tex.length = 0;
2100         fp->cur_node = 0;
2101         fp->first_node_has_tex = 0;
2102         fp->const_nr = 0;
2103         fp->max_temp_idx = 0;
2104         fp->node[0].alu_end = -1;
2105         fp->node[0].tex_end = -1;
2106
2107         _mesa_memset(cs, 0, sizeof(*fp->cs));
2108         for (i = 0; i < PFS_MAX_ALU_INST; i++) {
2109                 for (j = 0; j < 3; j++) {
2110                         cs->slot[i].vsrc[j] = SRC_CONST;
2111                         cs->slot[i].ssrc[j] = SRC_CONST;
2112                 }
2113         }
2114
2115         /* Work out what temps the Mesa inputs correspond to, this must match
2116          * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2117          * configures itself based on the fragprog's InputsRead
2118          *
2119          * NOTE: this depends on get_hw_temp() allocating registers in order,
2120          * starting from register 0.
2121          */
2122
2123         /* Texcoords come first */
2124         for (i = 0; i < fp->ctx->Const.MaxTextureUnits; i++) {
2125                 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
2126                         cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
2127                         cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
2128                             get_hw_temp(fp, 0);
2129                 }
2130         }
2131         InputsRead &= ~FRAG_BITS_TEX_ANY;
2132
2133         /* fragment position treated as a texcoord */
2134         if (InputsRead & FRAG_BIT_WPOS) {
2135                 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
2136                 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(fp, 0);
2137                 insert_wpos(&mp->Base);
2138         }
2139         InputsRead &= ~FRAG_BIT_WPOS;
2140
2141         /* Then primary colour */
2142         if (InputsRead & FRAG_BIT_COL0) {
2143                 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
2144                 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(fp, 0);
2145         }
2146         InputsRead &= ~FRAG_BIT_COL0;
2147
2148         /* Secondary color */
2149         if (InputsRead & FRAG_BIT_COL1) {
2150                 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
2151                 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(fp, 0);
2152         }
2153         InputsRead &= ~FRAG_BIT_COL1;
2154
2155         /* Anything else */
2156         if (InputsRead) {
2157                 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
2158                 /* force read from hwreg 0 for now */
2159                 for (i = 0; i < 32; i++)
2160                         if (InputsRead & (1 << i))
2161                                 cs->inputs[i].reg = 0;
2162         }
2163
2164         /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2165          * That way, we can free up the reg when it's no longer needed
2166          */
2167         if (!mp->Base.Instructions) {
2168                 ERROR("No instructions found in program\n");
2169                 return;
2170         }
2171
2172         for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
2173                 int idx;
2174
2175                 for (i = 0; i < 3; i++) {
2176                         idx = fpi->SrcReg[i].Index;
2177                         switch (fpi->SrcReg[i].File) {
2178                         case PROGRAM_TEMPORARY:
2179                                 if (!(temps_used & (1 << idx))) {
2180                                         cs->temps[idx].reg = -1;
2181                                         cs->temps[idx].refcount = 1;
2182                                         temps_used |= (1 << idx);
2183                                 } else
2184                                         cs->temps[idx].refcount++;
2185                                 break;
2186                         case PROGRAM_INPUT:
2187                                 cs->inputs[idx].refcount++;
2188                                 break;
2189                         default:
2190                                 break;
2191                         }
2192                 }
2193
2194                 idx = fpi->DstReg.Index;
2195                 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
2196                         if (!(temps_used & (1 << idx))) {
2197                                 cs->temps[idx].reg = -1;
2198                                 cs->temps[idx].refcount = 1;
2199                                 temps_used |= (1 << idx);
2200                         } else
2201                                 cs->temps[idx].refcount++;
2202                 }
2203         }
2204         cs->temp_in_use = temps_used;
2205 }
2206
2207 static void update_params(struct r300_fragment_program *fp)
2208 {
2209         struct gl_fragment_program *mp = &fp->mesa_program;
2210
2211         /* Ask Mesa nicely to fill in ParameterValues for us */
2212         if (mp->Base.Parameters)
2213                 _mesa_load_state_parameters(fp->ctx, mp->Base.Parameters);
2214 }
2215
2216 void r300TranslateFragmentShader(r300ContextPtr r300,
2217                                  struct r300_fragment_program *fp)
2218 {
2219         struct r300_pfs_compile_state *cs = NULL;
2220
2221         if (!fp->translated) {
2222
2223                 init_program(r300, fp);
2224                 cs = fp->cs;
2225
2226                 if (parse_program(fp) == GL_FALSE) {
2227                         dump_program(fp);
2228                         return;
2229                 }
2230
2231                 /* Finish off */
2232                 fp->node[fp->cur_node].alu_end =
2233                     cs->nrslots - fp->node[fp->cur_node].alu_offset - 1;
2234                 if (fp->node[fp->cur_node].tex_end < 0)
2235                         fp->node[fp->cur_node].tex_end = 0;
2236                 fp->alu_offset = 0;
2237                 fp->alu_end = cs->nrslots - 1;
2238                 fp->tex_offset = 0;
2239                 fp->tex_end = fp->tex.length ? fp->tex.length - 1 : 0;
2240                 assert(fp->node[fp->cur_node].alu_end >= 0);
2241                 assert(fp->alu_end >= 0);
2242
2243                 fp->translated = GL_TRUE;
2244                 if (RADEON_DEBUG & DEBUG_PIXEL)
2245                         dump_program(fp);
2246                 r300UpdateStateParameters(fp->ctx, _NEW_PROGRAM);
2247         }
2248
2249         update_params(fp);
2250 }
2251
2252 /* just some random things... */
2253 static void dump_program(struct r300_fragment_program *fp)
2254 {
2255         int n, i, j;
2256         static int pc = 0;
2257
2258         fprintf(stderr, "pc=%d*************************************\n", pc++);
2259
2260         fprintf(stderr, "Mesa program:\n");
2261         fprintf(stderr, "-------------\n");
2262         _mesa_print_program(&fp->mesa_program.Base);
2263         fflush(stdout);
2264
2265         fprintf(stderr, "Hardware program\n");
2266         fprintf(stderr, "----------------\n");
2267
2268         for (n = 0; n < (fp->cur_node + 1); n++) {
2269                 fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
2270                         "alu_end: %d, tex_end: %d\n", n,
2271                         fp->node[n].alu_offset,
2272                         fp->node[n].tex_offset,
2273                         fp->node[n].alu_end, fp->node[n].tex_end);
2274
2275                 if (fp->tex.length) {
2276                         fprintf(stderr, "  TEX:\n");
2277                         for (i = fp->node[n].tex_offset;
2278                              i <= fp->node[n].tex_offset + fp->node[n].tex_end;
2279                              ++i) {
2280                                 const char *instr;
2281
2282                                 switch ((fp->tex.
2283                                          inst[i] >> R300_FPITX_OPCODE_SHIFT) &
2284                                         15) {
2285                                 case R300_FPITX_OP_TEX:
2286                                         instr = "TEX";
2287                                         break;
2288                                 case R300_FPITX_OP_KIL:
2289                                         instr = "KIL";
2290                                         break;
2291                                 case R300_FPITX_OP_TXP:
2292                                         instr = "TXP";
2293                                         break;
2294                                 case R300_FPITX_OP_TXB:
2295                                         instr = "TXB";
2296                                         break;
2297                                 default:
2298                                         instr = "UNKNOWN";
2299                                 }
2300
2301                                 fprintf(stderr,
2302                                         "    %s t%i, %c%i, texture[%i]   (%08x)\n",
2303                                         instr,
2304                                         (fp->tex.
2305                                          inst[i] >> R300_FPITX_DST_SHIFT) & 31,
2306                                         (fp->tex.
2307                                          inst[i] & R300_FPITX_SRC_CONST) ? 'c' :
2308                                         't',
2309                                         (fp->tex.
2310                                          inst[i] >> R300_FPITX_SRC_SHIFT) & 31,
2311                                         (fp->tex.
2312                                          inst[i] & R300_FPITX_IMAGE_MASK) >>
2313                                         R300_FPITX_IMAGE_SHIFT,
2314                                         fp->tex.inst[i]);
2315                         }
2316                 }
2317
2318                 for (i = fp->node[n].alu_offset;
2319                      i <= fp->node[n].alu_offset + fp->node[n].alu_end; ++i) {
2320                         char srcc[3][10], dstc[20];
2321                         char srca[3][10], dsta[20];
2322                         char argc[3][20];
2323                         char arga[3][20];
2324                         char flags[5], tmp[10];
2325
2326                         for (j = 0; j < 3; ++j) {
2327                                 int regc = fp->alu.inst[i].inst1 >> (j * 6);
2328                                 int rega = fp->alu.inst[i].inst3 >> (j * 6);
2329
2330                                 sprintf(srcc[j], "%c%i",
2331                                         (regc & 32) ? 'c' : 't', regc & 31);
2332                                 sprintf(srca[j], "%c%i",
2333                                         (rega & 32) ? 'c' : 't', rega & 31);
2334                         }
2335
2336                         dstc[0] = 0;
2337                         sprintf(flags, "%s%s%s",
2338                                 (fp->alu.inst[i].
2339                                  inst1 & R300_FPI1_DSTC_REG_X) ? "x" : "",
2340                                 (fp->alu.inst[i].
2341                                  inst1 & R300_FPI1_DSTC_REG_Y) ? "y" : "",
2342                                 (fp->alu.inst[i].
2343                                  inst1 & R300_FPI1_DSTC_REG_Z) ? "z" : "");
2344                         if (flags[0] != 0) {
2345                                 sprintf(dstc, "t%i.%s ",
2346                                         (fp->alu.inst[i].
2347                                          inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
2348                                         flags);
2349                         }
2350                         sprintf(flags, "%s%s%s",
2351                                 (fp->alu.inst[i].
2352                                  inst1 & R300_FPI1_DSTC_OUTPUT_X) ? "x" : "",
2353                                 (fp->alu.inst[i].
2354                                  inst1 & R300_FPI1_DSTC_OUTPUT_Y) ? "y" : "",
2355                                 (fp->alu.inst[i].
2356                                  inst1 & R300_FPI1_DSTC_OUTPUT_Z) ? "z" : "");
2357                         if (flags[0] != 0) {
2358                                 sprintf(tmp, "o%i.%s",
2359                                         (fp->alu.inst[i].
2360                                          inst1 >> R300_FPI1_DSTC_SHIFT) & 31,
2361                                         flags);
2362                                 strcat(dstc, tmp);
2363                         }
2364
2365                         dsta[0] = 0;
2366                         if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_REG) {
2367                                 sprintf(dsta, "t%i.w ",
2368                                         (fp->alu.inst[i].
2369                                          inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
2370                         }
2371                         if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_OUTPUT) {
2372                                 sprintf(tmp, "o%i.w ",
2373                                         (fp->alu.inst[i].
2374                                          inst3 >> R300_FPI3_DSTA_SHIFT) & 31);
2375                                 strcat(dsta, tmp);
2376                         }
2377                         if (fp->alu.inst[i].inst3 & R300_FPI3_DSTA_DEPTH) {
2378                                 strcat(dsta, "Z");
2379                         }
2380
2381                         fprintf(stderr,
2382                                 "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2383                                 "       w: %3s %3s %3s -> %-20s (%08x)\n", i,
2384                                 srcc[0], srcc[1], srcc[2], dstc,
2385                                 fp->alu.inst[i].inst1, srca[0], srca[1],
2386                                 srca[2], dsta, fp->alu.inst[i].inst3);
2387
2388                         for (j = 0; j < 3; ++j) {
2389                                 int regc = fp->alu.inst[i].inst0 >> (j * 7);
2390                                 int rega = fp->alu.inst[i].inst2 >> (j * 7);
2391                                 int d;
2392                                 char buf[20];
2393
2394                                 d = regc & 31;
2395                                 if (d < 12) {
2396                                         switch (d % 4) {
2397                                         case R300_FPI0_ARGC_SRC0C_XYZ:
2398                                                 sprintf(buf, "%s.xyz",
2399                                                         srcc[d / 4]);
2400                                                 break;
2401                                         case R300_FPI0_ARGC_SRC0C_XXX:
2402                                                 sprintf(buf, "%s.xxx",
2403                                                         srcc[d / 4]);
2404                                                 break;
2405                                         case R300_FPI0_ARGC_SRC0C_YYY:
2406                                                 sprintf(buf, "%s.yyy",
2407                                                         srcc[d / 4]);
2408                                                 break;
2409                                         case R300_FPI0_ARGC_SRC0C_ZZZ:
2410                                                 sprintf(buf, "%s.zzz",
2411                                                         srcc[d / 4]);
2412                                                 break;
2413                                         }
2414                                 } else if (d < 15) {
2415                                         sprintf(buf, "%s.www", srca[d - 12]);
2416                                 } else if (d == 20) {
2417                                         sprintf(buf, "0.0");
2418                                 } else if (d == 21) {
2419                                         sprintf(buf, "1.0");
2420                                 } else if (d == 22) {
2421                                         sprintf(buf, "0.5");
2422                                 } else if (d >= 23 && d < 32) {
2423                                         d -= 23;
2424                                         switch (d / 3) {
2425                                         case 0:
2426                                                 sprintf(buf, "%s.yzx",
2427                                                         srcc[d % 3]);
2428                                                 break;
2429                                         case 1:
2430                                                 sprintf(buf, "%s.zxy",
2431                                                         srcc[d % 3]);
2432                                                 break;
2433                                         case 2:
2434                                                 sprintf(buf, "%s.Wzy",
2435                                                         srcc[d % 3]);
2436                                                 break;
2437                                         }
2438                                 } else {
2439                                         sprintf(buf, "%i", d);
2440                                 }
2441
2442                                 sprintf(argc[j], "%s%s%s%s",
2443                                         (regc & 32) ? "-" : "",
2444                                         (regc & 64) ? "|" : "",
2445                                         buf, (regc & 64) ? "|" : "");
2446
2447                                 d = rega & 31;
2448                                 if (d < 9) {
2449                                         sprintf(buf, "%s.%c", srcc[d / 3],
2450                                                 'x' + (char)(d % 3));
2451                                 } else if (d < 12) {
2452                                         sprintf(buf, "%s.w", srca[d - 9]);
2453                                 } else if (d == 16) {
2454                                         sprintf(buf, "0.0");
2455                                 } else if (d == 17) {
2456                                         sprintf(buf, "1.0");
2457                                 } else if (d == 18) {
2458                                         sprintf(buf, "0.5");
2459                                 } else {
2460                                         sprintf(buf, "%i", d);
2461                                 }
2462
2463                                 sprintf(arga[j], "%s%s%s%s",
2464                                         (rega & 32) ? "-" : "",
2465                                         (rega & 64) ? "|" : "",
2466                                         buf, (rega & 64) ? "|" : "");
2467                         }
2468
2469                         fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
2470                                 "       w: %8s %8s %8s    op: %08x\n",
2471                                 argc[0], argc[1], argc[2],
2472                                 fp->alu.inst[i].inst0, arga[0], arga[1],
2473                                 arga[2], fp->alu.inst[i].inst2);
2474                 }
2475         }
2476 }