src/mesa/drivers/dri/r300/r300_fragprog.c

   1 /*
   2  * Copyright (C) 2005 Ben Skeggs.
   3  *
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sublicense, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial
  16  * portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  */
  27
  28 /**
  29  * \file
  30  *
  31  * \author Ben Skeggs <darktama@iinet.net.au>
  32  *
  33  * \author Jerome Glisse <j.glisse@gmail.com>
  34  *
  35  * \todo Depth write, WPOS/FOGC inputs
  36  *
  37  * \todo FogOption
  38  *
  39  * \todo Verify results of opcodes for accuracy, I've only checked them in
  40  * specific cases.
  41  */
  42
  43 #include "glheader.h"
  44 #include "macros.h"
  45 #include "enums.h"
  46 #include "shader/prog_instruction.h"
  47 #include "shader/prog_parameter.h"
  48 #include "shader/prog_print.h"
  49
  50 #include "r300_context.h"
  51 #include "r300_fragprog.h"
  52 #include "r300_reg.h"
  53 #include "r300_state.h"
  54
  55 /* Mapping Mesa registers to R300 temporaries */
  56 struct reg_acc {
  57         int reg;                /* Assigned hw temp */
  58         unsigned int refcount;  /* Number of uses by mesa program */
  59 };
  60
  61 /**
  62  * Describe the current lifetime information for an R300 temporary
  63  */
  64 struct reg_lifetime {
  65         /* Index of the first slot where this register is free in the sense
  66            that it can be used as a new destination register.
  67            This is -1 if the register has been assigned to a Mesa register
  68            and the last access to the register has not yet been emitted */
  69         int free;
  70
  71         /* Index of the first slot where this register is currently reserved.
  72            This is used to stop e.g. a scalar operation from being moved
  73            before the allocation time of a register that was first allocated
  74            for a vector operation. */
  75         int reserved;
  76
  77         /* Index of the first slot in which the register can be used as a
  78            source without losing the value that is written by the last
  79            emitted instruction that writes to the register */
  80         int vector_valid;
  81         int scalar_valid;
  82
  83         /* Index to the slot where the register was last read.
  84            This is also the first slot in which the register may be written again */
  85         int vector_lastread;
  86         int scalar_lastread;
  87 };
  88
  89 /**
  90  * Store usage information about an ALU instruction slot during the
  91  * compilation of a fragment program.
  92  */
  93 #define SLOT_SRC_VECTOR  (1<<0)
  94 #define SLOT_SRC_SCALAR  (1<<3)
  95 #define SLOT_SRC_BOTH    (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
  96 #define SLOT_OP_VECTOR   (1<<16)
  97 #define SLOT_OP_SCALAR   (1<<17)
  98 #define SLOT_OP_BOTH     (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
  99
 100 struct r300_pfs_compile_slot {
 101         /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
 102            defined above */
 103         unsigned int used;
 104
 105         /* Selected sources */
 106         int vsrc[3];
 107         int ssrc[3];
 108 };
 109
 110 /**
 111  * Store information during compilation of fragment programs.
 112  */
 113 struct r300_pfs_compile_state {
 114         r300ContextPtr r300;
 115         struct r300_fragment_program *fp;
 116
 117         int nrslots;            /* number of ALU slots used so far */
 118
 119         /* Track which (parts of) slots are already filled with instructions */
 120         struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
 121
 122         /* Track the validity of R300 temporaries */
 123         struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
 124
 125         /* Used to map Mesa's inputs/temps onto hardware temps */
 126         int temp_in_use;
 127         struct reg_acc temps[PFS_NUM_TEMP_REGS];
 128         struct reg_acc inputs[32];      /* don't actually need 32... */
 129
 130         /* Track usage of hardware temps, for register allocation,
 131          * indirection detection, etc. */
 132         GLuint used_in_node;
 133         GLuint dest_in_node;
 134 };
 135
 136
 137 /*
 138  * Usefull macros and values
 139  */
 140 #define ERROR(fmt, args...) do {                        \
 141                 fprintf(stderr, "%s::%s(): " fmt "\n",  \
 142                         __FILE__, __FUNCTION__, ##args);        \
 143                 fp->error = GL_TRUE;                    \
 144         } while(0)
 145
 146 #define PFS_INVAL 0xFFFFFFFF
 147 #define COMPILE_STATE \
 148         struct r300_fragment_program *fp = cs->fp; \
 149         struct r300_fragment_program_code *code = &fp->code; \
 150         (void)code
 151
 152 #define SWIZZLE_XYZ             0
 153 #define SWIZZLE_XXX             1
 154 #define SWIZZLE_YYY             2
 155 #define SWIZZLE_ZZZ             3
 156 #define SWIZZLE_WWW             4
 157 #define SWIZZLE_YZX             5
 158 #define SWIZZLE_ZXY             6
 159 #define SWIZZLE_WZY             7
 160 #define SWIZZLE_111             8
 161 #define SWIZZLE_000             9
 162 #define SWIZZLE_HHH             10
 163
 164 #define swizzle(r, x, y, z, w) do_swizzle(cs, r,                \
 165                                           ((SWIZZLE_##x<<0)|    \
 166                                            (SWIZZLE_##y<<3)|    \
 167                                            (SWIZZLE_##z<<6)|    \
 168                                            (SWIZZLE_##w<<9)),   \
 169                                           0)
 170
 171 #define REG_TYPE_INPUT          0
 172 #define REG_TYPE_OUTPUT         1
 173 #define REG_TYPE_TEMP           2
 174 #define REG_TYPE_CONST          3
 175
 176 #define REG_TYPE_SHIFT          0
 177 #define REG_INDEX_SHIFT         2
 178 #define REG_VSWZ_SHIFT          8
 179 #define REG_SSWZ_SHIFT          13
 180 #define REG_NEGV_SHIFT          18
 181 #define REG_NEGS_SHIFT          19
 182 #define REG_ABS_SHIFT           20
 183 #define REG_NO_USE_SHIFT        21      // Hack for refcounting
 184 #define REG_VALID_SHIFT         22      // Does the register contain a defined value?
 185 #define REG_BUILTIN_SHIFT   23  // Is it a builtin (like all zero/all one)?
 186
 187 #define REG_TYPE_MASK           (0x03 << REG_TYPE_SHIFT)
 188 #define REG_INDEX_MASK          (0x3F << REG_INDEX_SHIFT)
 189 #define REG_VSWZ_MASK           (0x1F << REG_VSWZ_SHIFT)
 190 #define REG_SSWZ_MASK           (0x1F << REG_SSWZ_SHIFT)
 191 #define REG_NEGV_MASK           (0x01 << REG_NEGV_SHIFT)
 192 #define REG_NEGS_MASK           (0x01 << REG_NEGS_SHIFT)
 193 #define REG_ABS_MASK            (0x01 << REG_ABS_SHIFT)
 194 #define REG_NO_USE_MASK         (0x01 << REG_NO_USE_SHIFT)
 195 #define REG_VALID_MASK          (0x01 << REG_VALID_SHIFT)
 196 #define REG_BUILTIN_MASK        (0x01 << REG_BUILTIN_SHIFT)
 197
 198 #define REG(type, index, vswz, sswz, nouse, valid, builtin)     \
 199         (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |                   \
 200          ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |                \
 201          ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |              \
 202          ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |                \
 203          ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |  \
 204          ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |                   \
 205          ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 206 #define REG_GET_TYPE(reg)                                               \
 207         ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
 208 #define REG_GET_INDEX(reg)                                              \
 209         ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
 210 #define REG_GET_VSWZ(reg)                                               \
 211         ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
 212 #define REG_GET_SSWZ(reg)                                               \
 213         ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
 214 #define REG_GET_NO_USE(reg)                                             \
 215         ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
 216 #define REG_GET_VALID(reg)                                              \
 217         ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
 218 #define REG_GET_BUILTIN(reg)                                            \
 219         ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
 220 #define REG_SET_TYPE(reg, type)                                         \
 221         reg = ((reg & ~REG_TYPE_MASK) |                                 \
 222                ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
 223 #define REG_SET_INDEX(reg, index)                                       \
 224         reg = ((reg & ~REG_INDEX_MASK) |                                \
 225                ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
 226 #define REG_SET_VSWZ(reg, vswz)                                         \
 227         reg = ((reg & ~REG_VSWZ_MASK) |                                 \
 228                ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
 229 #define REG_SET_SSWZ(reg, sswz)                                         \
 230         reg = ((reg & ~REG_SSWZ_MASK) |                                 \
 231                ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 232 #define REG_SET_NO_USE(reg, nouse)                                      \
 233         reg = ((reg & ~REG_NO_USE_MASK) |                               \
 234                ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
 235 #define REG_SET_VALID(reg, valid)                                       \
 236         reg = ((reg & ~REG_VALID_MASK) |                                \
 237                ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
 238 #define REG_SET_BUILTIN(reg, builtin)                                   \
 239         reg = ((reg & ~REG_BUILTIN_MASK) |                              \
 240                ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
 241 #define REG_ABS(reg)                                                    \
 242         reg = (reg | REG_ABS_MASK)
 243 #define REG_NEGV(reg)                                                   \
 244         reg = (reg | REG_NEGV_MASK)
 245 #define REG_NEGS(reg)                                                   \
 246         reg = (reg | REG_NEGS_MASK)
 247
 248 #define NOP_INST0 (                                              \
 249                 (R300_ALU_OUTC_MAD) |                            \
 250                 (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
 251                 (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
 252                 (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
 253 #define NOP_INST1 (                                          \
 254                 ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
 255                 ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
 256                 ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
 257 #define NOP_INST2 ( \
 258                 (R300_ALU_OUTA_MAD) |                            \
 259                 (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
 260                 (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
 261                 (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
 262 #define NOP_INST3 (                                          \
 263                 ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
 264                 ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
 265                 ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
 266
 267
 268 /*
 269  * Datas structures for fragment program generation
 270  */
 271
 272 /* description of r300 native hw instructions */
 273 static const struct {
 274         const char *name;
 275         int argc;
 276         int v_op;
 277         int s_op;
 278 } r300_fpop[] = {
 279         /* *INDENT-OFF* */
 280         {"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
 281         {"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
 282         {"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
 283         {"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
 284         {"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
 285         {"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
 286         {"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
 287         {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
 288         {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
 289         {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
 290         {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
 291         {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
 292         {"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
 293         /* *INDENT-ON* */
 294 };
 295
 296 /* vector swizzles r300 can support natively, with a couple of
 297  * cases we handle specially
 298  *
 299  * REG_VSWZ/REG_SSWZ is an index into this table
 300  */
 301
 302 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
 303 #define SWIZZLE_HALF 6
 304
 305 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
 306                                           SWIZZLE_##y, \
 307                                           SWIZZLE_##z, \
 308                                           SWIZZLE_ZERO))
 309 /* native swizzles */
 310 static const struct r300_pfs_swizzle {
 311         GLuint hash;            /* swizzle value this matches */
 312         GLuint base;            /* base value for hw swizzle */
 313         GLuint stride;          /* difference in base between arg0/1/2 */
 314         GLuint flags;
 315 } v_swiz[] = {
 316         /* *INDENT-OFF* */
 317         {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
 318         {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
 319         {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
 320         {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
 321         {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
 322         {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
 323         {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
 324         {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
 325         {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
 326         {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
 327         {MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
 328         {PFS_INVAL, 0, 0, 0},
 329         /* *INDENT-ON* */
 330 };
 331
 332 /* used during matching of non-native swizzles */
 333 #define SWZ_X_MASK (7 << 0)
 334 #define SWZ_Y_MASK (7 << 3)
 335 #define SWZ_Z_MASK (7 << 6)
 336 #define SWZ_W_MASK (7 << 9)
 337 static const struct {
 338         GLuint hash;            /* used to mask matching swizzle components */
 339         int mask;               /* actual outmask */
 340         int count;              /* count of components matched */
 341 } s_mask[] = {
 342         /* *INDENT-OFF* */
 343         {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
 344         {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
 345         {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
 346         {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
 347         {SWZ_X_MASK, 1, 1},
 348         {SWZ_Y_MASK, 2, 1},
 349         {SWZ_Z_MASK, 4, 1},
 350         {PFS_INVAL, PFS_INVAL, PFS_INVAL}
 351         /* *INDENT-ON* */
 352 };
 353
 354 static const struct {
 355         int base;               /* hw value of swizzle */
 356         int stride;             /* difference between SRC0/1/2 */
 357         GLuint flags;
 358 } s_swiz[] = {
 359         /* *INDENT-OFF* */
 360         {R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
 361         {R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
 362         {R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
 363         {R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
 364         {R300_ALU_ARGA_ZERO, 0, 0},
 365         {R300_ALU_ARGA_ONE, 0, 0},
 366         {R300_ALU_ARGA_HALF, 0, 0}
 367         /* *INDENT-ON* */
 368 };
 369
 370 /* boiler-plate reg, for convenience */
 371 static const GLuint undef = REG(REG_TYPE_TEMP,
 372                                 0,
 373                                 SWIZZLE_XYZ,
 374                                 SWIZZLE_W,
 375                                 GL_FALSE,
 376                                 GL_FALSE,
 377                                 GL_FALSE);
 378
 379 /* constant one source */
 380 static const GLuint pfs_one = REG(REG_TYPE_CONST,
 381                                   0,
 382                                   SWIZZLE_111,
 383                                   SWIZZLE_ONE,
 384                                   GL_FALSE,
 385                                   GL_TRUE,
 386                                   GL_TRUE);
 387
 388 /* constant half source */
 389 static const GLuint pfs_half = REG(REG_TYPE_CONST,
 390                                    0,
 391                                    SWIZZLE_HHH,
 392                                    SWIZZLE_HALF,
 393                                    GL_FALSE,
 394                                    GL_TRUE,
 395                                    GL_TRUE);
 396
 397 /* constant zero source */
 398 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
 399                                    0,
 400                                    SWIZZLE_000,
 401                                    SWIZZLE_ZERO,
 402                                    GL_FALSE,
 403                                    GL_TRUE,
 404                                    GL_TRUE);
 405
 406 /*
 407  * Common functions prototypes
 408  */
 409 static void dump_program(struct r300_fragment_program *fp,
 410                          struct r300_fragment_program_code *code);
 411 static void emit_arith(struct r300_pfs_compile_state *cs, int op,
 412                        GLuint dest, int mask,
 413                        GLuint src0, GLuint src1, GLuint src2, int flags);
 414
 415 /**
 416  * Get an R300 temporary that can be written to in the given slot.
 417  */
 418 static int get_hw_temp(struct r300_pfs_compile_state *cs, int slot)
 419 {
 420         COMPILE_STATE;
 421         int r;
 422
 423         for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 424                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
 425                         break;
 426         }
 427
 428         if (r >= PFS_NUM_TEMP_REGS) {
 429                 ERROR("Out of hardware temps\n");
 430                 return 0;
 431         }
 432         // Reserved is used to avoid the following scenario:
 433         //  R300 temporary X is first assigned to Mesa temporary Y during vector ops
 434         //  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
 435         //  Then scalar ops on Mesa temporary Z are emitted and move back in time
 436         //  to overwrite the value of temporary Y.
 437         // End scenario.
 438         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 439         cs->hwtemps[r].free = -1;
 440
 441         // Reset to some value that won't mess things up when the user
 442         // tries to read from a temporary that hasn't been assigned a value yet.
 443         // In the normal case, vector_valid and scalar_valid should be set to
 444         // a sane value by the first emit that writes to this temporary.
 445         cs->hwtemps[r].vector_valid = 0;
 446         cs->hwtemps[r].scalar_valid = 0;
 447
 448         if (r > fp->code.max_temp_idx)
 449                 fp->code.max_temp_idx = r;
 450
 451         return r;
 452 }
 453
 454 /**
 455  * Get an R300 temporary that will act as a TEX destination register.
 456  */
 457 static int get_hw_temp_tex(struct r300_pfs_compile_state *cs)
 458 {
 459         COMPILE_STATE;
 460         int r;
 461
 462         for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 463                 if (cs->used_in_node & (1 << r))
 464                         continue;
 465
 466                 // Note: Be very careful here
 467                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
 468                         break;
 469         }
 470
 471         if (r >= PFS_NUM_TEMP_REGS)
 472                 return get_hw_temp(cs, 0);      /* Will cause an indirection */
 473
 474         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 475         cs->hwtemps[r].free = -1;
 476
 477         // Reset to some value that won't mess things up when the user
 478         // tries to read from a temporary that hasn't been assigned a value yet.
 479         // In the normal case, vector_valid and scalar_valid should be set to
 480         // a sane value by the first emit that writes to this temporary.
 481         cs->hwtemps[r].vector_valid = cs->nrslots;
 482         cs->hwtemps[r].scalar_valid = cs->nrslots;
 483
 484         if (r > code->max_temp_idx)
 485                 code->max_temp_idx = r;
 486
 487         return r;
 488 }
 489
 490 /**
 491  * Mark the given hardware register as free.
 492  */
 493 static void free_hw_temp(struct r300_pfs_compile_state *cs, int idx)
 494 {
 495         // Be very careful here. Consider sequences like
 496         //  MAD r0, r1,r2,r3
 497         //  TEX r4, ...
 498         // The TEX instruction may be moved in front of the MAD instruction
 499         // due to the way nodes work. We don't want to alias r1 and r4 in
 500         // this case.
 501         // I'm certain the register allocation could be further sanitized,
 502         // but it's tricky because of stuff that can happen inside emit_tex
 503         // and emit_arith.
 504         cs->hwtemps[idx].free = cs->nrslots + 1;
 505 }
 506
 507 /**
 508  * Create a new Mesa temporary register.
 509  */
 510 static GLuint get_temp_reg(struct r300_pfs_compile_state *cs)
 511 {
 512         COMPILE_STATE;
 513         GLuint r = undef;
 514         GLuint index;
 515
 516         index = ffs(~cs->temp_in_use);
 517         if (!index) {
 518                 ERROR("Out of program temps\n");
 519                 return r;
 520         }
 521
 522         cs->temp_in_use |= (1 << --index);
 523         cs->temps[index].refcount = 0xFFFFFFFF;
 524         cs->temps[index].reg = -1;
 525
 526         REG_SET_TYPE(r, REG_TYPE_TEMP);
 527         REG_SET_INDEX(r, index);
 528         REG_SET_VALID(r, GL_TRUE);
 529         return r;
 530 }
 531
 532 /**
 533  * Create a new Mesa temporary register that will act as the destination
 534  * register for a texture read.
 535  */
 536 static GLuint get_temp_reg_tex(struct r300_pfs_compile_state *cs)
 537 {
 538         COMPILE_STATE;
 539         GLuint r = undef;
 540         GLuint index;
 541
 542         index = ffs(~cs->temp_in_use);
 543         if (!index) {
 544                 ERROR("Out of program temps\n");
 545                 return r;
 546         }
 547
 548         cs->temp_in_use |= (1 << --index);
 549         cs->temps[index].refcount = 0xFFFFFFFF;
 550         cs->temps[index].reg = get_hw_temp_tex(cs);
 551
 552         REG_SET_TYPE(r, REG_TYPE_TEMP);
 553         REG_SET_INDEX(r, index);
 554         REG_SET_VALID(r, GL_TRUE);
 555         return r;
 556 }
 557
 558 /**
 559  * Free a Mesa temporary and the associated R300 temporary.
 560  */
 561 static void free_temp(struct r300_pfs_compile_state *cs, GLuint r)
 562 {
 563         GLuint index = REG_GET_INDEX(r);
 564
 565         if (!(cs->temp_in_use & (1 << index)))
 566                 return;
 567
 568         if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
 569                 free_hw_temp(cs, cs->temps[index].reg);
 570                 cs->temps[index].reg = -1;
 571                 cs->temp_in_use &= ~(1 << index);
 572         } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
 573                 free_hw_temp(cs, cs->inputs[index].reg);
 574                 cs->inputs[index].reg = -1;
 575         }
 576 }
 577
 578 /**
 579  * Emit a hardware constant/parameter.
 580  *
 581  * \p cp Stable pointer to an array of 4 floats.
 582  *  The pointer must be stable in the sense that it remains to be valid
 583  *  and hold the contents of the constant/parameter throughout the lifetime
 584  *  of the fragment program (actually, up until the next time the fragment
 585  *  program is translated).
 586  */
 587 static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
 588                             const GLfloat * cp)
 589 {
 590         COMPILE_STATE;
 591         GLuint reg = undef;
 592         int index;
 593
 594         for (index = 0; index < code->const_nr; ++index) {
 595                 if (code->constant[index] == cp)
 596                         break;
 597         }
 598
 599         if (index >= code->const_nr) {
 600                 if (index >= PFS_NUM_CONST_REGS) {
 601                         ERROR("Out of hw constants!\n");
 602                         return reg;
 603                 }
 604
 605                 code->const_nr++;
 606                 code->constant[index] = cp;
 607         }
 608
 609         REG_SET_TYPE(reg, REG_TYPE_CONST);
 610         REG_SET_INDEX(reg, index);
 611         REG_SET_VALID(reg, GL_TRUE);
 612         return reg;
 613 }
 614
 615 static inline GLuint negate(GLuint r)
 616 {
 617         REG_NEGS(r);
 618         REG_NEGV(r);
 619         return r;
 620 }
 621
 622 /* Hack, to prevent clobbering sources used multiple times when
 623  * emulating non-native instructions
 624  */
 625 static inline GLuint keep(GLuint r)
 626 {
 627         REG_SET_NO_USE(r, GL_TRUE);
 628         return r;
 629 }
 630
 631 static inline GLuint absolute(GLuint r)
 632 {
 633         REG_ABS(r);
 634         return r;
 635 }
 636
 637 static int swz_native(struct r300_pfs_compile_state *cs,
 638                       GLuint src, GLuint * r, GLuint arbneg)
 639 {
 640         COMPILE_STATE;
 641
 642         /* Native swizzle, handle negation */
 643         src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
 644
 645         if ((arbneg & 0x7) == 0x0) {
 646                 src = src & ~REG_NEGV_MASK;
 647                 *r = src;
 648         } else if ((arbneg & 0x7) == 0x7) {
 649                 src |= REG_NEGV_MASK;
 650                 *r = src;
 651         } else {
 652                 if (!REG_GET_VALID(*r))
 653                         *r = get_temp_reg(cs);
 654                 src |= REG_NEGV_MASK;
 655                 emit_arith(cs,
 656                            PFS_OP_MAD,
 657                            *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
 658                 src = src & ~REG_NEGV_MASK;
 659                 emit_arith(cs,
 660                            PFS_OP_MAD,
 661                            *r,
 662                            (arbneg ^ 0x7) | WRITEMASK_W,
 663                            src, pfs_one, pfs_zero, 0);
 664         }
 665
 666         return 3;
 667 }
 668
 669 static int swz_emit_partial(struct r300_pfs_compile_state *cs,
 670                             GLuint src,
 671                             GLuint * r, int mask, int mc, GLuint arbneg)
 672 {
 673         COMPILE_STATE;
 674         GLuint tmp;
 675         GLuint wmask = 0;
 676
 677         if (!REG_GET_VALID(*r))
 678                 *r = get_temp_reg(cs);
 679
 680         /* A partial match, VSWZ/mask define what parts of the
 681          * desired swizzle we match
 682          */
 683         if (mc + s_mask[mask].count == 3) {
 684                 wmask = WRITEMASK_W;
 685                 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
 686         }
 687
 688         tmp = arbneg & s_mask[mask].mask;
 689         if (tmp) {
 690                 tmp = tmp ^ s_mask[mask].mask;
 691                 if (tmp) {
 692                         emit_arith(cs,
 693                                    PFS_OP_MAD,
 694                                    *r,
 695                                    arbneg & s_mask[mask].mask,
 696                                    keep(src) | REG_NEGV_MASK,
 697                                    pfs_one, pfs_zero, 0);
 698                         if (!wmask) {
 699                                 REG_SET_NO_USE(src, GL_TRUE);
 700                         } else {
 701                                 REG_SET_NO_USE(src, GL_FALSE);
 702                         }
 703                         emit_arith(cs,
 704                                    PFS_OP_MAD,
 705                                    *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
 706                 } else {
 707                         if (!wmask) {
 708                                 REG_SET_NO_USE(src, GL_TRUE);
 709                         } else {
 710                                 REG_SET_NO_USE(src, GL_FALSE);
 711                         }
 712                         emit_arith(cs,
 713                                    PFS_OP_MAD,
 714                                    *r,
 715                                    (arbneg & s_mask[mask].mask) | wmask,
 716                                    src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
 717                 }
 718         } else {
 719                 if (!wmask) {
 720                         REG_SET_NO_USE(src, GL_TRUE);
 721                 } else {
 722                         REG_SET_NO_USE(src, GL_FALSE);
 723                 }
 724                 emit_arith(cs, PFS_OP_MAD,
 725                            *r,
 726                            s_mask[mask].mask | wmask,
 727                            src, pfs_one, pfs_zero, 0);
 728         }
 729
 730         return s_mask[mask].count;
 731 }
 732
 733 static GLuint do_swizzle(struct r300_pfs_compile_state *cs,
 734                          GLuint src, GLuint arbswz, GLuint arbneg)
 735 {
 736         COMPILE_STATE;
 737         GLuint r = undef;
 738         GLuint vswz;
 739         int c_mask = 0;
 740         int v_match = 0;
 741
 742         /* If swizzling from something without an XYZW native swizzle,
 743          * emit result to a temp, and do new swizzle from the temp.
 744          */
 745 #if 0
 746         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
 747                 GLuint temp = get_temp_reg(fp);
 748                 emit_arith(fp,
 749                            PFS_OP_MAD,
 750                            temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
 751                 src = temp;
 752         }
 753 #endif
 754
 755         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
 756                 GLuint vsrcswz =
 757                     (v_swiz[REG_GET_VSWZ(src)].
 758                      hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
 759                     REG_GET_SSWZ(src) << 9;
 760                 GLint i;
 761
 762                 GLuint newswz = 0;
 763                 GLuint offset;
 764                 for (i = 0; i < 4; ++i) {
 765                         offset = GET_SWZ(arbswz, i);
 766
 767                         newswz |=
 768                             (offset <= 3) ? GET_SWZ(vsrcswz,
 769                                                     offset) << i *
 770                             3 : offset << i * 3;
 771                 }
 772
 773                 arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
 774                 REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
 775         } else {
 776                 /* set scalar swizzling */
 777                 REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
 778
 779         }
 780         do {
 781                 vswz = REG_GET_VSWZ(src);
 782                 do {
 783                         int chash;
 784
 785                         REG_SET_VSWZ(src, vswz);
 786                         chash = v_swiz[REG_GET_VSWZ(src)].hash &
 787                             s_mask[c_mask].hash;
 788
 789                         if (chash == (arbswz & s_mask[c_mask].hash)) {
 790                                 if (s_mask[c_mask].count == 3) {
 791                                         v_match += swz_native(cs,
 792                                                               src, &r, arbneg);
 793                                 } else {
 794                                         v_match += swz_emit_partial(cs,
 795                                                                     src,
 796                                                                     &r,
 797                                                                     c_mask,
 798                                                                     v_match,
 799                                                                     arbneg);
 800                                 }
 801
 802                                 if (v_match == 3)
 803                                         return r;
 804
 805                                 /* Fill with something invalid.. all 0's was
 806                                  * wrong before, matched SWIZZLE_X.  So all
 807                                  * 1's will be okay for now
 808                                  */
 809                                 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
 810                         }
 811                 } while (v_swiz[++vswz].hash != PFS_INVAL);
 812                 REG_SET_VSWZ(src, SWIZZLE_XYZ);
 813         } while (s_mask[++c_mask].hash != PFS_INVAL);
 814
 815         ERROR("should NEVER get here\n");
 816         return r;
 817 }
 818
 819 static GLuint t_src(struct r300_pfs_compile_state *cs,
 820                     struct prog_src_register fpsrc)
 821 {
 822         COMPILE_STATE;
 823         GLuint r = undef;
 824
 825         switch (fpsrc.File) {
 826         case PROGRAM_TEMPORARY:
 827                 REG_SET_INDEX(r, fpsrc.Index);
 828                 REG_SET_VALID(r, GL_TRUE);
 829                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 830                 break;
 831         case PROGRAM_INPUT:
 832                 REG_SET_INDEX(r, fpsrc.Index);
 833                 REG_SET_VALID(r, GL_TRUE);
 834                 REG_SET_TYPE(r, REG_TYPE_INPUT);
 835                 break;
 836         case PROGRAM_LOCAL_PARAM:
 837                 r = emit_const4fv(cs,
 838                                   fp->mesa_program.Base.LocalParams[fpsrc.
 839                                                                     Index]);
 840                 break;
 841         case PROGRAM_ENV_PARAM:
 842                 r = emit_const4fv(cs,
 843                         cs->r300->radeon.glCtx->FragmentProgram.Parameters[fpsrc.Index]);
 844                 break;
 845         case PROGRAM_STATE_VAR:
 846         case PROGRAM_NAMED_PARAM:
 847         case PROGRAM_CONSTANT:
 848                 r = emit_const4fv(cs,
 849                                   fp->mesa_program.Base.Parameters->
 850                                   ParameterValues[fpsrc.Index]);
 851                 break;
 852         default:
 853                 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
 854                 return r;
 855         }
 856
 857         /* no point swizzling ONE/ZERO/HALF constants... */
 858         if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
 859                 r = do_swizzle(cs, r, fpsrc.Swizzle, fpsrc.NegateBase);
 860         return r;
 861 }
 862
 863 static GLuint t_scalar_src(struct r300_pfs_compile_state *cs,
 864                            struct prog_src_register fpsrc)
 865 {
 866         struct prog_src_register src = fpsrc;
 867         int sc = GET_SWZ(fpsrc.Swizzle, 0);     /* X */
 868
 869         src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
 870
 871         return t_src(cs, src);
 872 }
 873
 874 static GLuint t_dst(struct r300_pfs_compile_state *cs,
 875                     struct prog_dst_register dest)
 876 {
 877         COMPILE_STATE;
 878         GLuint r = undef;
 879
 880         switch (dest.File) {
 881         case PROGRAM_TEMPORARY:
 882                 REG_SET_INDEX(r, dest.Index);
 883                 REG_SET_VALID(r, GL_TRUE);
 884                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 885                 return r;
 886         case PROGRAM_OUTPUT:
 887                 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
 888                 switch (dest.Index) {
 889                 case FRAG_RESULT_COLR:
 890                 case FRAG_RESULT_DEPR:
 891                         REG_SET_INDEX(r, dest.Index);
 892                         REG_SET_VALID(r, GL_TRUE);
 893                         return r;
 894                 default:
 895                         ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
 896                         return r;
 897                 }
 898         default:
 899                 ERROR("Bad DstReg->File 0x%x\n", dest.File);
 900                 return r;
 901         }
 902 }
 903
 904 static int t_hw_src(struct r300_pfs_compile_state *cs, GLuint src, GLboolean tex)
 905 {
 906         COMPILE_STATE;
 907         int idx;
 908         int index = REG_GET_INDEX(src);
 909
 910         switch (REG_GET_TYPE(src)) {
 911         case REG_TYPE_TEMP:
 912                 /* NOTE: if reg==-1 here, a source is being read that
 913                  *       hasn't been written to. Undefined results.
 914                  */
 915                 if (cs->temps[index].reg == -1)
 916                         cs->temps[index].reg = get_hw_temp(cs, cs->nrslots);
 917
 918                 idx = cs->temps[index].reg;
 919
 920                 if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
 921                         free_temp(cs, src);
 922                 break;
 923         case REG_TYPE_INPUT:
 924                 idx = cs->inputs[index].reg;
 925
 926                 if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
 927                         free_hw_temp(cs, cs->inputs[index].reg);
 928                 break;
 929         case REG_TYPE_CONST:
 930                 return (index | SRC_CONST);
 931         default:
 932                 ERROR("Invalid type for source reg\n");
 933                 return (0 | SRC_CONST);
 934         }
 935
 936         if (!tex)
 937                 cs->used_in_node |= (1 << idx);
 938
 939         return idx;
 940 }
 941
 942 static int t_hw_dst(struct r300_pfs_compile_state *cs,
 943                     GLuint dest, GLboolean tex, int slot)
 944 {
 945         COMPILE_STATE;
 946         int idx;
 947         GLuint index = REG_GET_INDEX(dest);
 948         assert(REG_GET_VALID(dest));
 949
 950         switch (REG_GET_TYPE(dest)) {
 951         case REG_TYPE_TEMP:
 952                 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
 953                         if (!tex) {
 954                                 cs->temps[index].reg = get_hw_temp(cs, slot);
 955                         } else {
 956                                 cs->temps[index].reg = get_hw_temp_tex(cs);
 957                         }
 958                 }
 959                 idx = cs->temps[index].reg;
 960
 961                 if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
 962                         free_temp(cs, dest);
 963
 964                 cs->dest_in_node |= (1 << idx);
 965                 cs->used_in_node |= (1 << idx);
 966                 break;
 967         case REG_TYPE_OUTPUT:
 968                 switch (index) {
 969                 case FRAG_RESULT_COLR:
 970                         code->node[code->cur_node].flags |= R300_RGBA_OUT;
 971                         break;
 972                 case FRAG_RESULT_DEPR:
 973                         fp->WritesDepth = GL_TRUE;
 974                         code->node[code->cur_node].flags |= R300_W_OUT;
 975                         break;
 976                 }
 977                 return index;
 978                 break;
 979         default:
 980                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
 981                 return 0;
 982         }
 983
 984         return idx;
 985 }
 986
 987 static void emit_nop(struct r300_pfs_compile_state *cs)
 988 {
 989         COMPILE_STATE;
 990
 991         if (cs->nrslots >= PFS_MAX_ALU_INST) {
 992                 ERROR("Out of ALU instruction slots\n");
 993                 return;
 994         }
 995
 996         code->alu.inst[cs->nrslots].inst0 = NOP_INST0;
 997         code->alu.inst[cs->nrslots].inst1 = NOP_INST1;
 998         code->alu.inst[cs->nrslots].inst2 = NOP_INST2;
 999         code->alu.inst[cs->nrslots].inst3 = NOP_INST3;
1000         cs->nrslots++;
1001 }
1002
1003 static void emit_tex(struct r300_pfs_compile_state *cs,
1004                      struct prog_instruction *fpi, int opcode)
1005 {
1006         COMPILE_STATE;
1007         GLuint coord = t_src(cs, fpi->SrcReg[0]);
1008         GLuint dest = undef, rdest = undef;
1009         GLuint din, uin;
1010         int unit = fpi->TexSrcUnit;
1011         int hwsrc, hwdest;
1012         GLuint tempreg = 0;
1013
1014         /**
1015          * Hardware uses [0..1]x[0..1] range for rectangle textures
1016          * instead of [0..Width]x[0..Height].
1017          * Add a scaling instruction.
1018          *
1019          * \todo Refactor this once we have proper rewriting/optimization
1020          * support for programs.
1021          */
1022         if (opcode != R300_TEX_OP_KIL && fpi->TexSrcTarget == TEXTURE_RECT_INDEX) {
1023                 gl_state_index tokens[STATE_LENGTH] = {
1024                         STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
1025                         0
1026                 };
1027                 int factor_index;
1028                 GLuint factorreg;
1029
1030                 tokens[2] = unit;
1031                 factor_index =
1032                         _mesa_add_state_reference(cs->fp->mesa_program.Base.
1033                                                 Parameters, tokens);
1034                 factorreg =
1035                         emit_const4fv(cs,
1036                                 cs->fp->mesa_program.Base.Parameters->
1037                                 ParameterValues[factor_index]);
1038                 tempreg = keep(get_temp_reg(cs));
1039
1040                 emit_arith(cs, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
1041                         coord, factorreg, pfs_zero, 0);
1042
1043                 coord = tempreg;
1044         }
1045
1046         /* Texture operations do not support swizzles etc. in hardware,
1047          * so emit an additional arithmetic operation if necessary.
1048          */
1049         if (REG_GET_VSWZ(coord) != SWIZZLE_XYZ ||
1050             REG_GET_SSWZ(coord) != SWIZZLE_W ||
1051             coord & (REG_NEGV_MASK | REG_NEGS_MASK | REG_ABS_MASK)) {
1052                 assert(tempreg == 0);
1053                 tempreg = keep(get_temp_reg(cs));
1054                 emit_arith(cs, PFS_OP_MAD, tempreg, WRITEMASK_XYZW,
1055                         coord, pfs_one, pfs_zero, 0);
1056                 coord = tempreg;
1057         }
1058
1059         /* Ensure correct node indirection */
1060         uin = cs->used_in_node;
1061         din = cs->dest_in_node;
1062
1063         /* Resolve source/dest to hardware registers */
1064         hwsrc = t_hw_src(cs, coord, GL_TRUE);
1065
1066         if (opcode != R300_TEX_OP_KIL) {
1067                 dest = t_dst(cs, fpi->DstReg);
1068
1069                 /* r300 doesn't seem to be able to do TEX->output reg */
1070                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1071                         rdest = dest;
1072                         dest = get_temp_reg_tex(cs);
1073                 } else if (fpi->DstReg.WriteMask != WRITEMASK_XYZW) {
1074                         /* in case write mask isn't XYZW */
1075                         rdest = dest;
1076                         dest = get_temp_reg_tex(cs);
1077                 }
1078                 hwdest =
1079                     t_hw_dst(cs, dest, GL_TRUE,
1080                              code->node[code->cur_node].alu_offset);
1081
1082                 /* Use a temp that hasn't been used in this node, rather
1083                  * than causing an indirection
1084                  */
1085                 if (uin & (1 << hwdest)) {
1086                         free_hw_temp(cs, hwdest);
1087                         hwdest = get_hw_temp_tex(cs);
1088                         cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
1089                 }
1090         } else {
1091                 hwdest = 0;
1092                 unit = 0;
1093         }
1094
1095         /* Indirection if source has been written in this node, or if the
1096          * dest has been read/written in this node
1097          */
1098         if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
1099              (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
1100
1101                 /* Finish off current node */
1102                 if (code->node[code->cur_node].alu_offset == cs->nrslots)
1103                         emit_nop(cs);
1104
1105                 code->node[code->cur_node].alu_end =
1106                     cs->nrslots - code->node[code->cur_node].alu_offset - 1;
1107                 assert(code->node[code->cur_node].alu_end >= 0);
1108
1109                 if (++code->cur_node >= PFS_MAX_TEX_INDIRECT) {
1110                         ERROR("too many levels of texture indirection\n");
1111                         return;
1112                 }
1113
1114                 /* Start new node */
1115                 code->node[code->cur_node].tex_offset = code->tex.length;
1116                 code->node[code->cur_node].alu_offset = cs->nrslots;
1117                 code->node[code->cur_node].tex_end = -1;
1118                 code->node[code->cur_node].alu_end = -1;
1119                 code->node[code->cur_node].flags = 0;
1120                 cs->used_in_node = 0;
1121                 cs->dest_in_node = 0;
1122         }
1123
1124         if (code->cur_node == 0)
1125                 code->first_node_has_tex = 1;
1126
1127         code->tex.inst[code->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
1128             | (hwdest << R300_DST_ADDR_SHIFT)
1129             | (unit << R300_TEX_ID_SHIFT)
1130             | (opcode << R300_TEX_INST_SHIFT);
1131
1132         cs->dest_in_node |= (1 << hwdest);
1133         if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1134                 cs->used_in_node |= (1 << hwsrc);
1135
1136         code->node[code->cur_node].tex_end++;
1137
1138         /* Copy from temp to output if needed */
1139         if (REG_GET_VALID(rdest)) {
1140                 emit_arith(cs, PFS_OP_MAD, rdest, fpi->DstReg.WriteMask, dest,
1141                            pfs_one, pfs_zero, 0);
1142                 free_temp(cs, dest);
1143         }
1144
1145         /* Free temp register */
1146         if (tempreg != 0)
1147                 free_temp(cs, tempreg);
1148 }
1149
1150 /**
1151  * Returns the first slot where we could possibly allow writing to dest,
1152  * according to register allocation.
1153  */
1154 static int get_earliest_allowed_write(struct r300_pfs_compile_state *cs,
1155                                       GLuint dest, int mask)
1156 {
1157         COMPILE_STATE;
1158         int idx;
1159         int pos;
1160         GLuint index = REG_GET_INDEX(dest);
1161         assert(REG_GET_VALID(dest));
1162
1163         switch (REG_GET_TYPE(dest)) {
1164         case REG_TYPE_TEMP:
1165                 if (cs->temps[index].reg == -1)
1166                         return 0;
1167
1168                 idx = cs->temps[index].reg;
1169                 break;
1170         case REG_TYPE_OUTPUT:
1171                 return 0;
1172         default:
1173                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1174                 return 0;
1175         }
1176
1177         pos = cs->hwtemps[idx].reserved;
1178         if (mask & WRITEMASK_XYZ) {
1179                 if (pos < cs->hwtemps[idx].vector_lastread)
1180                         pos = cs->hwtemps[idx].vector_lastread;
1181         }
1182         if (mask & WRITEMASK_W) {
1183                 if (pos < cs->hwtemps[idx].scalar_lastread)
1184                         pos = cs->hwtemps[idx].scalar_lastread;
1185         }
1186
1187         return pos;
1188 }
1189
1190 /**
1191  * Allocates a slot for an ALU instruction that can consist of
1192  * a vertex part or a scalar part or both.
1193  *
1194  * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1195  * appropriate position (vector and/or scalar), and their positions are
1196  * recorded in the srcpos array.
1197  *
1198  * This function emits instruction code for the source fetch and the
1199  * argument selection. It does not emit instruction code for the
1200  * opcode or the destination selection.
1201  *
1202  * @return the index of the slot
1203  */
1204 static int find_and_prepare_slot(struct r300_pfs_compile_state *cs,
1205                                  GLboolean emit_vop,
1206                                  GLboolean emit_sop,
1207                                  int argc, GLuint * src, GLuint dest, int mask)
1208 {
1209         COMPILE_STATE;
1210         int hwsrc[3];
1211         int srcpos[3];
1212         unsigned int used;
1213         int tempused;
1214         int tempvsrc[3];
1215         int tempssrc[3];
1216         int pos;
1217         int regnr;
1218         int i, j;
1219
1220         // Determine instruction slots, whether sources are required on
1221         // vector or scalar side, and the smallest slot number where
1222         // all source registers are available
1223         used = 0;
1224         if (emit_vop)
1225                 used |= SLOT_OP_VECTOR;
1226         if (emit_sop)
1227                 used |= SLOT_OP_SCALAR;
1228
1229         pos = get_earliest_allowed_write(cs, dest, mask);
1230
1231         if (code->node[code->cur_node].alu_offset > pos)
1232                 pos = code->node[code->cur_node].alu_offset;
1233         for (i = 0; i < argc; ++i) {
1234                 if (!REG_GET_BUILTIN(src[i])) {
1235                         if (emit_vop)
1236                                 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1237                         if (emit_sop)
1238                                 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1239                 }
1240
1241                 hwsrc[i] = t_hw_src(cs, src[i], GL_FALSE);      /* Note: sideeffects wrt refcounting! */
1242                 regnr = hwsrc[i] & 31;
1243
1244                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1245                         if (used & (SLOT_SRC_VECTOR << i)) {
1246                                 if (cs->hwtemps[regnr].vector_valid > pos)
1247                                         pos = cs->hwtemps[regnr].vector_valid;
1248                         }
1249                         if (used & (SLOT_SRC_SCALAR << i)) {
1250                                 if (cs->hwtemps[regnr].scalar_valid > pos)
1251                                         pos = cs->hwtemps[regnr].scalar_valid;
1252                         }
1253                 }
1254         }
1255
1256         // Find a slot that fits
1257         for (;; ++pos) {
1258                 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1259                         continue;
1260
1261                 if (pos >= cs->nrslots) {
1262                         if (cs->nrslots >= PFS_MAX_ALU_INST) {
1263                                 ERROR("Out of ALU instruction slots\n");
1264                                 return -1;
1265                         }
1266
1267                         fp->code.alu.inst[pos].inst0 = NOP_INST0;
1268                         fp->code.alu.inst[pos].inst1 = NOP_INST1;
1269                         fp->code.alu.inst[pos].inst2 = NOP_INST2;
1270                         fp->code.alu.inst[pos].inst3 = NOP_INST3;
1271
1272                         cs->nrslots++;
1273                 }
1274                 // Note: When we need both parts (vector and scalar) of a source,
1275                 // we always try to put them into the same position. This makes the
1276                 // code easier to read, and it is optimal (i.e. one doesn't gain
1277                 // anything by splitting the parts).
1278                 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1279                 tempused = cs->slot[pos].used;
1280                 for (i = 0; i < 3; ++i) {
1281                         tempvsrc[i] = cs->slot[pos].vsrc[i];
1282                         tempssrc[i] = cs->slot[pos].ssrc[i];
1283                 }
1284
1285                 for (i = 0; i < argc; ++i) {
1286                         int flags = (used >> i) & SLOT_SRC_BOTH;
1287
1288                         if (!flags) {
1289                                 srcpos[i] = 0;
1290                                 continue;
1291                         }
1292
1293                         for (j = 0; j < 3; ++j) {
1294                                 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1295                                         if (tempvsrc[j] != hwsrc[i])
1296                                                 continue;
1297                                 }
1298
1299                                 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1300                                         if (tempssrc[j] != hwsrc[i])
1301                                                 continue;
1302                                 }
1303
1304                                 break;
1305                         }
1306
1307                         if (j == 3)
1308                                 break;
1309
1310                         srcpos[i] = j;
1311                         tempused |= flags << j;
1312                         if (flags & SLOT_SRC_VECTOR)
1313                                 tempvsrc[j] = hwsrc[i];
1314                         if (flags & SLOT_SRC_SCALAR)
1315                                 tempssrc[j] = hwsrc[i];
1316                 }
1317
1318                 if (i == argc)
1319                         break;
1320         }
1321
1322         // Found a slot, reserve it
1323         cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1324         for (i = 0; i < 3; ++i) {
1325                 cs->slot[pos].vsrc[i] = tempvsrc[i];
1326                 cs->slot[pos].ssrc[i] = tempssrc[i];
1327         }
1328
1329         for (i = 0; i < argc; ++i) {
1330                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1331                         int regnr = hwsrc[i] & 31;
1332
1333                         if (used & (SLOT_SRC_VECTOR << i)) {
1334                                 if (cs->hwtemps[regnr].vector_lastread < pos)
1335                                         cs->hwtemps[regnr].vector_lastread =
1336                                             pos;
1337                         }
1338                         if (used & (SLOT_SRC_SCALAR << i)) {
1339                                 if (cs->hwtemps[regnr].scalar_lastread < pos)
1340                                         cs->hwtemps[regnr].scalar_lastread =
1341                                             pos;
1342                         }
1343                 }
1344         }
1345
1346         // Emit the source fetch code
1347         code->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
1348         code->alu.inst[pos].inst1 |=
1349             ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
1350              (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
1351              (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
1352
1353         code->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
1354         code->alu.inst[pos].inst3 |=
1355             ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
1356              (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
1357              (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
1358
1359         // Emit the argument selection code
1360         if (emit_vop) {
1361                 int swz[3];
1362
1363                 for (i = 0; i < 3; ++i) {
1364                         if (i < argc) {
1365                                 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1366                                           (srcpos[i] *
1367                                            v_swiz[REG_GET_VSWZ(src[i])].
1368                                            stride)) | ((src[i] & REG_NEGV_MASK)
1369                                                        ? ARG_NEG : 0) | ((src[i]
1370                                                                           &
1371                                                                           REG_ABS_MASK)
1372                                                                          ?
1373                                                                          ARG_ABS
1374                                                                          : 0);
1375                         } else {
1376                                 swz[i] = R300_ALU_ARGC_ZERO;
1377                         }
1378                 }
1379
1380                 code->alu.inst[pos].inst0 &=
1381                     ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
1382                       R300_ALU_ARG2C_MASK);
1383                 code->alu.inst[pos].inst0 |=
1384                     (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
1385                                                          R300_ALU_ARG1C_SHIFT)
1386                     | (swz[2] << R300_ALU_ARG2C_SHIFT);
1387         }
1388
1389         if (emit_sop) {
1390                 int swz[3];
1391
1392                 for (i = 0; i < 3; ++i) {
1393                         if (i < argc) {
1394                                 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1395                                           (srcpos[i] *
1396                                            s_swiz[REG_GET_SSWZ(src[i])].
1397                                            stride)) | ((src[i] & REG_NEGV_MASK)
1398                                                        ? ARG_NEG : 0) | ((src[i]
1399                                                                           &
1400                                                                           REG_ABS_MASK)
1401                                                                          ?
1402                                                                          ARG_ABS
1403                                                                          : 0);
1404                         } else {
1405                                 swz[i] = R300_ALU_ARGA_ZERO;
1406                         }
1407                 }
1408
1409                 code->alu.inst[pos].inst2 &=
1410                     ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
1411                       R300_ALU_ARG2A_MASK);
1412                 code->alu.inst[pos].inst2 |=
1413                     (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
1414                                                          R300_ALU_ARG1A_SHIFT)
1415                     | (swz[2] << R300_ALU_ARG2A_SHIFT);
1416         }
1417
1418         return pos;
1419 }
1420
1421 /**
1422  * Append an ALU instruction to the instruction list.
1423  */
1424 static void emit_arith(struct r300_pfs_compile_state *cs,
1425                        int op,
1426                        GLuint dest,
1427                        int mask,
1428                        GLuint src0, GLuint src1, GLuint src2, int flags)
1429 {
1430         COMPILE_STATE;
1431         GLuint src[3] = { src0, src1, src2 };
1432         int hwdest;
1433         GLboolean emit_vop, emit_sop;
1434         int vop, sop, argc;
1435         int pos;
1436
1437         vop = r300_fpop[op].v_op;
1438         sop = r300_fpop[op].s_op;
1439         argc = r300_fpop[op].argc;
1440
1441         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1442             REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1443                 if (mask & WRITEMASK_Z) {
1444                         mask = WRITEMASK_W;
1445                 } else {
1446                         return;
1447                 }
1448         }
1449
1450         emit_vop = GL_FALSE;
1451         emit_sop = GL_FALSE;
1452         if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
1453                 emit_vop = GL_TRUE;
1454         if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
1455                 emit_sop = GL_TRUE;
1456
1457         pos =
1458             find_and_prepare_slot(cs, emit_vop, emit_sop, argc, src, dest,
1459                                   mask);
1460         if (pos < 0)
1461                 return;
1462
1463         hwdest = t_hw_dst(cs, dest, GL_FALSE, pos);     /* Note: Side effects wrt register allocation */
1464
1465         if (flags & PFS_FLAG_SAT) {
1466                 vop |= R300_ALU_OUTC_CLAMP;
1467                 sop |= R300_ALU_OUTA_CLAMP;
1468         }
1469
1470         /* Throw the pieces together and get ALU/1 */
1471         if (emit_vop) {
1472                 code->alu.inst[pos].inst0 |= vop;
1473
1474                 code->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
1475
1476                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1477                         if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1478                                 code->alu.inst[pos].inst1 |=
1479                                     (mask & WRITEMASK_XYZ) <<
1480                                     R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
1481                         } else
1482                                 assert(0);
1483                 } else {
1484                         code->alu.inst[pos].inst1 |=
1485                             (mask & WRITEMASK_XYZ) <<
1486                             R300_ALU_DSTC_REG_MASK_SHIFT;
1487
1488                         cs->hwtemps[hwdest].vector_valid = pos + 1;
1489                 }
1490         }
1491
1492         /* And now ALU/3 */
1493         if (emit_sop) {
1494                 code->alu.inst[pos].inst2 |= sop;
1495
1496                 if (mask & WRITEMASK_W) {
1497                         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1498                                 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1499                                         code->alu.inst[pos].inst3 |=
1500                                             (hwdest << R300_ALU_DSTA_SHIFT) |
1501                                             R300_ALU_DSTA_OUTPUT;
1502                                 } else if (REG_GET_INDEX(dest) ==
1503                                            FRAG_RESULT_DEPR) {
1504                                         code->alu.inst[pos].inst3 |=
1505                                             R300_ALU_DSTA_DEPTH;
1506                                 } else
1507                                         assert(0);
1508                         } else {
1509                                 code->alu.inst[pos].inst3 |=
1510                                     (hwdest << R300_ALU_DSTA_SHIFT) |
1511                                     R300_ALU_DSTA_REG;
1512
1513                                 cs->hwtemps[hwdest].scalar_valid = pos + 1;
1514                         }
1515                 }
1516         }
1517
1518         return;
1519 }
1520
1521 #if 0
1522 static GLuint get_attrib(struct r300_fragment_program *fp, GLuint attr)
1523 {
1524         struct gl_fragment_program *mp = &fp->mesa_program;
1525         GLuint r = undef;
1526
1527         if (!(mp->Base.InputsRead & (1 << attr))) {
1528                 ERROR("Attribute %d was not provided!\n", attr);
1529                 return undef;
1530         }
1531
1532         REG_SET_TYPE(r, REG_TYPE_INPUT);
1533         REG_SET_INDEX(r, attr);
1534         REG_SET_VALID(r, GL_TRUE);
1535         return r;
1536 }
1537 #endif
1538
1539 static GLfloat SinCosConsts[2][4] = {
1540         {
1541          1.273239545,           // 4/PI
1542          -0.405284735,          // -4/(PI*PI)
1543          3.141592654,           // PI
1544          0.2225                 // weight
1545          },
1546         {
1547          0.75,
1548          0.0,
1549          0.159154943,           // 1/(2*PI)
1550          6.283185307            // 2*PI
1551          }
1552 };
1553
1554 /**
1555  * Emit a LIT instruction.
1556  * \p flags may be PFS_FLAG_SAT
1557  *
1558  * Definition of LIT (from ARB_fragment_program):
1559  * tmp = VectorLoad(op0);
1560  * if (tmp.x < 0) tmp.x = 0;
1561  * if (tmp.y < 0) tmp.y = 0;
1562  * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1563  * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1564  * result.x = 1.0;
1565  * result.y = tmp.x;
1566  * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1567  * result.w = 1.0;
1568  *
1569  * The longest path of computation is the one leading to result.z,
1570  * consisting of 5 operations. This implementation of LIT takes
1571  * 5 slots. So unless there's some special undocumented opcode,
1572  * this implementation is potentially optimal. Unfortunately,
1573  * emit_arith is a bit too conservative because it doesn't understand
1574  * partial writes to the vector component.
1575  */
1576 static const GLfloat LitConst[4] =
1577     { 127.999999, 127.999999, 127.999999, -127.999999 };
1578
1579 static void emit_lit(struct r300_pfs_compile_state *cs,
1580                      GLuint dest, int mask, GLuint src, int flags)
1581 {
1582         COMPILE_STATE;
1583         GLuint cnst;
1584         int needTemporary;
1585         GLuint temp;
1586
1587         cnst = emit_const4fv(cs, LitConst);
1588
1589         needTemporary = 0;
1590         if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
1591                 needTemporary = 1;
1592         } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1593                 // LIT is typically followed by DP3/DP4, so there's no point
1594                 // in creating special code for this case
1595                 needTemporary = 1;
1596         }
1597
1598         if (needTemporary) {
1599                 temp = keep(get_temp_reg(cs));
1600         } else {
1601                 temp = keep(dest);
1602         }
1603
1604         // Note: The order of emit_arith inside the slots is relevant,
1605         // because emit_arith only looks at scalar vs. vector when resolving
1606         // dependencies, and it does not consider individual vector components,
1607         // so swizzling between the two parts can create fake dependencies.
1608
1609         // First slot
1610         emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_XY,
1611                    keep(src), pfs_zero, undef, 0);
1612         emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
1613
1614         // Second slot
1615         emit_arith(cs, PFS_OP_MIN, temp, WRITEMASK_Z,
1616                    swizzle(temp, W, W, W, W), cnst, undef, 0);
1617         emit_arith(cs, PFS_OP_LG2, temp, WRITEMASK_W,
1618                    swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
1619
1620         // Third slot
1621         // If desired, we saturate the y result here.
1622         // This does not affect the use as a condition variable in the CMP later
1623         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W,
1624                    temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
1625         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_Y,
1626                    swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
1627
1628         // Fourth slot
1629         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_X,
1630                    pfs_one, pfs_one, pfs_zero, 0);
1631         emit_arith(cs, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
1632
1633         // Fifth slot
1634         emit_arith(cs, PFS_OP_CMP, temp, WRITEMASK_Z,
1635                    pfs_zero, swizzle(temp, W, W, W, W),
1636                    negate(swizzle(temp, Y, Y, Y, Y)), flags);
1637         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
1638                    pfs_zero, 0);
1639
1640         if (needTemporary) {
1641                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1642                            temp, pfs_one, pfs_zero, flags);
1643                 free_temp(cs, temp);
1644         } else {
1645                 // Decrease refcount of the destination
1646                 t_hw_dst(cs, dest, GL_FALSE, cs->nrslots);
1647         }
1648 }
1649
1650 static GLboolean parse_program(struct r300_pfs_compile_state *cs)
1651 {
1652         COMPILE_STATE;
1653         struct gl_fragment_program *mp = &fp->mesa_program;
1654         const struct prog_instruction *inst = mp->Base.Instructions;
1655         struct prog_instruction *fpi;
1656         GLuint src[3], dest, temp[2];
1657         int flags, mask = 0;
1658         int const_sin[2];
1659
1660         if (!inst || inst[0].Opcode == OPCODE_END) {
1661                 ERROR("empty program?\n");
1662                 return GL_FALSE;
1663         }
1664
1665         for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
1666                 if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1667                         flags = PFS_FLAG_SAT;
1668                 else
1669                         flags = 0;
1670
1671                 if (fpi->Opcode != OPCODE_KIL) {
1672                         dest = t_dst(cs, fpi->DstReg);
1673                         mask = fpi->DstReg.WriteMask;
1674                 }
1675
1676                 switch (fpi->Opcode) {
1677                 case OPCODE_ABS:
1678                         src[0] = t_src(cs, fpi->SrcReg[0]);
1679                         emit_arith(cs, PFS_OP_MAD, dest, mask,
1680                                    absolute(src[0]), pfs_one, pfs_zero, flags);
1681                         break;
1682                 case OPCODE_ADD:
1683                         src[0] = t_src(cs, fpi->SrcReg[0]);
1684                         src[1] = t_src(cs, fpi->SrcReg[1]);
1685                         emit_arith(cs, PFS_OP_MAD, dest, mask,
1686                                    src[0], pfs_one, src[1], flags);
1687                         break;
1688                 case OPCODE_CMP:
1689                         src[0] = t_src(cs, fpi->SrcReg[0]);
1690                         src[1] = t_src(cs, fpi->SrcReg[1]);
1691                         src[2] = t_src(cs, fpi->SrcReg[2]);
1692                         /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1693                          *    r300 - if src2.c < 0.0 ? src1.c : src0.c
1694                          */
1695                         emit_arith(cs, PFS_OP_CMP, dest, mask,
1696                                    src[2], src[1], src[0], flags);
1697                         break;
1698                 case OPCODE_COS:
1699                         /*
1700                          * cos using a parabola (see SIN):
1701                          * cos(x):
1702                          *   x = (x/(2*PI))+0.75
1703                          *   x = frac(x)
1704                          *   x = (x*2*PI)-PI
1705                          *   result = sin(x)
1706                          */
1707                         temp[0] = get_temp_reg(cs);
1708                         const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1709                         const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1710                         src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1711
1712                         /* add 0.5*PI and do range reduction */
1713
1714                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1715                                    swizzle(src[0], X, X, X, X),
1716                                    swizzle(const_sin[1], Z, Z, Z, Z),
1717                                    swizzle(const_sin[1], X, X, X, X), 0);
1718
1719                         emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
1720                                    swizzle(temp[0], X, X, X, X),
1721                                    undef, undef, 0);
1722
1723                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),       //2*PI
1724                                    negate(swizzle(const_sin[0], Z, Z, Z, Z)),   //-PI
1725                                    0);
1726
1727                         /* SIN */
1728
1729                         emit_arith(cs, PFS_OP_MAD, temp[0],
1730                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1731                                                                       Z, Z, Z,
1732                                                                       Z),
1733                                    const_sin[0], pfs_zero, 0);
1734
1735                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1736                                    swizzle(temp[0], Y, Y, Y, Y),
1737                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1738                                    swizzle(temp[0], X, X, X, X), 0);
1739
1740                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1741                                    swizzle(temp[0], X, X, X, X),
1742                                    absolute(swizzle(temp[0], X, X, X, X)),
1743                                    negate(swizzle(temp[0], X, X, X, X)), 0);
1744
1745                         emit_arith(cs, PFS_OP_MAD, dest, mask,
1746                                    swizzle(temp[0], Y, Y, Y, Y),
1747                                    swizzle(const_sin[0], W, W, W, W),
1748                                    swizzle(temp[0], X, X, X, X), flags);
1749
1750                         free_temp(cs, temp[0]);
1751                         break;
1752                 case OPCODE_DP3:
1753                         src[0] = t_src(cs, fpi->SrcReg[0]);
1754                         src[1] = t_src(cs, fpi->SrcReg[1]);
1755                         emit_arith(cs, PFS_OP_DP3, dest, mask,
1756                                    src[0], src[1], undef, flags);
1757                         break;
1758                 case OPCODE_DP4:
1759                         src[0] = t_src(cs, fpi->SrcReg[0]);
1760                         src[1] = t_src(cs, fpi->SrcReg[1]);
1761                         emit_arith(cs, PFS_OP_DP4, dest, mask,
1762                                    src[0], src[1], undef, flags);
1763                         break;
1764                 case OPCODE_DPH:
1765                         src[0] = t_src(cs, fpi->SrcReg[0]);
1766                         src[1] = t_src(cs, fpi->SrcReg[1]);
1767                         /* src0.xyz1 -> temp
1768                          * DP4 dest, temp, src1
1769                          */
1770                         emit_arith(cs, PFS_OP_DP4, dest, mask,
1771                                    swizzle(src[0], X, Y, Z, ONE), src[1],
1772                                    undef, flags);
1773                         break;
1774                 case OPCODE_DST:
1775                         src[0] = t_src(cs, fpi->SrcReg[0]);
1776                         src[1] = t_src(cs, fpi->SrcReg[1]);
1777                         /* dest.y = src0.y * src1.y */
1778                         if (mask & WRITEMASK_Y)
1779                                 emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Y,
1780                                            keep(src[0]), keep(src[1]),
1781                                            pfs_zero, flags);
1782                         /* dest.z = src0.z */
1783                         if (mask & WRITEMASK_Z)
1784                                 emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Z,
1785                                            src[0], pfs_one, pfs_zero, flags);
1786                         /* result.x = 1.0
1787                          * result.w = src1.w */
1788                         if (mask & WRITEMASK_XW) {
1789                                 REG_SET_VSWZ(src[1], SWIZZLE_111);      /*Cheat */
1790                                 emit_arith(cs, PFS_OP_MAD, dest,
1791                                            mask & WRITEMASK_XW,
1792                                            src[1], pfs_one, pfs_zero, flags);
1793                         }
1794                         break;
1795                 case OPCODE_EX2:
1796                         src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1797                         emit_arith(cs, PFS_OP_EX2, dest, mask,
1798                                    src[0], undef, undef, flags);
1799                         break;
1800                 case OPCODE_FLR:
1801                         src[0] = t_src(cs, fpi->SrcReg[0]);
1802                         temp[0] = get_temp_reg(cs);
1803                         /* FRC temp, src0
1804                          * MAD dest, src0, 1.0, -temp
1805                          */
1806                         emit_arith(cs, PFS_OP_FRC, temp[0], mask,
1807                                    keep(src[0]), undef, undef, 0);
1808                         emit_arith(cs, PFS_OP_MAD, dest, mask,
1809                                    src[0], pfs_one, negate(temp[0]), flags);
1810                         free_temp(cs, temp[0]);
1811                         break;
1812                 case OPCODE_FRC:
1813                         src[0] = t_src(cs, fpi->SrcReg[0]);
1814                         emit_arith(cs, PFS_OP_FRC, dest, mask,
1815                                    src[0], undef, undef, flags);
1816                         break;
1817                 case OPCODE_KIL:
1818                         emit_tex(cs, fpi, R300_TEX_OP_KIL);
1819                         break;
1820                 case OPCODE_LG2:
1821                         src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1822                         emit_arith(cs, PFS_OP_LG2, dest, mask,
1823                                    src[0], undef, undef, flags);
1824                         break;
1825                 case OPCODE_LIT:
1826                         src[0] = t_src(cs, fpi->SrcReg[0]);
1827                         emit_lit(cs, dest, mask, src[0], flags);
1828                         break;
1829                 case OPCODE_LRP:
1830                         src[0] = t_src(cs, fpi->SrcReg[0]);
1831                         src[1] = t_src(cs, fpi->SrcReg[1]);
1832                         src[2] = t_src(cs, fpi->SrcReg[2]);
1833                         /* result = tmp0tmp1 + (1 - tmp0)tmp2
1834                          *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1835                          *     MAD temp, -tmp0, tmp2, tmp2
1836                          *     MAD result, tmp0, tmp1, temp
1837                          */
1838                         temp[0] = get_temp_reg(cs);
1839                         emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1840                                    negate(keep(src[0])), keep(src[2]), src[2],
1841                                    0);
1842                         emit_arith(cs, PFS_OP_MAD, dest, mask,
1843                                    src[0], src[1], temp[0], flags);
1844                         free_temp(cs, temp[0]);
1845                         break;
1846                 case OPCODE_MAD:
1847                         src[0] = t_src(cs, fpi->SrcReg[0]);
1848                         src[1] = t_src(cs, fpi->SrcReg[1]);
1849                         src[2] = t_src(cs, fpi->SrcReg[2]);
1850                         emit_arith(cs, PFS_OP_MAD, dest, mask,
1851                                    src[0], src[1], src[2], flags);
1852                         break;
1853                 case OPCODE_MAX:
1854                         src[0] = t_src(cs, fpi->SrcReg[0]);
1855                         src[1] = t_src(cs, fpi->SrcReg[1]);
1856                         emit_arith(cs, PFS_OP_MAX, dest, mask,
1857                                    src[0], src[1], undef, flags);
1858                         break;
1859                 case OPCODE_MIN:
1860                         src[0] = t_src(cs, fpi->SrcReg[0]);
1861                         src[1] = t_src(cs, fpi->SrcReg[1]);
1862                         emit_arith(cs, PFS_OP_MIN, dest, mask,
1863                                    src[0], src[1], undef, flags);
1864                         break;
1865                 case OPCODE_MOV:
1866                 case OPCODE_SWZ:
1867                         src[0] = t_src(cs, fpi->SrcReg[0]);
1868                         emit_arith(cs, PFS_OP_MAD, dest, mask,
1869                                    src[0], pfs_one, pfs_zero, flags);
1870                         break;
1871                 case OPCODE_MUL:
1872                         src[0] = t_src(cs, fpi->SrcReg[0]);
1873                         src[1] = t_src(cs, fpi->SrcReg[1]);
1874                         emit_arith(cs, PFS_OP_MAD, dest, mask,
1875                                    src[0], src[1], pfs_zero, flags);
1876                         break;
1877                 case OPCODE_POW:
1878                         src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1879                         src[1] = t_scalar_src(cs, fpi->SrcReg[1]);
1880                         temp[0] = get_temp_reg(cs);
1881                         emit_arith(cs, PFS_OP_LG2, temp[0], WRITEMASK_W,
1882                                    src[0], undef, undef, 0);
1883                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
1884                                    temp[0], src[1], pfs_zero, 0);
1885                         emit_arith(cs, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
1886                                    temp[0], undef, undef, 0);
1887                         free_temp(cs, temp[0]);
1888                         break;
1889                 case OPCODE_RCP:
1890                         src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1891                         emit_arith(cs, PFS_OP_RCP, dest, mask,
1892                                    src[0], undef, undef, flags);
1893                         break;
1894                 case OPCODE_RSQ:
1895                         src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1896                         emit_arith(cs, PFS_OP_RSQ, dest, mask,
1897                                    absolute(src[0]), pfs_zero, pfs_zero, flags);
1898                         break;
1899                 case OPCODE_SCS:
1900                         /*
1901                          * scs using a parabola :
1902                          * scs(x):
1903                          *   result.x = sin(-abs(x)+0.5*PI)  (cos)
1904                          *   result.y = sin(x)               (sin)
1905                          *
1906                          */
1907                         temp[0] = get_temp_reg(cs);
1908                         temp[1] = get_temp_reg(cs);
1909                         const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1910                         const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1911                         src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1912
1913                         /* x = -abs(x)+0.5*PI */
1914                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z),     //PI
1915                                    pfs_half,
1916                                    negate(abs
1917                                           (swizzle(keep(src[0]), X, X, X, X))),
1918                                    0);
1919
1920                         /* C*x (sin) */
1921                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
1922                                    swizzle(const_sin[0], Y, Y, Y, Y),
1923                                    swizzle(keep(src[0]), X, X, X, X),
1924                                    pfs_zero, 0);
1925
1926                         /* B*x, C*x (cos) */
1927                         emit_arith(cs, PFS_OP_MAD, temp[0],
1928                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1929                                                                       Z, Z, Z,
1930                                                                       Z),
1931                                    const_sin[0], pfs_zero, 0);
1932
1933                         /* B*x (sin) */
1934                         emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1935                                    swizzle(const_sin[0], X, X, X, X),
1936                                    keep(src[0]), pfs_zero, 0);
1937
1938                         /* y = B*x + C*x*abs(x) (sin) */
1939                         emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1940                                    absolute(src[0]),
1941                                    swizzle(temp[0], W, W, W, W),
1942                                    swizzle(temp[1], W, W, W, W), 0);
1943
1944                         /* y = B*x + C*x*abs(x) (cos) */
1945                         emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1946                                    swizzle(temp[0], Y, Y, Y, Y),
1947                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
1948                                    swizzle(temp[0], X, X, X, X), 0);
1949
1950                         /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1951                         emit_arith(cs, PFS_OP_MAD, temp[0],
1952                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
1953                                                                       W, Z, Y,
1954                                                                       X),
1955                                    absolute(swizzle(temp[1], W, Z, Y, X)),
1956                                    negate(swizzle(temp[1], W, Z, Y, X)), 0);
1957
1958                         /* dest.xy = mad(temp.xy, P, temp2.wz) */
1959                         emit_arith(cs, PFS_OP_MAD, dest,
1960                                    mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
1961                                    swizzle(const_sin[0], W, W, W, W),
1962                                    swizzle(temp[1], W, Z, Y, X), flags);
1963
1964                         free_temp(cs, temp[0]);
1965                         free_temp(cs, temp[1]);
1966                         break;
1967                 case OPCODE_SGE:
1968                         src[0] = t_src(cs, fpi->SrcReg[0]);
1969                         src[1] = t_src(cs, fpi->SrcReg[1]);
1970                         temp[0] = get_temp_reg(cs);
1971                         /* temp = src0 - src1
1972                          * dest.c = (temp.c < 0.0) ? 0 : 1
1973                          */
1974                         emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1975                                    src[0], pfs_one, negate(src[1]), 0);
1976                         emit_arith(cs, PFS_OP_CMP, dest, mask,
1977                                    pfs_one, pfs_zero, temp[0], 0);
1978                         free_temp(cs, temp[0]);
1979                         break;
1980                 case OPCODE_SIN:
1981                         /*
1982                          *  using a parabola:
1983                          * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1984                          * extra precision is obtained by weighting against
1985                          * itself squared.
1986                          */
1987
1988                         temp[0] = get_temp_reg(cs);
1989                         const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1990                         const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1991                         src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1992
1993                         /* do range reduction */
1994
1995                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1996                                    swizzle(keep(src[0]), X, X, X, X),
1997                                    swizzle(const_sin[1], Z, Z, Z, Z),
1998                                    pfs_half, 0);
1999
2000                         emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
2001                                    swizzle(temp[0], X, X, X, X),
2002                                    undef, undef, 0);
2003
2004                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),       //2*PI
2005                                    negate(swizzle(const_sin[0], Z, Z, Z, Z)),   //PI
2006                                    0);
2007
2008                         /* SIN */
2009
2010                         emit_arith(cs, PFS_OP_MAD, temp[0],
2011                                    WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
2012                                                                       Z, Z, Z,
2013                                                                       Z),
2014                                    const_sin[0], pfs_zero, 0);
2015
2016                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
2017                                    swizzle(temp[0], Y, Y, Y, Y),
2018                                    absolute(swizzle(temp[0], Z, Z, Z, Z)),
2019                                    swizzle(temp[0], X, X, X, X), 0);
2020
2021                         emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
2022                                    swizzle(temp[0], X, X, X, X),
2023                                    absolute(swizzle(temp[0], X, X, X, X)),
2024                                    negate(swizzle(temp[0], X, X, X, X)), 0);
2025
2026                         emit_arith(cs, PFS_OP_MAD, dest, mask,
2027                                    swizzle(temp[0], Y, Y, Y, Y),
2028                                    swizzle(const_sin[0], W, W, W, W),
2029                                    swizzle(temp[0], X, X, X, X), flags);
2030
2031                         free_temp(cs, temp[0]);
2032                         break;
2033                 case OPCODE_SLT:
2034                         src[0] = t_src(cs, fpi->SrcReg[0]);
2035                         src[1] = t_src(cs, fpi->SrcReg[1]);
2036                         temp[0] = get_temp_reg(cs);
2037                         /* temp = src0 - src1
2038                          * dest.c = (temp.c < 0.0) ? 1 : 0
2039                          */
2040                         emit_arith(cs, PFS_OP_MAD, temp[0], mask,
2041                                    src[0], pfs_one, negate(src[1]), 0);
2042                         emit_arith(cs, PFS_OP_CMP, dest, mask,
2043                                    pfs_zero, pfs_one, temp[0], 0);
2044                         free_temp(cs, temp[0]);
2045                         break;
2046                 case OPCODE_SUB:
2047                         src[0] = t_src(cs, fpi->SrcReg[0]);
2048                         src[1] = t_src(cs, fpi->SrcReg[1]);
2049                         emit_arith(cs, PFS_OP_MAD, dest, mask,
2050                                    src[0], pfs_one, negate(src[1]), flags);
2051                         break;
2052                 case OPCODE_TEX:
2053                         emit_tex(cs, fpi, R300_TEX_OP_LD);
2054                         break;
2055                 case OPCODE_TXB:
2056                         emit_tex(cs, fpi, R300_TEX_OP_TXB);
2057                         break;
2058                 case OPCODE_TXP:
2059                         emit_tex(cs, fpi, R300_TEX_OP_TXP);
2060                         break;
2061                 case OPCODE_XPD:{
2062                                 src[0] = t_src(cs, fpi->SrcReg[0]);
2063                                 src[1] = t_src(cs, fpi->SrcReg[1]);
2064                                 temp[0] = get_temp_reg(cs);
2065                                 /* temp = src0.zxy * src1.yzx */
2066                                 emit_arith(cs, PFS_OP_MAD, temp[0],
2067                                            WRITEMASK_XYZ, swizzle(keep(src[0]),
2068                                                                   Z, X, Y, W),
2069                                            swizzle(keep(src[1]), Y, Z, X, W),
2070                                            pfs_zero, 0);
2071                                 /* dest.xyz = src0.yzx * src1.zxy - temp
2072                                  * dest.w       = undefined
2073                                  * */
2074                                 emit_arith(cs, PFS_OP_MAD, dest,
2075                                            mask & WRITEMASK_XYZ, swizzle(src[0],
2076                                                                          Y, Z,
2077                                                                          X, W),
2078                                            swizzle(src[1], Z, X, Y, W),
2079                                            negate(temp[0]), flags);
2080                                 /* cleanup */
2081                                 free_temp(cs, temp[0]);
2082                                 break;
2083                         }
2084                 default:
2085                         ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
2086                         break;
2087                 }
2088
2089                 if (fp->error)
2090                         return GL_FALSE;
2091
2092         }
2093
2094         return GL_TRUE;
2095 }
2096
2097 static void insert_wpos(struct gl_program *prog)
2098 {
2099         static gl_state_index tokens[STATE_LENGTH] = {
2100                 STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
2101         };
2102         struct prog_instruction *fpi;
2103         GLuint window_index;
2104         int i = 0;
2105         GLuint tempregi = prog->NumTemporaries;
2106         /* should do something else if no temps left... */
2107         prog->NumTemporaries++;
2108
2109         fpi = _mesa_alloc_instructions(prog->NumInstructions + 3);
2110         _mesa_init_instructions(fpi, prog->NumInstructions + 3);
2111
2112         /* perspective divide */
2113         fpi[i].Opcode = OPCODE_RCP;
2114
2115         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2116         fpi[i].DstReg.Index = tempregi;
2117         fpi[i].DstReg.WriteMask = WRITEMASK_W;
2118         fpi[i].DstReg.CondMask = COND_TR;
2119
2120         fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2121         fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2122         fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
2123         i++;
2124
2125         fpi[i].Opcode = OPCODE_MUL;
2126
2127         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2128         fpi[i].DstReg.Index = tempregi;
2129         fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2130         fpi[i].DstReg.CondMask = COND_TR;
2131
2132         fpi[i].SrcReg[0].File = PROGRAM_INPUT;
2133         fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
2134         fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
2135
2136         fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
2137         fpi[i].SrcReg[1].Index = tempregi;
2138         fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
2139         i++;
2140
2141         /* viewport transformation */
2142         window_index = _mesa_add_state_reference(prog->Parameters, tokens);
2143
2144         fpi[i].Opcode = OPCODE_MAD;
2145
2146         fpi[i].DstReg.File = PROGRAM_TEMPORARY;
2147         fpi[i].DstReg.Index = tempregi;
2148         fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
2149         fpi[i].DstReg.CondMask = COND_TR;
2150
2151         fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
2152         fpi[i].SrcReg[0].Index = tempregi;
2153         fpi[i].SrcReg[0].Swizzle =
2154             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2155
2156         fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
2157         fpi[i].SrcReg[1].Index = window_index;
2158         fpi[i].SrcReg[1].Swizzle =
2159             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2160
2161         fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
2162         fpi[i].SrcReg[2].Index = window_index;
2163         fpi[i].SrcReg[2].Swizzle =
2164             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
2165         i++;
2166
2167         _mesa_copy_instructions(&fpi[i], prog->Instructions,
2168                                 prog->NumInstructions);
2169
2170         free(prog->Instructions);
2171
2172         prog->Instructions = fpi;
2173
2174         prog->NumInstructions += i;
2175         fpi = &prog->Instructions[prog->NumInstructions - 1];
2176
2177         assert(fpi->Opcode == OPCODE_END);
2178
2179         for (fpi = &prog->Instructions[3]; fpi->Opcode != OPCODE_END; fpi++) {
2180                 for (i = 0; i < 3; i++)
2181                         if (fpi->SrcReg[i].File == PROGRAM_INPUT &&
2182                             fpi->SrcReg[i].Index == FRAG_ATTRIB_WPOS) {
2183                                 fpi->SrcReg[i].File = PROGRAM_TEMPORARY;
2184                                 fpi->SrcReg[i].Index = tempregi;
2185                         }
2186         }
2187 }
2188
2189 /* - Init structures
2190  * - Determine what hwregs each input corresponds to
2191  */
2192 static void init_program(struct r300_pfs_compile_state *cs)
2193 {
2194         COMPILE_STATE;
2195         struct gl_fragment_program *mp = &fp->mesa_program;
2196         struct prog_instruction *fpi;
2197         GLuint InputsRead = mp->Base.InputsRead;
2198         GLuint temps_used = 0;  /* for fp->temps[] */
2199         int i, j;
2200
2201         /* New compile, reset tracking data */
2202         fp->optimization =
2203             driQueryOptioni(&cs->r300->radeon.optionCache, "fp_optimization");
2204         fp->translated = GL_FALSE;
2205         fp->error = GL_FALSE;
2206         fp->WritesDepth = GL_FALSE;
2207         code->tex.length = 0;
2208         code->cur_node = 0;
2209         code->first_node_has_tex = 0;
2210         code->const_nr = 0;
2211         code->max_temp_idx = 0;
2212         code->node[0].alu_end = -1;
2213         code->node[0].tex_end = -1;
2214
2215         for (i = 0; i < PFS_MAX_ALU_INST; i++) {
2216                 for (j = 0; j < 3; j++) {
2217                         cs->slot[i].vsrc[j] = SRC_CONST;
2218                         cs->slot[i].ssrc[j] = SRC_CONST;
2219                 }
2220         }
2221
2222         /* Work out what temps the Mesa inputs correspond to, this must match
2223          * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2224          * configures itself based on the fragprog's InputsRead
2225          *
2226          * NOTE: this depends on get_hw_temp() allocating registers in order,
2227          * starting from register 0.
2228          */
2229
2230         /* Texcoords come first */
2231         for (i = 0; i < cs->r300->radeon.glCtx->Const.MaxTextureUnits; i++) {
2232                 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
2233                         cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
2234                         cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
2235                             get_hw_temp(cs, 0);
2236                 }
2237         }
2238         InputsRead &= ~FRAG_BITS_TEX_ANY;
2239
2240         /* fragment position treated as a texcoord */
2241         if (InputsRead & FRAG_BIT_WPOS) {
2242                 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
2243                 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(cs, 0);
2244                 insert_wpos(&mp->Base);
2245         }
2246         InputsRead &= ~FRAG_BIT_WPOS;
2247
2248         /* Then primary colour */
2249         if (InputsRead & FRAG_BIT_COL0) {
2250                 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
2251                 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(cs, 0);
2252         }
2253         InputsRead &= ~FRAG_BIT_COL0;
2254
2255         /* Secondary color */
2256         if (InputsRead & FRAG_BIT_COL1) {
2257                 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
2258                 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(cs, 0);
2259         }
2260         InputsRead &= ~FRAG_BIT_COL1;
2261
2262         /* Anything else */
2263         if (InputsRead) {
2264                 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
2265                 /* force read from hwreg 0 for now */
2266                 for (i = 0; i < 32; i++)
2267                         if (InputsRead & (1 << i))
2268                                 cs->inputs[i].reg = 0;
2269         }
2270
2271         /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2272          * That way, we can free up the reg when it's no longer needed
2273          */
2274         if (!mp->Base.Instructions) {
2275                 ERROR("No instructions found in program\n");
2276                 return;
2277         }
2278
2279         for (fpi = mp->Base.Instructions; fpi->Opcode != OPCODE_END; fpi++) {
2280                 int idx;
2281
2282                 for (i = 0; i < 3; i++) {
2283                         idx = fpi->SrcReg[i].Index;
2284                         switch (fpi->SrcReg[i].File) {
2285                         case PROGRAM_TEMPORARY:
2286                                 if (!(temps_used & (1 << idx))) {
2287                                         cs->temps[idx].reg = -1;
2288                                         cs->temps[idx].refcount = 1;
2289                                         temps_used |= (1 << idx);
2290                                 } else
2291                                         cs->temps[idx].refcount++;
2292                                 break;
2293                         case PROGRAM_INPUT:
2294                                 cs->inputs[idx].refcount++;
2295                                 break;
2296                         default:
2297                                 break;
2298                         }
2299                 }
2300
2301                 idx = fpi->DstReg.Index;
2302                 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
2303                         if (!(temps_used & (1 << idx))) {
2304                                 cs->temps[idx].reg = -1;
2305                                 cs->temps[idx].refcount = 1;
2306                                 temps_used |= (1 << idx);
2307                         } else
2308                                 cs->temps[idx].refcount++;
2309                 }
2310         }
2311         cs->temp_in_use = temps_used;
2312 }
2313
2314 static void update_params(r300ContextPtr r300, struct r300_fragment_program *fp)
2315 {
2316         struct gl_fragment_program *mp = &fp->mesa_program;
2317
2318         /* Ask Mesa nicely to fill in ParameterValues for us */
2319         if (mp->Base.Parameters)
2320                 _mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
2321 }
2322
2323 void r300TranslateFragmentShader(r300ContextPtr r300,
2324                                  struct r300_fragment_program *fp)
2325 {
2326         if (!fp->translated) {
2327                 struct r300_pfs_compile_state cs;
2328
2329                 _mesa_memset(&cs, 0, sizeof(cs));
2330                 cs.r300 = r300;
2331                 cs.fp = fp;
2332                 init_program(&cs);
2333
2334                 if (parse_program(&cs) == GL_FALSE) {
2335                         dump_program(fp, &fp->code);
2336                         return;
2337                 }
2338
2339                 /* Finish off */
2340                 fp->code.node[fp->code.cur_node].alu_end =
2341                     cs.nrslots - fp->code.node[fp->code.cur_node].alu_offset - 1;
2342                 if (fp->code.node[fp->code.cur_node].tex_end < 0)
2343                         fp->code.node[fp->code.cur_node].tex_end = 0;
2344                 fp->code.alu_offset = 0;
2345                 fp->code.alu_end = cs.nrslots - 1;
2346                 fp->code.tex_offset = 0;
2347                 fp->code.tex_end = fp->code.tex.length ? fp->code.tex.length - 1 : 0;
2348                 assert(fp->code.node[fp->code.cur_node].alu_end >= 0);
2349                 assert(fp->code.alu_end >= 0);
2350
2351                 fp->translated = GL_TRUE;
2352                 if (RADEON_DEBUG & DEBUG_PIXEL)
2353                         dump_program(fp, &fp->code);
2354                 r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
2355         }
2356
2357         update_params(r300, fp);
2358 }
2359
2360 /* just some random things... */
2361 static void dump_program(struct r300_fragment_program *fp,
2362                          struct r300_fragment_program_code *code)
2363 {
2364         int n, i, j;
2365         static int pc = 0;
2366
2367         fprintf(stderr, "pc=%d*************************************\n", pc++);
2368
2369         fprintf(stderr, "Mesa program:\n");
2370         fprintf(stderr, "-------------\n");
2371         _mesa_print_program(&fp->mesa_program.Base);
2372         fflush(stdout);
2373
2374         fprintf(stderr, "Hardware program\n");
2375         fprintf(stderr, "----------------\n");
2376
2377         for (n = 0; n < (code->cur_node + 1); n++) {
2378                 fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
2379                         "alu_end: %d, tex_end: %d\n", n,
2380                         code->node[n].alu_offset,
2381                         code->node[n].tex_offset,
2382                         code->node[n].alu_end, code->node[n].tex_end);
2383
2384                 if (code->tex.length) {
2385                         fprintf(stderr, "  TEX:\n");
2386                         for (i = code->node[n].tex_offset;
2387                              i <= code->node[n].tex_offset + code->node[n].tex_end;
2388                              ++i) {
2389                                 const char *instr;
2390
2391                                 switch ((code->tex.
2392                                          inst[i] >> R300_TEX_INST_SHIFT) &
2393                                         15) {
2394                                 case R300_TEX_OP_LD:
2395                                         instr = "TEX";
2396                                         break;
2397                                 case R300_TEX_OP_KIL:
2398                                         instr = "KIL";
2399                                         break;
2400                                 case R300_TEX_OP_TXP:
2401                                         instr = "TXP";
2402                                         break;
2403                                 case R300_TEX_OP_TXB:
2404                                         instr = "TXB";
2405                                         break;
2406                                 default:
2407                                         instr = "UNKNOWN";
2408                                 }
2409
2410                                 fprintf(stderr,
2411                                         "    %s t%i, %c%i, texture[%i]   (%08x)\n",
2412                                         instr,
2413                                         (code->tex.
2414                                          inst[i] >> R300_DST_ADDR_SHIFT) & 31,
2415                                         't',
2416                                         (code->tex.
2417                                          inst[i] >> R300_SRC_ADDR_SHIFT) & 31,
2418                                         (code->tex.
2419                                          inst[i] & R300_TEX_ID_MASK) >>
2420                                         R300_TEX_ID_SHIFT,
2421                                         code->tex.inst[i]);
2422                         }
2423                 }
2424
2425                 for (i = code->node[n].alu_offset;
2426                      i <= code->node[n].alu_offset + code->node[n].alu_end; ++i) {
2427                         char srcc[3][10], dstc[20];
2428                         char srca[3][10], dsta[20];
2429                         char argc[3][20];
2430                         char arga[3][20];
2431                         char flags[5], tmp[10];
2432
2433                         for (j = 0; j < 3; ++j) {
2434                                 int regc = code->alu.inst[i].inst1 >> (j * 6);
2435                                 int rega = code->alu.inst[i].inst3 >> (j * 6);
2436
2437                                 sprintf(srcc[j], "%c%i",
2438                                         (regc & 32) ? 'c' : 't', regc & 31);
2439                                 sprintf(srca[j], "%c%i",
2440                                         (rega & 32) ? 'c' : 't', rega & 31);
2441                         }
2442
2443                         dstc[0] = 0;
2444                         sprintf(flags, "%s%s%s",
2445                                 (code->alu.inst[i].
2446                                  inst1 & R300_ALU_DSTC_REG_X) ? "x" : "",
2447                                 (code->alu.inst[i].
2448                                  inst1 & R300_ALU_DSTC_REG_Y) ? "y" : "",
2449                                 (code->alu.inst[i].
2450                                  inst1 & R300_ALU_DSTC_REG_Z) ? "z" : "");
2451                         if (flags[0] != 0) {
2452                                 sprintf(dstc, "t%i.%s ",
2453                                         (code->alu.inst[i].
2454                                          inst1 >> R300_ALU_DSTC_SHIFT) & 31,
2455                                         flags);
2456                         }
2457                         sprintf(flags, "%s%s%s",
2458                                 (code->alu.inst[i].
2459                                  inst1 & R300_ALU_DSTC_OUTPUT_X) ? "x" : "",
2460                                 (code->alu.inst[i].
2461                                  inst1 & R300_ALU_DSTC_OUTPUT_Y) ? "y" : "",
2462                                 (code->alu.inst[i].
2463                                  inst1 & R300_ALU_DSTC_OUTPUT_Z) ? "z" : "");
2464                         if (flags[0] != 0) {
2465                                 sprintf(tmp, "o%i.%s",
2466                                         (code->alu.inst[i].
2467                                          inst1 >> R300_ALU_DSTC_SHIFT) & 31,
2468                                         flags);
2469                                 strcat(dstc, tmp);
2470                         }
2471
2472                         dsta[0] = 0;
2473                         if (code->alu.inst[i].inst3 & R300_ALU_DSTA_REG) {
2474                                 sprintf(dsta, "t%i.w ",
2475                                         (code->alu.inst[i].
2476                                          inst3 >> R300_ALU_DSTA_SHIFT) & 31);
2477                         }
2478                         if (code->alu.inst[i].inst3 & R300_ALU_DSTA_OUTPUT) {
2479                                 sprintf(tmp, "o%i.w ",
2480                                         (code->alu.inst[i].
2481                                          inst3 >> R300_ALU_DSTA_SHIFT) & 31);
2482                                 strcat(dsta, tmp);
2483                         }
2484                         if (code->alu.inst[i].inst3 & R300_ALU_DSTA_DEPTH) {
2485                                 strcat(dsta, "Z");
2486                         }
2487
2488                         fprintf(stderr,
2489                                 "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2490                                 "       w: %3s %3s %3s -> %-20s (%08x)\n", i,
2491                                 srcc[0], srcc[1], srcc[2], dstc,
2492                                 code->alu.inst[i].inst1, srca[0], srca[1],
2493                                 srca[2], dsta, code->alu.inst[i].inst3);
2494
2495                         for (j = 0; j < 3; ++j) {
2496                                 int regc = code->alu.inst[i].inst0 >> (j * 7);
2497                                 int rega = code->alu.inst[i].inst2 >> (j * 7);
2498                                 int d;
2499                                 char buf[20];
2500
2501                                 d = regc & 31;
2502                                 if (d < 12) {
2503                                         switch (d % 4) {
2504                                         case R300_ALU_ARGC_SRC0C_XYZ:
2505                                                 sprintf(buf, "%s.xyz",
2506                                                         srcc[d / 4]);
2507                                                 break;
2508                                         case R300_ALU_ARGC_SRC0C_XXX:
2509                                                 sprintf(buf, "%s.xxx",
2510                                                         srcc[d / 4]);
2511                                                 break;
2512                                         case R300_ALU_ARGC_SRC0C_YYY:
2513                                                 sprintf(buf, "%s.yyy",
2514                                                         srcc[d / 4]);
2515                                                 break;
2516                                         case R300_ALU_ARGC_SRC0C_ZZZ:
2517                                                 sprintf(buf, "%s.zzz",
2518                                                         srcc[d / 4]);
2519                                                 break;
2520                                         }
2521                                 } else if (d < 15) {
2522                                         sprintf(buf, "%s.www", srca[d - 12]);
2523                                 } else if (d == 20) {
2524                                         sprintf(buf, "0.0");
2525                                 } else if (d == 21) {
2526                                         sprintf(buf, "1.0");
2527                                 } else if (d == 22) {
2528                                         sprintf(buf, "0.5");
2529                                 } else if (d >= 23 && d < 32) {
2530                                         d -= 23;
2531                                         switch (d / 3) {
2532                                         case 0:
2533                                                 sprintf(buf, "%s.yzx",
2534                                                         srcc[d % 3]);
2535                                                 break;
2536                                         case 1:
2537                                                 sprintf(buf, "%s.zxy",
2538                                                         srcc[d % 3]);
2539                                                 break;
2540                                         case 2:
2541                                                 sprintf(buf, "%s.Wzy",
2542                                                         srcc[d % 3]);
2543                                                 break;
2544                                         }
2545                                 } else {
2546                                         sprintf(buf, "%i", d);
2547                                 }
2548
2549                                 sprintf(argc[j], "%s%s%s%s",
2550                                         (regc & 32) ? "-" : "",
2551                                         (regc & 64) ? "|" : "",
2552                                         buf, (regc & 64) ? "|" : "");
2553
2554                                 d = rega & 31;
2555                                 if (d < 9) {
2556                                         sprintf(buf, "%s.%c", srcc[d / 3],
2557                                                 'x' + (char)(d % 3));
2558                                 } else if (d < 12) {
2559                                         sprintf(buf, "%s.w", srca[d - 9]);
2560                                 } else if (d == 16) {
2561                                         sprintf(buf, "0.0");
2562                                 } else if (d == 17) {
2563                                         sprintf(buf, "1.0");
2564                                 } else if (d == 18) {
2565                                         sprintf(buf, "0.5");
2566                                 } else {
2567                                         sprintf(buf, "%i", d);
2568                                 }
2569
2570                                 sprintf(arga[j], "%s%s%s%s",
2571                                         (rega & 32) ? "-" : "",
2572                                         (rega & 64) ? "|" : "",
2573                                         buf, (rega & 64) ? "|" : "");
2574                         }
2575
2576                         fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
2577                                 "       w: %8s %8s %8s    op: %08x\n",
2578                                 argc[0], argc[1], argc[2],
2579                                 code->alu.inst[i].inst0, arga[0], arga[1],
2580                                 arga[2], code->alu.inst[i].inst2);
2581                 }
2582         }
2583 }