src/mesa/drivers/dri/r300/r300_fragprog_emit.c

   1 /*
   2  * Copyright (C) 2005 Ben Skeggs.
   3  *
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sublicense, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial
  16  * portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  */
  27
  28 /**
  29  * \file
  30  *
  31  * Emit the r300_fragment_program_code that can be understood by the hardware.
  32  * Input is a pre-transformed radeon_program.
  33  *
  34  * \author Ben Skeggs <darktama@iinet.net.au>
  35  *
  36  * \author Jerome Glisse <j.glisse@gmail.com>
  37  *
  38  * \todo FogOption
  39  *
  40  * \todo Verify results of opcodes for accuracy, I've only checked them in
  41  * specific cases.
  42  */
  43
  44 #include "glheader.h"
  45 #include "macros.h"
  46 #include "enums.h"
  47 #include "shader/prog_instruction.h"
  48 #include "shader/prog_parameter.h"
  49 #include "shader/prog_print.h"
  50
  51 #include "r300_context.h"
  52 #include "r300_fragprog.h"
  53 #include "r300_reg.h"
  54 #include "r300_state.h"
  55
  56 /* Mapping Mesa registers to R300 temporaries */
  57 struct reg_acc {
  58         int reg;                /* Assigned hw temp */
  59         unsigned int refcount;  /* Number of uses by mesa program */
  60 };
  61
  62 /**
  63  * Describe the current lifetime information for an R300 temporary
  64  */
  65 struct reg_lifetime {
  66         /* Index of the first slot where this register is free in the sense
  67            that it can be used as a new destination register.
  68            This is -1 if the register has been assigned to a Mesa register
  69            and the last access to the register has not yet been emitted */
  70         int free;
  71
  72         /* Index of the first slot where this register is currently reserved.
  73            This is used to stop e.g. a scalar operation from being moved
  74            before the allocation time of a register that was first allocated
  75            for a vector operation. */
  76         int reserved;
  77
  78         /* Index of the first slot in which the register can be used as a
  79            source without losing the value that is written by the last
  80            emitted instruction that writes to the register */
  81         int vector_valid;
  82         int scalar_valid;
  83
  84         /* Index to the slot where the register was last read.
  85            This is also the first slot in which the register may be written again */
  86         int vector_lastread;
  87         int scalar_lastread;
  88 };
  89
  90 /**
  91  * Store usage information about an ALU instruction slot during the
  92  * compilation of a fragment program.
  93  */
  94 #define SLOT_SRC_VECTOR  (1<<0)
  95 #define SLOT_SRC_SCALAR  (1<<3)
  96 #define SLOT_SRC_BOTH    (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
  97 #define SLOT_OP_VECTOR   (1<<16)
  98 #define SLOT_OP_SCALAR   (1<<17)
  99 #define SLOT_OP_BOTH     (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
 100
 101 struct r300_pfs_compile_slot {
 102         /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
 103            defined above */
 104         unsigned int used;
 105
 106         /* Selected sources */
 107         int vsrc[3];
 108         int ssrc[3];
 109 };
 110
 111 /**
 112  * Store information during compilation of fragment programs.
 113  */
 114 struct r300_pfs_compile_state {
 115         struct r300_fragment_program_compiler *compiler;
 116
 117         int nrslots;            /* number of ALU slots used so far */
 118
 119         /* Track which (parts of) slots are already filled with instructions */
 120         struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
 121
 122         /* Track the validity of R300 temporaries */
 123         struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
 124
 125         /* Used to map Mesa's inputs/temps onto hardware temps */
 126         int temp_in_use;
 127         struct reg_acc temps[PFS_NUM_TEMP_REGS];
 128         struct reg_acc inputs[32];      /* don't actually need 32... */
 129
 130         /* Track usage of hardware temps, for register allocation,
 131          * indirection detection, etc. */
 132         GLuint used_in_node;
 133         GLuint dest_in_node;
 134 };
 135
 136
 137 /*
 138  * Usefull macros and values
 139  */
 140 #define ERROR(fmt, args...) do {                        \
 141                 fprintf(stderr, "%s::%s(): " fmt "\n",  \
 142                         __FILE__, __FUNCTION__, ##args);        \
 143                 fp->error = GL_TRUE;                    \
 144         } while(0)
 145
 146 #define PFS_INVAL 0xFFFFFFFF
 147 #define COMPILE_STATE \
 148         struct r300_fragment_program *fp = cs->compiler->fp; \
 149         struct r300_fragment_program_code *code = cs->compiler->code; \
 150         (void)code; (void)fp
 151
 152 #define SWIZZLE_XYZ             0
 153 #define SWIZZLE_XXX             1
 154 #define SWIZZLE_YYY             2
 155 #define SWIZZLE_ZZZ             3
 156 #define SWIZZLE_WWW             4
 157 #define SWIZZLE_YZX             5
 158 #define SWIZZLE_ZXY             6
 159 #define SWIZZLE_WZY             7
 160 #define SWIZZLE_111             8
 161 #define SWIZZLE_000             9
 162 #define SWIZZLE_HHH             10
 163
 164 #define swizzle(r, x, y, z, w) do_swizzle(cs, r,                \
 165                                           ((SWIZZLE_##x<<0)|    \
 166                                            (SWIZZLE_##y<<3)|    \
 167                                            (SWIZZLE_##z<<6)|    \
 168                                            (SWIZZLE_##w<<9)),   \
 169                                           0)
 170
 171 #define REG_TYPE_INPUT          0
 172 #define REG_TYPE_OUTPUT         1
 173 #define REG_TYPE_TEMP           2
 174 #define REG_TYPE_CONST          3
 175
 176 #define REG_TYPE_SHIFT          0
 177 #define REG_INDEX_SHIFT         2
 178 #define REG_VSWZ_SHIFT          8
 179 #define REG_SSWZ_SHIFT          13
 180 #define REG_NEGV_SHIFT          18
 181 #define REG_NEGS_SHIFT          19
 182 #define REG_ABS_SHIFT           20
 183 #define REG_NO_USE_SHIFT        21      // Hack for refcounting
 184 #define REG_VALID_SHIFT         22      // Does the register contain a defined value?
 185 #define REG_BUILTIN_SHIFT   23  // Is it a builtin (like all zero/all one)?
 186
 187 #define REG_TYPE_MASK           (0x03 << REG_TYPE_SHIFT)
 188 #define REG_INDEX_MASK          (0x3F << REG_INDEX_SHIFT)
 189 #define REG_VSWZ_MASK           (0x1F << REG_VSWZ_SHIFT)
 190 #define REG_SSWZ_MASK           (0x1F << REG_SSWZ_SHIFT)
 191 #define REG_NEGV_MASK           (0x01 << REG_NEGV_SHIFT)
 192 #define REG_NEGS_MASK           (0x01 << REG_NEGS_SHIFT)
 193 #define REG_ABS_MASK            (0x01 << REG_ABS_SHIFT)
 194 #define REG_NO_USE_MASK         (0x01 << REG_NO_USE_SHIFT)
 195 #define REG_VALID_MASK          (0x01 << REG_VALID_SHIFT)
 196 #define REG_BUILTIN_MASK        (0x01 << REG_BUILTIN_SHIFT)
 197
 198 #define REG(type, index, vswz, sswz, nouse, valid, builtin)     \
 199         (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |                   \
 200          ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |                \
 201          ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |              \
 202          ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |                \
 203          ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |  \
 204          ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |                   \
 205          ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 206 #define REG_GET_TYPE(reg)                                               \
 207         ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
 208 #define REG_GET_INDEX(reg)                                              \
 209         ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
 210 #define REG_GET_VSWZ(reg)                                               \
 211         ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
 212 #define REG_GET_SSWZ(reg)                                               \
 213         ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
 214 #define REG_GET_NO_USE(reg)                                             \
 215         ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
 216 #define REG_GET_VALID(reg)                                              \
 217         ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
 218 #define REG_GET_BUILTIN(reg)                                            \
 219         ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
 220 #define REG_SET_TYPE(reg, type)                                         \
 221         reg = ((reg & ~REG_TYPE_MASK) |                                 \
 222                ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
 223 #define REG_SET_INDEX(reg, index)                                       \
 224         reg = ((reg & ~REG_INDEX_MASK) |                                \
 225                ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
 226 #define REG_SET_VSWZ(reg, vswz)                                         \
 227         reg = ((reg & ~REG_VSWZ_MASK) |                                 \
 228                ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
 229 #define REG_SET_SSWZ(reg, sswz)                                         \
 230         reg = ((reg & ~REG_SSWZ_MASK) |                                 \
 231                ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 232 #define REG_SET_NO_USE(reg, nouse)                                      \
 233         reg = ((reg & ~REG_NO_USE_MASK) |                               \
 234                ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
 235 #define REG_SET_VALID(reg, valid)                                       \
 236         reg = ((reg & ~REG_VALID_MASK) |                                \
 237                ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
 238 #define REG_SET_BUILTIN(reg, builtin)                                   \
 239         reg = ((reg & ~REG_BUILTIN_MASK) |                              \
 240                ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
 241 #define REG_ABS(reg)                                                    \
 242         reg = (reg | REG_ABS_MASK)
 243 #define REG_NEGV(reg)                                                   \
 244         reg = (reg | REG_NEGV_MASK)
 245 #define REG_NEGS(reg)                                                   \
 246         reg = (reg | REG_NEGS_MASK)
 247
 248 #define NOP_INST0 (                                              \
 249                 (R300_ALU_OUTC_MAD) |                            \
 250                 (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
 251                 (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
 252                 (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
 253 #define NOP_INST1 (                                          \
 254                 ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
 255                 ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
 256                 ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
 257 #define NOP_INST2 ( \
 258                 (R300_ALU_OUTA_MAD) |                            \
 259                 (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
 260                 (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
 261                 (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
 262 #define NOP_INST3 (                                          \
 263                 ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
 264                 ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
 265                 ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
 266
 267
 268 /*
 269  * Datas structures for fragment program generation
 270  */
 271
 272 /* description of r300 native hw instructions */
 273 static const struct {
 274         const char *name;
 275         int argc;
 276         int v_op;
 277         int s_op;
 278 } r300_fpop[] = {
 279         /* *INDENT-OFF* */
 280         {"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
 281         {"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
 282         {"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
 283         {"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
 284         {"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
 285         {"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
 286         {"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
 287         {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
 288         {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
 289         {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
 290         {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
 291         {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
 292         {"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
 293         /* *INDENT-ON* */
 294 };
 295
 296 /* vector swizzles r300 can support natively, with a couple of
 297  * cases we handle specially
 298  *
 299  * REG_VSWZ/REG_SSWZ is an index into this table
 300  */
 301
 302 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
 303 #define SWIZZLE_HALF 6
 304
 305 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
 306                                           SWIZZLE_##y, \
 307                                           SWIZZLE_##z, \
 308                                           SWIZZLE_ZERO))
 309 /* native swizzles */
 310 static const struct r300_pfs_swizzle {
 311         GLuint hash;            /* swizzle value this matches */
 312         GLuint base;            /* base value for hw swizzle */
 313         GLuint stride;          /* difference in base between arg0/1/2 */
 314         GLuint flags;
 315 } v_swiz[] = {
 316         /* *INDENT-OFF* */
 317         {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
 318         {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
 319         {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
 320         {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
 321         {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
 322         {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
 323         {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
 324         {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
 325         {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
 326         {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
 327         {MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
 328         {PFS_INVAL, 0, 0, 0},
 329         /* *INDENT-ON* */
 330 };
 331
 332 /* used during matching of non-native swizzles */
 333 #define SWZ_X_MASK (7 << 0)
 334 #define SWZ_Y_MASK (7 << 3)
 335 #define SWZ_Z_MASK (7 << 6)
 336 #define SWZ_W_MASK (7 << 9)
 337 static const struct {
 338         GLuint hash;            /* used to mask matching swizzle components */
 339         int mask;               /* actual outmask */
 340         int count;              /* count of components matched */
 341 } s_mask[] = {
 342         /* *INDENT-OFF* */
 343         {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
 344         {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
 345         {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
 346         {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
 347         {SWZ_X_MASK, 1, 1},
 348         {SWZ_Y_MASK, 2, 1},
 349         {SWZ_Z_MASK, 4, 1},
 350         {PFS_INVAL, PFS_INVAL, PFS_INVAL}
 351         /* *INDENT-ON* */
 352 };
 353
 354 static const struct {
 355         int base;               /* hw value of swizzle */
 356         int stride;             /* difference between SRC0/1/2 */
 357         GLuint flags;
 358 } s_swiz[] = {
 359         /* *INDENT-OFF* */
 360         {R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
 361         {R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
 362         {R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
 363         {R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
 364         {R300_ALU_ARGA_ZERO, 0, 0},
 365         {R300_ALU_ARGA_ONE, 0, 0},
 366         {R300_ALU_ARGA_HALF, 0, 0}
 367         /* *INDENT-ON* */
 368 };
 369
 370 /* boiler-plate reg, for convenience */
 371 static const GLuint undef = REG(REG_TYPE_TEMP,
 372                                 0,
 373                                 SWIZZLE_XYZ,
 374                                 SWIZZLE_W,
 375                                 GL_FALSE,
 376                                 GL_FALSE,
 377                                 GL_FALSE);
 378
 379 /* constant one source */
 380 static const GLuint pfs_one = REG(REG_TYPE_CONST,
 381                                   0,
 382                                   SWIZZLE_111,
 383                                   SWIZZLE_ONE,
 384                                   GL_FALSE,
 385                                   GL_TRUE,
 386                                   GL_TRUE);
 387
 388 /* constant half source */
 389 static const GLuint pfs_half = REG(REG_TYPE_CONST,
 390                                    0,
 391                                    SWIZZLE_HHH,
 392                                    SWIZZLE_HALF,
 393                                    GL_FALSE,
 394                                    GL_TRUE,
 395                                    GL_TRUE);
 396
 397 /* constant zero source */
 398 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
 399                                    0,
 400                                    SWIZZLE_000,
 401                                    SWIZZLE_ZERO,
 402                                    GL_FALSE,
 403                                    GL_TRUE,
 404                                    GL_TRUE);
 405
 406 /*
 407  * Common functions prototypes
 408  */
 409 static void emit_arith(struct r300_pfs_compile_state *cs, int op,
 410                        GLuint dest, int mask,
 411                        GLuint src0, GLuint src1, GLuint src2, int flags);
 412
 413 /**
 414  * Get an R300 temporary that can be written to in the given slot.
 415  */
 416 static int get_hw_temp(struct r300_pfs_compile_state *cs, int slot)
 417 {
 418         COMPILE_STATE;
 419         int r;
 420
 421         for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 422                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
 423                         break;
 424         }
 425
 426         if (r >= PFS_NUM_TEMP_REGS) {
 427                 ERROR("Out of hardware temps\n");
 428                 return 0;
 429         }
 430         // Reserved is used to avoid the following scenario:
 431         //  R300 temporary X is first assigned to Mesa temporary Y during vector ops
 432         //  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
 433         //  Then scalar ops on Mesa temporary Z are emitted and move back in time
 434         //  to overwrite the value of temporary Y.
 435         // End scenario.
 436         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 437         cs->hwtemps[r].free = -1;
 438
 439         // Reset to some value that won't mess things up when the user
 440         // tries to read from a temporary that hasn't been assigned a value yet.
 441         // In the normal case, vector_valid and scalar_valid should be set to
 442         // a sane value by the first emit that writes to this temporary.
 443         cs->hwtemps[r].vector_valid = 0;
 444         cs->hwtemps[r].scalar_valid = 0;
 445
 446         if (r > code->max_temp_idx)
 447                 code->max_temp_idx = r;
 448
 449         return r;
 450 }
 451
 452 /**
 453  * Get an R300 temporary that will act as a TEX destination register.
 454  */
 455 static int get_hw_temp_tex(struct r300_pfs_compile_state *cs)
 456 {
 457         COMPILE_STATE;
 458         int r;
 459
 460         for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 461                 if (cs->used_in_node & (1 << r))
 462                         continue;
 463
 464                 // Note: Be very careful here
 465                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
 466                         break;
 467         }
 468
 469         if (r >= PFS_NUM_TEMP_REGS)
 470                 return get_hw_temp(cs, 0);      /* Will cause an indirection */
 471
 472         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 473         cs->hwtemps[r].free = -1;
 474
 475         // Reset to some value that won't mess things up when the user
 476         // tries to read from a temporary that hasn't been assigned a value yet.
 477         // In the normal case, vector_valid and scalar_valid should be set to
 478         // a sane value by the first emit that writes to this temporary.
 479         cs->hwtemps[r].vector_valid = cs->nrslots;
 480         cs->hwtemps[r].scalar_valid = cs->nrslots;
 481
 482         if (r > code->max_temp_idx)
 483                 code->max_temp_idx = r;
 484
 485         return r;
 486 }
 487
 488 /**
 489  * Mark the given hardware register as free.
 490  */
 491 static void free_hw_temp(struct r300_pfs_compile_state *cs, int idx)
 492 {
 493         // Be very careful here. Consider sequences like
 494         //  MAD r0, r1,r2,r3
 495         //  TEX r4, ...
 496         // The TEX instruction may be moved in front of the MAD instruction
 497         // due to the way nodes work. We don't want to alias r1 and r4 in
 498         // this case.
 499         // I'm certain the register allocation could be further sanitized,
 500         // but it's tricky because of stuff that can happen inside emit_tex
 501         // and emit_arith.
 502         cs->hwtemps[idx].free = cs->nrslots + 1;
 503 }
 504
 505 /**
 506  * Create a new Mesa temporary register.
 507  */
 508 static GLuint get_temp_reg(struct r300_pfs_compile_state *cs)
 509 {
 510         COMPILE_STATE;
 511         GLuint r = undef;
 512         GLuint index;
 513
 514         index = ffs(~cs->temp_in_use);
 515         if (!index) {
 516                 ERROR("Out of program temps\n");
 517                 return r;
 518         }
 519
 520         cs->temp_in_use |= (1 << --index);
 521         cs->temps[index].refcount = 0xFFFFFFFF;
 522         cs->temps[index].reg = -1;
 523
 524         REG_SET_TYPE(r, REG_TYPE_TEMP);
 525         REG_SET_INDEX(r, index);
 526         REG_SET_VALID(r, GL_TRUE);
 527         return r;
 528 }
 529
 530 /**
 531  * Free a Mesa temporary and the associated R300 temporary.
 532  */
 533 static void free_temp(struct r300_pfs_compile_state *cs, GLuint r)
 534 {
 535         GLuint index = REG_GET_INDEX(r);
 536
 537         if (!(cs->temp_in_use & (1 << index)))
 538                 return;
 539
 540         if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
 541                 free_hw_temp(cs, cs->temps[index].reg);
 542                 cs->temps[index].reg = -1;
 543                 cs->temp_in_use &= ~(1 << index);
 544         } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
 545                 free_hw_temp(cs, cs->inputs[index].reg);
 546                 cs->inputs[index].reg = -1;
 547         }
 548 }
 549
 550 /**
 551  * Emit a hardware constant/parameter.
 552  *
 553  * \p cp Stable pointer to an array of 4 floats.
 554  *  The pointer must be stable in the sense that it remains to be valid
 555  *  and hold the contents of the constant/parameter throughout the lifetime
 556  *  of the fragment program (actually, up until the next time the fragment
 557  *  program is translated).
 558  */
 559 static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
 560                             const GLfloat * cp)
 561 {
 562         COMPILE_STATE;
 563         GLuint reg = undef;
 564         int index;
 565
 566         for (index = 0; index < code->const_nr; ++index) {
 567                 if (code->constant[index] == cp)
 568                         break;
 569         }
 570
 571         if (index >= code->const_nr) {
 572                 if (index >= PFS_NUM_CONST_REGS) {
 573                         ERROR("Out of hw constants!\n");
 574                         return reg;
 575                 }
 576
 577                 code->const_nr++;
 578                 code->constant[index] = cp;
 579         }
 580
 581         REG_SET_TYPE(reg, REG_TYPE_CONST);
 582         REG_SET_INDEX(reg, index);
 583         REG_SET_VALID(reg, GL_TRUE);
 584         return reg;
 585 }
 586
 587 static inline GLuint negate(GLuint r)
 588 {
 589         REG_NEGS(r);
 590         REG_NEGV(r);
 591         return r;
 592 }
 593
 594 /* Hack, to prevent clobbering sources used multiple times when
 595  * emulating non-native instructions
 596  */
 597 static inline GLuint keep(GLuint r)
 598 {
 599         REG_SET_NO_USE(r, GL_TRUE);
 600         return r;
 601 }
 602
 603 static inline GLuint absolute(GLuint r)
 604 {
 605         REG_ABS(r);
 606         return r;
 607 }
 608
 609 static int swz_native(struct r300_pfs_compile_state *cs,
 610                       GLuint src, GLuint * r, GLuint arbneg)
 611 {
 612         COMPILE_STATE;
 613
 614         /* Native swizzle, handle negation */
 615         src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
 616
 617         if ((arbneg & 0x7) == 0x0) {
 618                 src = src & ~REG_NEGV_MASK;
 619                 *r = src;
 620         } else if ((arbneg & 0x7) == 0x7) {
 621                 src |= REG_NEGV_MASK;
 622                 *r = src;
 623         } else {
 624                 if (!REG_GET_VALID(*r))
 625                         *r = get_temp_reg(cs);
 626                 src |= REG_NEGV_MASK;
 627                 emit_arith(cs,
 628                            PFS_OP_MAD,
 629                            *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
 630                 src = src & ~REG_NEGV_MASK;
 631                 emit_arith(cs,
 632                            PFS_OP_MAD,
 633                            *r,
 634                            (arbneg ^ 0x7) | WRITEMASK_W,
 635                            src, pfs_one, pfs_zero, 0);
 636         }
 637
 638         return 3;
 639 }
 640
 641 static int swz_emit_partial(struct r300_pfs_compile_state *cs,
 642                             GLuint src,
 643                             GLuint * r, int mask, int mc, GLuint arbneg)
 644 {
 645         COMPILE_STATE;
 646         GLuint tmp;
 647         GLuint wmask = 0;
 648
 649         if (!REG_GET_VALID(*r))
 650                 *r = get_temp_reg(cs);
 651
 652         /* A partial match, VSWZ/mask define what parts of the
 653          * desired swizzle we match
 654          */
 655         if (mc + s_mask[mask].count == 3) {
 656                 wmask = WRITEMASK_W;
 657                 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
 658         }
 659
 660         tmp = arbneg & s_mask[mask].mask;
 661         if (tmp) {
 662                 tmp = tmp ^ s_mask[mask].mask;
 663                 if (tmp) {
 664                         emit_arith(cs,
 665                                    PFS_OP_MAD,
 666                                    *r,
 667                                    arbneg & s_mask[mask].mask,
 668                                    keep(src) | REG_NEGV_MASK,
 669                                    pfs_one, pfs_zero, 0);
 670                         if (!wmask) {
 671                                 REG_SET_NO_USE(src, GL_TRUE);
 672                         } else {
 673                                 REG_SET_NO_USE(src, GL_FALSE);
 674                         }
 675                         emit_arith(cs,
 676                                    PFS_OP_MAD,
 677                                    *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
 678                 } else {
 679                         if (!wmask) {
 680                                 REG_SET_NO_USE(src, GL_TRUE);
 681                         } else {
 682                                 REG_SET_NO_USE(src, GL_FALSE);
 683                         }
 684                         emit_arith(cs,
 685                                    PFS_OP_MAD,
 686                                    *r,
 687                                    (arbneg & s_mask[mask].mask) | wmask,
 688                                    src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
 689                 }
 690         } else {
 691                 if (!wmask) {
 692                         REG_SET_NO_USE(src, GL_TRUE);
 693                 } else {
 694                         REG_SET_NO_USE(src, GL_FALSE);
 695                 }
 696                 emit_arith(cs, PFS_OP_MAD,
 697                            *r,
 698                            s_mask[mask].mask | wmask,
 699                            src, pfs_one, pfs_zero, 0);
 700         }
 701
 702         return s_mask[mask].count;
 703 }
 704
 705 static GLuint do_swizzle(struct r300_pfs_compile_state *cs,
 706                          GLuint src, GLuint arbswz, GLuint arbneg)
 707 {
 708         COMPILE_STATE;
 709         GLuint r = undef;
 710         GLuint vswz;
 711         int c_mask = 0;
 712         int v_match = 0;
 713
 714         /* If swizzling from something without an XYZW native swizzle,
 715          * emit result to a temp, and do new swizzle from the temp.
 716          */
 717 #if 0
 718         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
 719                 GLuint temp = get_temp_reg(fp);
 720                 emit_arith(fp,
 721                            PFS_OP_MAD,
 722                            temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
 723                 src = temp;
 724         }
 725 #endif
 726
 727         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
 728                 GLuint vsrcswz =
 729                     (v_swiz[REG_GET_VSWZ(src)].
 730                      hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
 731                     REG_GET_SSWZ(src) << 9;
 732                 GLint i;
 733
 734                 GLuint newswz = 0;
 735                 GLuint offset;
 736                 for (i = 0; i < 4; ++i) {
 737                         offset = GET_SWZ(arbswz, i);
 738
 739                         newswz |=
 740                             (offset <= 3) ? GET_SWZ(vsrcswz,
 741                                                     offset) << i *
 742                             3 : offset << i * 3;
 743                 }
 744
 745                 arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
 746                 REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
 747         } else {
 748                 /* set scalar swizzling */
 749                 REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
 750
 751         }
 752         do {
 753                 vswz = REG_GET_VSWZ(src);
 754                 do {
 755                         int chash;
 756
 757                         REG_SET_VSWZ(src, vswz);
 758                         chash = v_swiz[REG_GET_VSWZ(src)].hash &
 759                             s_mask[c_mask].hash;
 760
 761                         if (chash == (arbswz & s_mask[c_mask].hash)) {
 762                                 if (s_mask[c_mask].count == 3) {
 763                                         v_match += swz_native(cs,
 764                                                               src, &r, arbneg);
 765                                 } else {
 766                                         v_match += swz_emit_partial(cs,
 767                                                                     src,
 768                                                                     &r,
 769                                                                     c_mask,
 770                                                                     v_match,
 771                                                                     arbneg);
 772                                 }
 773
 774                                 if (v_match == 3)
 775                                         return r;
 776
 777                                 /* Fill with something invalid.. all 0's was
 778                                  * wrong before, matched SWIZZLE_X.  So all
 779                                  * 1's will be okay for now
 780                                  */
 781                                 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
 782                         }
 783                 } while (v_swiz[++vswz].hash != PFS_INVAL);
 784                 REG_SET_VSWZ(src, SWIZZLE_XYZ);
 785         } while (s_mask[++c_mask].hash != PFS_INVAL);
 786
 787         ERROR("should NEVER get here\n");
 788         return r;
 789 }
 790
 791 static GLuint t_src(struct r300_pfs_compile_state *cs,
 792                     struct prog_src_register fpsrc)
 793 {
 794         COMPILE_STATE;
 795         GLuint r = undef;
 796
 797         switch (fpsrc.File) {
 798         case PROGRAM_TEMPORARY:
 799                 REG_SET_INDEX(r, fpsrc.Index);
 800                 REG_SET_VALID(r, GL_TRUE);
 801                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 802                 break;
 803         case PROGRAM_INPUT:
 804                 REG_SET_INDEX(r, fpsrc.Index);
 805                 REG_SET_VALID(r, GL_TRUE);
 806                 REG_SET_TYPE(r, REG_TYPE_INPUT);
 807                 break;
 808         case PROGRAM_LOCAL_PARAM:
 809                 r = emit_const4fv(cs,
 810                                   fp->mesa_program.Base.LocalParams[fpsrc.
 811                                                                     Index]);
 812                 break;
 813         case PROGRAM_ENV_PARAM:
 814                 r = emit_const4fv(cs,
 815                         cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[fpsrc.Index]);
 816                 break;
 817         case PROGRAM_STATE_VAR:
 818         case PROGRAM_NAMED_PARAM:
 819         case PROGRAM_CONSTANT:
 820                 r = emit_const4fv(cs,
 821                                   fp->mesa_program.Base.Parameters->
 822                                   ParameterValues[fpsrc.Index]);
 823                 break;
 824         case PROGRAM_BUILTIN:
 825                 switch(fpsrc.Swizzle) {
 826                 case SWIZZLE_1111: r = pfs_one; break;
 827                 case SWIZZLE_0000: r = pfs_zero; break;
 828                 default:
 829                         ERROR("bad PROGRAM_BUILTIN swizzle %u\n", fpsrc.Swizzle);
 830                         break;
 831                 }
 832                 break;
 833         default:
 834                 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
 835                 return r;
 836         }
 837
 838         /* no point swizzling ONE/ZERO/HALF constants... */
 839         if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
 840                 r = do_swizzle(cs, r, fpsrc.Swizzle, fpsrc.NegateBase);
 841         if (fpsrc.Abs)
 842                 r = absolute(r);
 843         if (fpsrc.NegateAbs)
 844                 r = negate(r);
 845         return r;
 846 }
 847
 848 static GLuint t_scalar_src(struct r300_pfs_compile_state *cs,
 849                            struct prog_src_register fpsrc)
 850 {
 851         struct prog_src_register src = fpsrc;
 852         int sc = GET_SWZ(fpsrc.Swizzle, 0);     /* X */
 853
 854         src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
 855
 856         return t_src(cs, src);
 857 }
 858
 859 static GLuint t_dst(struct r300_pfs_compile_state *cs,
 860                     struct prog_dst_register dest)
 861 {
 862         COMPILE_STATE;
 863         GLuint r = undef;
 864
 865         switch (dest.File) {
 866         case PROGRAM_TEMPORARY:
 867                 REG_SET_INDEX(r, dest.Index);
 868                 REG_SET_VALID(r, GL_TRUE);
 869                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 870                 return r;
 871         case PROGRAM_OUTPUT:
 872                 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
 873                 switch (dest.Index) {
 874                 case FRAG_RESULT_COLR:
 875                 case FRAG_RESULT_DEPR:
 876                         REG_SET_INDEX(r, dest.Index);
 877                         REG_SET_VALID(r, GL_TRUE);
 878                         return r;
 879                 default:
 880                         ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
 881                         return r;
 882                 }
 883         default:
 884                 ERROR("Bad DstReg->File 0x%x\n", dest.File);
 885                 return r;
 886         }
 887 }
 888
 889 static int t_hw_src(struct r300_pfs_compile_state *cs, GLuint src, GLboolean tex)
 890 {
 891         COMPILE_STATE;
 892         int idx;
 893         int index = REG_GET_INDEX(src);
 894
 895         switch (REG_GET_TYPE(src)) {
 896         case REG_TYPE_TEMP:
 897                 /* NOTE: if reg==-1 here, a source is being read that
 898                  *       hasn't been written to. Undefined results.
 899                  */
 900                 if (cs->temps[index].reg == -1)
 901                         cs->temps[index].reg = get_hw_temp(cs, cs->nrslots);
 902
 903                 idx = cs->temps[index].reg;
 904
 905                 if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
 906                         free_temp(cs, src);
 907                 break;
 908         case REG_TYPE_INPUT:
 909                 idx = cs->inputs[index].reg;
 910
 911                 if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
 912                         free_hw_temp(cs, cs->inputs[index].reg);
 913                 break;
 914         case REG_TYPE_CONST:
 915                 return (index | SRC_CONST);
 916         default:
 917                 ERROR("Invalid type for source reg\n");
 918                 return (0 | SRC_CONST);
 919         }
 920
 921         if (!tex)
 922                 cs->used_in_node |= (1 << idx);
 923
 924         return idx;
 925 }
 926
 927 static int t_hw_dst(struct r300_pfs_compile_state *cs,
 928                     GLuint dest, GLboolean tex, int slot)
 929 {
 930         COMPILE_STATE;
 931         int idx;
 932         GLuint index = REG_GET_INDEX(dest);
 933         assert(REG_GET_VALID(dest));
 934
 935         switch (REG_GET_TYPE(dest)) {
 936         case REG_TYPE_TEMP:
 937                 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
 938                         if (!tex) {
 939                                 cs->temps[index].reg = get_hw_temp(cs, slot);
 940                         } else {
 941                                 cs->temps[index].reg = get_hw_temp_tex(cs);
 942                         }
 943                 }
 944                 idx = cs->temps[index].reg;
 945
 946                 if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
 947                         free_temp(cs, dest);
 948
 949                 cs->dest_in_node |= (1 << idx);
 950                 cs->used_in_node |= (1 << idx);
 951                 break;
 952         case REG_TYPE_OUTPUT:
 953                 switch (index) {
 954                 case FRAG_RESULT_COLR:
 955                         code->node[code->cur_node].flags |= R300_RGBA_OUT;
 956                         break;
 957                 case FRAG_RESULT_DEPR:
 958                         fp->WritesDepth = GL_TRUE;
 959                         code->node[code->cur_node].flags |= R300_W_OUT;
 960                         break;
 961                 }
 962                 return index;
 963                 break;
 964         default:
 965                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
 966                 return 0;
 967         }
 968
 969         return idx;
 970 }
 971
 972 static void emit_nop(struct r300_pfs_compile_state *cs)
 973 {
 974         COMPILE_STATE;
 975
 976         if (cs->nrslots >= PFS_MAX_ALU_INST) {
 977                 ERROR("Out of ALU instruction slots\n");
 978                 return;
 979         }
 980
 981         code->alu.inst[cs->nrslots].inst0 = NOP_INST0;
 982         code->alu.inst[cs->nrslots].inst1 = NOP_INST1;
 983         code->alu.inst[cs->nrslots].inst2 = NOP_INST2;
 984         code->alu.inst[cs->nrslots].inst3 = NOP_INST3;
 985         cs->nrslots++;
 986 }
 987
 988 static void emit_tex(struct r300_pfs_compile_state *cs,
 989                      struct prog_instruction *fpi, int opcode)
 990 {
 991         COMPILE_STATE;
 992         GLuint coord = t_src(cs, fpi->SrcReg[0]);
 993         GLuint dest = undef;
 994         GLuint din, uin;
 995         int unit = fpi->TexSrcUnit;
 996         int hwsrc, hwdest;
 997
 998         /* Ensure correct node indirection */
 999         uin = cs->used_in_node;
1000         din = cs->dest_in_node;
1001
1002         /* Resolve source/dest to hardware registers */
1003         hwsrc = t_hw_src(cs, coord, GL_TRUE);
1004
1005         if (opcode != R300_TEX_OP_KIL) {
1006                 dest = t_dst(cs, fpi->DstReg);
1007
1008                 hwdest =
1009                     t_hw_dst(cs, dest, GL_TRUE,
1010                              code->node[code->cur_node].alu_offset);
1011
1012                 /* Use a temp that hasn't been used in this node, rather
1013                  * than causing an indirection
1014                  */
1015                 if (uin & (1 << hwdest)) {
1016                         free_hw_temp(cs, hwdest);
1017                         hwdest = get_hw_temp_tex(cs);
1018                         cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
1019                 }
1020         } else {
1021                 hwdest = 0;
1022                 unit = 0;
1023         }
1024
1025         /* Indirection if source has been written in this node, or if the
1026          * dest has been read/written in this node
1027          */
1028         if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
1029              (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
1030
1031                 /* Finish off current node */
1032                 if (code->node[code->cur_node].alu_offset == cs->nrslots)
1033                         emit_nop(cs);
1034
1035                 code->node[code->cur_node].alu_end =
1036                     cs->nrslots - code->node[code->cur_node].alu_offset - 1;
1037                 assert(code->node[code->cur_node].alu_end >= 0);
1038
1039                 if (++code->cur_node >= PFS_MAX_TEX_INDIRECT) {
1040                         ERROR("too many levels of texture indirection\n");
1041                         return;
1042                 }
1043
1044                 /* Start new node */
1045                 code->node[code->cur_node].tex_offset = code->tex.length;
1046                 code->node[code->cur_node].alu_offset = cs->nrslots;
1047                 code->node[code->cur_node].tex_end = -1;
1048                 code->node[code->cur_node].alu_end = -1;
1049                 code->node[code->cur_node].flags = 0;
1050                 cs->used_in_node = 0;
1051                 cs->dest_in_node = 0;
1052         }
1053
1054         if (code->cur_node == 0)
1055                 code->first_node_has_tex = 1;
1056
1057         code->tex.inst[code->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
1058             | (hwdest << R300_DST_ADDR_SHIFT)
1059             | (unit << R300_TEX_ID_SHIFT)
1060             | (opcode << R300_TEX_INST_SHIFT);
1061
1062         cs->dest_in_node |= (1 << hwdest);
1063         if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1064                 cs->used_in_node |= (1 << hwsrc);
1065
1066         code->node[code->cur_node].tex_end++;
1067 }
1068
1069 /**
1070  * Returns the first slot where we could possibly allow writing to dest,
1071  * according to register allocation.
1072  */
1073 static int get_earliest_allowed_write(struct r300_pfs_compile_state *cs,
1074                                       GLuint dest, int mask)
1075 {
1076         COMPILE_STATE;
1077         int idx;
1078         int pos;
1079         GLuint index = REG_GET_INDEX(dest);
1080         assert(REG_GET_VALID(dest));
1081
1082         switch (REG_GET_TYPE(dest)) {
1083         case REG_TYPE_TEMP:
1084                 if (cs->temps[index].reg == -1)
1085                         return 0;
1086
1087                 idx = cs->temps[index].reg;
1088                 break;
1089         case REG_TYPE_OUTPUT:
1090                 return 0;
1091         default:
1092                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1093                 return 0;
1094         }
1095
1096         pos = cs->hwtemps[idx].reserved;
1097         if (mask & WRITEMASK_XYZ) {
1098                 if (pos < cs->hwtemps[idx].vector_lastread)
1099                         pos = cs->hwtemps[idx].vector_lastread;
1100         }
1101         if (mask & WRITEMASK_W) {
1102                 if (pos < cs->hwtemps[idx].scalar_lastread)
1103                         pos = cs->hwtemps[idx].scalar_lastread;
1104         }
1105
1106         return pos;
1107 }
1108
1109 /**
1110  * Allocates a slot for an ALU instruction that can consist of
1111  * a vertex part or a scalar part or both.
1112  *
1113  * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1114  * appropriate position (vector and/or scalar), and their positions are
1115  * recorded in the srcpos array.
1116  *
1117  * This function emits instruction code for the source fetch and the
1118  * argument selection. It does not emit instruction code for the
1119  * opcode or the destination selection.
1120  *
1121  * @return the index of the slot
1122  */
1123 static int find_and_prepare_slot(struct r300_pfs_compile_state *cs,
1124                                  GLboolean emit_vop,
1125                                  GLboolean emit_sop,
1126                                  int argc, GLuint * src, GLuint dest, int mask)
1127 {
1128         COMPILE_STATE;
1129         int hwsrc[3];
1130         int srcpos[3];
1131         unsigned int used;
1132         int tempused;
1133         int tempvsrc[3];
1134         int tempssrc[3];
1135         int pos;
1136         int regnr;
1137         int i, j;
1138
1139         // Determine instruction slots, whether sources are required on
1140         // vector or scalar side, and the smallest slot number where
1141         // all source registers are available
1142         used = 0;
1143         if (emit_vop)
1144                 used |= SLOT_OP_VECTOR;
1145         if (emit_sop)
1146                 used |= SLOT_OP_SCALAR;
1147
1148         pos = get_earliest_allowed_write(cs, dest, mask);
1149
1150         if (code->node[code->cur_node].alu_offset > pos)
1151                 pos = code->node[code->cur_node].alu_offset;
1152         for (i = 0; i < argc; ++i) {
1153                 if (!REG_GET_BUILTIN(src[i])) {
1154                         if (emit_vop)
1155                                 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1156                         if (emit_sop)
1157                                 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1158                 }
1159
1160                 hwsrc[i] = t_hw_src(cs, src[i], GL_FALSE);      /* Note: sideeffects wrt refcounting! */
1161                 regnr = hwsrc[i] & 31;
1162
1163                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1164                         if (used & (SLOT_SRC_VECTOR << i)) {
1165                                 if (cs->hwtemps[regnr].vector_valid > pos)
1166                                         pos = cs->hwtemps[regnr].vector_valid;
1167                         }
1168                         if (used & (SLOT_SRC_SCALAR << i)) {
1169                                 if (cs->hwtemps[regnr].scalar_valid > pos)
1170                                         pos = cs->hwtemps[regnr].scalar_valid;
1171                         }
1172                 }
1173         }
1174
1175         // Find a slot that fits
1176         for (;; ++pos) {
1177                 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1178                         continue;
1179
1180                 if (pos >= cs->nrslots) {
1181                         if (cs->nrslots >= PFS_MAX_ALU_INST) {
1182                                 ERROR("Out of ALU instruction slots\n");
1183                                 return -1;
1184                         }
1185
1186                         code->alu.inst[pos].inst0 = NOP_INST0;
1187                         code->alu.inst[pos].inst1 = NOP_INST1;
1188                         code->alu.inst[pos].inst2 = NOP_INST2;
1189                         code->alu.inst[pos].inst3 = NOP_INST3;
1190
1191                         cs->nrslots++;
1192                 }
1193                 // Note: When we need both parts (vector and scalar) of a source,
1194                 // we always try to put them into the same position. This makes the
1195                 // code easier to read, and it is optimal (i.e. one doesn't gain
1196                 // anything by splitting the parts).
1197                 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1198                 tempused = cs->slot[pos].used;
1199                 for (i = 0; i < 3; ++i) {
1200                         tempvsrc[i] = cs->slot[pos].vsrc[i];
1201                         tempssrc[i] = cs->slot[pos].ssrc[i];
1202                 }
1203
1204                 for (i = 0; i < argc; ++i) {
1205                         int flags = (used >> i) & SLOT_SRC_BOTH;
1206
1207                         if (!flags) {
1208                                 srcpos[i] = 0;
1209                                 continue;
1210                         }
1211
1212                         for (j = 0; j < 3; ++j) {
1213                                 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1214                                         if (tempvsrc[j] != hwsrc[i])
1215                                                 continue;
1216                                 }
1217
1218                                 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1219                                         if (tempssrc[j] != hwsrc[i])
1220                                                 continue;
1221                                 }
1222
1223                                 break;
1224                         }
1225
1226                         if (j == 3)
1227                                 break;
1228
1229                         srcpos[i] = j;
1230                         tempused |= flags << j;
1231                         if (flags & SLOT_SRC_VECTOR)
1232                                 tempvsrc[j] = hwsrc[i];
1233                         if (flags & SLOT_SRC_SCALAR)
1234                                 tempssrc[j] = hwsrc[i];
1235                 }
1236
1237                 if (i == argc)
1238                         break;
1239         }
1240
1241         // Found a slot, reserve it
1242         cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1243         for (i = 0; i < 3; ++i) {
1244                 cs->slot[pos].vsrc[i] = tempvsrc[i];
1245                 cs->slot[pos].ssrc[i] = tempssrc[i];
1246         }
1247
1248         for (i = 0; i < argc; ++i) {
1249                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1250                         int regnr = hwsrc[i] & 31;
1251
1252                         if (used & (SLOT_SRC_VECTOR << i)) {
1253                                 if (cs->hwtemps[regnr].vector_lastread < pos)
1254                                         cs->hwtemps[regnr].vector_lastread =
1255                                             pos;
1256                         }
1257                         if (used & (SLOT_SRC_SCALAR << i)) {
1258                                 if (cs->hwtemps[regnr].scalar_lastread < pos)
1259                                         cs->hwtemps[regnr].scalar_lastread =
1260                                             pos;
1261                         }
1262                 }
1263         }
1264
1265         // Emit the source fetch code
1266         code->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
1267         code->alu.inst[pos].inst1 |=
1268             ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
1269              (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
1270              (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
1271
1272         code->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
1273         code->alu.inst[pos].inst3 |=
1274             ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
1275              (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
1276              (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
1277
1278         // Emit the argument selection code
1279         if (emit_vop) {
1280                 int swz[3];
1281
1282                 for (i = 0; i < 3; ++i) {
1283                         if (i < argc) {
1284                                 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1285                                           (srcpos[i] *
1286                                            v_swiz[REG_GET_VSWZ(src[i])].
1287                                            stride)) | ((src[i] & REG_NEGV_MASK)
1288                                                        ? ARG_NEG : 0) | ((src[i]
1289                                                                           &
1290                                                                           REG_ABS_MASK)
1291                                                                          ?
1292                                                                          ARG_ABS
1293                                                                          : 0);
1294                         } else {
1295                                 swz[i] = R300_ALU_ARGC_ZERO;
1296                         }
1297                 }
1298
1299                 code->alu.inst[pos].inst0 &=
1300                     ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
1301                       R300_ALU_ARG2C_MASK);
1302                 code->alu.inst[pos].inst0 |=
1303                     (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
1304                                                          R300_ALU_ARG1C_SHIFT)
1305                     | (swz[2] << R300_ALU_ARG2C_SHIFT);
1306         }
1307
1308         if (emit_sop) {
1309                 int swz[3];
1310
1311                 for (i = 0; i < 3; ++i) {
1312                         if (i < argc) {
1313                                 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1314                                           (srcpos[i] *
1315                                            s_swiz[REG_GET_SSWZ(src[i])].
1316                                            stride)) | ((src[i] & REG_NEGS_MASK)
1317                                                        ? ARG_NEG : 0) | ((src[i]
1318                                                                           &
1319                                                                           REG_ABS_MASK)
1320                                                                          ?
1321                                                                          ARG_ABS
1322                                                                          : 0);
1323                         } else {
1324                                 swz[i] = R300_ALU_ARGA_ZERO;
1325                         }
1326                 }
1327
1328                 code->alu.inst[pos].inst2 &=
1329                     ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
1330                       R300_ALU_ARG2A_MASK);
1331                 code->alu.inst[pos].inst2 |=
1332                     (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
1333                                                          R300_ALU_ARG1A_SHIFT)
1334                     | (swz[2] << R300_ALU_ARG2A_SHIFT);
1335         }
1336
1337         return pos;
1338 }
1339
1340 /**
1341  * Append an ALU instruction to the instruction list.
1342  */
1343 static void emit_arith(struct r300_pfs_compile_state *cs,
1344                        int op,
1345                        GLuint dest,
1346                        int mask,
1347                        GLuint src0, GLuint src1, GLuint src2, int flags)
1348 {
1349         COMPILE_STATE;
1350         GLuint src[3] = { src0, src1, src2 };
1351         int hwdest;
1352         GLboolean emit_vop, emit_sop;
1353         int vop, sop, argc;
1354         int pos;
1355
1356         vop = r300_fpop[op].v_op;
1357         sop = r300_fpop[op].s_op;
1358         argc = r300_fpop[op].argc;
1359
1360         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1361             REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1362                 if (mask & WRITEMASK_Z) {
1363                         mask = WRITEMASK_W;
1364                 } else {
1365                         return;
1366                 }
1367         }
1368
1369         emit_vop = GL_FALSE;
1370         emit_sop = GL_FALSE;
1371         if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
1372                 emit_vop = GL_TRUE;
1373         if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
1374                 emit_sop = GL_TRUE;
1375
1376         pos =
1377             find_and_prepare_slot(cs, emit_vop, emit_sop, argc, src, dest,
1378                                   mask);
1379         if (pos < 0)
1380                 return;
1381
1382         hwdest = t_hw_dst(cs, dest, GL_FALSE, pos);     /* Note: Side effects wrt register allocation */
1383
1384         if (flags & PFS_FLAG_SAT) {
1385                 vop |= R300_ALU_OUTC_CLAMP;
1386                 sop |= R300_ALU_OUTA_CLAMP;
1387         }
1388
1389         /* Throw the pieces together and get ALU/1 */
1390         if (emit_vop) {
1391                 code->alu.inst[pos].inst0 |= vop;
1392
1393                 code->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
1394
1395                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1396                         if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1397                                 code->alu.inst[pos].inst1 |=
1398                                     (mask & WRITEMASK_XYZ) <<
1399                                     R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
1400                         } else
1401                                 assert(0);
1402                 } else {
1403                         code->alu.inst[pos].inst1 |=
1404                             (mask & WRITEMASK_XYZ) <<
1405                             R300_ALU_DSTC_REG_MASK_SHIFT;
1406
1407                         cs->hwtemps[hwdest].vector_valid = pos + 1;
1408                 }
1409         }
1410
1411         /* And now ALU/3 */
1412         if (emit_sop) {
1413                 code->alu.inst[pos].inst2 |= sop;
1414
1415                 if (mask & WRITEMASK_W) {
1416                         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1417                                 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1418                                         code->alu.inst[pos].inst3 |=
1419                                             (hwdest << R300_ALU_DSTA_SHIFT) |
1420                                             R300_ALU_DSTA_OUTPUT;
1421                                 } else if (REG_GET_INDEX(dest) ==
1422                                            FRAG_RESULT_DEPR) {
1423                                         code->alu.inst[pos].inst3 |=
1424                                             R300_ALU_DSTA_DEPTH;
1425                                 } else
1426                                         assert(0);
1427                         } else {
1428                                 code->alu.inst[pos].inst3 |=
1429                                     (hwdest << R300_ALU_DSTA_SHIFT) |
1430                                     R300_ALU_DSTA_REG;
1431
1432                                 cs->hwtemps[hwdest].scalar_valid = pos + 1;
1433                         }
1434                 }
1435         }
1436
1437         return;
1438 }
1439
1440 static GLfloat SinCosConsts[2][4] = {
1441         {
1442          1.273239545,           // 4/PI
1443          -0.405284735,          // -4/(PI*PI)
1444          3.141592654,           // PI
1445          0.2225                 // weight
1446          },
1447         {
1448          0.75,
1449          0.0,
1450          0.159154943,           // 1/(2*PI)
1451          6.283185307            // 2*PI
1452          }
1453 };
1454
1455 /**
1456  * Emit a LIT instruction.
1457  * \p flags may be PFS_FLAG_SAT
1458  *
1459  * Definition of LIT (from ARB_fragment_program):
1460  * tmp = VectorLoad(op0);
1461  * if (tmp.x < 0) tmp.x = 0;
1462  * if (tmp.y < 0) tmp.y = 0;
1463  * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1464  * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1465  * result.x = 1.0;
1466  * result.y = tmp.x;
1467  * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1468  * result.w = 1.0;
1469  *
1470  * The longest path of computation is the one leading to result.z,
1471  * consisting of 5 operations. This implementation of LIT takes
1472  * 5 slots. So unless there's some special undocumented opcode,
1473  * this implementation is potentially optimal. Unfortunately,
1474  * emit_arith is a bit too conservative because it doesn't understand
1475  * partial writes to the vector component.
1476  */
1477 static const GLfloat LitConst[4] =
1478     { 127.999999, 127.999999, 127.999999, -127.999999 };
1479
1480 static void emit_lit(struct r300_pfs_compile_state *cs,
1481                      GLuint dest, int mask, GLuint src, int flags)
1482 {
1483         COMPILE_STATE;
1484         GLuint cnst;
1485         int needTemporary;
1486         GLuint temp;
1487
1488         cnst = emit_const4fv(cs, LitConst);
1489
1490         needTemporary = 0;
1491         if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
1492                 needTemporary = 1;
1493         } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1494                 // LIT is typically followed by DP3/DP4, so there's no point
1495                 // in creating special code for this case
1496                 needTemporary = 1;
1497         }
1498
1499         if (needTemporary) {
1500                 temp = keep(get_temp_reg(cs));
1501         } else {
1502                 temp = keep(dest);
1503         }
1504
1505         // Note: The order of emit_arith inside the slots is relevant,
1506         // because emit_arith only looks at scalar vs. vector when resolving
1507         // dependencies, and it does not consider individual vector components,
1508         // so swizzling between the two parts can create fake dependencies.
1509
1510         // First slot
1511         emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_XY,
1512                    keep(src), pfs_zero, undef, 0);
1513         emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
1514
1515         // Second slot
1516         emit_arith(cs, PFS_OP_MIN, temp, WRITEMASK_Z,
1517                    swizzle(temp, W, W, W, W), cnst, undef, 0);
1518         emit_arith(cs, PFS_OP_LG2, temp, WRITEMASK_W,
1519                    swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
1520
1521         // Third slot
1522         // If desired, we saturate the y result here.
1523         // This does not affect the use as a condition variable in the CMP later
1524         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W,
1525                    temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
1526         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_Y,
1527                    swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
1528
1529         // Fourth slot
1530         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_X,
1531                    pfs_one, pfs_one, pfs_zero, 0);
1532         emit_arith(cs, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
1533
1534         // Fifth slot
1535         emit_arith(cs, PFS_OP_CMP, temp, WRITEMASK_Z,
1536                    pfs_zero, swizzle(temp, W, W, W, W),
1537                    negate(swizzle(temp, Y, Y, Y, Y)), flags);
1538         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
1539                    pfs_zero, 0);
1540
1541         if (needTemporary) {
1542                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1543                            temp, pfs_one, pfs_zero, flags);
1544                 free_temp(cs, temp);
1545         } else {
1546                 // Decrease refcount of the destination
1547                 t_hw_dst(cs, dest, GL_FALSE, cs->nrslots);
1548         }
1549 }
1550
1551 static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi)
1552 {
1553         COMPILE_STATE;
1554         GLuint src[3], dest, temp[2];
1555         int flags, mask = 0;
1556         int const_sin[2];
1557
1558         if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1559                 flags = PFS_FLAG_SAT;
1560         else
1561                 flags = 0;
1562
1563         if (fpi->Opcode != OPCODE_KIL) {
1564                 dest = t_dst(cs, fpi->DstReg);
1565                 mask = fpi->DstReg.WriteMask;
1566         }
1567
1568         switch (fpi->Opcode) {
1569         case OPCODE_ADD:
1570                 src[0] = t_src(cs, fpi->SrcReg[0]);
1571                 src[1] = t_src(cs, fpi->SrcReg[1]);
1572                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1573                                 src[0], pfs_one, src[1], flags);
1574                 break;
1575         case OPCODE_CMP:
1576                 src[0] = t_src(cs, fpi->SrcReg[0]);
1577                 src[1] = t_src(cs, fpi->SrcReg[1]);
1578                 src[2] = t_src(cs, fpi->SrcReg[2]);
1579                 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1580                         *    r300 - if src2.c < 0.0 ? src1.c : src0.c
1581                         */
1582                 emit_arith(cs, PFS_OP_CMP, dest, mask,
1583                                 src[2], src[1], src[0], flags);
1584                 break;
1585         case OPCODE_COS:
1586                 /*
1587                         * cos using a parabola (see SIN):
1588                         * cos(x):
1589                         *   x = (x/(2*PI))+0.75
1590                         *   x = frac(x)
1591                         *   x = (x*2*PI)-PI
1592                         *   result = sin(x)
1593                         */
1594                 temp[0] = get_temp_reg(cs);
1595                 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1596                 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1597                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1598
1599                 /* add 0.5*PI and do range reduction */
1600
1601                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1602                                 swizzle(src[0], X, X, X, X),
1603                                 swizzle(const_sin[1], Z, Z, Z, Z),
1604                                 swizzle(const_sin[1], X, X, X, X), 0);
1605
1606                 emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
1607                                 swizzle(temp[0], X, X, X, X),
1608                                 undef, undef, 0);
1609
1610                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),       //2*PI
1611                                 negate(swizzle(const_sin[0], Z, Z, Z, Z)),      //-PI
1612                                 0);
1613
1614                 /* SIN */
1615
1616                 emit_arith(cs, PFS_OP_MAD, temp[0],
1617                                 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1618                                                                 Z, Z, Z,
1619                                                                 Z),
1620                                 const_sin[0], pfs_zero, 0);
1621
1622                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1623                                 swizzle(temp[0], Y, Y, Y, Y),
1624                                 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1625                                 swizzle(temp[0], X, X, X, X), 0);
1626
1627                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1628                                 swizzle(temp[0], X, X, X, X),
1629                                 absolute(swizzle(temp[0], X, X, X, X)),
1630                                 negate(swizzle(temp[0], X, X, X, X)), 0);
1631
1632                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1633                                 swizzle(temp[0], Y, Y, Y, Y),
1634                                 swizzle(const_sin[0], W, W, W, W),
1635                                 swizzle(temp[0], X, X, X, X), flags);
1636
1637                 free_temp(cs, temp[0]);
1638                 break;
1639         case OPCODE_DP3:
1640                 src[0] = t_src(cs, fpi->SrcReg[0]);
1641                 src[1] = t_src(cs, fpi->SrcReg[1]);
1642                 emit_arith(cs, PFS_OP_DP3, dest, mask,
1643                                 src[0], src[1], undef, flags);
1644                 break;
1645         case OPCODE_DP4:
1646                 src[0] = t_src(cs, fpi->SrcReg[0]);
1647                 src[1] = t_src(cs, fpi->SrcReg[1]);
1648                 emit_arith(cs, PFS_OP_DP4, dest, mask,
1649                                 src[0], src[1], undef, flags);
1650                 break;
1651         case OPCODE_DST:
1652                 src[0] = t_src(cs, fpi->SrcReg[0]);
1653                 src[1] = t_src(cs, fpi->SrcReg[1]);
1654                 /* dest.y = src0.y * src1.y */
1655                 if (mask & WRITEMASK_Y)
1656                         emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Y,
1657                                         keep(src[0]), keep(src[1]),
1658                                         pfs_zero, flags);
1659                 /* dest.z = src0.z */
1660                 if (mask & WRITEMASK_Z)
1661                         emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Z,
1662                                         src[0], pfs_one, pfs_zero, flags);
1663                 /* result.x = 1.0
1664                         * result.w = src1.w */
1665                 if (mask & WRITEMASK_XW) {
1666                         REG_SET_VSWZ(src[1], SWIZZLE_111);      /*Cheat */
1667                         emit_arith(cs, PFS_OP_MAD, dest,
1668                                         mask & WRITEMASK_XW,
1669                                         src[1], pfs_one, pfs_zero, flags);
1670                 }
1671                 break;
1672         case OPCODE_EX2:
1673                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1674                 emit_arith(cs, PFS_OP_EX2, dest, mask,
1675                                 src[0], undef, undef, flags);
1676                 break;
1677         case OPCODE_FRC:
1678                 src[0] = t_src(cs, fpi->SrcReg[0]);
1679                 emit_arith(cs, PFS_OP_FRC, dest, mask,
1680                                 src[0], undef, undef, flags);
1681                 break;
1682         case OPCODE_KIL:
1683                 emit_tex(cs, fpi, R300_TEX_OP_KIL);
1684                 break;
1685         case OPCODE_LG2:
1686                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1687                 emit_arith(cs, PFS_OP_LG2, dest, mask,
1688                                 src[0], undef, undef, flags);
1689                 break;
1690         case OPCODE_LIT:
1691                 src[0] = t_src(cs, fpi->SrcReg[0]);
1692                 emit_lit(cs, dest, mask, src[0], flags);
1693                 break;
1694         case OPCODE_LRP:
1695                 src[0] = t_src(cs, fpi->SrcReg[0]);
1696                 src[1] = t_src(cs, fpi->SrcReg[1]);
1697                 src[2] = t_src(cs, fpi->SrcReg[2]);
1698                 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1699                         *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1700                         *     MAD temp, -tmp0, tmp2, tmp2
1701                         *     MAD result, tmp0, tmp1, temp
1702                         */
1703                 temp[0] = get_temp_reg(cs);
1704                 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1705                                 negate(keep(src[0])), keep(src[2]), src[2],
1706                                 0);
1707                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1708                                 src[0], src[1], temp[0], flags);
1709                 free_temp(cs, temp[0]);
1710                 break;
1711         case OPCODE_MAD:
1712                 src[0] = t_src(cs, fpi->SrcReg[0]);
1713                 src[1] = t_src(cs, fpi->SrcReg[1]);
1714                 src[2] = t_src(cs, fpi->SrcReg[2]);
1715                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1716                                 src[0], src[1], src[2], flags);
1717                 break;
1718         case OPCODE_MAX:
1719                 src[0] = t_src(cs, fpi->SrcReg[0]);
1720                 src[1] = t_src(cs, fpi->SrcReg[1]);
1721                 emit_arith(cs, PFS_OP_MAX, dest, mask,
1722                                 src[0], src[1], undef, flags);
1723                 break;
1724         case OPCODE_MIN:
1725                 src[0] = t_src(cs, fpi->SrcReg[0]);
1726                 src[1] = t_src(cs, fpi->SrcReg[1]);
1727                 emit_arith(cs, PFS_OP_MIN, dest, mask,
1728                                 src[0], src[1], undef, flags);
1729                 break;
1730         case OPCODE_MOV:
1731                 src[0] = t_src(cs, fpi->SrcReg[0]);
1732                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1733                                 src[0], pfs_one, pfs_zero, flags);
1734                 break;
1735         case OPCODE_MUL:
1736                 src[0] = t_src(cs, fpi->SrcReg[0]);
1737                 src[1] = t_src(cs, fpi->SrcReg[1]);
1738                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1739                                 src[0], src[1], pfs_zero, flags);
1740                 break;
1741         case OPCODE_RCP:
1742                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1743                 emit_arith(cs, PFS_OP_RCP, dest, mask,
1744                                 src[0], undef, undef, flags);
1745                 break;
1746         case OPCODE_RSQ:
1747                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1748                 emit_arith(cs, PFS_OP_RSQ, dest, mask,
1749                                 absolute(src[0]), pfs_zero, pfs_zero, flags);
1750                 break;
1751         case OPCODE_SCS:
1752                 /*
1753                         * scs using a parabola :
1754                         * scs(x):
1755                         *   result.x = sin(-abs(x)+0.5*PI)  (cos)
1756                         *   result.y = sin(x)               (sin)
1757                         *
1758                         */
1759                 temp[0] = get_temp_reg(cs);
1760                 temp[1] = get_temp_reg(cs);
1761                 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1762                 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1763                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1764
1765                 /* x = -abs(x)+0.5*PI */
1766                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z),     //PI
1767                                 pfs_half,
1768                                 negate(abs
1769                                         (swizzle(keep(src[0]), X, X, X, X))),
1770                                 0);
1771
1772                 /* C*x (sin) */
1773                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
1774                                 swizzle(const_sin[0], Y, Y, Y, Y),
1775                                 swizzle(keep(src[0]), X, X, X, X),
1776                                 pfs_zero, 0);
1777
1778                 /* B*x, C*x (cos) */
1779                 emit_arith(cs, PFS_OP_MAD, temp[0],
1780                                 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1781                                                                 Z, Z, Z,
1782                                                                 Z),
1783                                 const_sin[0], pfs_zero, 0);
1784
1785                 /* B*x (sin) */
1786                 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1787                                 swizzle(const_sin[0], X, X, X, X),
1788                                 keep(src[0]), pfs_zero, 0);
1789
1790                 /* y = B*x + C*x*abs(x) (sin) */
1791                 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1792                                 absolute(src[0]),
1793                                 swizzle(temp[0], W, W, W, W),
1794                                 swizzle(temp[1], W, W, W, W), 0);
1795
1796                 /* y = B*x + C*x*abs(x) (cos) */
1797                 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1798                                 swizzle(temp[0], Y, Y, Y, Y),
1799                                 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1800                                 swizzle(temp[0], X, X, X, X), 0);
1801
1802                 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1803                 emit_arith(cs, PFS_OP_MAD, temp[0],
1804                                 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
1805                                                                 W, Z, Y,
1806                                                                 X),
1807                                 absolute(swizzle(temp[1], W, Z, Y, X)),
1808                                 negate(swizzle(temp[1], W, Z, Y, X)), 0);
1809
1810                 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1811                 emit_arith(cs, PFS_OP_MAD, dest,
1812                                 mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
1813                                 swizzle(const_sin[0], W, W, W, W),
1814                                 swizzle(temp[1], W, Z, Y, X), flags);
1815
1816                 free_temp(cs, temp[0]);
1817                 free_temp(cs, temp[1]);
1818                 break;
1819         case OPCODE_SIN:
1820                 /*
1821                         *  using a parabola:
1822                         * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1823                         * extra precision is obtained by weighting against
1824                         * itself squared.
1825                         */
1826
1827                 temp[0] = get_temp_reg(cs);
1828                 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1829                 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1830                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1831
1832                 /* do range reduction */
1833
1834                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1835                                 swizzle(keep(src[0]), X, X, X, X),
1836                                 swizzle(const_sin[1], Z, Z, Z, Z),
1837                                 pfs_half, 0);
1838
1839                 emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
1840                                 swizzle(temp[0], X, X, X, X),
1841                                 undef, undef, 0);
1842
1843                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),       //2*PI
1844                                 negate(swizzle(const_sin[0], Z, Z, Z, Z)),      //PI
1845                                 0);
1846
1847                 /* SIN */
1848
1849                 emit_arith(cs, PFS_OP_MAD, temp[0],
1850                                 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1851                                                                 Z, Z, Z,
1852                                                                 Z),
1853                                 const_sin[0], pfs_zero, 0);
1854
1855                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1856                                 swizzle(temp[0], Y, Y, Y, Y),
1857                                 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1858                                 swizzle(temp[0], X, X, X, X), 0);
1859
1860                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1861                                 swizzle(temp[0], X, X, X, X),
1862                                 absolute(swizzle(temp[0], X, X, X, X)),
1863                                 negate(swizzle(temp[0], X, X, X, X)), 0);
1864
1865                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1866                                 swizzle(temp[0], Y, Y, Y, Y),
1867                                 swizzle(const_sin[0], W, W, W, W),
1868                                 swizzle(temp[0], X, X, X, X), flags);
1869
1870                 free_temp(cs, temp[0]);
1871                 break;
1872         case OPCODE_TEX:
1873                 emit_tex(cs, fpi, R300_TEX_OP_LD);
1874                 break;
1875         case OPCODE_TXB:
1876                 emit_tex(cs, fpi, R300_TEX_OP_TXB);
1877                 break;
1878         case OPCODE_TXP:
1879                 emit_tex(cs, fpi, R300_TEX_OP_TXP);
1880                 break;
1881         default:
1882                 ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
1883                 break;
1884         }
1885 }
1886
1887 static GLboolean parse_program(struct r300_pfs_compile_state *cs)
1888 {
1889         COMPILE_STATE;
1890         int clauseidx;
1891
1892         for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; ++clauseidx) {
1893                 struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx];
1894                 int ip;
1895
1896                 for(ip = 0; ip < clause->NumInstructions; ++ip) {
1897                         emit_instruction(cs, clause->Instructions + ip);
1898
1899                         if (fp->error)
1900                                 return GL_FALSE;
1901                 }
1902         }
1903
1904         return GL_TRUE;
1905 }
1906
1907
1908 /* - Init structures
1909  * - Determine what hwregs each input corresponds to
1910  */
1911 static void init_program(struct r300_pfs_compile_state *cs)
1912 {
1913         COMPILE_STATE;
1914         struct gl_fragment_program *mp = &fp->mesa_program;
1915         GLuint InputsRead = mp->Base.InputsRead;
1916         GLuint temps_used = 0;  /* for fp->temps[] */
1917         int i, j;
1918
1919         /* New compile, reset tracking data */
1920         fp->optimization =
1921             driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization");
1922         fp->translated = GL_FALSE;
1923         fp->error = GL_FALSE;
1924         fp->WritesDepth = GL_FALSE;
1925         code->tex.length = 0;
1926         code->cur_node = 0;
1927         code->first_node_has_tex = 0;
1928         code->const_nr = 0;
1929         code->max_temp_idx = 0;
1930         code->node[0].alu_end = -1;
1931         code->node[0].tex_end = -1;
1932
1933         for (i = 0; i < PFS_MAX_ALU_INST; i++) {
1934                 for (j = 0; j < 3; j++) {
1935                         cs->slot[i].vsrc[j] = SRC_CONST;
1936                         cs->slot[i].ssrc[j] = SRC_CONST;
1937                 }
1938         }
1939
1940         /* Work out what temps the Mesa inputs correspond to, this must match
1941          * what setup_rs_unit does, which shouldn't be a problem as rs_unit
1942          * configures itself based on the fragprog's InputsRead
1943          *
1944          * NOTE: this depends on get_hw_temp() allocating registers in order,
1945          * starting from register 0.
1946          */
1947
1948         /* Texcoords come first */
1949         for (i = 0; i < cs->compiler->r300->radeon.glCtx->Const.MaxTextureUnits; i++) {
1950                 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
1951                         cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
1952                         cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
1953                             get_hw_temp(cs, 0);
1954                 }
1955         }
1956         InputsRead &= ~FRAG_BITS_TEX_ANY;
1957
1958         /* fragment position treated as a texcoord */
1959         if (InputsRead & FRAG_BIT_WPOS) {
1960                 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
1961                 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(cs, 0);
1962         }
1963         InputsRead &= ~FRAG_BIT_WPOS;
1964
1965         /* Then primary colour */
1966         if (InputsRead & FRAG_BIT_COL0) {
1967                 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
1968                 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(cs, 0);
1969         }
1970         InputsRead &= ~FRAG_BIT_COL0;
1971
1972         /* Secondary color */
1973         if (InputsRead & FRAG_BIT_COL1) {
1974                 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
1975                 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(cs, 0);
1976         }
1977         InputsRead &= ~FRAG_BIT_COL1;
1978
1979         /* Anything else */
1980         if (InputsRead) {
1981                 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
1982                 /* force read from hwreg 0 for now */
1983                 for (i = 0; i < 32; i++)
1984                         if (InputsRead & (1 << i))
1985                                 cs->inputs[i].reg = 0;
1986         }
1987
1988         /* Pre-parse the program, grabbing refcounts on input/temp regs.
1989          * That way, we can free up the reg when it's no longer needed
1990          */
1991         for (i = 0; i < cs->compiler->compiler.Clauses[0].NumInstructions; ++i) {
1992                 struct prog_instruction *fpi = cs->compiler->compiler.Clauses[0].Instructions + i;
1993                 int idx;
1994
1995                 for (j = 0; j < 3; j++) {
1996                         idx = fpi->SrcReg[j].Index;
1997                         switch (fpi->SrcReg[j].File) {
1998                         case PROGRAM_TEMPORARY:
1999                                 if (!(temps_used & (1 << idx))) {
2000                                         cs->temps[idx].reg = -1;
2001                                         cs->temps[idx].refcount = 1;
2002                                         temps_used |= (1 << idx);
2003                                 } else
2004                                         cs->temps[idx].refcount++;
2005                                 break;
2006                         case PROGRAM_INPUT:
2007                                 cs->inputs[idx].refcount++;
2008                                 break;
2009                         default:
2010                                 break;
2011                         }
2012                 }
2013
2014                 idx = fpi->DstReg.Index;
2015                 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
2016                         if (!(temps_used & (1 << idx))) {
2017                                 cs->temps[idx].reg = -1;
2018                                 cs->temps[idx].refcount = 1;
2019                                 temps_used |= (1 << idx);
2020                         } else
2021                                 cs->temps[idx].refcount++;
2022                 }
2023         }
2024         cs->temp_in_use = temps_used;
2025 }
2026
2027
2028 /**
2029  * Final compilation step: Turn the intermediate radeon_program into
2030  * machine-readable instructions.
2031  */
2032 GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
2033 {
2034         struct r300_pfs_compile_state cs;
2035         struct r300_fragment_program_code *code = compiler->code;
2036
2037         _mesa_memset(&cs, 0, sizeof(cs));
2038         cs.compiler = compiler;
2039         init_program(&cs);
2040
2041         if (!parse_program(&cs))
2042                 return GL_FALSE;
2043
2044         /* Finish off */
2045         code->node[code->cur_node].alu_end =
2046                 cs.nrslots - code->node[code->cur_node].alu_offset - 1;
2047         if (code->node[code->cur_node].tex_end < 0)
2048                 code->node[code->cur_node].tex_end = 0;
2049         code->alu_offset = 0;
2050         code->alu_end = cs.nrslots - 1;
2051         code->tex_offset = 0;
2052         code->tex_end = code->tex.length ? code->tex.length - 1 : 0;
2053         assert(code->node[code->cur_node].alu_end >= 0);
2054         assert(code->alu_end >= 0);
2055
2056         return GL_TRUE;
2057 }
2058