src/mesa/drivers/dri/r300/r300_fragprog_emit.c

   1 /*
   2  * Copyright (C) 2005 Ben Skeggs.
   3  *
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sublicense, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial
  16  * portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  */
  27
  28 /**
  29  * \file
  30  *
  31  * Emit the r300_fragment_program_code that can be understood by the hardware.
  32  * Input is a pre-transformed radeon_program.
  33  *
  34  * \author Ben Skeggs <darktama@iinet.net.au>
  35  *
  36  * \author Jerome Glisse <j.glisse@gmail.com>
  37  *
  38  * \todo FogOption
  39  *
  40  * \todo Verify results of opcodes for accuracy, I've only checked them in
  41  * specific cases.
  42  */
  43
  44 #include "glheader.h"
  45 #include "macros.h"
  46 #include "enums.h"
  47 #include "shader/prog_instruction.h"
  48 #include "shader/prog_parameter.h"
  49 #include "shader/prog_print.h"
  50
  51 #include "r300_context.h"
  52 #include "r300_fragprog.h"
  53 #include "r300_reg.h"
  54 #include "r300_state.h"
  55
  56 /* Mapping Mesa registers to R300 temporaries */
  57 struct reg_acc {
  58         int reg;                /* Assigned hw temp */
  59         unsigned int refcount;  /* Number of uses by mesa program */
  60 };
  61
  62 /**
  63  * Describe the current lifetime information for an R300 temporary
  64  */
  65 struct reg_lifetime {
  66         /* Index of the first slot where this register is free in the sense
  67            that it can be used as a new destination register.
  68            This is -1 if the register has been assigned to a Mesa register
  69            and the last access to the register has not yet been emitted */
  70         int free;
  71
  72         /* Index of the first slot where this register is currently reserved.
  73            This is used to stop e.g. a scalar operation from being moved
  74            before the allocation time of a register that was first allocated
  75            for a vector operation. */
  76         int reserved;
  77
  78         /* Index of the first slot in which the register can be used as a
  79            source without losing the value that is written by the last
  80            emitted instruction that writes to the register */
  81         int vector_valid;
  82         int scalar_valid;
  83
  84         /* Index to the slot where the register was last read.
  85            This is also the first slot in which the register may be written again */
  86         int vector_lastread;
  87         int scalar_lastread;
  88 };
  89
  90 /**
  91  * Store usage information about an ALU instruction slot during the
  92  * compilation of a fragment program.
  93  */
  94 #define SLOT_SRC_VECTOR  (1<<0)
  95 #define SLOT_SRC_SCALAR  (1<<3)
  96 #define SLOT_SRC_BOTH    (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
  97 #define SLOT_OP_VECTOR   (1<<16)
  98 #define SLOT_OP_SCALAR   (1<<17)
  99 #define SLOT_OP_BOTH     (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
 100
 101 struct r300_pfs_compile_slot {
 102         /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
 103            defined above */
 104         unsigned int used;
 105
 106         /* Selected sources */
 107         int vsrc[3];
 108         int ssrc[3];
 109 };
 110
 111 /**
 112  * Store information during compilation of fragment programs.
 113  */
 114 struct r300_pfs_compile_state {
 115         struct r300_fragment_program_compiler *compiler;
 116
 117         int nrslots;            /* number of ALU slots used so far */
 118
 119         /* Track which (parts of) slots are already filled with instructions */
 120         struct r300_pfs_compile_slot slot[PFS_MAX_ALU_INST];
 121
 122         /* Track the validity of R300 temporaries */
 123         struct reg_lifetime hwtemps[PFS_NUM_TEMP_REGS];
 124
 125         /* Used to map Mesa's inputs/temps onto hardware temps */
 126         int temp_in_use;
 127         struct reg_acc temps[PFS_NUM_TEMP_REGS];
 128         struct reg_acc inputs[32];      /* don't actually need 32... */
 129
 130         /* Track usage of hardware temps, for register allocation,
 131          * indirection detection, etc. */
 132         GLuint used_in_node;
 133         GLuint dest_in_node;
 134 };
 135
 136
 137 /*
 138  * Usefull macros and values
 139  */
 140 #define ERROR(fmt, args...) do {                        \
 141                 fprintf(stderr, "%s::%s(): " fmt "\n",  \
 142                         __FILE__, __FUNCTION__, ##args);        \
 143                 fp->error = GL_TRUE;                    \
 144         } while(0)
 145
 146 #define PFS_INVAL 0xFFFFFFFF
 147 #define COMPILE_STATE \
 148         struct r300_fragment_program *fp = cs->compiler->fp; \
 149         struct r300_fragment_program_code *code = cs->compiler->code; \
 150         (void)code; (void)fp
 151
 152 #define SWIZZLE_XYZ             0
 153 #define SWIZZLE_XXX             1
 154 #define SWIZZLE_YYY             2
 155 #define SWIZZLE_ZZZ             3
 156 #define SWIZZLE_WWW             4
 157 #define SWIZZLE_YZX             5
 158 #define SWIZZLE_ZXY             6
 159 #define SWIZZLE_WZY             7
 160 #define SWIZZLE_111             8
 161 #define SWIZZLE_000             9
 162 #define SWIZZLE_HHH             10
 163
 164 #define swizzle(r, x, y, z, w) do_swizzle(cs, r,                \
 165                                           ((SWIZZLE_##x<<0)|    \
 166                                            (SWIZZLE_##y<<3)|    \
 167                                            (SWIZZLE_##z<<6)|    \
 168                                            (SWIZZLE_##w<<9)),   \
 169                                           0)
 170
 171 #define REG_TYPE_INPUT          0
 172 #define REG_TYPE_OUTPUT         1
 173 #define REG_TYPE_TEMP           2
 174 #define REG_TYPE_CONST          3
 175
 176 #define REG_TYPE_SHIFT          0
 177 #define REG_INDEX_SHIFT         2
 178 #define REG_VSWZ_SHIFT          8
 179 #define REG_SSWZ_SHIFT          13
 180 #define REG_NEGV_SHIFT          18
 181 #define REG_NEGS_SHIFT          19
 182 #define REG_ABS_SHIFT           20
 183 #define REG_NO_USE_SHIFT        21      // Hack for refcounting
 184 #define REG_VALID_SHIFT         22      // Does the register contain a defined value?
 185 #define REG_BUILTIN_SHIFT   23  // Is it a builtin (like all zero/all one)?
 186
 187 #define REG_TYPE_MASK           (0x03 << REG_TYPE_SHIFT)
 188 #define REG_INDEX_MASK          (0x3F << REG_INDEX_SHIFT)
 189 #define REG_VSWZ_MASK           (0x1F << REG_VSWZ_SHIFT)
 190 #define REG_SSWZ_MASK           (0x1F << REG_SSWZ_SHIFT)
 191 #define REG_NEGV_MASK           (0x01 << REG_NEGV_SHIFT)
 192 #define REG_NEGS_MASK           (0x01 << REG_NEGS_SHIFT)
 193 #define REG_ABS_MASK            (0x01 << REG_ABS_SHIFT)
 194 #define REG_NO_USE_MASK         (0x01 << REG_NO_USE_SHIFT)
 195 #define REG_VALID_MASK          (0x01 << REG_VALID_SHIFT)
 196 #define REG_BUILTIN_MASK        (0x01 << REG_BUILTIN_SHIFT)
 197
 198 #define REG(type, index, vswz, sswz, nouse, valid, builtin)     \
 199         (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) |                   \
 200          ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) |                \
 201          ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) |              \
 202          ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) |                \
 203          ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) |  \
 204          ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) |                   \
 205          ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 206 #define REG_GET_TYPE(reg)                                               \
 207         ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
 208 #define REG_GET_INDEX(reg)                                              \
 209         ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
 210 #define REG_GET_VSWZ(reg)                                               \
 211         ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
 212 #define REG_GET_SSWZ(reg)                                               \
 213         ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
 214 #define REG_GET_NO_USE(reg)                                             \
 215         ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
 216 #define REG_GET_VALID(reg)                                              \
 217         ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
 218 #define REG_GET_BUILTIN(reg)                                            \
 219         ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
 220 #define REG_SET_TYPE(reg, type)                                         \
 221         reg = ((reg & ~REG_TYPE_MASK) |                                 \
 222                ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
 223 #define REG_SET_INDEX(reg, index)                                       \
 224         reg = ((reg & ~REG_INDEX_MASK) |                                \
 225                ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
 226 #define REG_SET_VSWZ(reg, vswz)                                         \
 227         reg = ((reg & ~REG_VSWZ_MASK) |                                 \
 228                ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
 229 #define REG_SET_SSWZ(reg, sswz)                                         \
 230         reg = ((reg & ~REG_SSWZ_MASK) |                                 \
 231                ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
 232 #define REG_SET_NO_USE(reg, nouse)                                      \
 233         reg = ((reg & ~REG_NO_USE_MASK) |                               \
 234                ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
 235 #define REG_SET_VALID(reg, valid)                                       \
 236         reg = ((reg & ~REG_VALID_MASK) |                                \
 237                ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
 238 #define REG_SET_BUILTIN(reg, builtin)                                   \
 239         reg = ((reg & ~REG_BUILTIN_MASK) |                              \
 240                ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
 241 #define REG_ABS(reg)                                                    \
 242         reg = (reg | REG_ABS_MASK)
 243 #define REG_NEGV(reg)                                                   \
 244         reg = (reg | REG_NEGV_MASK)
 245 #define REG_NEGS(reg)                                                   \
 246         reg = (reg | REG_NEGS_MASK)
 247
 248 #define NOP_INST0 (                                              \
 249                 (R300_ALU_OUTC_MAD) |                            \
 250                 (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
 251                 (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
 252                 (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
 253 #define NOP_INST1 (                                          \
 254                 ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
 255                 ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
 256                 ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
 257 #define NOP_INST2 ( \
 258                 (R300_ALU_OUTA_MAD) |                            \
 259                 (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
 260                 (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
 261                 (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
 262 #define NOP_INST3 (                                          \
 263                 ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
 264                 ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
 265                 ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
 266
 267
 268 /*
 269  * Datas structures for fragment program generation
 270  */
 271
 272 /* description of r300 native hw instructions */
 273 static const struct {
 274         const char *name;
 275         int argc;
 276         int v_op;
 277         int s_op;
 278 } r300_fpop[] = {
 279         /* *INDENT-OFF* */
 280         {"MAD", 3, R300_ALU_OUTC_MAD, R300_ALU_OUTA_MAD},
 281         {"DP3", 2, R300_ALU_OUTC_DP3, R300_ALU_OUTA_DP4},
 282         {"DP4", 2, R300_ALU_OUTC_DP4, R300_ALU_OUTA_DP4},
 283         {"MIN", 2, R300_ALU_OUTC_MIN, R300_ALU_OUTA_MIN},
 284         {"MAX", 2, R300_ALU_OUTC_MAX, R300_ALU_OUTA_MAX},
 285         {"CMP", 3, R300_ALU_OUTC_CMP, R300_ALU_OUTA_CMP},
 286         {"FRC", 1, R300_ALU_OUTC_FRC, R300_ALU_OUTA_FRC},
 287         {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_EX2},
 288         {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_LG2},
 289         {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RCP},
 290         {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA, R300_ALU_OUTA_RSQ},
 291         {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA, PFS_INVAL},
 292         {"CMPH", 3, R300_ALU_OUTC_CMPH, PFS_INVAL},
 293         /* *INDENT-ON* */
 294 };
 295
 296 /* vector swizzles r300 can support natively, with a couple of
 297  * cases we handle specially
 298  *
 299  * REG_VSWZ/REG_SSWZ is an index into this table
 300  */
 301
 302 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
 303 #define SWIZZLE_HALF 6
 304
 305 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
 306                                           SWIZZLE_##y, \
 307                                           SWIZZLE_##z, \
 308                                           SWIZZLE_ZERO))
 309 /* native swizzles */
 310 static const struct r300_pfs_swizzle {
 311         GLuint hash;            /* swizzle value this matches */
 312         GLuint base;            /* base value for hw swizzle */
 313         GLuint stride;          /* difference in base between arg0/1/2 */
 314         GLuint flags;
 315 } v_swiz[] = {
 316         /* *INDENT-OFF* */
 317         {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4, SLOT_SRC_VECTOR},
 318         {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4, SLOT_SRC_VECTOR},
 319         {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4, SLOT_SRC_VECTOR},
 320         {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4, SLOT_SRC_VECTOR},
 321         {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1, SLOT_SRC_SCALAR},
 322         {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1, SLOT_SRC_VECTOR},
 323         {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1, SLOT_SRC_VECTOR},
 324         {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1, SLOT_SRC_BOTH},
 325         {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0, 0},
 326         {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0, 0},
 327         {MAKE_SWZ3(HALF, HALF, HALF), R300_ALU_ARGC_HALF, 0, 0},
 328         {PFS_INVAL, 0, 0, 0},
 329         /* *INDENT-ON* */
 330 };
 331
 332 /* used during matching of non-native swizzles */
 333 #define SWZ_X_MASK (7 << 0)
 334 #define SWZ_Y_MASK (7 << 3)
 335 #define SWZ_Z_MASK (7 << 6)
 336 #define SWZ_W_MASK (7 << 9)
 337 static const struct {
 338         GLuint hash;            /* used to mask matching swizzle components */
 339         int mask;               /* actual outmask */
 340         int count;              /* count of components matched */
 341 } s_mask[] = {
 342         /* *INDENT-OFF* */
 343         {SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK, 1 | 2 | 4, 3},
 344         {SWZ_X_MASK | SWZ_Y_MASK, 1 | 2, 2},
 345         {SWZ_X_MASK | SWZ_Z_MASK, 1 | 4, 2},
 346         {SWZ_Y_MASK | SWZ_Z_MASK, 2 | 4, 2},
 347         {SWZ_X_MASK, 1, 1},
 348         {SWZ_Y_MASK, 2, 1},
 349         {SWZ_Z_MASK, 4, 1},
 350         {PFS_INVAL, PFS_INVAL, PFS_INVAL}
 351         /* *INDENT-ON* */
 352 };
 353
 354 static const struct {
 355         int base;               /* hw value of swizzle */
 356         int stride;             /* difference between SRC0/1/2 */
 357         GLuint flags;
 358 } s_swiz[] = {
 359         /* *INDENT-OFF* */
 360         {R300_ALU_ARGA_SRC0C_X, 3, SLOT_SRC_VECTOR},
 361         {R300_ALU_ARGA_SRC0C_Y, 3, SLOT_SRC_VECTOR},
 362         {R300_ALU_ARGA_SRC0C_Z, 3, SLOT_SRC_VECTOR},
 363         {R300_ALU_ARGA_SRC0A, 1, SLOT_SRC_SCALAR},
 364         {R300_ALU_ARGA_ZERO, 0, 0},
 365         {R300_ALU_ARGA_ONE, 0, 0},
 366         {R300_ALU_ARGA_HALF, 0, 0}
 367         /* *INDENT-ON* */
 368 };
 369
 370 /* boiler-plate reg, for convenience */
 371 static const GLuint undef = REG(REG_TYPE_TEMP,
 372                                 0,
 373                                 SWIZZLE_XYZ,
 374                                 SWIZZLE_W,
 375                                 GL_FALSE,
 376                                 GL_FALSE,
 377                                 GL_FALSE);
 378
 379 /* constant one source */
 380 static const GLuint pfs_one = REG(REG_TYPE_CONST,
 381                                   0,
 382                                   SWIZZLE_111,
 383                                   SWIZZLE_ONE,
 384                                   GL_FALSE,
 385                                   GL_TRUE,
 386                                   GL_TRUE);
 387
 388 /* constant half source */
 389 static const GLuint pfs_half = REG(REG_TYPE_CONST,
 390                                    0,
 391                                    SWIZZLE_HHH,
 392                                    SWIZZLE_HALF,
 393                                    GL_FALSE,
 394                                    GL_TRUE,
 395                                    GL_TRUE);
 396
 397 /* constant zero source */
 398 static const GLuint pfs_zero = REG(REG_TYPE_CONST,
 399                                    0,
 400                                    SWIZZLE_000,
 401                                    SWIZZLE_ZERO,
 402                                    GL_FALSE,
 403                                    GL_TRUE,
 404                                    GL_TRUE);
 405
 406 /*
 407  * Common functions prototypes
 408  */
 409 static void emit_arith(struct r300_pfs_compile_state *cs, int op,
 410                        GLuint dest, int mask,
 411                        GLuint src0, GLuint src1, GLuint src2, int flags);
 412
 413 /**
 414  * Get an R300 temporary that can be written to in the given slot.
 415  */
 416 static int get_hw_temp(struct r300_pfs_compile_state *cs, int slot)
 417 {
 418         COMPILE_STATE;
 419         int r;
 420
 421         for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 422                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= slot)
 423                         break;
 424         }
 425
 426         if (r >= PFS_NUM_TEMP_REGS) {
 427                 ERROR("Out of hardware temps\n");
 428                 return 0;
 429         }
 430         // Reserved is used to avoid the following scenario:
 431         //  R300 temporary X is first assigned to Mesa temporary Y during vector ops
 432         //  R300 temporary X is then assigned to Mesa temporary Z for further vector ops
 433         //  Then scalar ops on Mesa temporary Z are emitted and move back in time
 434         //  to overwrite the value of temporary Y.
 435         // End scenario.
 436         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 437         cs->hwtemps[r].free = -1;
 438
 439         // Reset to some value that won't mess things up when the user
 440         // tries to read from a temporary that hasn't been assigned a value yet.
 441         // In the normal case, vector_valid and scalar_valid should be set to
 442         // a sane value by the first emit that writes to this temporary.
 443         cs->hwtemps[r].vector_valid = 0;
 444         cs->hwtemps[r].scalar_valid = 0;
 445
 446         if (r > code->max_temp_idx)
 447                 code->max_temp_idx = r;
 448
 449         return r;
 450 }
 451
 452 /**
 453  * Get an R300 temporary that will act as a TEX destination register.
 454  */
 455 static int get_hw_temp_tex(struct r300_pfs_compile_state *cs)
 456 {
 457         COMPILE_STATE;
 458         int r;
 459
 460         for (r = 0; r < PFS_NUM_TEMP_REGS; ++r) {
 461                 if (cs->used_in_node & (1 << r))
 462                         continue;
 463
 464                 // Note: Be very careful here
 465                 if (cs->hwtemps[r].free >= 0 && cs->hwtemps[r].free <= 0)
 466                         break;
 467         }
 468
 469         if (r >= PFS_NUM_TEMP_REGS)
 470                 return get_hw_temp(cs, 0);      /* Will cause an indirection */
 471
 472         cs->hwtemps[r].reserved = cs->hwtemps[r].free;
 473         cs->hwtemps[r].free = -1;
 474
 475         // Reset to some value that won't mess things up when the user
 476         // tries to read from a temporary that hasn't been assigned a value yet.
 477         // In the normal case, vector_valid and scalar_valid should be set to
 478         // a sane value by the first emit that writes to this temporary.
 479         cs->hwtemps[r].vector_valid = cs->nrslots;
 480         cs->hwtemps[r].scalar_valid = cs->nrslots;
 481
 482         if (r > code->max_temp_idx)
 483                 code->max_temp_idx = r;
 484
 485         return r;
 486 }
 487
 488 /**
 489  * Mark the given hardware register as free.
 490  */
 491 static void free_hw_temp(struct r300_pfs_compile_state *cs, int idx)
 492 {
 493         // Be very careful here. Consider sequences like
 494         //  MAD r0, r1,r2,r3
 495         //  TEX r4, ...
 496         // The TEX instruction may be moved in front of the MAD instruction
 497         // due to the way nodes work. We don't want to alias r1 and r4 in
 498         // this case.
 499         // I'm certain the register allocation could be further sanitized,
 500         // but it's tricky because of stuff that can happen inside emit_tex
 501         // and emit_arith.
 502         cs->hwtemps[idx].free = cs->nrslots + 1;
 503 }
 504
 505 /**
 506  * Create a new Mesa temporary register.
 507  */
 508 static GLuint get_temp_reg(struct r300_pfs_compile_state *cs)
 509 {
 510         COMPILE_STATE;
 511         GLuint r = undef;
 512         GLuint index;
 513
 514         index = ffs(~cs->temp_in_use);
 515         if (!index) {
 516                 ERROR("Out of program temps\n");
 517                 return r;
 518         }
 519
 520         cs->temp_in_use |= (1 << --index);
 521         cs->temps[index].refcount = 0xFFFFFFFF;
 522         cs->temps[index].reg = -1;
 523
 524         REG_SET_TYPE(r, REG_TYPE_TEMP);
 525         REG_SET_INDEX(r, index);
 526         REG_SET_VALID(r, GL_TRUE);
 527         return r;
 528 }
 529
 530 /**
 531  * Free a Mesa temporary and the associated R300 temporary.
 532  */
 533 static void free_temp(struct r300_pfs_compile_state *cs, GLuint r)
 534 {
 535         GLuint index = REG_GET_INDEX(r);
 536
 537         if (!(cs->temp_in_use & (1 << index)))
 538                 return;
 539
 540         if (REG_GET_TYPE(r) == REG_TYPE_TEMP) {
 541                 free_hw_temp(cs, cs->temps[index].reg);
 542                 cs->temps[index].reg = -1;
 543                 cs->temp_in_use &= ~(1 << index);
 544         } else if (REG_GET_TYPE(r) == REG_TYPE_INPUT) {
 545                 free_hw_temp(cs, cs->inputs[index].reg);
 546                 cs->inputs[index].reg = -1;
 547         }
 548 }
 549
 550 /**
 551  * Emit a hardware constant/parameter.
 552  *
 553  * \p cp Stable pointer to an array of 4 floats.
 554  *  The pointer must be stable in the sense that it remains to be valid
 555  *  and hold the contents of the constant/parameter throughout the lifetime
 556  *  of the fragment program (actually, up until the next time the fragment
 557  *  program is translated).
 558  */
 559 static GLuint emit_const4fv(struct r300_pfs_compile_state *cs,
 560                             const GLfloat * cp)
 561 {
 562         COMPILE_STATE;
 563         GLuint reg = undef;
 564         int index;
 565
 566         for (index = 0; index < code->const_nr; ++index) {
 567                 if (code->constant[index] == cp)
 568                         break;
 569         }
 570
 571         if (index >= code->const_nr) {
 572                 if (index >= PFS_NUM_CONST_REGS) {
 573                         ERROR("Out of hw constants!\n");
 574                         return reg;
 575                 }
 576
 577                 code->const_nr++;
 578                 code->constant[index] = cp;
 579         }
 580
 581         REG_SET_TYPE(reg, REG_TYPE_CONST);
 582         REG_SET_INDEX(reg, index);
 583         REG_SET_VALID(reg, GL_TRUE);
 584         return reg;
 585 }
 586
 587 static inline GLuint negate(GLuint r)
 588 {
 589         REG_NEGS(r);
 590         REG_NEGV(r);
 591         return r;
 592 }
 593
 594 /* Hack, to prevent clobbering sources used multiple times when
 595  * emulating non-native instructions
 596  */
 597 static inline GLuint keep(GLuint r)
 598 {
 599         REG_SET_NO_USE(r, GL_TRUE);
 600         return r;
 601 }
 602
 603 static inline GLuint absolute(GLuint r)
 604 {
 605         REG_ABS(r);
 606         return r;
 607 }
 608
 609 static int swz_native(struct r300_pfs_compile_state *cs,
 610                       GLuint src, GLuint * r, GLuint arbneg)
 611 {
 612         COMPILE_STATE;
 613
 614         /* Native swizzle, handle negation */
 615         src = (src & ~REG_NEGS_MASK) | (((arbneg >> 3) & 1) << REG_NEGS_SHIFT);
 616
 617         if ((arbneg & 0x7) == 0x0) {
 618                 src = src & ~REG_NEGV_MASK;
 619                 *r = src;
 620         } else if ((arbneg & 0x7) == 0x7) {
 621                 src |= REG_NEGV_MASK;
 622                 *r = src;
 623         } else {
 624                 if (!REG_GET_VALID(*r))
 625                         *r = get_temp_reg(cs);
 626                 src |= REG_NEGV_MASK;
 627                 emit_arith(cs,
 628                            PFS_OP_MAD,
 629                            *r, arbneg & 0x7, keep(src), pfs_one, pfs_zero, 0);
 630                 src = src & ~REG_NEGV_MASK;
 631                 emit_arith(cs,
 632                            PFS_OP_MAD,
 633                            *r,
 634                            (arbneg ^ 0x7) | WRITEMASK_W,
 635                            src, pfs_one, pfs_zero, 0);
 636         }
 637
 638         return 3;
 639 }
 640
 641 static int swz_emit_partial(struct r300_pfs_compile_state *cs,
 642                             GLuint src,
 643                             GLuint * r, int mask, int mc, GLuint arbneg)
 644 {
 645         COMPILE_STATE;
 646         GLuint tmp;
 647         GLuint wmask = 0;
 648
 649         if (!REG_GET_VALID(*r))
 650                 *r = get_temp_reg(cs);
 651
 652         /* A partial match, VSWZ/mask define what parts of the
 653          * desired swizzle we match
 654          */
 655         if (mc + s_mask[mask].count == 3) {
 656                 wmask = WRITEMASK_W;
 657                 src |= ((arbneg >> 3) & 1) << REG_NEGS_SHIFT;
 658         }
 659
 660         tmp = arbneg & s_mask[mask].mask;
 661         if (tmp) {
 662                 tmp = tmp ^ s_mask[mask].mask;
 663                 if (tmp) {
 664                         emit_arith(cs,
 665                                    PFS_OP_MAD,
 666                                    *r,
 667                                    arbneg & s_mask[mask].mask,
 668                                    keep(src) | REG_NEGV_MASK,
 669                                    pfs_one, pfs_zero, 0);
 670                         if (!wmask) {
 671                                 REG_SET_NO_USE(src, GL_TRUE);
 672                         } else {
 673                                 REG_SET_NO_USE(src, GL_FALSE);
 674                         }
 675                         emit_arith(cs,
 676                                    PFS_OP_MAD,
 677                                    *r, tmp | wmask, src, pfs_one, pfs_zero, 0);
 678                 } else {
 679                         if (!wmask) {
 680                                 REG_SET_NO_USE(src, GL_TRUE);
 681                         } else {
 682                                 REG_SET_NO_USE(src, GL_FALSE);
 683                         }
 684                         emit_arith(cs,
 685                                    PFS_OP_MAD,
 686                                    *r,
 687                                    (arbneg & s_mask[mask].mask) | wmask,
 688                                    src | REG_NEGV_MASK, pfs_one, pfs_zero, 0);
 689                 }
 690         } else {
 691                 if (!wmask) {
 692                         REG_SET_NO_USE(src, GL_TRUE);
 693                 } else {
 694                         REG_SET_NO_USE(src, GL_FALSE);
 695                 }
 696                 emit_arith(cs, PFS_OP_MAD,
 697                            *r,
 698                            s_mask[mask].mask | wmask,
 699                            src, pfs_one, pfs_zero, 0);
 700         }
 701
 702         return s_mask[mask].count;
 703 }
 704
 705 static GLuint do_swizzle(struct r300_pfs_compile_state *cs,
 706                          GLuint src, GLuint arbswz, GLuint arbneg)
 707 {
 708         COMPILE_STATE;
 709         GLuint r = undef;
 710         GLuint vswz;
 711         int c_mask = 0;
 712         int v_match = 0;
 713
 714         /* If swizzling from something without an XYZW native swizzle,
 715          * emit result to a temp, and do new swizzle from the temp.
 716          */
 717 #if 0
 718         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
 719                 GLuint temp = get_temp_reg(fp);
 720                 emit_arith(fp,
 721                            PFS_OP_MAD,
 722                            temp, WRITEMASK_XYZW, src, pfs_one, pfs_zero, 0);
 723                 src = temp;
 724         }
 725 #endif
 726
 727         if (REG_GET_VSWZ(src) != SWIZZLE_XYZ || REG_GET_SSWZ(src) != SWIZZLE_W) {
 728                 GLuint vsrcswz =
 729                     (v_swiz[REG_GET_VSWZ(src)].
 730                      hash & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK)) |
 731                     REG_GET_SSWZ(src) << 9;
 732                 GLint i;
 733
 734                 GLuint newswz = 0;
 735                 GLuint offset;
 736                 for (i = 0; i < 4; ++i) {
 737                         offset = GET_SWZ(arbswz, i);
 738
 739                         newswz |=
 740                             (offset <= 3) ? GET_SWZ(vsrcswz,
 741                                                     offset) << i *
 742                             3 : offset << i * 3;
 743                 }
 744
 745                 arbswz = newswz & (SWZ_X_MASK | SWZ_Y_MASK | SWZ_Z_MASK);
 746                 REG_SET_SSWZ(src, GET_SWZ(newswz, 3));
 747         } else {
 748                 /* set scalar swizzling */
 749                 REG_SET_SSWZ(src, GET_SWZ(arbswz, 3));
 750
 751         }
 752         do {
 753                 vswz = REG_GET_VSWZ(src);
 754                 do {
 755                         int chash;
 756
 757                         REG_SET_VSWZ(src, vswz);
 758                         chash = v_swiz[REG_GET_VSWZ(src)].hash &
 759                             s_mask[c_mask].hash;
 760
 761                         if (chash == (arbswz & s_mask[c_mask].hash)) {
 762                                 if (s_mask[c_mask].count == 3) {
 763                                         v_match += swz_native(cs,
 764                                                               src, &r, arbneg);
 765                                 } else {
 766                                         v_match += swz_emit_partial(cs,
 767                                                                     src,
 768                                                                     &r,
 769                                                                     c_mask,
 770                                                                     v_match,
 771                                                                     arbneg);
 772                                 }
 773
 774                                 if (v_match == 3)
 775                                         return r;
 776
 777                                 /* Fill with something invalid.. all 0's was
 778                                  * wrong before, matched SWIZZLE_X.  So all
 779                                  * 1's will be okay for now
 780                                  */
 781                                 arbswz |= (PFS_INVAL & s_mask[c_mask].hash);
 782                         }
 783                 } while (v_swiz[++vswz].hash != PFS_INVAL);
 784                 REG_SET_VSWZ(src, SWIZZLE_XYZ);
 785         } while (s_mask[++c_mask].hash != PFS_INVAL);
 786
 787         ERROR("should NEVER get here\n");
 788         return r;
 789 }
 790
 791 static GLuint t_src(struct r300_pfs_compile_state *cs,
 792                     struct prog_src_register fpsrc)
 793 {
 794         COMPILE_STATE;
 795         GLuint r = undef;
 796
 797         switch (fpsrc.File) {
 798         case PROGRAM_TEMPORARY:
 799                 REG_SET_INDEX(r, fpsrc.Index);
 800                 REG_SET_VALID(r, GL_TRUE);
 801                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 802                 break;
 803         case PROGRAM_INPUT:
 804                 REG_SET_INDEX(r, fpsrc.Index);
 805                 REG_SET_VALID(r, GL_TRUE);
 806                 REG_SET_TYPE(r, REG_TYPE_INPUT);
 807                 break;
 808         case PROGRAM_LOCAL_PARAM:
 809                 r = emit_const4fv(cs,
 810                                   fp->mesa_program.Base.LocalParams[fpsrc.
 811                                                                     Index]);
 812                 break;
 813         case PROGRAM_ENV_PARAM:
 814                 r = emit_const4fv(cs,
 815                         cs->compiler->r300->radeon.glCtx->FragmentProgram.Parameters[fpsrc.Index]);
 816                 break;
 817         case PROGRAM_STATE_VAR:
 818         case PROGRAM_NAMED_PARAM:
 819         case PROGRAM_CONSTANT:
 820                 r = emit_const4fv(cs,
 821                                   fp->mesa_program.Base.Parameters->
 822                                   ParameterValues[fpsrc.Index]);
 823                 break;
 824         case PROGRAM_BUILTIN:
 825                 switch(fpsrc.Swizzle) {
 826                 case SWIZZLE_1111: r = pfs_one; break;
 827                 case SWIZZLE_0000: r = pfs_zero; break;
 828                 default:
 829                         ERROR("bad PROGRAM_BUILTIN swizzle %u\n", fpsrc.Swizzle);
 830                         break;
 831                 }
 832                 break;
 833         default:
 834                 ERROR("unknown SrcReg->File %x\n", fpsrc.File);
 835                 return r;
 836         }
 837
 838         /* no point swizzling ONE/ZERO/HALF constants... */
 839         if (REG_GET_VSWZ(r) < SWIZZLE_111 || REG_GET_SSWZ(r) < SWIZZLE_ZERO)
 840                 r = do_swizzle(cs, r, fpsrc.Swizzle, fpsrc.NegateBase);
 841         return r;
 842 }
 843
 844 static GLuint t_scalar_src(struct r300_pfs_compile_state *cs,
 845                            struct prog_src_register fpsrc)
 846 {
 847         struct prog_src_register src = fpsrc;
 848         int sc = GET_SWZ(fpsrc.Swizzle, 0);     /* X */
 849
 850         src.Swizzle = ((sc << 0) | (sc << 3) | (sc << 6) | (sc << 9));
 851
 852         return t_src(cs, src);
 853 }
 854
 855 static GLuint t_dst(struct r300_pfs_compile_state *cs,
 856                     struct prog_dst_register dest)
 857 {
 858         COMPILE_STATE;
 859         GLuint r = undef;
 860
 861         switch (dest.File) {
 862         case PROGRAM_TEMPORARY:
 863                 REG_SET_INDEX(r, dest.Index);
 864                 REG_SET_VALID(r, GL_TRUE);
 865                 REG_SET_TYPE(r, REG_TYPE_TEMP);
 866                 return r;
 867         case PROGRAM_OUTPUT:
 868                 REG_SET_TYPE(r, REG_TYPE_OUTPUT);
 869                 switch (dest.Index) {
 870                 case FRAG_RESULT_COLR:
 871                 case FRAG_RESULT_DEPR:
 872                         REG_SET_INDEX(r, dest.Index);
 873                         REG_SET_VALID(r, GL_TRUE);
 874                         return r;
 875                 default:
 876                         ERROR("Bad DstReg->Index 0x%x\n", dest.Index);
 877                         return r;
 878                 }
 879         default:
 880                 ERROR("Bad DstReg->File 0x%x\n", dest.File);
 881                 return r;
 882         }
 883 }
 884
 885 static int t_hw_src(struct r300_pfs_compile_state *cs, GLuint src, GLboolean tex)
 886 {
 887         COMPILE_STATE;
 888         int idx;
 889         int index = REG_GET_INDEX(src);
 890
 891         switch (REG_GET_TYPE(src)) {
 892         case REG_TYPE_TEMP:
 893                 /* NOTE: if reg==-1 here, a source is being read that
 894                  *       hasn't been written to. Undefined results.
 895                  */
 896                 if (cs->temps[index].reg == -1)
 897                         cs->temps[index].reg = get_hw_temp(cs, cs->nrslots);
 898
 899                 idx = cs->temps[index].reg;
 900
 901                 if (!REG_GET_NO_USE(src) && (--cs->temps[index].refcount == 0))
 902                         free_temp(cs, src);
 903                 break;
 904         case REG_TYPE_INPUT:
 905                 idx = cs->inputs[index].reg;
 906
 907                 if (!REG_GET_NO_USE(src) && (--cs->inputs[index].refcount == 0))
 908                         free_hw_temp(cs, cs->inputs[index].reg);
 909                 break;
 910         case REG_TYPE_CONST:
 911                 return (index | SRC_CONST);
 912         default:
 913                 ERROR("Invalid type for source reg\n");
 914                 return (0 | SRC_CONST);
 915         }
 916
 917         if (!tex)
 918                 cs->used_in_node |= (1 << idx);
 919
 920         return idx;
 921 }
 922
 923 static int t_hw_dst(struct r300_pfs_compile_state *cs,
 924                     GLuint dest, GLboolean tex, int slot)
 925 {
 926         COMPILE_STATE;
 927         int idx;
 928         GLuint index = REG_GET_INDEX(dest);
 929         assert(REG_GET_VALID(dest));
 930
 931         switch (REG_GET_TYPE(dest)) {
 932         case REG_TYPE_TEMP:
 933                 if (cs->temps[REG_GET_INDEX(dest)].reg == -1) {
 934                         if (!tex) {
 935                                 cs->temps[index].reg = get_hw_temp(cs, slot);
 936                         } else {
 937                                 cs->temps[index].reg = get_hw_temp_tex(cs);
 938                         }
 939                 }
 940                 idx = cs->temps[index].reg;
 941
 942                 if (!REG_GET_NO_USE(dest) && (--cs->temps[index].refcount == 0))
 943                         free_temp(cs, dest);
 944
 945                 cs->dest_in_node |= (1 << idx);
 946                 cs->used_in_node |= (1 << idx);
 947                 break;
 948         case REG_TYPE_OUTPUT:
 949                 switch (index) {
 950                 case FRAG_RESULT_COLR:
 951                         code->node[code->cur_node].flags |= R300_RGBA_OUT;
 952                         break;
 953                 case FRAG_RESULT_DEPR:
 954                         fp->WritesDepth = GL_TRUE;
 955                         code->node[code->cur_node].flags |= R300_W_OUT;
 956                         break;
 957                 }
 958                 return index;
 959                 break;
 960         default:
 961                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
 962                 return 0;
 963         }
 964
 965         return idx;
 966 }
 967
 968 static void emit_nop(struct r300_pfs_compile_state *cs)
 969 {
 970         COMPILE_STATE;
 971
 972         if (cs->nrslots >= PFS_MAX_ALU_INST) {
 973                 ERROR("Out of ALU instruction slots\n");
 974                 return;
 975         }
 976
 977         code->alu.inst[cs->nrslots].inst0 = NOP_INST0;
 978         code->alu.inst[cs->nrslots].inst1 = NOP_INST1;
 979         code->alu.inst[cs->nrslots].inst2 = NOP_INST2;
 980         code->alu.inst[cs->nrslots].inst3 = NOP_INST3;
 981         cs->nrslots++;
 982 }
 983
 984 static void emit_tex(struct r300_pfs_compile_state *cs,
 985                      struct prog_instruction *fpi, int opcode)
 986 {
 987         COMPILE_STATE;
 988         GLuint coord = t_src(cs, fpi->SrcReg[0]);
 989         GLuint dest = undef;
 990         GLuint din, uin;
 991         int unit = fpi->TexSrcUnit;
 992         int hwsrc, hwdest;
 993
 994         /* Ensure correct node indirection */
 995         uin = cs->used_in_node;
 996         din = cs->dest_in_node;
 997
 998         /* Resolve source/dest to hardware registers */
 999         hwsrc = t_hw_src(cs, coord, GL_TRUE);
1000
1001         if (opcode != R300_TEX_OP_KIL) {
1002                 dest = t_dst(cs, fpi->DstReg);
1003
1004                 hwdest =
1005                     t_hw_dst(cs, dest, GL_TRUE,
1006                              code->node[code->cur_node].alu_offset);
1007
1008                 /* Use a temp that hasn't been used in this node, rather
1009                  * than causing an indirection
1010                  */
1011                 if (uin & (1 << hwdest)) {
1012                         free_hw_temp(cs, hwdest);
1013                         hwdest = get_hw_temp_tex(cs);
1014                         cs->temps[REG_GET_INDEX(dest)].reg = hwdest;
1015                 }
1016         } else {
1017                 hwdest = 0;
1018                 unit = 0;
1019         }
1020
1021         /* Indirection if source has been written in this node, or if the
1022          * dest has been read/written in this node
1023          */
1024         if ((REG_GET_TYPE(coord) != REG_TYPE_CONST &&
1025              (din & (1 << hwsrc))) || (uin & (1 << hwdest))) {
1026
1027                 /* Finish off current node */
1028                 if (code->node[code->cur_node].alu_offset == cs->nrslots)
1029                         emit_nop(cs);
1030
1031                 code->node[code->cur_node].alu_end =
1032                     cs->nrslots - code->node[code->cur_node].alu_offset - 1;
1033                 assert(code->node[code->cur_node].alu_end >= 0);
1034
1035                 if (++code->cur_node >= PFS_MAX_TEX_INDIRECT) {
1036                         ERROR("too many levels of texture indirection\n");
1037                         return;
1038                 }
1039
1040                 /* Start new node */
1041                 code->node[code->cur_node].tex_offset = code->tex.length;
1042                 code->node[code->cur_node].alu_offset = cs->nrslots;
1043                 code->node[code->cur_node].tex_end = -1;
1044                 code->node[code->cur_node].alu_end = -1;
1045                 code->node[code->cur_node].flags = 0;
1046                 cs->used_in_node = 0;
1047                 cs->dest_in_node = 0;
1048         }
1049
1050         if (code->cur_node == 0)
1051                 code->first_node_has_tex = 1;
1052
1053         code->tex.inst[code->tex.length++] = 0 | (hwsrc << R300_SRC_ADDR_SHIFT)
1054             | (hwdest << R300_DST_ADDR_SHIFT)
1055             | (unit << R300_TEX_ID_SHIFT)
1056             | (opcode << R300_TEX_INST_SHIFT);
1057
1058         cs->dest_in_node |= (1 << hwdest);
1059         if (REG_GET_TYPE(coord) != REG_TYPE_CONST)
1060                 cs->used_in_node |= (1 << hwsrc);
1061
1062         code->node[code->cur_node].tex_end++;
1063 }
1064
1065 /**
1066  * Returns the first slot where we could possibly allow writing to dest,
1067  * according to register allocation.
1068  */
1069 static int get_earliest_allowed_write(struct r300_pfs_compile_state *cs,
1070                                       GLuint dest, int mask)
1071 {
1072         COMPILE_STATE;
1073         int idx;
1074         int pos;
1075         GLuint index = REG_GET_INDEX(dest);
1076         assert(REG_GET_VALID(dest));
1077
1078         switch (REG_GET_TYPE(dest)) {
1079         case REG_TYPE_TEMP:
1080                 if (cs->temps[index].reg == -1)
1081                         return 0;
1082
1083                 idx = cs->temps[index].reg;
1084                 break;
1085         case REG_TYPE_OUTPUT:
1086                 return 0;
1087         default:
1088                 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest));
1089                 return 0;
1090         }
1091
1092         pos = cs->hwtemps[idx].reserved;
1093         if (mask & WRITEMASK_XYZ) {
1094                 if (pos < cs->hwtemps[idx].vector_lastread)
1095                         pos = cs->hwtemps[idx].vector_lastread;
1096         }
1097         if (mask & WRITEMASK_W) {
1098                 if (pos < cs->hwtemps[idx].scalar_lastread)
1099                         pos = cs->hwtemps[idx].scalar_lastread;
1100         }
1101
1102         return pos;
1103 }
1104
1105 /**
1106  * Allocates a slot for an ALU instruction that can consist of
1107  * a vertex part or a scalar part or both.
1108  *
1109  * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1110  * appropriate position (vector and/or scalar), and their positions are
1111  * recorded in the srcpos array.
1112  *
1113  * This function emits instruction code for the source fetch and the
1114  * argument selection. It does not emit instruction code for the
1115  * opcode or the destination selection.
1116  *
1117  * @return the index of the slot
1118  */
1119 static int find_and_prepare_slot(struct r300_pfs_compile_state *cs,
1120                                  GLboolean emit_vop,
1121                                  GLboolean emit_sop,
1122                                  int argc, GLuint * src, GLuint dest, int mask)
1123 {
1124         COMPILE_STATE;
1125         int hwsrc[3];
1126         int srcpos[3];
1127         unsigned int used;
1128         int tempused;
1129         int tempvsrc[3];
1130         int tempssrc[3];
1131         int pos;
1132         int regnr;
1133         int i, j;
1134
1135         // Determine instruction slots, whether sources are required on
1136         // vector or scalar side, and the smallest slot number where
1137         // all source registers are available
1138         used = 0;
1139         if (emit_vop)
1140                 used |= SLOT_OP_VECTOR;
1141         if (emit_sop)
1142                 used |= SLOT_OP_SCALAR;
1143
1144         pos = get_earliest_allowed_write(cs, dest, mask);
1145
1146         if (code->node[code->cur_node].alu_offset > pos)
1147                 pos = code->node[code->cur_node].alu_offset;
1148         for (i = 0; i < argc; ++i) {
1149                 if (!REG_GET_BUILTIN(src[i])) {
1150                         if (emit_vop)
1151                                 used |= v_swiz[REG_GET_VSWZ(src[i])].flags << i;
1152                         if (emit_sop)
1153                                 used |= s_swiz[REG_GET_SSWZ(src[i])].flags << i;
1154                 }
1155
1156                 hwsrc[i] = t_hw_src(cs, src[i], GL_FALSE);      /* Note: sideeffects wrt refcounting! */
1157                 regnr = hwsrc[i] & 31;
1158
1159                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1160                         if (used & (SLOT_SRC_VECTOR << i)) {
1161                                 if (cs->hwtemps[regnr].vector_valid > pos)
1162                                         pos = cs->hwtemps[regnr].vector_valid;
1163                         }
1164                         if (used & (SLOT_SRC_SCALAR << i)) {
1165                                 if (cs->hwtemps[regnr].scalar_valid > pos)
1166                                         pos = cs->hwtemps[regnr].scalar_valid;
1167                         }
1168                 }
1169         }
1170
1171         // Find a slot that fits
1172         for (;; ++pos) {
1173                 if (cs->slot[pos].used & used & SLOT_OP_BOTH)
1174                         continue;
1175
1176                 if (pos >= cs->nrslots) {
1177                         if (cs->nrslots >= PFS_MAX_ALU_INST) {
1178                                 ERROR("Out of ALU instruction slots\n");
1179                                 return -1;
1180                         }
1181
1182                         code->alu.inst[pos].inst0 = NOP_INST0;
1183                         code->alu.inst[pos].inst1 = NOP_INST1;
1184                         code->alu.inst[pos].inst2 = NOP_INST2;
1185                         code->alu.inst[pos].inst3 = NOP_INST3;
1186
1187                         cs->nrslots++;
1188                 }
1189                 // Note: When we need both parts (vector and scalar) of a source,
1190                 // we always try to put them into the same position. This makes the
1191                 // code easier to read, and it is optimal (i.e. one doesn't gain
1192                 // anything by splitting the parts).
1193                 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1194                 tempused = cs->slot[pos].used;
1195                 for (i = 0; i < 3; ++i) {
1196                         tempvsrc[i] = cs->slot[pos].vsrc[i];
1197                         tempssrc[i] = cs->slot[pos].ssrc[i];
1198                 }
1199
1200                 for (i = 0; i < argc; ++i) {
1201                         int flags = (used >> i) & SLOT_SRC_BOTH;
1202
1203                         if (!flags) {
1204                                 srcpos[i] = 0;
1205                                 continue;
1206                         }
1207
1208                         for (j = 0; j < 3; ++j) {
1209                                 if ((tempused >> j) & flags & SLOT_SRC_VECTOR) {
1210                                         if (tempvsrc[j] != hwsrc[i])
1211                                                 continue;
1212                                 }
1213
1214                                 if ((tempused >> j) & flags & SLOT_SRC_SCALAR) {
1215                                         if (tempssrc[j] != hwsrc[i])
1216                                                 continue;
1217                                 }
1218
1219                                 break;
1220                         }
1221
1222                         if (j == 3)
1223                                 break;
1224
1225                         srcpos[i] = j;
1226                         tempused |= flags << j;
1227                         if (flags & SLOT_SRC_VECTOR)
1228                                 tempvsrc[j] = hwsrc[i];
1229                         if (flags & SLOT_SRC_SCALAR)
1230                                 tempssrc[j] = hwsrc[i];
1231                 }
1232
1233                 if (i == argc)
1234                         break;
1235         }
1236
1237         // Found a slot, reserve it
1238         cs->slot[pos].used = tempused | (used & SLOT_OP_BOTH);
1239         for (i = 0; i < 3; ++i) {
1240                 cs->slot[pos].vsrc[i] = tempvsrc[i];
1241                 cs->slot[pos].ssrc[i] = tempssrc[i];
1242         }
1243
1244         for (i = 0; i < argc; ++i) {
1245                 if (REG_GET_TYPE(src[i]) == REG_TYPE_TEMP) {
1246                         int regnr = hwsrc[i] & 31;
1247
1248                         if (used & (SLOT_SRC_VECTOR << i)) {
1249                                 if (cs->hwtemps[regnr].vector_lastread < pos)
1250                                         cs->hwtemps[regnr].vector_lastread =
1251                                             pos;
1252                         }
1253                         if (used & (SLOT_SRC_SCALAR << i)) {
1254                                 if (cs->hwtemps[regnr].scalar_lastread < pos)
1255                                         cs->hwtemps[regnr].scalar_lastread =
1256                                             pos;
1257                         }
1258                 }
1259         }
1260
1261         // Emit the source fetch code
1262         code->alu.inst[pos].inst1 &= ~R300_ALU_SRC_MASK;
1263         code->alu.inst[pos].inst1 |=
1264             ((cs->slot[pos].vsrc[0] << R300_ALU_SRC0C_SHIFT) |
1265              (cs->slot[pos].vsrc[1] << R300_ALU_SRC1C_SHIFT) |
1266              (cs->slot[pos].vsrc[2] << R300_ALU_SRC2C_SHIFT));
1267
1268         code->alu.inst[pos].inst3 &= ~R300_ALU_SRC_MASK;
1269         code->alu.inst[pos].inst3 |=
1270             ((cs->slot[pos].ssrc[0] << R300_ALU_SRC0A_SHIFT) |
1271              (cs->slot[pos].ssrc[1] << R300_ALU_SRC1A_SHIFT) |
1272              (cs->slot[pos].ssrc[2] << R300_ALU_SRC2A_SHIFT));
1273
1274         // Emit the argument selection code
1275         if (emit_vop) {
1276                 int swz[3];
1277
1278                 for (i = 0; i < 3; ++i) {
1279                         if (i < argc) {
1280                                 swz[i] = (v_swiz[REG_GET_VSWZ(src[i])].base +
1281                                           (srcpos[i] *
1282                                            v_swiz[REG_GET_VSWZ(src[i])].
1283                                            stride)) | ((src[i] & REG_NEGV_MASK)
1284                                                        ? ARG_NEG : 0) | ((src[i]
1285                                                                           &
1286                                                                           REG_ABS_MASK)
1287                                                                          ?
1288                                                                          ARG_ABS
1289                                                                          : 0);
1290                         } else {
1291                                 swz[i] = R300_ALU_ARGC_ZERO;
1292                         }
1293                 }
1294
1295                 code->alu.inst[pos].inst0 &=
1296                     ~(R300_ALU_ARG0C_MASK | R300_ALU_ARG1C_MASK |
1297                       R300_ALU_ARG2C_MASK);
1298                 code->alu.inst[pos].inst0 |=
1299                     (swz[0] << R300_ALU_ARG0C_SHIFT) | (swz[1] <<
1300                                                          R300_ALU_ARG1C_SHIFT)
1301                     | (swz[2] << R300_ALU_ARG2C_SHIFT);
1302         }
1303
1304         if (emit_sop) {
1305                 int swz[3];
1306
1307                 for (i = 0; i < 3; ++i) {
1308                         if (i < argc) {
1309                                 swz[i] = (s_swiz[REG_GET_SSWZ(src[i])].base +
1310                                           (srcpos[i] *
1311                                            s_swiz[REG_GET_SSWZ(src[i])].
1312                                            stride)) | ((src[i] & REG_NEGV_MASK)
1313                                                        ? ARG_NEG : 0) | ((src[i]
1314                                                                           &
1315                                                                           REG_ABS_MASK)
1316                                                                          ?
1317                                                                          ARG_ABS
1318                                                                          : 0);
1319                         } else {
1320                                 swz[i] = R300_ALU_ARGA_ZERO;
1321                         }
1322                 }
1323
1324                 code->alu.inst[pos].inst2 &=
1325                     ~(R300_ALU_ARG0A_MASK | R300_ALU_ARG1A_MASK |
1326                       R300_ALU_ARG2A_MASK);
1327                 code->alu.inst[pos].inst2 |=
1328                     (swz[0] << R300_ALU_ARG0A_SHIFT) | (swz[1] <<
1329                                                          R300_ALU_ARG1A_SHIFT)
1330                     | (swz[2] << R300_ALU_ARG2A_SHIFT);
1331         }
1332
1333         return pos;
1334 }
1335
1336 /**
1337  * Append an ALU instruction to the instruction list.
1338  */
1339 static void emit_arith(struct r300_pfs_compile_state *cs,
1340                        int op,
1341                        GLuint dest,
1342                        int mask,
1343                        GLuint src0, GLuint src1, GLuint src2, int flags)
1344 {
1345         COMPILE_STATE;
1346         GLuint src[3] = { src0, src1, src2 };
1347         int hwdest;
1348         GLboolean emit_vop, emit_sop;
1349         int vop, sop, argc;
1350         int pos;
1351
1352         vop = r300_fpop[op].v_op;
1353         sop = r300_fpop[op].s_op;
1354         argc = r300_fpop[op].argc;
1355
1356         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT &&
1357             REG_GET_INDEX(dest) == FRAG_RESULT_DEPR) {
1358                 if (mask & WRITEMASK_Z) {
1359                         mask = WRITEMASK_W;
1360                 } else {
1361                         return;
1362                 }
1363         }
1364
1365         emit_vop = GL_FALSE;
1366         emit_sop = GL_FALSE;
1367         if ((mask & WRITEMASK_XYZ) || vop == R300_ALU_OUTC_DP3)
1368                 emit_vop = GL_TRUE;
1369         if ((mask & WRITEMASK_W) || vop == R300_ALU_OUTC_REPL_ALPHA)
1370                 emit_sop = GL_TRUE;
1371
1372         pos =
1373             find_and_prepare_slot(cs, emit_vop, emit_sop, argc, src, dest,
1374                                   mask);
1375         if (pos < 0)
1376                 return;
1377
1378         hwdest = t_hw_dst(cs, dest, GL_FALSE, pos);     /* Note: Side effects wrt register allocation */
1379
1380         if (flags & PFS_FLAG_SAT) {
1381                 vop |= R300_ALU_OUTC_CLAMP;
1382                 sop |= R300_ALU_OUTA_CLAMP;
1383         }
1384
1385         /* Throw the pieces together and get ALU/1 */
1386         if (emit_vop) {
1387                 code->alu.inst[pos].inst0 |= vop;
1388
1389                 code->alu.inst[pos].inst1 |= hwdest << R300_ALU_DSTC_SHIFT;
1390
1391                 if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1392                         if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1393                                 code->alu.inst[pos].inst1 |=
1394                                     (mask & WRITEMASK_XYZ) <<
1395                                     R300_ALU_DSTC_OUTPUT_MASK_SHIFT;
1396                         } else
1397                                 assert(0);
1398                 } else {
1399                         code->alu.inst[pos].inst1 |=
1400                             (mask & WRITEMASK_XYZ) <<
1401                             R300_ALU_DSTC_REG_MASK_SHIFT;
1402
1403                         cs->hwtemps[hwdest].vector_valid = pos + 1;
1404                 }
1405         }
1406
1407         /* And now ALU/3 */
1408         if (emit_sop) {
1409                 code->alu.inst[pos].inst2 |= sop;
1410
1411                 if (mask & WRITEMASK_W) {
1412                         if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1413                                 if (REG_GET_INDEX(dest) == FRAG_RESULT_COLR) {
1414                                         code->alu.inst[pos].inst3 |=
1415                                             (hwdest << R300_ALU_DSTA_SHIFT) |
1416                                             R300_ALU_DSTA_OUTPUT;
1417                                 } else if (REG_GET_INDEX(dest) ==
1418                                            FRAG_RESULT_DEPR) {
1419                                         code->alu.inst[pos].inst3 |=
1420                                             R300_ALU_DSTA_DEPTH;
1421                                 } else
1422                                         assert(0);
1423                         } else {
1424                                 code->alu.inst[pos].inst3 |=
1425                                     (hwdest << R300_ALU_DSTA_SHIFT) |
1426                                     R300_ALU_DSTA_REG;
1427
1428                                 cs->hwtemps[hwdest].scalar_valid = pos + 1;
1429                         }
1430                 }
1431         }
1432
1433         return;
1434 }
1435
1436 static GLfloat SinCosConsts[2][4] = {
1437         {
1438          1.273239545,           // 4/PI
1439          -0.405284735,          // -4/(PI*PI)
1440          3.141592654,           // PI
1441          0.2225                 // weight
1442          },
1443         {
1444          0.75,
1445          0.0,
1446          0.159154943,           // 1/(2*PI)
1447          6.283185307            // 2*PI
1448          }
1449 };
1450
1451 /**
1452  * Emit a LIT instruction.
1453  * \p flags may be PFS_FLAG_SAT
1454  *
1455  * Definition of LIT (from ARB_fragment_program):
1456  * tmp = VectorLoad(op0);
1457  * if (tmp.x < 0) tmp.x = 0;
1458  * if (tmp.y < 0) tmp.y = 0;
1459  * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1460  * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1461  * result.x = 1.0;
1462  * result.y = tmp.x;
1463  * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1464  * result.w = 1.0;
1465  *
1466  * The longest path of computation is the one leading to result.z,
1467  * consisting of 5 operations. This implementation of LIT takes
1468  * 5 slots. So unless there's some special undocumented opcode,
1469  * this implementation is potentially optimal. Unfortunately,
1470  * emit_arith is a bit too conservative because it doesn't understand
1471  * partial writes to the vector component.
1472  */
1473 static const GLfloat LitConst[4] =
1474     { 127.999999, 127.999999, 127.999999, -127.999999 };
1475
1476 static void emit_lit(struct r300_pfs_compile_state *cs,
1477                      GLuint dest, int mask, GLuint src, int flags)
1478 {
1479         COMPILE_STATE;
1480         GLuint cnst;
1481         int needTemporary;
1482         GLuint temp;
1483
1484         cnst = emit_const4fv(cs, LitConst);
1485
1486         needTemporary = 0;
1487         if ((mask & WRITEMASK_XYZW) != WRITEMASK_XYZW) {
1488                 needTemporary = 1;
1489         } else if (REG_GET_TYPE(dest) == REG_TYPE_OUTPUT) {
1490                 // LIT is typically followed by DP3/DP4, so there's no point
1491                 // in creating special code for this case
1492                 needTemporary = 1;
1493         }
1494
1495         if (needTemporary) {
1496                 temp = keep(get_temp_reg(cs));
1497         } else {
1498                 temp = keep(dest);
1499         }
1500
1501         // Note: The order of emit_arith inside the slots is relevant,
1502         // because emit_arith only looks at scalar vs. vector when resolving
1503         // dependencies, and it does not consider individual vector components,
1504         // so swizzling between the two parts can create fake dependencies.
1505
1506         // First slot
1507         emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_XY,
1508                    keep(src), pfs_zero, undef, 0);
1509         emit_arith(cs, PFS_OP_MAX, temp, WRITEMASK_W, src, cnst, undef, 0);
1510
1511         // Second slot
1512         emit_arith(cs, PFS_OP_MIN, temp, WRITEMASK_Z,
1513                    swizzle(temp, W, W, W, W), cnst, undef, 0);
1514         emit_arith(cs, PFS_OP_LG2, temp, WRITEMASK_W,
1515                    swizzle(temp, Y, Y, Y, Y), undef, undef, 0);
1516
1517         // Third slot
1518         // If desired, we saturate the y result here.
1519         // This does not affect the use as a condition variable in the CMP later
1520         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W,
1521                    temp, swizzle(temp, Z, Z, Z, Z), pfs_zero, 0);
1522         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_Y,
1523                    swizzle(temp, X, X, X, X), pfs_one, pfs_zero, flags);
1524
1525         // Fourth slot
1526         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_X,
1527                    pfs_one, pfs_one, pfs_zero, 0);
1528         emit_arith(cs, PFS_OP_EX2, temp, WRITEMASK_W, temp, undef, undef, 0);
1529
1530         // Fifth slot
1531         emit_arith(cs, PFS_OP_CMP, temp, WRITEMASK_Z,
1532                    pfs_zero, swizzle(temp, W, W, W, W),
1533                    negate(swizzle(temp, Y, Y, Y, Y)), flags);
1534         emit_arith(cs, PFS_OP_MAD, temp, WRITEMASK_W, pfs_one, pfs_one,
1535                    pfs_zero, 0);
1536
1537         if (needTemporary) {
1538                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1539                            temp, pfs_one, pfs_zero, flags);
1540                 free_temp(cs, temp);
1541         } else {
1542                 // Decrease refcount of the destination
1543                 t_hw_dst(cs, dest, GL_FALSE, cs->nrslots);
1544         }
1545 }
1546
1547 static void emit_instruction(struct r300_pfs_compile_state *cs, struct prog_instruction *fpi)
1548 {
1549         COMPILE_STATE;
1550         GLuint src[3], dest, temp[2];
1551         int flags, mask = 0;
1552         int const_sin[2];
1553
1554         if (fpi->SaturateMode == SATURATE_ZERO_ONE)
1555                 flags = PFS_FLAG_SAT;
1556         else
1557                 flags = 0;
1558
1559         if (fpi->Opcode != OPCODE_KIL) {
1560                 dest = t_dst(cs, fpi->DstReg);
1561                 mask = fpi->DstReg.WriteMask;
1562         }
1563
1564         switch (fpi->Opcode) {
1565         case OPCODE_ABS:
1566                 src[0] = t_src(cs, fpi->SrcReg[0]);
1567                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1568                                 absolute(src[0]), pfs_one, pfs_zero, flags);
1569                 break;
1570         case OPCODE_ADD:
1571                 src[0] = t_src(cs, fpi->SrcReg[0]);
1572                 src[1] = t_src(cs, fpi->SrcReg[1]);
1573                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1574                                 src[0], pfs_one, src[1], flags);
1575                 break;
1576         case OPCODE_CMP:
1577                 src[0] = t_src(cs, fpi->SrcReg[0]);
1578                 src[1] = t_src(cs, fpi->SrcReg[1]);
1579                 src[2] = t_src(cs, fpi->SrcReg[2]);
1580                 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1581                         *    r300 - if src2.c < 0.0 ? src1.c : src0.c
1582                         */
1583                 emit_arith(cs, PFS_OP_CMP, dest, mask,
1584                                 src[2], src[1], src[0], flags);
1585                 break;
1586         case OPCODE_COS:
1587                 /*
1588                         * cos using a parabola (see SIN):
1589                         * cos(x):
1590                         *   x = (x/(2*PI))+0.75
1591                         *   x = frac(x)
1592                         *   x = (x*2*PI)-PI
1593                         *   result = sin(x)
1594                         */
1595                 temp[0] = get_temp_reg(cs);
1596                 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1597                 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1598                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1599
1600                 /* add 0.5*PI and do range reduction */
1601
1602                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1603                                 swizzle(src[0], X, X, X, X),
1604                                 swizzle(const_sin[1], Z, Z, Z, Z),
1605                                 swizzle(const_sin[1], X, X, X, X), 0);
1606
1607                 emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
1608                                 swizzle(temp[0], X, X, X, X),
1609                                 undef, undef, 0);
1610
1611                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),       //2*PI
1612                                 negate(swizzle(const_sin[0], Z, Z, Z, Z)),      //-PI
1613                                 0);
1614
1615                 /* SIN */
1616
1617                 emit_arith(cs, PFS_OP_MAD, temp[0],
1618                                 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1619                                                                 Z, Z, Z,
1620                                                                 Z),
1621                                 const_sin[0], pfs_zero, 0);
1622
1623                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1624                                 swizzle(temp[0], Y, Y, Y, Y),
1625                                 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1626                                 swizzle(temp[0], X, X, X, X), 0);
1627
1628                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1629                                 swizzle(temp[0], X, X, X, X),
1630                                 absolute(swizzle(temp[0], X, X, X, X)),
1631                                 negate(swizzle(temp[0], X, X, X, X)), 0);
1632
1633                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1634                                 swizzle(temp[0], Y, Y, Y, Y),
1635                                 swizzle(const_sin[0], W, W, W, W),
1636                                 swizzle(temp[0], X, X, X, X), flags);
1637
1638                 free_temp(cs, temp[0]);
1639                 break;
1640         case OPCODE_DP3:
1641                 src[0] = t_src(cs, fpi->SrcReg[0]);
1642                 src[1] = t_src(cs, fpi->SrcReg[1]);
1643                 emit_arith(cs, PFS_OP_DP3, dest, mask,
1644                                 src[0], src[1], undef, flags);
1645                 break;
1646         case OPCODE_DP4:
1647                 src[0] = t_src(cs, fpi->SrcReg[0]);
1648                 src[1] = t_src(cs, fpi->SrcReg[1]);
1649                 emit_arith(cs, PFS_OP_DP4, dest, mask,
1650                                 src[0], src[1], undef, flags);
1651                 break;
1652         case OPCODE_DPH:
1653                 src[0] = t_src(cs, fpi->SrcReg[0]);
1654                 src[1] = t_src(cs, fpi->SrcReg[1]);
1655                 /* src0.xyz1 -> temp
1656                         * DP4 dest, temp, src1
1657                         */
1658                 emit_arith(cs, PFS_OP_DP4, dest, mask,
1659                                 swizzle(src[0], X, Y, Z, ONE), src[1],
1660                                 undef, flags);
1661                 break;
1662         case OPCODE_DST:
1663                 src[0] = t_src(cs, fpi->SrcReg[0]);
1664                 src[1] = t_src(cs, fpi->SrcReg[1]);
1665                 /* dest.y = src0.y * src1.y */
1666                 if (mask & WRITEMASK_Y)
1667                         emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Y,
1668                                         keep(src[0]), keep(src[1]),
1669                                         pfs_zero, flags);
1670                 /* dest.z = src0.z */
1671                 if (mask & WRITEMASK_Z)
1672                         emit_arith(cs, PFS_OP_MAD, dest, WRITEMASK_Z,
1673                                         src[0], pfs_one, pfs_zero, flags);
1674                 /* result.x = 1.0
1675                         * result.w = src1.w */
1676                 if (mask & WRITEMASK_XW) {
1677                         REG_SET_VSWZ(src[1], SWIZZLE_111);      /*Cheat */
1678                         emit_arith(cs, PFS_OP_MAD, dest,
1679                                         mask & WRITEMASK_XW,
1680                                         src[1], pfs_one, pfs_zero, flags);
1681                 }
1682                 break;
1683         case OPCODE_EX2:
1684                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1685                 emit_arith(cs, PFS_OP_EX2, dest, mask,
1686                                 src[0], undef, undef, flags);
1687                 break;
1688         case OPCODE_FLR:
1689                 src[0] = t_src(cs, fpi->SrcReg[0]);
1690                 temp[0] = get_temp_reg(cs);
1691                 /* FRC temp, src0
1692                         * MAD dest, src0, 1.0, -temp
1693                         */
1694                 emit_arith(cs, PFS_OP_FRC, temp[0], mask,
1695                                 keep(src[0]), undef, undef, 0);
1696                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1697                                 src[0], pfs_one, negate(temp[0]), flags);
1698                 free_temp(cs, temp[0]);
1699                 break;
1700         case OPCODE_FRC:
1701                 src[0] = t_src(cs, fpi->SrcReg[0]);
1702                 emit_arith(cs, PFS_OP_FRC, dest, mask,
1703                                 src[0], undef, undef, flags);
1704                 break;
1705         case OPCODE_KIL:
1706                 emit_tex(cs, fpi, R300_TEX_OP_KIL);
1707                 break;
1708         case OPCODE_LG2:
1709                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1710                 emit_arith(cs, PFS_OP_LG2, dest, mask,
1711                                 src[0], undef, undef, flags);
1712                 break;
1713         case OPCODE_LIT:
1714                 src[0] = t_src(cs, fpi->SrcReg[0]);
1715                 emit_lit(cs, dest, mask, src[0], flags);
1716                 break;
1717         case OPCODE_LRP:
1718                 src[0] = t_src(cs, fpi->SrcReg[0]);
1719                 src[1] = t_src(cs, fpi->SrcReg[1]);
1720                 src[2] = t_src(cs, fpi->SrcReg[2]);
1721                 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1722                         *        = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1723                         *     MAD temp, -tmp0, tmp2, tmp2
1724                         *     MAD result, tmp0, tmp1, temp
1725                         */
1726                 temp[0] = get_temp_reg(cs);
1727                 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1728                                 negate(keep(src[0])), keep(src[2]), src[2],
1729                                 0);
1730                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1731                                 src[0], src[1], temp[0], flags);
1732                 free_temp(cs, temp[0]);
1733                 break;
1734         case OPCODE_MAD:
1735                 src[0] = t_src(cs, fpi->SrcReg[0]);
1736                 src[1] = t_src(cs, fpi->SrcReg[1]);
1737                 src[2] = t_src(cs, fpi->SrcReg[2]);
1738                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1739                                 src[0], src[1], src[2], flags);
1740                 break;
1741         case OPCODE_MAX:
1742                 src[0] = t_src(cs, fpi->SrcReg[0]);
1743                 src[1] = t_src(cs, fpi->SrcReg[1]);
1744                 emit_arith(cs, PFS_OP_MAX, dest, mask,
1745                                 src[0], src[1], undef, flags);
1746                 break;
1747         case OPCODE_MIN:
1748                 src[0] = t_src(cs, fpi->SrcReg[0]);
1749                 src[1] = t_src(cs, fpi->SrcReg[1]);
1750                 emit_arith(cs, PFS_OP_MIN, dest, mask,
1751                                 src[0], src[1], undef, flags);
1752                 break;
1753         case OPCODE_MOV:
1754         case OPCODE_SWZ:
1755                 src[0] = t_src(cs, fpi->SrcReg[0]);
1756                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1757                                 src[0], pfs_one, pfs_zero, flags);
1758                 break;
1759         case OPCODE_MUL:
1760                 src[0] = t_src(cs, fpi->SrcReg[0]);
1761                 src[1] = t_src(cs, fpi->SrcReg[1]);
1762                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1763                                 src[0], src[1], pfs_zero, flags);
1764                 break;
1765         case OPCODE_POW:
1766                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1767                 src[1] = t_scalar_src(cs, fpi->SrcReg[1]);
1768                 temp[0] = get_temp_reg(cs);
1769                 emit_arith(cs, PFS_OP_LG2, temp[0], WRITEMASK_W,
1770                                 src[0], undef, undef, 0);
1771                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
1772                                 temp[0], src[1], pfs_zero, 0);
1773                 emit_arith(cs, PFS_OP_EX2, dest, fpi->DstReg.WriteMask,
1774                                 temp[0], undef, undef, 0);
1775                 free_temp(cs, temp[0]);
1776                 break;
1777         case OPCODE_RCP:
1778                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1779                 emit_arith(cs, PFS_OP_RCP, dest, mask,
1780                                 src[0], undef, undef, flags);
1781                 break;
1782         case OPCODE_RSQ:
1783                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1784                 emit_arith(cs, PFS_OP_RSQ, dest, mask,
1785                                 absolute(src[0]), pfs_zero, pfs_zero, flags);
1786                 break;
1787         case OPCODE_SCS:
1788                 /*
1789                         * scs using a parabola :
1790                         * scs(x):
1791                         *   result.x = sin(-abs(x)+0.5*PI)  (cos)
1792                         *   result.y = sin(x)               (sin)
1793                         *
1794                         */
1795                 temp[0] = get_temp_reg(cs);
1796                 temp[1] = get_temp_reg(cs);
1797                 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1798                 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1799                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1800
1801                 /* x = -abs(x)+0.5*PI */
1802                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(const_sin[0], Z, Z, Z, Z),     //PI
1803                                 pfs_half,
1804                                 negate(abs
1805                                         (swizzle(keep(src[0]), X, X, X, X))),
1806                                 0);
1807
1808                 /* C*x (sin) */
1809                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_W,
1810                                 swizzle(const_sin[0], Y, Y, Y, Y),
1811                                 swizzle(keep(src[0]), X, X, X, X),
1812                                 pfs_zero, 0);
1813
1814                 /* B*x, C*x (cos) */
1815                 emit_arith(cs, PFS_OP_MAD, temp[0],
1816                                 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1817                                                                 Z, Z, Z,
1818                                                                 Z),
1819                                 const_sin[0], pfs_zero, 0);
1820
1821                 /* B*x (sin) */
1822                 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1823                                 swizzle(const_sin[0], X, X, X, X),
1824                                 keep(src[0]), pfs_zero, 0);
1825
1826                 /* y = B*x + C*x*abs(x) (sin) */
1827                 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_Z,
1828                                 absolute(src[0]),
1829                                 swizzle(temp[0], W, W, W, W),
1830                                 swizzle(temp[1], W, W, W, W), 0);
1831
1832                 /* y = B*x + C*x*abs(x) (cos) */
1833                 emit_arith(cs, PFS_OP_MAD, temp[1], WRITEMASK_W,
1834                                 swizzle(temp[0], Y, Y, Y, Y),
1835                                 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1836                                 swizzle(temp[0], X, X, X, X), 0);
1837
1838                 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1839                 emit_arith(cs, PFS_OP_MAD, temp[0],
1840                                 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[1],
1841                                                                 W, Z, Y,
1842                                                                 X),
1843                                 absolute(swizzle(temp[1], W, Z, Y, X)),
1844                                 negate(swizzle(temp[1], W, Z, Y, X)), 0);
1845
1846                 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1847                 emit_arith(cs, PFS_OP_MAD, dest,
1848                                 mask & (WRITEMASK_X | WRITEMASK_Y), temp[0],
1849                                 swizzle(const_sin[0], W, W, W, W),
1850                                 swizzle(temp[1], W, Z, Y, X), flags);
1851
1852                 free_temp(cs, temp[0]);
1853                 free_temp(cs, temp[1]);
1854                 break;
1855         case OPCODE_SGE:
1856                 src[0] = t_src(cs, fpi->SrcReg[0]);
1857                 src[1] = t_src(cs, fpi->SrcReg[1]);
1858                 temp[0] = get_temp_reg(cs);
1859                 /* temp = src0 - src1
1860                         * dest.c = (temp.c < 0.0) ? 0 : 1
1861                         */
1862                 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1863                                 src[0], pfs_one, negate(src[1]), 0);
1864                 emit_arith(cs, PFS_OP_CMP, dest, mask,
1865                                 pfs_one, pfs_zero, temp[0], 0);
1866                 free_temp(cs, temp[0]);
1867                 break;
1868         case OPCODE_SIN:
1869                 /*
1870                         *  using a parabola:
1871                         * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1872                         * extra precision is obtained by weighting against
1873                         * itself squared.
1874                         */
1875
1876                 temp[0] = get_temp_reg(cs);
1877                 const_sin[0] = emit_const4fv(cs, SinCosConsts[0]);
1878                 const_sin[1] = emit_const4fv(cs, SinCosConsts[1]);
1879                 src[0] = t_scalar_src(cs, fpi->SrcReg[0]);
1880
1881                 /* do range reduction */
1882
1883                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1884                                 swizzle(keep(src[0]), X, X, X, X),
1885                                 swizzle(const_sin[1], Z, Z, Z, Z),
1886                                 pfs_half, 0);
1887
1888                 emit_arith(cs, PFS_OP_FRC, temp[0], WRITEMASK_X,
1889                                 swizzle(temp[0], X, X, X, X),
1890                                 undef, undef, 0);
1891
1892                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Z, swizzle(temp[0], X, X, X, X), swizzle(const_sin[1], W, W, W, W),       //2*PI
1893                                 negate(swizzle(const_sin[0], Z, Z, Z, Z)),      //PI
1894                                 0);
1895
1896                 /* SIN */
1897
1898                 emit_arith(cs, PFS_OP_MAD, temp[0],
1899                                 WRITEMASK_X | WRITEMASK_Y, swizzle(temp[0],
1900                                                                 Z, Z, Z,
1901                                                                 Z),
1902                                 const_sin[0], pfs_zero, 0);
1903
1904                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_X,
1905                                 swizzle(temp[0], Y, Y, Y, Y),
1906                                 absolute(swizzle(temp[0], Z, Z, Z, Z)),
1907                                 swizzle(temp[0], X, X, X, X), 0);
1908
1909                 emit_arith(cs, PFS_OP_MAD, temp[0], WRITEMASK_Y,
1910                                 swizzle(temp[0], X, X, X, X),
1911                                 absolute(swizzle(temp[0], X, X, X, X)),
1912                                 negate(swizzle(temp[0], X, X, X, X)), 0);
1913
1914                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1915                                 swizzle(temp[0], Y, Y, Y, Y),
1916                                 swizzle(const_sin[0], W, W, W, W),
1917                                 swizzle(temp[0], X, X, X, X), flags);
1918
1919                 free_temp(cs, temp[0]);
1920                 break;
1921         case OPCODE_SLT:
1922                 src[0] = t_src(cs, fpi->SrcReg[0]);
1923                 src[1] = t_src(cs, fpi->SrcReg[1]);
1924                 temp[0] = get_temp_reg(cs);
1925                 /* temp = src0 - src1
1926                         * dest.c = (temp.c < 0.0) ? 1 : 0
1927                         */
1928                 emit_arith(cs, PFS_OP_MAD, temp[0], mask,
1929                                 src[0], pfs_one, negate(src[1]), 0);
1930                 emit_arith(cs, PFS_OP_CMP, dest, mask,
1931                                 pfs_zero, pfs_one, temp[0], 0);
1932                 free_temp(cs, temp[0]);
1933                 break;
1934         case OPCODE_SUB:
1935                 src[0] = t_src(cs, fpi->SrcReg[0]);
1936                 src[1] = t_src(cs, fpi->SrcReg[1]);
1937                 emit_arith(cs, PFS_OP_MAD, dest, mask,
1938                                 src[0], pfs_one, negate(src[1]), flags);
1939                 break;
1940         case OPCODE_TEX:
1941                 emit_tex(cs, fpi, R300_TEX_OP_LD);
1942                 break;
1943         case OPCODE_TXB:
1944                 emit_tex(cs, fpi, R300_TEX_OP_TXB);
1945                 break;
1946         case OPCODE_TXP:
1947                 emit_tex(cs, fpi, R300_TEX_OP_TXP);
1948                 break;
1949         case OPCODE_XPD:{
1950                         src[0] = t_src(cs, fpi->SrcReg[0]);
1951                         src[1] = t_src(cs, fpi->SrcReg[1]);
1952                         temp[0] = get_temp_reg(cs);
1953                         /* temp = src0.zxy * src1.yzx */
1954                         emit_arith(cs, PFS_OP_MAD, temp[0],
1955                                         WRITEMASK_XYZ, swizzle(keep(src[0]),
1956                                                                 Z, X, Y, W),
1957                                         swizzle(keep(src[1]), Y, Z, X, W),
1958                                         pfs_zero, 0);
1959                         /* dest.xyz = src0.yzx * src1.zxy - temp
1960                                 * dest.w       = undefined
1961                                 * */
1962                         emit_arith(cs, PFS_OP_MAD, dest,
1963                                         mask & WRITEMASK_XYZ, swizzle(src[0],
1964                                                                         Y, Z,
1965                                                                         X, W),
1966                                         swizzle(src[1], Z, X, Y, W),
1967                                         negate(temp[0]), flags);
1968                         /* cleanup */
1969                         free_temp(cs, temp[0]);
1970                         break;
1971                 }
1972         default:
1973                 ERROR("unknown fpi->Opcode %d\n", fpi->Opcode);
1974                 break;
1975         }
1976 }
1977
1978 static GLboolean parse_program(struct r300_pfs_compile_state *cs)
1979 {
1980         COMPILE_STATE;
1981         int clauseidx;
1982
1983         for (clauseidx = 0; clauseidx < cs->compiler->compiler.NumClauses; ++clauseidx) {
1984                 struct radeon_clause* clause = &cs->compiler->compiler.Clauses[clauseidx];
1985                 int ip;
1986
1987                 for(ip = 0; ip < clause->NumInstructions; ++ip) {
1988                         emit_instruction(cs, clause->Instructions + ip);
1989
1990                         if (fp->error)
1991                                 return GL_FALSE;
1992                 }
1993         }
1994
1995         return GL_TRUE;
1996 }
1997
1998
1999 /* - Init structures
2000  * - Determine what hwregs each input corresponds to
2001  */
2002 static void init_program(struct r300_pfs_compile_state *cs)
2003 {
2004         COMPILE_STATE;
2005         struct gl_fragment_program *mp = &fp->mesa_program;
2006         GLuint InputsRead = mp->Base.InputsRead;
2007         GLuint temps_used = 0;  /* for fp->temps[] */
2008         int i, j;
2009
2010         /* New compile, reset tracking data */
2011         fp->optimization =
2012             driQueryOptioni(&cs->compiler->r300->radeon.optionCache, "fp_optimization");
2013         fp->translated = GL_FALSE;
2014         fp->error = GL_FALSE;
2015         fp->WritesDepth = GL_FALSE;
2016         code->tex.length = 0;
2017         code->cur_node = 0;
2018         code->first_node_has_tex = 0;
2019         code->const_nr = 0;
2020         code->max_temp_idx = 0;
2021         code->node[0].alu_end = -1;
2022         code->node[0].tex_end = -1;
2023
2024         for (i = 0; i < PFS_MAX_ALU_INST; i++) {
2025                 for (j = 0; j < 3; j++) {
2026                         cs->slot[i].vsrc[j] = SRC_CONST;
2027                         cs->slot[i].ssrc[j] = SRC_CONST;
2028                 }
2029         }
2030
2031         /* Work out what temps the Mesa inputs correspond to, this must match
2032          * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2033          * configures itself based on the fragprog's InputsRead
2034          *
2035          * NOTE: this depends on get_hw_temp() allocating registers in order,
2036          * starting from register 0.
2037          */
2038
2039         /* Texcoords come first */
2040         for (i = 0; i < cs->compiler->r300->radeon.glCtx->Const.MaxTextureUnits; i++) {
2041                 if (InputsRead & (FRAG_BIT_TEX0 << i)) {
2042                         cs->inputs[FRAG_ATTRIB_TEX0 + i].refcount = 0;
2043                         cs->inputs[FRAG_ATTRIB_TEX0 + i].reg =
2044                             get_hw_temp(cs, 0);
2045                 }
2046         }
2047         InputsRead &= ~FRAG_BITS_TEX_ANY;
2048
2049         /* fragment position treated as a texcoord */
2050         if (InputsRead & FRAG_BIT_WPOS) {
2051                 cs->inputs[FRAG_ATTRIB_WPOS].refcount = 0;
2052                 cs->inputs[FRAG_ATTRIB_WPOS].reg = get_hw_temp(cs, 0);
2053         }
2054         InputsRead &= ~FRAG_BIT_WPOS;
2055
2056         /* Then primary colour */
2057         if (InputsRead & FRAG_BIT_COL0) {
2058                 cs->inputs[FRAG_ATTRIB_COL0].refcount = 0;
2059                 cs->inputs[FRAG_ATTRIB_COL0].reg = get_hw_temp(cs, 0);
2060         }
2061         InputsRead &= ~FRAG_BIT_COL0;
2062
2063         /* Secondary color */
2064         if (InputsRead & FRAG_BIT_COL1) {
2065                 cs->inputs[FRAG_ATTRIB_COL1].refcount = 0;
2066                 cs->inputs[FRAG_ATTRIB_COL1].reg = get_hw_temp(cs, 0);
2067         }
2068         InputsRead &= ~FRAG_BIT_COL1;
2069
2070         /* Anything else */
2071         if (InputsRead) {
2072                 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead);
2073                 /* force read from hwreg 0 for now */
2074                 for (i = 0; i < 32; i++)
2075                         if (InputsRead & (1 << i))
2076                                 cs->inputs[i].reg = 0;
2077         }
2078
2079         /* Pre-parse the program, grabbing refcounts on input/temp regs.
2080          * That way, we can free up the reg when it's no longer needed
2081          */
2082         for (i = 0; i < cs->compiler->compiler.Clauses[0].NumInstructions; ++i) {
2083                 struct prog_instruction *fpi = cs->compiler->compiler.Clauses[0].Instructions + i;
2084                 int idx;
2085
2086                 for (j = 0; j < 3; j++) {
2087                         idx = fpi->SrcReg[j].Index;
2088                         switch (fpi->SrcReg[j].File) {
2089                         case PROGRAM_TEMPORARY:
2090                                 if (!(temps_used & (1 << idx))) {
2091                                         cs->temps[idx].reg = -1;
2092                                         cs->temps[idx].refcount = 1;
2093                                         temps_used |= (1 << idx);
2094                                 } else
2095                                         cs->temps[idx].refcount++;
2096                                 break;
2097                         case PROGRAM_INPUT:
2098                                 cs->inputs[idx].refcount++;
2099                                 break;
2100                         default:
2101                                 break;
2102                         }
2103                 }
2104
2105                 idx = fpi->DstReg.Index;
2106                 if (fpi->DstReg.File == PROGRAM_TEMPORARY) {
2107                         if (!(temps_used & (1 << idx))) {
2108                                 cs->temps[idx].reg = -1;
2109                                 cs->temps[idx].refcount = 1;
2110                                 temps_used |= (1 << idx);
2111                         } else
2112                                 cs->temps[idx].refcount++;
2113                 }
2114         }
2115         cs->temp_in_use = temps_used;
2116 }
2117
2118
2119 /**
2120  * Final compilation step: Turn the intermediate radeon_program into
2121  * machine-readable instructions.
2122  */
2123 GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
2124 {
2125         struct r300_pfs_compile_state cs;
2126         struct r300_fragment_program_code *code = compiler->code;
2127
2128         _mesa_memset(&cs, 0, sizeof(cs));
2129         cs.compiler = compiler;
2130         init_program(&cs);
2131
2132         if (!parse_program(&cs))
2133                 return GL_FALSE;
2134
2135         /* Finish off */
2136         code->node[code->cur_node].alu_end =
2137                 cs.nrslots - code->node[code->cur_node].alu_offset - 1;
2138         if (code->node[code->cur_node].tex_end < 0)
2139                 code->node[code->cur_node].tex_end = 0;
2140         code->alu_offset = 0;
2141         code->alu_end = cs.nrslots - 1;
2142         code->tex_offset = 0;
2143         code->tex_end = code->tex.length ? code->tex.length - 1 : 0;
2144         assert(code->node[code->cur_node].alu_end >= 0);
2145         assert(code->alu_end >= 0);
2146
2147         return GL_TRUE;
2148 }
2149