2 * Copyright (C) 2005 Ben Skeggs.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * Emit the r300_fragment_program_code that can be understood by the hardware.
32 * Input is a pre-transformed radeon_program.
34 * \author Ben Skeggs <darktama@iinet.net.au>
36 * \author Jerome Glisse <j.glisse@gmail.com>
40 * \todo Verify results of opcodes for accuracy, I've only checked them in
47 #include "shader/prog_instruction.h"
48 #include "shader/prog_parameter.h"
49 #include "shader/prog_print.h"
51 #include "r300_context.h"
52 #include "r300_fragprog.h"
54 #include "r300_state.h"
56 /* Mapping Mesa registers to R300 temporaries */
58 int reg
; /* Assigned hw temp */
59 unsigned int refcount
; /* Number of uses by mesa program */
63 * Describe the current lifetime information for an R300 temporary
66 /* Index of the first slot where this register is free in the sense
67 that it can be used as a new destination register.
68 This is -1 if the register has been assigned to a Mesa register
69 and the last access to the register has not yet been emitted */
72 /* Index of the first slot where this register is currently reserved.
73 This is used to stop e.g. a scalar operation from being moved
74 before the allocation time of a register that was first allocated
75 for a vector operation. */
78 /* Index of the first slot in which the register can be used as a
79 source without losing the value that is written by the last
80 emitted instruction that writes to the register */
84 /* Index to the slot where the register was last read.
85 This is also the first slot in which the register may be written again */
91 * Store usage information about an ALU instruction slot during the
92 * compilation of a fragment program.
94 #define SLOT_SRC_VECTOR (1<<0)
95 #define SLOT_SRC_SCALAR (1<<3)
96 #define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
97 #define SLOT_OP_VECTOR (1<<16)
98 #define SLOT_OP_SCALAR (1<<17)
99 #define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
101 struct r300_pfs_compile_slot
{
102 /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
106 /* Selected sources */
112 * Store information during compilation of fragment programs.
114 struct r300_pfs_compile_state
{
115 struct r300_fragment_program_compiler
*compiler
;
117 int nrslots
; /* number of ALU slots used so far */
119 /* Track which (parts of) slots are already filled with instructions */
120 struct r300_pfs_compile_slot slot
[PFS_MAX_ALU_INST
];
122 /* Track the validity of R300 temporaries */
123 struct reg_lifetime hwtemps
[PFS_NUM_TEMP_REGS
];
125 /* Used to map Mesa's inputs/temps onto hardware temps */
127 struct reg_acc temps
[PFS_NUM_TEMP_REGS
];
128 struct reg_acc inputs
[32]; /* don't actually need 32... */
130 /* Track usage of hardware temps, for register allocation,
131 * indirection detection, etc. */
138 * Usefull macros and values
140 #define ERROR(fmt, args...) do { \
141 fprintf(stderr, "%s::%s(): " fmt "\n", \
142 __FILE__, __FUNCTION__, ##args); \
143 fp->error = GL_TRUE; \
146 #define PFS_INVAL 0xFFFFFFFF
147 #define COMPILE_STATE \
148 struct r300_fragment_program *fp = cs->compiler->fp; \
149 struct r300_fragment_program_code *code = cs->compiler->code; \
152 #define SWIZZLE_XYZ 0
153 #define SWIZZLE_XXX 1
154 #define SWIZZLE_YYY 2
155 #define SWIZZLE_ZZZ 3
156 #define SWIZZLE_WWW 4
157 #define SWIZZLE_YZX 5
158 #define SWIZZLE_ZXY 6
159 #define SWIZZLE_WZY 7
160 #define SWIZZLE_111 8
161 #define SWIZZLE_000 9
162 #define SWIZZLE_HHH 10
164 #define swizzle(r, x, y, z, w) do_swizzle(cs, r, \
171 #define REG_TYPE_INPUT 0
172 #define REG_TYPE_OUTPUT 1
173 #define REG_TYPE_TEMP 2
174 #define REG_TYPE_CONST 3
176 #define REG_TYPE_SHIFT 0
177 #define REG_INDEX_SHIFT 2
178 #define REG_VSWZ_SHIFT 8
179 #define REG_SSWZ_SHIFT 13
180 #define REG_NEGV_SHIFT 18
181 #define REG_NEGS_SHIFT 19
182 #define REG_ABS_SHIFT 20
183 #define REG_NO_USE_SHIFT 21 // Hack for refcounting
184 #define REG_VALID_SHIFT 22 // Does the register contain a defined value?
185 #define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
187 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
188 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
189 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
190 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
191 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
192 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
193 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
194 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
195 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
196 #define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
198 #define REG(type, index, vswz, sswz, nouse, valid, builtin) \
199 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
200 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
201 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
202 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
203 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
204 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
205 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
206 #define REG_GET_TYPE(reg) \
207 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
208 #define REG_GET_INDEX(reg) \
209 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
210 #define REG_GET_VSWZ(reg) \
211 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
212 #define REG_GET_SSWZ(reg) \
213 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
214 #define REG_GET_NO_USE(reg) \
215 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
216 #define REG_GET_VALID(reg) \
217 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
218 #define REG_GET_BUILTIN(reg) \
219 ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
220 #define REG_SET_TYPE(reg, type) \
221 reg = ((reg & ~REG_TYPE_MASK) | \
222 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
223 #define REG_SET_INDEX(reg, index) \
224 reg = ((reg & ~REG_INDEX_MASK) | \
225 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
226 #define REG_SET_VSWZ(reg, vswz) \
227 reg = ((reg & ~REG_VSWZ_MASK) | \
228 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
229 #define REG_SET_SSWZ(reg, sswz) \
230 reg = ((reg & ~REG_SSWZ_MASK) | \
231 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
232 #define REG_SET_NO_USE(reg, nouse) \
233 reg = ((reg & ~REG_NO_USE_MASK) | \
234 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
235 #define REG_SET_VALID(reg, valid) \
236 reg = ((reg & ~REG_VALID_MASK) | \
237 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
238 #define REG_SET_BUILTIN(reg, builtin) \
239 reg = ((reg & ~REG_BUILTIN_MASK) | \
240 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
241 #define REG_ABS(reg) \
242 reg = (reg | REG_ABS_MASK)
243 #define REG_NEGV(reg) \
244 reg = (reg | REG_NEGV_MASK)
245 #define REG_NEGS(reg) \
246 reg = (reg | REG_NEGS_MASK)
248 #define NOP_INST0 ( \
249 (R300_ALU_OUTC_MAD) | \
250 (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
251 (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
252 (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
253 #define NOP_INST1 ( \
254 ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
255 ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
256 ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
257 #define NOP_INST2 ( \
258 (R300_ALU_OUTA_MAD) | \
259 (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
260 (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
261 (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
262 #define NOP_INST3 ( \
263 ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
264 ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
265 ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
269 * Datas structures for fragment program generation
272 /* description of r300 native hw instructions */
273 static const struct {
280 {"MAD", 3, R300_ALU_OUTC_MAD
, R300_ALU_OUTA_MAD
},
281 {"DP3", 2, R300_ALU_OUTC_DP3
, R300_ALU_OUTA_DP4
},
282 {"DP4", 2, R300_ALU_OUTC_DP4
, R300_ALU_OUTA_DP4
},
283 {"MIN", 2, R300_ALU_OUTC_MIN
, R300_ALU_OUTA_MIN
},
284 {"MAX", 2, R300_ALU_OUTC_MAX
, R300_ALU_OUTA_MAX
},
285 {"CMP", 3, R300_ALU_OUTC_CMP
, R300_ALU_OUTA_CMP
},
286 {"FRC", 1, R300_ALU_OUTC_FRC
, R300_ALU_OUTA_FRC
},
287 {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA
, R300_ALU_OUTA_EX2
},
288 {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA
, R300_ALU_OUTA_LG2
},
289 {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA
, R300_ALU_OUTA_RCP
},
290 {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA
, R300_ALU_OUTA_RSQ
},
291 {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA
, PFS_INVAL
},
292 {"CMPH", 3, R300_ALU_OUTC_CMPH
, PFS_INVAL
},
296 /* vector swizzles r300 can support natively, with a couple of
297 * cases we handle specially
299 * REG_VSWZ/REG_SSWZ is an index into this table
302 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
303 #define SWIZZLE_HALF 6
305 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
309 /* native swizzles */
310 static const struct r300_pfs_swizzle
{
311 GLuint hash
; /* swizzle value this matches */
312 GLuint base
; /* base value for hw swizzle */
313 GLuint stride
; /* difference in base between arg0/1/2 */
317 {MAKE_SWZ3(X
, Y
, Z
), R300_ALU_ARGC_SRC0C_XYZ
, 4, SLOT_SRC_VECTOR
},
318 {MAKE_SWZ3(X
, X
, X
), R300_ALU_ARGC_SRC0C_XXX
, 4, SLOT_SRC_VECTOR
},
319 {MAKE_SWZ3(Y
, Y
, Y
), R300_ALU_ARGC_SRC0C_YYY
, 4, SLOT_SRC_VECTOR
},
320 {MAKE_SWZ3(Z
, Z
, Z
), R300_ALU_ARGC_SRC0C_ZZZ
, 4, SLOT_SRC_VECTOR
},
321 {MAKE_SWZ3(W
, W
, W
), R300_ALU_ARGC_SRC0A
, 1, SLOT_SRC_SCALAR
},
322 {MAKE_SWZ3(Y
, Z
, X
), R300_ALU_ARGC_SRC0C_YZX
, 1, SLOT_SRC_VECTOR
},
323 {MAKE_SWZ3(Z
, X
, Y
), R300_ALU_ARGC_SRC0C_ZXY
, 1, SLOT_SRC_VECTOR
},
324 {MAKE_SWZ3(W
, Z
, Y
), R300_ALU_ARGC_SRC0CA_WZY
, 1, SLOT_SRC_BOTH
},
325 {MAKE_SWZ3(ONE
, ONE
, ONE
), R300_ALU_ARGC_ONE
, 0, 0},
326 {MAKE_SWZ3(ZERO
, ZERO
, ZERO
), R300_ALU_ARGC_ZERO
, 0, 0},
327 {MAKE_SWZ3(HALF
, HALF
, HALF
), R300_ALU_ARGC_HALF
, 0, 0},
328 {PFS_INVAL
, 0, 0, 0},
332 /* used during matching of non-native swizzles */
333 #define SWZ_X_MASK (7 << 0)
334 #define SWZ_Y_MASK (7 << 3)
335 #define SWZ_Z_MASK (7 << 6)
336 #define SWZ_W_MASK (7 << 9)
337 static const struct {
338 GLuint hash
; /* used to mask matching swizzle components */
339 int mask
; /* actual outmask */
340 int count
; /* count of components matched */
343 {SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
, 1 | 2 | 4, 3},
344 {SWZ_X_MASK
| SWZ_Y_MASK
, 1 | 2, 2},
345 {SWZ_X_MASK
| SWZ_Z_MASK
, 1 | 4, 2},
346 {SWZ_Y_MASK
| SWZ_Z_MASK
, 2 | 4, 2},
350 {PFS_INVAL
, PFS_INVAL
, PFS_INVAL
}
354 static const struct {
355 int base
; /* hw value of swizzle */
356 int stride
; /* difference between SRC0/1/2 */
360 {R300_ALU_ARGA_SRC0C_X
, 3, SLOT_SRC_VECTOR
},
361 {R300_ALU_ARGA_SRC0C_Y
, 3, SLOT_SRC_VECTOR
},
362 {R300_ALU_ARGA_SRC0C_Z
, 3, SLOT_SRC_VECTOR
},
363 {R300_ALU_ARGA_SRC0A
, 1, SLOT_SRC_SCALAR
},
364 {R300_ALU_ARGA_ZERO
, 0, 0},
365 {R300_ALU_ARGA_ONE
, 0, 0},
366 {R300_ALU_ARGA_HALF
, 0, 0}
370 /* boiler-plate reg, for convenience */
371 static const GLuint undef
= REG(REG_TYPE_TEMP
,
379 /* constant one source */
380 static const GLuint pfs_one
= REG(REG_TYPE_CONST
,
388 /* constant half source */
389 static const GLuint pfs_half
= REG(REG_TYPE_CONST
,
397 /* constant zero source */
398 static const GLuint pfs_zero
= REG(REG_TYPE_CONST
,
407 * Common functions prototypes
409 static void emit_arith(struct r300_pfs_compile_state
*cs
, int op
,
410 GLuint dest
, int mask
,
411 GLuint src0
, GLuint src1
, GLuint src2
, int flags
);
414 * Get an R300 temporary that can be written to in the given slot.
416 static int get_hw_temp(struct r300_pfs_compile_state
*cs
, int slot
)
421 for (r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
422 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= slot
)
426 if (r
>= PFS_NUM_TEMP_REGS
) {
427 ERROR("Out of hardware temps\n");
430 // Reserved is used to avoid the following scenario:
431 // R300 temporary X is first assigned to Mesa temporary Y during vector ops
432 // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
433 // Then scalar ops on Mesa temporary Z are emitted and move back in time
434 // to overwrite the value of temporary Y.
436 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
437 cs
->hwtemps
[r
].free
= -1;
439 // Reset to some value that won't mess things up when the user
440 // tries to read from a temporary that hasn't been assigned a value yet.
441 // In the normal case, vector_valid and scalar_valid should be set to
442 // a sane value by the first emit that writes to this temporary.
443 cs
->hwtemps
[r
].vector_valid
= 0;
444 cs
->hwtemps
[r
].scalar_valid
= 0;
446 if (r
> code
->max_temp_idx
)
447 code
->max_temp_idx
= r
;
453 * Get an R300 temporary that will act as a TEX destination register.
455 static int get_hw_temp_tex(struct r300_pfs_compile_state
*cs
)
460 for (r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
461 if (cs
->used_in_node
& (1 << r
))
464 // Note: Be very careful here
465 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= 0)
469 if (r
>= PFS_NUM_TEMP_REGS
)
470 return get_hw_temp(cs
, 0); /* Will cause an indirection */
472 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
473 cs
->hwtemps
[r
].free
= -1;
475 // Reset to some value that won't mess things up when the user
476 // tries to read from a temporary that hasn't been assigned a value yet.
477 // In the normal case, vector_valid and scalar_valid should be set to
478 // a sane value by the first emit that writes to this temporary.
479 cs
->hwtemps
[r
].vector_valid
= cs
->nrslots
;
480 cs
->hwtemps
[r
].scalar_valid
= cs
->nrslots
;
482 if (r
> code
->max_temp_idx
)
483 code
->max_temp_idx
= r
;
489 * Mark the given hardware register as free.
491 static void free_hw_temp(struct r300_pfs_compile_state
*cs
, int idx
)
493 // Be very careful here. Consider sequences like
496 // The TEX instruction may be moved in front of the MAD instruction
497 // due to the way nodes work. We don't want to alias r1 and r4 in
499 // I'm certain the register allocation could be further sanitized,
500 // but it's tricky because of stuff that can happen inside emit_tex
502 cs
->hwtemps
[idx
].free
= cs
->nrslots
+ 1;
506 * Create a new Mesa temporary register.
508 static GLuint
get_temp_reg(struct r300_pfs_compile_state
*cs
)
514 index
= ffs(~cs
->temp_in_use
);
516 ERROR("Out of program temps\n");
520 cs
->temp_in_use
|= (1 << --index
);
521 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
522 cs
->temps
[index
].reg
= -1;
524 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
525 REG_SET_INDEX(r
, index
);
526 REG_SET_VALID(r
, GL_TRUE
);
531 * Free a Mesa temporary and the associated R300 temporary.
533 static void free_temp(struct r300_pfs_compile_state
*cs
, GLuint r
)
535 GLuint index
= REG_GET_INDEX(r
);
537 if (!(cs
->temp_in_use
& (1 << index
)))
540 if (REG_GET_TYPE(r
) == REG_TYPE_TEMP
) {
541 free_hw_temp(cs
, cs
->temps
[index
].reg
);
542 cs
->temps
[index
].reg
= -1;
543 cs
->temp_in_use
&= ~(1 << index
);
544 } else if (REG_GET_TYPE(r
) == REG_TYPE_INPUT
) {
545 free_hw_temp(cs
, cs
->inputs
[index
].reg
);
546 cs
->inputs
[index
].reg
= -1;
551 * Emit a hardware constant/parameter.
553 static GLuint
emit_const4fv(struct r300_pfs_compile_state
*cs
,
554 struct prog_src_register srcreg
)
560 for (index
= 0; index
< code
->const_nr
; ++index
) {
561 if (code
->constant
[index
].File
== srcreg
.File
&&
562 code
->constant
[index
].Index
== srcreg
.Index
)
566 if (index
>= code
->const_nr
) {
567 if (index
>= PFS_NUM_CONST_REGS
) {
568 ERROR("Out of hw constants!\n");
573 code
->constant
[index
] = srcreg
;
576 REG_SET_TYPE(reg
, REG_TYPE_CONST
);
577 REG_SET_INDEX(reg
, index
);
578 REG_SET_VALID(reg
, GL_TRUE
);
582 static INLINE GLuint
negate(GLuint r
)
589 /* Hack, to prevent clobbering sources used multiple times when
590 * emulating non-native instructions
592 static INLINE GLuint
keep(GLuint r
)
594 REG_SET_NO_USE(r
, GL_TRUE
);
598 static INLINE GLuint
absolute(GLuint r
)
604 static int swz_native(struct r300_pfs_compile_state
*cs
,
605 GLuint src
, GLuint
* r
, GLuint arbneg
)
609 /* Native swizzle, handle negation */
610 src
= (src
& ~REG_NEGS_MASK
) | (((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
);
612 if ((arbneg
& 0x7) == 0x0) {
613 src
= src
& ~REG_NEGV_MASK
;
615 } else if ((arbneg
& 0x7) == 0x7) {
616 src
|= REG_NEGV_MASK
;
619 if (!REG_GET_VALID(*r
))
620 *r
= get_temp_reg(cs
);
621 src
|= REG_NEGV_MASK
;
624 *r
, arbneg
& 0x7, keep(src
), pfs_one
, pfs_zero
, 0);
625 src
= src
& ~REG_NEGV_MASK
;
629 (arbneg
^ 0x7) | WRITEMASK_W
,
630 src
, pfs_one
, pfs_zero
, 0);
636 static int swz_emit_partial(struct r300_pfs_compile_state
*cs
,
638 GLuint
* r
, int mask
, int mc
, GLuint arbneg
)
644 if (!REG_GET_VALID(*r
))
645 *r
= get_temp_reg(cs
);
647 /* A partial match, VSWZ/mask define what parts of the
648 * desired swizzle we match
650 if (mc
+ s_mask
[mask
].count
== 3) {
652 src
|= ((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
;
655 tmp
= arbneg
& s_mask
[mask
].mask
;
657 tmp
= tmp
^ s_mask
[mask
].mask
;
662 arbneg
& s_mask
[mask
].mask
,
663 keep(src
) | REG_NEGV_MASK
,
664 pfs_one
, pfs_zero
, 0);
666 REG_SET_NO_USE(src
, GL_TRUE
);
668 REG_SET_NO_USE(src
, GL_FALSE
);
672 *r
, tmp
| wmask
, src
, pfs_one
, pfs_zero
, 0);
675 REG_SET_NO_USE(src
, GL_TRUE
);
677 REG_SET_NO_USE(src
, GL_FALSE
);
682 (arbneg
& s_mask
[mask
].mask
) | wmask
,
683 src
| REG_NEGV_MASK
, pfs_one
, pfs_zero
, 0);
687 REG_SET_NO_USE(src
, GL_TRUE
);
689 REG_SET_NO_USE(src
, GL_FALSE
);
691 emit_arith(cs
, PFS_OP_MAD
,
693 s_mask
[mask
].mask
| wmask
,
694 src
, pfs_one
, pfs_zero
, 0);
697 return s_mask
[mask
].count
;
700 static GLuint
do_swizzle(struct r300_pfs_compile_state
*cs
,
701 GLuint src
, GLuint arbswz
, GLuint arbneg
)
709 /* If swizzling from something without an XYZW native swizzle,
710 * emit result to a temp, and do new swizzle from the temp.
713 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
|| REG_GET_SSWZ(src
) != SWIZZLE_W
) {
714 GLuint temp
= get_temp_reg(fp
);
717 temp
, WRITEMASK_XYZW
, src
, pfs_one
, pfs_zero
, 0);
722 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
|| REG_GET_SSWZ(src
) != SWIZZLE_W
) {
724 (v_swiz
[REG_GET_VSWZ(src
)].
725 hash
& (SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
)) |
726 REG_GET_SSWZ(src
) << 9;
731 for (i
= 0; i
< 4; ++i
) {
732 offset
= GET_SWZ(arbswz
, i
);
735 (offset
<= 3) ? GET_SWZ(vsrcswz
,
740 arbswz
= newswz
& (SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
);
741 REG_SET_SSWZ(src
, GET_SWZ(newswz
, 3));
743 /* set scalar swizzling */
744 REG_SET_SSWZ(src
, GET_SWZ(arbswz
, 3));
748 vswz
= REG_GET_VSWZ(src
);
752 REG_SET_VSWZ(src
, vswz
);
753 chash
= v_swiz
[REG_GET_VSWZ(src
)].hash
&
756 if (chash
== (arbswz
& s_mask
[c_mask
].hash
)) {
757 if (s_mask
[c_mask
].count
== 3) {
758 v_match
+= swz_native(cs
,
761 v_match
+= swz_emit_partial(cs
,
772 /* Fill with something invalid.. all 0's was
773 * wrong before, matched SWIZZLE_X. So all
774 * 1's will be okay for now
776 arbswz
|= (PFS_INVAL
& s_mask
[c_mask
].hash
);
778 } while (v_swiz
[++vswz
].hash
!= PFS_INVAL
);
779 REG_SET_VSWZ(src
, SWIZZLE_XYZ
);
780 } while (s_mask
[++c_mask
].hash
!= PFS_INVAL
);
782 ERROR("should NEVER get here\n");
786 static GLuint
t_src(struct r300_pfs_compile_state
*cs
,
787 struct prog_src_register fpsrc
)
792 switch (fpsrc
.File
) {
793 case PROGRAM_TEMPORARY
:
794 REG_SET_INDEX(r
, fpsrc
.Index
);
795 REG_SET_VALID(r
, GL_TRUE
);
796 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
799 REG_SET_INDEX(r
, fpsrc
.Index
);
800 REG_SET_VALID(r
, GL_TRUE
);
801 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
803 case PROGRAM_LOCAL_PARAM
:
804 case PROGRAM_ENV_PARAM
:
805 case PROGRAM_STATE_VAR
:
806 case PROGRAM_NAMED_PARAM
:
807 case PROGRAM_CONSTANT
:
808 r
= emit_const4fv(cs
, fpsrc
);
810 case PROGRAM_BUILTIN
:
811 switch(fpsrc
.Swizzle
) {
812 case SWIZZLE_1111
: r
= pfs_one
; break;
813 case SWIZZLE_0000
: r
= pfs_zero
; break;
815 ERROR("bad PROGRAM_BUILTIN swizzle %u\n", fpsrc
.Swizzle
);
820 ERROR("unknown SrcReg->File %x\n", fpsrc
.File
);
824 /* no point swizzling ONE/ZERO/HALF constants... */
825 if (REG_GET_VSWZ(r
) < SWIZZLE_111
|| REG_GET_SSWZ(r
) < SWIZZLE_ZERO
)
826 r
= do_swizzle(cs
, r
, fpsrc
.Swizzle
, fpsrc
.NegateBase
);
834 static GLuint
t_scalar_src(struct r300_pfs_compile_state
*cs
,
835 struct prog_src_register fpsrc
)
837 struct prog_src_register src
= fpsrc
;
838 int sc
= GET_SWZ(fpsrc
.Swizzle
, 0); /* X */
840 src
.Swizzle
= ((sc
<< 0) | (sc
<< 3) | (sc
<< 6) | (sc
<< 9));
842 return t_src(cs
, src
);
845 static GLuint
t_dst(struct r300_pfs_compile_state
*cs
,
846 struct prog_dst_register dest
)
852 case PROGRAM_TEMPORARY
:
853 REG_SET_INDEX(r
, dest
.Index
);
854 REG_SET_VALID(r
, GL_TRUE
);
855 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
858 REG_SET_TYPE(r
, REG_TYPE_OUTPUT
);
859 switch (dest
.Index
) {
860 case FRAG_RESULT_COLR
:
861 case FRAG_RESULT_DEPR
:
862 REG_SET_INDEX(r
, dest
.Index
);
863 REG_SET_VALID(r
, GL_TRUE
);
866 ERROR("Bad DstReg->Index 0x%x\n", dest
.Index
);
870 ERROR("Bad DstReg->File 0x%x\n", dest
.File
);
875 static int t_hw_src(struct r300_pfs_compile_state
*cs
, GLuint src
, GLboolean tex
)
879 int index
= REG_GET_INDEX(src
);
881 switch (REG_GET_TYPE(src
)) {
883 /* NOTE: if reg==-1 here, a source is being read that
884 * hasn't been written to. Undefined results.
886 if (cs
->temps
[index
].reg
== -1)
887 cs
->temps
[index
].reg
= get_hw_temp(cs
, cs
->nrslots
);
889 idx
= cs
->temps
[index
].reg
;
891 if (!REG_GET_NO_USE(src
) && (--cs
->temps
[index
].refcount
== 0))
895 idx
= cs
->inputs
[index
].reg
;
897 if (!REG_GET_NO_USE(src
) && (--cs
->inputs
[index
].refcount
== 0))
898 free_hw_temp(cs
, cs
->inputs
[index
].reg
);
901 return (index
| SRC_CONST
);
903 ERROR("Invalid type for source reg\n");
904 return (0 | SRC_CONST
);
908 cs
->used_in_node
|= (1 << idx
);
913 static int t_hw_dst(struct r300_pfs_compile_state
*cs
,
914 GLuint dest
, GLboolean tex
, int slot
)
918 GLuint index
= REG_GET_INDEX(dest
);
919 assert(REG_GET_VALID(dest
));
921 switch (REG_GET_TYPE(dest
)) {
923 if (cs
->temps
[REG_GET_INDEX(dest
)].reg
== -1) {
925 cs
->temps
[index
].reg
= get_hw_temp(cs
, slot
);
927 cs
->temps
[index
].reg
= get_hw_temp_tex(cs
);
930 idx
= cs
->temps
[index
].reg
;
932 if (!REG_GET_NO_USE(dest
) && (--cs
->temps
[index
].refcount
== 0))
935 cs
->dest_in_node
|= (1 << idx
);
936 cs
->used_in_node
|= (1 << idx
);
938 case REG_TYPE_OUTPUT
:
940 case FRAG_RESULT_COLR
:
941 code
->node
[code
->cur_node
].flags
|= R300_RGBA_OUT
;
943 case FRAG_RESULT_DEPR
:
944 fp
->WritesDepth
= GL_TRUE
;
945 code
->node
[code
->cur_node
].flags
|= R300_W_OUT
;
951 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
958 static void emit_nop(struct r300_pfs_compile_state
*cs
)
962 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
963 ERROR("Out of ALU instruction slots\n");
967 code
->alu
.inst
[cs
->nrslots
].inst0
= NOP_INST0
;
968 code
->alu
.inst
[cs
->nrslots
].inst1
= NOP_INST1
;
969 code
->alu
.inst
[cs
->nrslots
].inst2
= NOP_INST2
;
970 code
->alu
.inst
[cs
->nrslots
].inst3
= NOP_INST3
;
974 static void emit_tex(struct r300_pfs_compile_state
*cs
,
975 struct prog_instruction
*fpi
, int opcode
)
978 GLuint coord
= t_src(cs
, fpi
->SrcReg
[0]);
981 int unit
= fpi
->TexSrcUnit
;
984 /* Ensure correct node indirection */
985 uin
= cs
->used_in_node
;
986 din
= cs
->dest_in_node
;
988 /* Resolve source/dest to hardware registers */
989 hwsrc
= t_hw_src(cs
, coord
, GL_TRUE
);
991 if (opcode
!= R300_TEX_OP_KIL
) {
992 dest
= t_dst(cs
, fpi
->DstReg
);
995 t_hw_dst(cs
, dest
, GL_TRUE
,
996 code
->node
[code
->cur_node
].alu_offset
);
998 /* Use a temp that hasn't been used in this node, rather
999 * than causing an indirection
1001 if (uin
& (1 << hwdest
)) {
1002 free_hw_temp(cs
, hwdest
);
1003 hwdest
= get_hw_temp_tex(cs
);
1004 cs
->temps
[REG_GET_INDEX(dest
)].reg
= hwdest
;
1011 /* Indirection if source has been written in this node, or if the
1012 * dest has been read/written in this node
1014 if ((REG_GET_TYPE(coord
) != REG_TYPE_CONST
&&
1015 (din
& (1 << hwsrc
))) || (uin
& (1 << hwdest
))) {
1017 /* Finish off current node */
1018 if (code
->node
[code
->cur_node
].alu_offset
== cs
->nrslots
)
1021 code
->node
[code
->cur_node
].alu_end
=
1022 cs
->nrslots
- code
->node
[code
->cur_node
].alu_offset
- 1;
1023 assert(code
->node
[code
->cur_node
].alu_end
>= 0);
1025 if (++code
->cur_node
>= PFS_MAX_TEX_INDIRECT
) {
1026 ERROR("too many levels of texture indirection\n");
1030 /* Start new node */
1031 code
->node
[code
->cur_node
].tex_offset
= code
->tex
.length
;
1032 code
->node
[code
->cur_node
].alu_offset
= cs
->nrslots
;
1033 code
->node
[code
->cur_node
].tex_end
= -1;
1034 code
->node
[code
->cur_node
].alu_end
= -1;
1035 code
->node
[code
->cur_node
].flags
= 0;
1036 cs
->used_in_node
= 0;
1037 cs
->dest_in_node
= 0;
1040 if (code
->cur_node
== 0)
1041 code
->first_node_has_tex
= 1;
1043 code
->tex
.inst
[code
->tex
.length
++] = 0 | (hwsrc
<< R300_SRC_ADDR_SHIFT
)
1044 | (hwdest
<< R300_DST_ADDR_SHIFT
)
1045 | (unit
<< R300_TEX_ID_SHIFT
)
1046 | (opcode
<< R300_TEX_INST_SHIFT
);
1048 cs
->dest_in_node
|= (1 << hwdest
);
1049 if (REG_GET_TYPE(coord
) != REG_TYPE_CONST
)
1050 cs
->used_in_node
|= (1 << hwsrc
);
1052 code
->node
[code
->cur_node
].tex_end
++;
1056 * Returns the first slot where we could possibly allow writing to dest,
1057 * according to register allocation.
1059 static int get_earliest_allowed_write(struct r300_pfs_compile_state
*cs
,
1060 GLuint dest
, int mask
)
1065 GLuint index
= REG_GET_INDEX(dest
);
1066 assert(REG_GET_VALID(dest
));
1068 switch (REG_GET_TYPE(dest
)) {
1070 if (cs
->temps
[index
].reg
== -1)
1073 idx
= cs
->temps
[index
].reg
;
1075 case REG_TYPE_OUTPUT
:
1078 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
1082 pos
= cs
->hwtemps
[idx
].reserved
;
1083 if (mask
& WRITEMASK_XYZ
) {
1084 if (pos
< cs
->hwtemps
[idx
].vector_lastread
)
1085 pos
= cs
->hwtemps
[idx
].vector_lastread
;
1087 if (mask
& WRITEMASK_W
) {
1088 if (pos
< cs
->hwtemps
[idx
].scalar_lastread
)
1089 pos
= cs
->hwtemps
[idx
].scalar_lastread
;
1096 * Allocates a slot for an ALU instruction that can consist of
1097 * a vertex part or a scalar part or both.
1099 * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1100 * appropriate position (vector and/or scalar), and their positions are
1101 * recorded in the srcpos array.
1103 * This function emits instruction code for the source fetch and the
1104 * argument selection. It does not emit instruction code for the
1105 * opcode or the destination selection.
1107 * @return the index of the slot
1109 static int find_and_prepare_slot(struct r300_pfs_compile_state
*cs
,
1112 int argc
, GLuint
* src
, GLuint dest
, int mask
)
1125 // Determine instruction slots, whether sources are required on
1126 // vector or scalar side, and the smallest slot number where
1127 // all source registers are available
1130 used
|= SLOT_OP_VECTOR
;
1132 used
|= SLOT_OP_SCALAR
;
1134 pos
= get_earliest_allowed_write(cs
, dest
, mask
);
1136 if (code
->node
[code
->cur_node
].alu_offset
> pos
)
1137 pos
= code
->node
[code
->cur_node
].alu_offset
;
1138 for (i
= 0; i
< argc
; ++i
) {
1139 if (!REG_GET_BUILTIN(src
[i
])) {
1141 used
|= v_swiz
[REG_GET_VSWZ(src
[i
])].flags
<< i
;
1143 used
|= s_swiz
[REG_GET_SSWZ(src
[i
])].flags
<< i
;
1146 hwsrc
[i
] = t_hw_src(cs
, src
[i
], GL_FALSE
); /* Note: sideeffects wrt refcounting! */
1147 regnr
= hwsrc
[i
] & 31;
1149 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1150 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1151 if (cs
->hwtemps
[regnr
].vector_valid
> pos
)
1152 pos
= cs
->hwtemps
[regnr
].vector_valid
;
1154 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1155 if (cs
->hwtemps
[regnr
].scalar_valid
> pos
)
1156 pos
= cs
->hwtemps
[regnr
].scalar_valid
;
1161 // Find a slot that fits
1163 if (cs
->slot
[pos
].used
& used
& SLOT_OP_BOTH
)
1166 if (pos
>= cs
->nrslots
) {
1167 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
1168 ERROR("Out of ALU instruction slots\n");
1172 code
->alu
.inst
[pos
].inst0
= NOP_INST0
;
1173 code
->alu
.inst
[pos
].inst1
= NOP_INST1
;
1174 code
->alu
.inst
[pos
].inst2
= NOP_INST2
;
1175 code
->alu
.inst
[pos
].inst3
= NOP_INST3
;
1179 // Note: When we need both parts (vector and scalar) of a source,
1180 // we always try to put them into the same position. This makes the
1181 // code easier to read, and it is optimal (i.e. one doesn't gain
1182 // anything by splitting the parts).
1183 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1184 tempused
= cs
->slot
[pos
].used
;
1185 for (i
= 0; i
< 3; ++i
) {
1186 tempvsrc
[i
] = cs
->slot
[pos
].vsrc
[i
];
1187 tempssrc
[i
] = cs
->slot
[pos
].ssrc
[i
];
1190 for (i
= 0; i
< argc
; ++i
) {
1191 int flags
= (used
>> i
) & SLOT_SRC_BOTH
;
1198 for (j
= 0; j
< 3; ++j
) {
1199 if ((tempused
>> j
) & flags
& SLOT_SRC_VECTOR
) {
1200 if (tempvsrc
[j
] != hwsrc
[i
])
1204 if ((tempused
>> j
) & flags
& SLOT_SRC_SCALAR
) {
1205 if (tempssrc
[j
] != hwsrc
[i
])
1216 tempused
|= flags
<< j
;
1217 if (flags
& SLOT_SRC_VECTOR
)
1218 tempvsrc
[j
] = hwsrc
[i
];
1219 if (flags
& SLOT_SRC_SCALAR
)
1220 tempssrc
[j
] = hwsrc
[i
];
1227 // Found a slot, reserve it
1228 cs
->slot
[pos
].used
= tempused
| (used
& SLOT_OP_BOTH
);
1229 for (i
= 0; i
< 3; ++i
) {
1230 cs
->slot
[pos
].vsrc
[i
] = tempvsrc
[i
];
1231 cs
->slot
[pos
].ssrc
[i
] = tempssrc
[i
];
1234 for (i
= 0; i
< argc
; ++i
) {
1235 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1236 int regnr
= hwsrc
[i
] & 31;
1238 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1239 if (cs
->hwtemps
[regnr
].vector_lastread
< pos
)
1240 cs
->hwtemps
[regnr
].vector_lastread
=
1243 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1244 if (cs
->hwtemps
[regnr
].scalar_lastread
< pos
)
1245 cs
->hwtemps
[regnr
].scalar_lastread
=
1251 // Emit the source fetch code
1252 code
->alu
.inst
[pos
].inst1
&= ~R300_ALU_SRC_MASK
;
1253 code
->alu
.inst
[pos
].inst1
|=
1254 ((cs
->slot
[pos
].vsrc
[0] << R300_ALU_SRC0C_SHIFT
) |
1255 (cs
->slot
[pos
].vsrc
[1] << R300_ALU_SRC1C_SHIFT
) |
1256 (cs
->slot
[pos
].vsrc
[2] << R300_ALU_SRC2C_SHIFT
));
1258 code
->alu
.inst
[pos
].inst3
&= ~R300_ALU_SRC_MASK
;
1259 code
->alu
.inst
[pos
].inst3
|=
1260 ((cs
->slot
[pos
].ssrc
[0] << R300_ALU_SRC0A_SHIFT
) |
1261 (cs
->slot
[pos
].ssrc
[1] << R300_ALU_SRC1A_SHIFT
) |
1262 (cs
->slot
[pos
].ssrc
[2] << R300_ALU_SRC2A_SHIFT
));
1264 // Emit the argument selection code
1268 for (i
= 0; i
< 3; ++i
) {
1270 swz
[i
] = (v_swiz
[REG_GET_VSWZ(src
[i
])].base
+
1272 v_swiz
[REG_GET_VSWZ(src
[i
])].
1273 stride
)) | ((src
[i
] & REG_NEGV_MASK
)
1274 ? ARG_NEG
: 0) | ((src
[i
]
1281 swz
[i
] = R300_ALU_ARGC_ZERO
;
1285 code
->alu
.inst
[pos
].inst0
&=
1286 ~(R300_ALU_ARG0C_MASK
| R300_ALU_ARG1C_MASK
|
1287 R300_ALU_ARG2C_MASK
);
1288 code
->alu
.inst
[pos
].inst0
|=
1289 (swz
[0] << R300_ALU_ARG0C_SHIFT
) | (swz
[1] <<
1290 R300_ALU_ARG1C_SHIFT
)
1291 | (swz
[2] << R300_ALU_ARG2C_SHIFT
);
1297 for (i
= 0; i
< 3; ++i
) {
1299 swz
[i
] = (s_swiz
[REG_GET_SSWZ(src
[i
])].base
+
1301 s_swiz
[REG_GET_SSWZ(src
[i
])].
1302 stride
)) | ((src
[i
] & REG_NEGS_MASK
)
1303 ? ARG_NEG
: 0) | ((src
[i
]
1310 swz
[i
] = R300_ALU_ARGA_ZERO
;
1314 code
->alu
.inst
[pos
].inst2
&=
1315 ~(R300_ALU_ARG0A_MASK
| R300_ALU_ARG1A_MASK
|
1316 R300_ALU_ARG2A_MASK
);
1317 code
->alu
.inst
[pos
].inst2
|=
1318 (swz
[0] << R300_ALU_ARG0A_SHIFT
) | (swz
[1] <<
1319 R300_ALU_ARG1A_SHIFT
)
1320 | (swz
[2] << R300_ALU_ARG2A_SHIFT
);
1327 * Append an ALU instruction to the instruction list.
1329 static void emit_arith(struct r300_pfs_compile_state
*cs
,
1333 GLuint src0
, GLuint src1
, GLuint src2
, int flags
)
1336 GLuint src
[3] = { src0
, src1
, src2
};
1338 GLboolean emit_vop
, emit_sop
;
1342 vop
= r300_fpop
[op
].v_op
;
1343 sop
= r300_fpop
[op
].s_op
;
1344 argc
= r300_fpop
[op
].argc
;
1346 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
&&
1347 REG_GET_INDEX(dest
) == FRAG_RESULT_DEPR
) {
1348 if (mask
& WRITEMASK_Z
) {
1355 emit_vop
= GL_FALSE
;
1356 emit_sop
= GL_FALSE
;
1357 if ((mask
& WRITEMASK_XYZ
) || vop
== R300_ALU_OUTC_DP3
)
1359 if ((mask
& WRITEMASK_W
) || vop
== R300_ALU_OUTC_REPL_ALPHA
)
1363 find_and_prepare_slot(cs
, emit_vop
, emit_sop
, argc
, src
, dest
,
1368 hwdest
= t_hw_dst(cs
, dest
, GL_FALSE
, pos
); /* Note: Side effects wrt register allocation */
1370 if (flags
& PFS_FLAG_SAT
) {
1371 vop
|= R300_ALU_OUTC_CLAMP
;
1372 sop
|= R300_ALU_OUTA_CLAMP
;
1375 /* Throw the pieces together and get ALU/1 */
1377 code
->alu
.inst
[pos
].inst0
|= vop
;
1379 code
->alu
.inst
[pos
].inst1
|= hwdest
<< R300_ALU_DSTC_SHIFT
;
1381 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1382 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1383 code
->alu
.inst
[pos
].inst1
|=
1384 (mask
& WRITEMASK_XYZ
) <<
1385 R300_ALU_DSTC_OUTPUT_MASK_SHIFT
;
1389 code
->alu
.inst
[pos
].inst1
|=
1390 (mask
& WRITEMASK_XYZ
) <<
1391 R300_ALU_DSTC_REG_MASK_SHIFT
;
1393 cs
->hwtemps
[hwdest
].vector_valid
= pos
+ 1;
1399 code
->alu
.inst
[pos
].inst2
|= sop
;
1401 if (mask
& WRITEMASK_W
) {
1402 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1403 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1404 code
->alu
.inst
[pos
].inst3
|=
1405 (hwdest
<< R300_ALU_DSTA_SHIFT
) |
1406 R300_ALU_DSTA_OUTPUT
;
1407 } else if (REG_GET_INDEX(dest
) ==
1409 code
->alu
.inst
[pos
].inst3
|=
1410 R300_ALU_DSTA_DEPTH
;
1414 code
->alu
.inst
[pos
].inst3
|=
1415 (hwdest
<< R300_ALU_DSTA_SHIFT
) |
1418 cs
->hwtemps
[hwdest
].scalar_valid
= pos
+ 1;
1426 static void emit_instruction(struct r300_pfs_compile_state
*cs
, struct prog_instruction
*fpi
)
1429 GLuint src
[3], dest
;
1430 int flags
, mask
= 0;
1432 if (fpi
->SaturateMode
== SATURATE_ZERO_ONE
)
1433 flags
= PFS_FLAG_SAT
;
1437 if (fpi
->Opcode
!= OPCODE_KIL
) {
1438 dest
= t_dst(cs
, fpi
->DstReg
);
1439 mask
= fpi
->DstReg
.WriteMask
;
1442 switch (fpi
->Opcode
) {
1444 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1445 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1446 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1447 src
[0], pfs_one
, src
[1], flags
);
1450 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1451 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1452 src
[2] = t_src(cs
, fpi
->SrcReg
[2]);
1453 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1454 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1456 emit_arith(cs
, PFS_OP_CMP
, dest
, mask
,
1457 src
[2], src
[1], src
[0], flags
);
1460 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1461 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1462 emit_arith(cs
, PFS_OP_DP3
, dest
, mask
,
1463 src
[0], src
[1], undef
, flags
);
1466 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1467 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1468 emit_arith(cs
, PFS_OP_DP4
, dest
, mask
,
1469 src
[0], src
[1], undef
, flags
);
1472 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1473 emit_arith(cs
, PFS_OP_EX2
, dest
, mask
,
1474 src
[0], undef
, undef
, flags
);
1477 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1478 emit_arith(cs
, PFS_OP_FRC
, dest
, mask
,
1479 src
[0], undef
, undef
, flags
);
1482 emit_tex(cs
, fpi
, R300_TEX_OP_KIL
);
1485 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1486 emit_arith(cs
, PFS_OP_LG2
, dest
, mask
,
1487 src
[0], undef
, undef
, flags
);
1490 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1491 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1492 src
[2] = t_src(cs
, fpi
->SrcReg
[2]);
1493 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1494 src
[0], src
[1], src
[2], flags
);
1497 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1498 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1499 emit_arith(cs
, PFS_OP_MAX
, dest
, mask
,
1500 src
[0], src
[1], undef
, flags
);
1503 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1504 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1505 emit_arith(cs
, PFS_OP_MIN
, dest
, mask
,
1506 src
[0], src
[1], undef
, flags
);
1509 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1510 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1511 src
[0], pfs_one
, pfs_zero
, flags
);
1514 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1515 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1516 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1517 src
[0], src
[1], pfs_zero
, flags
);
1520 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1521 emit_arith(cs
, PFS_OP_RCP
, dest
, mask
,
1522 src
[0], undef
, undef
, flags
);
1525 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1526 emit_arith(cs
, PFS_OP_RSQ
, dest
, mask
,
1527 absolute(src
[0]), pfs_zero
, pfs_zero
, flags
);
1530 emit_tex(cs
, fpi
, R300_TEX_OP_LD
);
1533 emit_tex(cs
, fpi
, R300_TEX_OP_TXB
);
1536 emit_tex(cs
, fpi
, R300_TEX_OP_TXP
);
1539 ERROR("unknown fpi->Opcode %d\n", fpi
->Opcode
);
1544 static GLboolean
parse_program(struct r300_pfs_compile_state
*cs
)
1547 struct prog_instruction
* fpi
;
1549 for(fpi
= cs
->compiler
->program
->Instructions
; fpi
->Opcode
!= OPCODE_END
; ++fpi
) {
1550 emit_instruction(cs
, fpi
);
1560 /* - Init structures
1561 * - Determine what hwregs each input corresponds to
1563 static void init_program(struct r300_pfs_compile_state
*cs
)
1566 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
1567 GLuint InputsRead
= mp
->Base
.InputsRead
;
1568 GLuint temps_used
= 0; /* for fp->temps[] */
1571 /* New compile, reset tracking data */
1573 driQueryOptioni(&cs
->compiler
->r300
->radeon
.optionCache
, "fp_optimization");
1574 fp
->translated
= GL_FALSE
;
1575 fp
->error
= GL_FALSE
;
1576 fp
->WritesDepth
= GL_FALSE
;
1577 code
->tex
.length
= 0;
1579 code
->first_node_has_tex
= 0;
1581 code
->max_temp_idx
= 0;
1582 code
->node
[0].alu_end
= -1;
1583 code
->node
[0].tex_end
= -1;
1585 for (i
= 0; i
< PFS_MAX_ALU_INST
; i
++) {
1586 for (j
= 0; j
< 3; j
++) {
1587 cs
->slot
[i
].vsrc
[j
] = SRC_CONST
;
1588 cs
->slot
[i
].ssrc
[j
] = SRC_CONST
;
1592 /* Work out what temps the Mesa inputs correspond to, this must match
1593 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
1594 * configures itself based on the fragprog's InputsRead
1596 * NOTE: this depends on get_hw_temp() allocating registers in order,
1597 * starting from register 0.
1600 /* Texcoords come first */
1601 for (i
= 0; i
< cs
->compiler
->r300
->radeon
.glCtx
->Const
.MaxTextureUnits
; i
++) {
1602 if (InputsRead
& (FRAG_BIT_TEX0
<< i
)) {
1603 cs
->inputs
[FRAG_ATTRIB_TEX0
+ i
].refcount
= 0;
1604 cs
->inputs
[FRAG_ATTRIB_TEX0
+ i
].reg
=
1608 InputsRead
&= ~FRAG_BITS_TEX_ANY
;
1610 /* fragment position treated as a texcoord */
1611 if (InputsRead
& FRAG_BIT_WPOS
) {
1612 cs
->inputs
[FRAG_ATTRIB_WPOS
].refcount
= 0;
1613 cs
->inputs
[FRAG_ATTRIB_WPOS
].reg
= get_hw_temp(cs
, 0);
1615 InputsRead
&= ~FRAG_BIT_WPOS
;
1617 /* Then primary colour */
1618 if (InputsRead
& FRAG_BIT_COL0
) {
1619 cs
->inputs
[FRAG_ATTRIB_COL0
].refcount
= 0;
1620 cs
->inputs
[FRAG_ATTRIB_COL0
].reg
= get_hw_temp(cs
, 0);
1622 InputsRead
&= ~FRAG_BIT_COL0
;
1624 /* Secondary color */
1625 if (InputsRead
& FRAG_BIT_COL1
) {
1626 cs
->inputs
[FRAG_ATTRIB_COL1
].refcount
= 0;
1627 cs
->inputs
[FRAG_ATTRIB_COL1
].reg
= get_hw_temp(cs
, 0);
1629 InputsRead
&= ~FRAG_BIT_COL1
;
1633 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead
);
1634 /* force read from hwreg 0 for now */
1635 for (i
= 0; i
< 32; i
++)
1636 if (InputsRead
& (1 << i
))
1637 cs
->inputs
[i
].reg
= 0;
1640 /* Pre-parse the program, grabbing refcounts on input/temp regs.
1641 * That way, we can free up the reg when it's no longer needed
1643 for (i
= 0; i
< cs
->compiler
->program
->NumInstructions
; ++i
) {
1644 struct prog_instruction
*fpi
= cs
->compiler
->program
->Instructions
+ i
;
1647 for (j
= 0; j
< 3; j
++) {
1648 idx
= fpi
->SrcReg
[j
].Index
;
1649 switch (fpi
->SrcReg
[j
].File
) {
1650 case PROGRAM_TEMPORARY
:
1651 if (!(temps_used
& (1 << idx
))) {
1652 cs
->temps
[idx
].reg
= -1;
1653 cs
->temps
[idx
].refcount
= 1;
1654 temps_used
|= (1 << idx
);
1656 cs
->temps
[idx
].refcount
++;
1659 cs
->inputs
[idx
].refcount
++;
1666 idx
= fpi
->DstReg
.Index
;
1667 if (fpi
->DstReg
.File
== PROGRAM_TEMPORARY
) {
1668 if (!(temps_used
& (1 << idx
))) {
1669 cs
->temps
[idx
].reg
= -1;
1670 cs
->temps
[idx
].refcount
= 1;
1671 temps_used
|= (1 << idx
);
1673 cs
->temps
[idx
].refcount
++;
1676 cs
->temp_in_use
= temps_used
;
1681 * Final compilation step: Turn the intermediate radeon_program into
1682 * machine-readable instructions.
1684 GLboolean
r300FragmentProgramEmit(struct r300_fragment_program_compiler
*compiler
)
1686 struct r300_pfs_compile_state cs
;
1687 struct r300_fragment_program_code
*code
= compiler
->code
;
1689 _mesa_memset(&cs
, 0, sizeof(cs
));
1690 cs
.compiler
= compiler
;
1693 if (!parse_program(&cs
))
1697 code
->node
[code
->cur_node
].alu_end
=
1698 cs
.nrslots
- code
->node
[code
->cur_node
].alu_offset
- 1;
1699 if (code
->node
[code
->cur_node
].tex_end
< 0)
1700 code
->node
[code
->cur_node
].tex_end
= 0;
1701 code
->alu_offset
= 0;
1702 code
->alu_end
= cs
.nrslots
- 1;
1703 code
->tex_offset
= 0;
1704 code
->tex_end
= code
->tex
.length
? code
->tex
.length
- 1 : 0;
1705 assert(code
->node
[code
->cur_node
].alu_end
>= 0);
1706 assert(code
->alu_end
>= 0);