2 * Copyright (C) 2005 Ben Skeggs.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * \author Ben Skeggs <darktama@iinet.net.au>
33 * \author Jerome Glisse <j.glisse@gmail.com>
35 * \todo Depth write, WPOS/FOGC inputs
39 * \todo Verify results of opcodes for accuracy, I've only checked them in
46 #include "shader/prog_instruction.h"
47 #include "shader/prog_parameter.h"
48 #include "shader/prog_print.h"
50 #include "r300_context.h"
51 #include "r300_fragprog.h"
53 #include "r300_state.h"
56 * Usefull macros and values
58 #define ERROR(fmt, args...) do { \
59 fprintf(stderr, "%s::%s(): " fmt "\n", \
60 __FILE__, __FUNCTION__, ##args); \
61 fp->error = GL_TRUE; \
64 #define PFS_INVAL 0xFFFFFFFF
65 #define COMPILE_STATE struct r300_pfs_compile_state *cs = fp->cs
77 #define SWIZZLE_HHH 10
79 #define swizzle(r, x, y, z, w) do_swizzle(fp, r, \
86 #define REG_TYPE_INPUT 0
87 #define REG_TYPE_OUTPUT 1
88 #define REG_TYPE_TEMP 2
89 #define REG_TYPE_CONST 3
91 #define REG_TYPE_SHIFT 0
92 #define REG_INDEX_SHIFT 2
93 #define REG_VSWZ_SHIFT 8
94 #define REG_SSWZ_SHIFT 13
95 #define REG_NEGV_SHIFT 18
96 #define REG_NEGS_SHIFT 19
97 #define REG_ABS_SHIFT 20
98 #define REG_NO_USE_SHIFT 21 // Hack for refcounting
99 #define REG_VALID_SHIFT 22 // Does the register contain a defined value?
100 #define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
102 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
103 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
104 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
105 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
106 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
107 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
108 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
109 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
110 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
111 #define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
113 #define REG(type, index, vswz, sswz, nouse, valid, builtin) \
114 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
115 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
116 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
117 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
118 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
119 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
120 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
121 #define REG_GET_TYPE(reg) \
122 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
123 #define REG_GET_INDEX(reg) \
124 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
125 #define REG_GET_VSWZ(reg) \
126 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
127 #define REG_GET_SSWZ(reg) \
128 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
129 #define REG_GET_NO_USE(reg) \
130 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
131 #define REG_GET_VALID(reg) \
132 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
133 #define REG_GET_BUILTIN(reg) \
134 ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
135 #define REG_SET_TYPE(reg, type) \
136 reg = ((reg & ~REG_TYPE_MASK) | \
137 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
138 #define REG_SET_INDEX(reg, index) \
139 reg = ((reg & ~REG_INDEX_MASK) | \
140 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
141 #define REG_SET_VSWZ(reg, vswz) \
142 reg = ((reg & ~REG_VSWZ_MASK) | \
143 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
144 #define REG_SET_SSWZ(reg, sswz) \
145 reg = ((reg & ~REG_SSWZ_MASK) | \
146 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
147 #define REG_SET_NO_USE(reg, nouse) \
148 reg = ((reg & ~REG_NO_USE_MASK) | \
149 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
150 #define REG_SET_VALID(reg, valid) \
151 reg = ((reg & ~REG_VALID_MASK) | \
152 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
153 #define REG_SET_BUILTIN(reg, builtin) \
154 reg = ((reg & ~REG_BUILTIN_MASK) | \
155 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
156 #define REG_ABS(reg) \
157 reg = (reg | REG_ABS_MASK)
158 #define REG_NEGV(reg) \
159 reg = (reg | REG_NEGV_MASK)
160 #define REG_NEGS(reg) \
161 reg = (reg | REG_NEGS_MASK)
164 * Datas structures for fragment program generation
167 /* description of r300 native hw instructions */
168 static const struct {
175 {"MAD", 3, R300_FPI0_OUTC_MAD
, R300_FPI2_OUTA_MAD
},
176 {"DP3", 2, R300_FPI0_OUTC_DP3
, R300_FPI2_OUTA_DP4
},
177 {"DP4", 2, R300_FPI0_OUTC_DP4
, R300_FPI2_OUTA_DP4
},
178 {"MIN", 2, R300_FPI0_OUTC_MIN
, R300_FPI2_OUTA_MIN
},
179 {"MAX", 2, R300_FPI0_OUTC_MAX
, R300_FPI2_OUTA_MAX
},
180 {"CMP", 3, R300_FPI0_OUTC_CMP
, R300_FPI2_OUTA_CMP
},
181 {"FRC", 1, R300_FPI0_OUTC_FRC
, R300_FPI2_OUTA_FRC
},
182 {"EX2", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_EX2
},
183 {"LG2", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_LG2
},
184 {"RCP", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_RCP
},
185 {"RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_RSQ
},
186 {"REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA
, PFS_INVAL
},
187 {"CMPH", 3, R300_FPI0_OUTC_CMPH
, PFS_INVAL
},
191 /* vector swizzles r300 can support natively, with a couple of
192 * cases we handle specially
194 * REG_VSWZ/REG_SSWZ is an index into this table
197 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
198 #define SWIZZLE_HALF 6
200 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
204 /* native swizzles */
205 static const struct r300_pfs_swizzle
{
206 GLuint hash
; /* swizzle value this matches */
207 GLuint base
; /* base value for hw swizzle */
208 GLuint stride
; /* difference in base between arg0/1/2 */
212 {MAKE_SWZ3(X
, Y
, Z
), R300_FPI0_ARGC_SRC0C_XYZ
, 4, SLOT_SRC_VECTOR
},
213 {MAKE_SWZ3(X
, X
, X
), R300_FPI0_ARGC_SRC0C_XXX
, 4, SLOT_SRC_VECTOR
},
214 {MAKE_SWZ3(Y
, Y
, Y
), R300_FPI0_ARGC_SRC0C_YYY
, 4, SLOT_SRC_VECTOR
},
215 {MAKE_SWZ3(Z
, Z
, Z
), R300_FPI0_ARGC_SRC0C_ZZZ
, 4, SLOT_SRC_VECTOR
},
216 {MAKE_SWZ3(W
, W
, W
), R300_FPI0_ARGC_SRC0A
, 1, SLOT_SRC_SCALAR
},
217 {MAKE_SWZ3(Y
, Z
, X
), R300_FPI0_ARGC_SRC0C_YZX
, 1, SLOT_SRC_VECTOR
},
218 {MAKE_SWZ3(Z
, X
, Y
), R300_FPI0_ARGC_SRC0C_ZXY
, 1, SLOT_SRC_VECTOR
},
219 {MAKE_SWZ3(W
, Z
, Y
), R300_FPI0_ARGC_SRC0CA_WZY
, 1, SLOT_SRC_BOTH
},
220 {MAKE_SWZ3(ONE
, ONE
, ONE
), R300_FPI0_ARGC_ONE
, 0, 0},
221 {MAKE_SWZ3(ZERO
, ZERO
, ZERO
), R300_FPI0_ARGC_ZERO
, 0, 0},
222 {MAKE_SWZ3(HALF
, HALF
, HALF
), R300_FPI0_ARGC_HALF
, 0, 0},
223 {PFS_INVAL
, 0, 0, 0},
227 /* used during matching of non-native swizzles */
228 #define SWZ_X_MASK (7 << 0)
229 #define SWZ_Y_MASK (7 << 3)
230 #define SWZ_Z_MASK (7 << 6)
231 #define SWZ_W_MASK (7 << 9)
232 static const struct {
233 GLuint hash
; /* used to mask matching swizzle components */
234 int mask
; /* actual outmask */
235 int count
; /* count of components matched */
238 {SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
, 1 | 2 | 4, 3},
239 {SWZ_X_MASK
| SWZ_Y_MASK
, 1 | 2, 2},
240 {SWZ_X_MASK
| SWZ_Z_MASK
, 1 | 4, 2},
241 {SWZ_Y_MASK
| SWZ_Z_MASK
, 2 | 4, 2},
245 {PFS_INVAL
, PFS_INVAL
, PFS_INVAL
}
249 static const struct {
250 int base
; /* hw value of swizzle */
251 int stride
; /* difference between SRC0/1/2 */
255 {R300_FPI2_ARGA_SRC0C_X
, 3, SLOT_SRC_VECTOR
},
256 {R300_FPI2_ARGA_SRC0C_Y
, 3, SLOT_SRC_VECTOR
},
257 {R300_FPI2_ARGA_SRC0C_Z
, 3, SLOT_SRC_VECTOR
},
258 {R300_FPI2_ARGA_SRC0A
, 1, SLOT_SRC_SCALAR
},
259 {R300_FPI2_ARGA_ZERO
, 0, 0},
260 {R300_FPI2_ARGA_ONE
, 0, 0},
261 {R300_FPI2_ARGA_HALF
, 0, 0}
265 /* boiler-plate reg, for convenience */
266 static const GLuint undef
= REG(REG_TYPE_TEMP
,
274 /* constant one source */
275 static const GLuint pfs_one
= REG(REG_TYPE_CONST
,
283 /* constant half source */
284 static const GLuint pfs_half
= REG(REG_TYPE_CONST
,
292 /* constant zero source */
293 static const GLuint pfs_zero
= REG(REG_TYPE_CONST
,
302 * Common functions prototypes
304 static void dump_program(struct r300_fragment_program
*fp
);
305 static void emit_arith(struct r300_fragment_program
*fp
, int op
,
306 GLuint dest
, int mask
,
307 GLuint src0
, GLuint src1
, GLuint src2
, int flags
);
310 * Get an R300 temporary that can be written to in the given slot.
312 static int get_hw_temp(struct r300_fragment_program
*fp
, int slot
)
317 for (r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
318 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= slot
)
322 if (r
>= PFS_NUM_TEMP_REGS
) {
323 ERROR("Out of hardware temps\n");
326 // Reserved is used to avoid the following scenario:
327 // R300 temporary X is first assigned to Mesa temporary Y during vector ops
328 // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
329 // Then scalar ops on Mesa temporary Z are emitted and move back in time
330 // to overwrite the value of temporary Y.
332 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
333 cs
->hwtemps
[r
].free
= -1;
335 // Reset to some value that won't mess things up when the user
336 // tries to read from a temporary that hasn't been assigned a value yet.
337 // In the normal case, vector_valid and scalar_valid should be set to
338 // a sane value by the first emit that writes to this temporary.
339 cs
->hwtemps
[r
].vector_valid
= 0;
340 cs
->hwtemps
[r
].scalar_valid
= 0;
342 if (r
> fp
->max_temp_idx
)
343 fp
->max_temp_idx
= r
;
349 * Get an R300 temporary that will act as a TEX destination register.
351 static int get_hw_temp_tex(struct r300_fragment_program
*fp
)
356 for (r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
357 if (cs
->used_in_node
& (1 << r
))
360 // Note: Be very careful here
361 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= 0)
365 if (r
>= PFS_NUM_TEMP_REGS
)
366 return get_hw_temp(fp
, 0); /* Will cause an indirection */
368 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
369 cs
->hwtemps
[r
].free
= -1;
371 // Reset to some value that won't mess things up when the user
372 // tries to read from a temporary that hasn't been assigned a value yet.
373 // In the normal case, vector_valid and scalar_valid should be set to
374 // a sane value by the first emit that writes to this temporary.
375 cs
->hwtemps
[r
].vector_valid
= cs
->nrslots
;
376 cs
->hwtemps
[r
].scalar_valid
= cs
->nrslots
;
378 if (r
> fp
->max_temp_idx
)
379 fp
->max_temp_idx
= r
;
385 * Mark the given hardware register as free.
387 static void free_hw_temp(struct r300_fragment_program
*fp
, int idx
)
391 // Be very careful here. Consider sequences like
394 // The TEX instruction may be moved in front of the MAD instruction
395 // due to the way nodes work. We don't want to alias r1 and r4 in
397 // I'm certain the register allocation could be further sanitized,
398 // but it's tricky because of stuff that can happen inside emit_tex
400 cs
->hwtemps
[idx
].free
= cs
->nrslots
+ 1;
404 * Create a new Mesa temporary register.
406 static GLuint
get_temp_reg(struct r300_fragment_program
*fp
)
412 index
= ffs(~cs
->temp_in_use
);
414 ERROR("Out of program temps\n");
418 cs
->temp_in_use
|= (1 << --index
);
419 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
420 cs
->temps
[index
].reg
= -1;
422 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
423 REG_SET_INDEX(r
, index
);
424 REG_SET_VALID(r
, GL_TRUE
);
429 * Create a new Mesa temporary register that will act as the destination
430 * register for a texture read.
432 static GLuint
get_temp_reg_tex(struct r300_fragment_program
*fp
)
438 index
= ffs(~cs
->temp_in_use
);
440 ERROR("Out of program temps\n");
444 cs
->temp_in_use
|= (1 << --index
);
445 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
446 cs
->temps
[index
].reg
= get_hw_temp_tex(fp
);
448 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
449 REG_SET_INDEX(r
, index
);
450 REG_SET_VALID(r
, GL_TRUE
);
455 * Free a Mesa temporary and the associated R300 temporary.
457 static void free_temp(struct r300_fragment_program
*fp
, GLuint r
)
460 GLuint index
= REG_GET_INDEX(r
);
462 if (!(cs
->temp_in_use
& (1 << index
)))
465 if (REG_GET_TYPE(r
) == REG_TYPE_TEMP
) {
466 free_hw_temp(fp
, cs
->temps
[index
].reg
);
467 cs
->temps
[index
].reg
= -1;
468 cs
->temp_in_use
&= ~(1 << index
);
469 } else if (REG_GET_TYPE(r
) == REG_TYPE_INPUT
) {
470 free_hw_temp(fp
, cs
->inputs
[index
].reg
);
471 cs
->inputs
[index
].reg
= -1;
476 * Emit a hardware constant/parameter.
478 * \p cp Stable pointer to an array of 4 floats.
479 * The pointer must be stable in the sense that it remains to be valid
480 * and hold the contents of the constant/parameter throughout the lifetime
481 * of the fragment program (actually, up until the next time the fragment
482 * program is translated).
484 static GLuint
emit_const4fv(struct r300_fragment_program
*fp
,
490 for (index
= 0; index
< fp
->const_nr
; ++index
) {
491 if (fp
->constant
[index
] == cp
)
495 if (index
>= fp
->const_nr
) {
496 if (index
>= PFS_NUM_CONST_REGS
) {
497 ERROR("Out of hw constants!\n");
502 fp
->constant
[index
] = cp
;
505 REG_SET_TYPE(reg
, REG_TYPE_CONST
);
506 REG_SET_INDEX(reg
, index
);
507 REG_SET_VALID(reg
, GL_TRUE
);
511 static inline GLuint
negate(GLuint r
)
518 /* Hack, to prevent clobbering sources used multiple times when
519 * emulating non-native instructions
521 static inline GLuint
keep(GLuint r
)
523 REG_SET_NO_USE(r
, GL_TRUE
);
527 static inline GLuint
absolute(GLuint r
)
533 static int swz_native(struct r300_fragment_program
*fp
,
534 GLuint src
, GLuint
* r
, GLuint arbneg
)
536 /* Native swizzle, handle negation */
537 src
= (src
& ~REG_NEGS_MASK
) | (((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
);
539 if ((arbneg
& 0x7) == 0x0) {
540 src
= src
& ~REG_NEGV_MASK
;
542 } else if ((arbneg
& 0x7) == 0x7) {
543 src
|= REG_NEGV_MASK
;
546 if (!REG_GET_VALID(*r
))
547 *r
= get_temp_reg(fp
);
548 src
|= REG_NEGV_MASK
;
551 *r
, arbneg
& 0x7, keep(src
), pfs_one
, pfs_zero
, 0);
552 src
= src
& ~REG_NEGV_MASK
;
556 (arbneg
^ 0x7) | WRITEMASK_W
,
557 src
, pfs_one
, pfs_zero
, 0);
563 static int swz_emit_partial(struct r300_fragment_program
*fp
,
565 GLuint
* r
, int mask
, int mc
, GLuint arbneg
)
570 if (!REG_GET_VALID(*r
))
571 *r
= get_temp_reg(fp
);
573 /* A partial match, VSWZ/mask define what parts of the
574 * desired swizzle we match
576 if (mc
+ s_mask
[mask
].count
== 3) {
578 src
|= ((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
;
581 tmp
= arbneg
& s_mask
[mask
].mask
;
583 tmp
= tmp
^ s_mask
[mask
].mask
;
588 arbneg
& s_mask
[mask
].mask
,
589 keep(src
) | REG_NEGV_MASK
,
590 pfs_one
, pfs_zero
, 0);
592 REG_SET_NO_USE(src
, GL_TRUE
);
594 REG_SET_NO_USE(src
, GL_FALSE
);
598 *r
, tmp
| wmask
, src
, pfs_one
, pfs_zero
, 0);
601 REG_SET_NO_USE(src
, GL_TRUE
);
603 REG_SET_NO_USE(src
, GL_FALSE
);
608 (arbneg
& s_mask
[mask
].mask
) | wmask
,
609 src
| REG_NEGV_MASK
, pfs_one
, pfs_zero
, 0);
613 REG_SET_NO_USE(src
, GL_TRUE
);
615 REG_SET_NO_USE(src
, GL_FALSE
);
617 emit_arith(fp
, PFS_OP_MAD
,
619 s_mask
[mask
].mask
| wmask
,
620 src
, pfs_one
, pfs_zero
, 0);
623 return s_mask
[mask
].count
;
626 static GLuint
do_swizzle(struct r300_fragment_program
*fp
,
627 GLuint src
, GLuint arbswz
, GLuint arbneg
)
634 /* If swizzling from something without an XYZW native swizzle,
635 * emit result to a temp, and do new swizzle from the temp.
638 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
|| REG_GET_SSWZ(src
) != SWIZZLE_W
) {
639 GLuint temp
= get_temp_reg(fp
);
642 temp
, WRITEMASK_XYZW
, src
, pfs_one
, pfs_zero
, 0);
647 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
|| REG_GET_SSWZ(src
) != SWIZZLE_W
) {
649 (v_swiz
[REG_GET_VSWZ(src
)].
650 hash
& (SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
)) |
651 REG_GET_SSWZ(src
) << 9;
656 for (i
= 0; i
< 4; ++i
) {
657 offset
= GET_SWZ(arbswz
, i
);
660 (offset
<= 3) ? GET_SWZ(vsrcswz
,
665 arbswz
= newswz
& (SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
);
666 REG_SET_SSWZ(src
, GET_SWZ(newswz
, 3));
668 /* set scalar swizzling */
669 REG_SET_SSWZ(src
, GET_SWZ(arbswz
, 3));
673 vswz
= REG_GET_VSWZ(src
);
677 REG_SET_VSWZ(src
, vswz
);
678 chash
= v_swiz
[REG_GET_VSWZ(src
)].hash
&
681 if (chash
== (arbswz
& s_mask
[c_mask
].hash
)) {
682 if (s_mask
[c_mask
].count
== 3) {
683 v_match
+= swz_native(fp
,
686 v_match
+= swz_emit_partial(fp
,
697 /* Fill with something invalid.. all 0's was
698 * wrong before, matched SWIZZLE_X. So all
699 * 1's will be okay for now
701 arbswz
|= (PFS_INVAL
& s_mask
[c_mask
].hash
);
703 } while (v_swiz
[++vswz
].hash
!= PFS_INVAL
);
704 REG_SET_VSWZ(src
, SWIZZLE_XYZ
);
705 } while (s_mask
[++c_mask
].hash
!= PFS_INVAL
);
707 ERROR("should NEVER get here\n");
711 static GLuint
t_src(struct r300_fragment_program
*fp
,
712 struct prog_src_register fpsrc
)
716 switch (fpsrc
.File
) {
717 case PROGRAM_TEMPORARY
:
718 REG_SET_INDEX(r
, fpsrc
.Index
);
719 REG_SET_VALID(r
, GL_TRUE
);
720 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
723 REG_SET_INDEX(r
, fpsrc
.Index
);
724 REG_SET_VALID(r
, GL_TRUE
);
725 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
727 case PROGRAM_LOCAL_PARAM
:
728 r
= emit_const4fv(fp
,
729 fp
->mesa_program
.Base
.LocalParams
[fpsrc
.
732 case PROGRAM_ENV_PARAM
:
733 r
= emit_const4fv(fp
,
734 fp
->ctx
->FragmentProgram
.Parameters
[fpsrc
.
737 case PROGRAM_STATE_VAR
:
738 case PROGRAM_NAMED_PARAM
:
739 r
= emit_const4fv(fp
,
740 fp
->mesa_program
.Base
.Parameters
->
741 ParameterValues
[fpsrc
.Index
]);
744 ERROR("unknown SrcReg->File %x\n", fpsrc
.File
);
748 /* no point swizzling ONE/ZERO/HALF constants... */
749 if (REG_GET_VSWZ(r
) < SWIZZLE_111
|| REG_GET_SSWZ(r
) < SWIZZLE_ZERO
)
750 r
= do_swizzle(fp
, r
, fpsrc
.Swizzle
, fpsrc
.NegateBase
);
754 static GLuint
t_scalar_src(struct r300_fragment_program
*fp
,
755 struct prog_src_register fpsrc
)
757 struct prog_src_register src
= fpsrc
;
758 int sc
= GET_SWZ(fpsrc
.Swizzle
, 0); /* X */
760 src
.Swizzle
= ((sc
<< 0) | (sc
<< 3) | (sc
<< 6) | (sc
<< 9));
762 return t_src(fp
, src
);
765 static GLuint
t_dst(struct r300_fragment_program
*fp
,
766 struct prog_dst_register dest
)
771 case PROGRAM_TEMPORARY
:
772 REG_SET_INDEX(r
, dest
.Index
);
773 REG_SET_VALID(r
, GL_TRUE
);
774 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
777 REG_SET_TYPE(r
, REG_TYPE_OUTPUT
);
778 switch (dest
.Index
) {
779 case FRAG_RESULT_COLR
:
780 case FRAG_RESULT_DEPR
:
781 REG_SET_INDEX(r
, dest
.Index
);
782 REG_SET_VALID(r
, GL_TRUE
);
785 ERROR("Bad DstReg->Index 0x%x\n", dest
.Index
);
789 ERROR("Bad DstReg->File 0x%x\n", dest
.File
);
794 static int t_hw_src(struct r300_fragment_program
*fp
, GLuint src
, GLboolean tex
)
798 int index
= REG_GET_INDEX(src
);
800 switch (REG_GET_TYPE(src
)) {
802 /* NOTE: if reg==-1 here, a source is being read that
803 * hasn't been written to. Undefined results.
805 if (cs
->temps
[index
].reg
== -1)
806 cs
->temps
[index
].reg
= get_hw_temp(fp
, cs
->nrslots
);
808 idx
= cs
->temps
[index
].reg
;
810 if (!REG_GET_NO_USE(src
) && (--cs
->temps
[index
].refcount
== 0))
814 idx
= cs
->inputs
[index
].reg
;
816 if (!REG_GET_NO_USE(src
) && (--cs
->inputs
[index
].refcount
== 0))
817 free_hw_temp(fp
, cs
->inputs
[index
].reg
);
820 return (index
| SRC_CONST
);
822 ERROR("Invalid type for source reg\n");
823 return (0 | SRC_CONST
);
827 cs
->used_in_node
|= (1 << idx
);
832 static int t_hw_dst(struct r300_fragment_program
*fp
,
833 GLuint dest
, GLboolean tex
, int slot
)
837 GLuint index
= REG_GET_INDEX(dest
);
838 assert(REG_GET_VALID(dest
));
840 switch (REG_GET_TYPE(dest
)) {
842 if (cs
->temps
[REG_GET_INDEX(dest
)].reg
== -1) {
844 cs
->temps
[index
].reg
= get_hw_temp(fp
, slot
);
846 cs
->temps
[index
].reg
= get_hw_temp_tex(fp
);
849 idx
= cs
->temps
[index
].reg
;
851 if (!REG_GET_NO_USE(dest
) && (--cs
->temps
[index
].refcount
== 0))
854 cs
->dest_in_node
|= (1 << idx
);
855 cs
->used_in_node
|= (1 << idx
);
857 case REG_TYPE_OUTPUT
:
859 case FRAG_RESULT_COLR
:
860 fp
->node
[fp
->cur_node
].flags
|=
861 R300_PFS_NODE_OUTPUT_COLOR
;
863 case FRAG_RESULT_DEPR
:
864 fp
->node
[fp
->cur_node
].flags
|=
865 R300_PFS_NODE_OUTPUT_DEPTH
;
871 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
878 static void emit_nop(struct r300_fragment_program
*fp
)
882 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
883 ERROR("Out of ALU instruction slots\n");
887 fp
->alu
.inst
[cs
->nrslots
].inst0
= NOP_INST0
;
888 fp
->alu
.inst
[cs
->nrslots
].inst1
= NOP_INST1
;
889 fp
->alu
.inst
[cs
->nrslots
].inst2
= NOP_INST2
;
890 fp
->alu
.inst
[cs
->nrslots
].inst3
= NOP_INST3
;
894 static void emit_tex(struct r300_fragment_program
*fp
,
895 struct prog_instruction
*fpi
, int opcode
)
898 GLuint coord
= t_src(fp
, fpi
->SrcReg
[0]);
899 GLuint dest
= undef
, rdest
= undef
;
901 int unit
= fpi
->TexSrcUnit
;
905 uin
= cs
->used_in_node
;
906 din
= cs
->dest_in_node
;
908 /* Resolve source/dest to hardware registers */
909 if (opcode
!= R300_FPITX_OP_KIL
) {
910 if (fpi
->TexSrcTarget
== TEXTURE_RECT_INDEX
) {
912 * Hardware uses [0..1]x[0..1] range for rectangle textures
913 * instead of [0..Width]x[0..Height].
914 * Add a scaling instruction.
916 * \todo Refactor this once we have proper rewriting/optimization
917 * support for programs.
919 gl_state_index tokens
[STATE_LENGTH
] = {
920 STATE_INTERNAL
, STATE_R300_TEXRECT_FACTOR
, 0, 0,
928 _mesa_add_state_reference(fp
->mesa_program
.Base
.
932 fp
->mesa_program
.Base
.Parameters
->
933 ParameterValues
[factor_index
]);
934 tempreg
= keep(get_temp_reg(fp
));
936 emit_arith(fp
, PFS_OP_MAD
, tempreg
, WRITEMASK_XYZW
,
937 coord
, factorreg
, pfs_zero
, 0);
939 /* Ensure correct node indirection */
940 uin
= cs
->used_in_node
;
941 din
= cs
->dest_in_node
;
943 hwsrc
= t_hw_src(fp
, tempreg
, GL_TRUE
);
945 hwsrc
= t_hw_src(fp
, coord
, GL_TRUE
);
948 dest
= t_dst(fp
, fpi
->DstReg
);
950 /* r300 doesn't seem to be able to do TEX->output reg */
951 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
953 dest
= get_temp_reg_tex(fp
);
954 } else if (fpi
->DstReg
.WriteMask
!= WRITEMASK_XYZW
) {
955 /* in case write mask isn't XYZW */
957 dest
= get_temp_reg_tex(fp
);
960 t_hw_dst(fp
, dest
, GL_TRUE
,
961 fp
->node
[fp
->cur_node
].alu_offset
);
963 /* Use a temp that hasn't been used in this node, rather
964 * than causing an indirection
966 if (uin
& (1 << hwdest
)) {
967 free_hw_temp(fp
, hwdest
);
968 hwdest
= get_hw_temp_tex(fp
);
969 cs
->temps
[REG_GET_INDEX(dest
)].reg
= hwdest
;
974 hwsrc
= t_hw_src(fp
, coord
, GL_TRUE
);
977 /* Indirection if source has been written in this node, or if the
978 * dest has been read/written in this node
980 if ((REG_GET_TYPE(coord
) != REG_TYPE_CONST
&&
981 (din
& (1 << hwsrc
))) || (uin
& (1 << hwdest
))) {
983 /* Finish off current node */
984 if (fp
->node
[fp
->cur_node
].alu_offset
== cs
->nrslots
)
987 fp
->node
[fp
->cur_node
].alu_end
=
988 cs
->nrslots
- fp
->node
[fp
->cur_node
].alu_offset
- 1;
989 assert(fp
->node
[fp
->cur_node
].alu_end
>= 0);
991 if (++fp
->cur_node
>= PFS_MAX_TEX_INDIRECT
) {
992 ERROR("too many levels of texture indirection\n");
997 fp
->node
[fp
->cur_node
].tex_offset
= fp
->tex
.length
;
998 fp
->node
[fp
->cur_node
].alu_offset
= cs
->nrslots
;
999 fp
->node
[fp
->cur_node
].tex_end
= -1;
1000 fp
->node
[fp
->cur_node
].alu_end
= -1;
1001 fp
->node
[fp
->cur_node
].flags
= 0;
1002 cs
->used_in_node
= 0;
1003 cs
->dest_in_node
= 0;
1006 if (fp
->cur_node
== 0)
1007 fp
->first_node_has_tex
= 1;
1009 fp
->tex
.inst
[fp
->tex
.length
++] = 0 | (hwsrc
<< R300_FPITX_SRC_SHIFT
)
1010 | (hwdest
<< R300_FPITX_DST_SHIFT
)
1011 | (unit
<< R300_FPITX_IMAGE_SHIFT
)
1012 /* not entirely sure about this */
1013 | (opcode
<< R300_FPITX_OPCODE_SHIFT
);
1015 cs
->dest_in_node
|= (1 << hwdest
);
1016 if (REG_GET_TYPE(coord
) != REG_TYPE_CONST
)
1017 cs
->used_in_node
|= (1 << hwsrc
);
1019 fp
->node
[fp
->cur_node
].tex_end
++;
1021 /* Copy from temp to output if needed */
1022 if (REG_GET_VALID(rdest
)) {
1023 emit_arith(fp
, PFS_OP_MAD
, rdest
, fpi
->DstReg
.WriteMask
, dest
,
1024 pfs_one
, pfs_zero
, 0);
1025 free_temp(fp
, dest
);
1028 /* Free temp register */
1030 free_temp(fp
, tempreg
);
1034 * Returns the first slot where we could possibly allow writing to dest,
1035 * according to register allocation.
1037 static int get_earliest_allowed_write(struct r300_fragment_program
*fp
,
1038 GLuint dest
, int mask
)
1043 GLuint index
= REG_GET_INDEX(dest
);
1044 assert(REG_GET_VALID(dest
));
1046 switch (REG_GET_TYPE(dest
)) {
1048 if (cs
->temps
[index
].reg
== -1)
1051 idx
= cs
->temps
[index
].reg
;
1053 case REG_TYPE_OUTPUT
:
1056 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
1060 pos
= cs
->hwtemps
[idx
].reserved
;
1061 if (mask
& WRITEMASK_XYZ
) {
1062 if (pos
< cs
->hwtemps
[idx
].vector_lastread
)
1063 pos
= cs
->hwtemps
[idx
].vector_lastread
;
1065 if (mask
& WRITEMASK_W
) {
1066 if (pos
< cs
->hwtemps
[idx
].scalar_lastread
)
1067 pos
= cs
->hwtemps
[idx
].scalar_lastread
;
1074 * Allocates a slot for an ALU instruction that can consist of
1075 * a vertex part or a scalar part or both.
1077 * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1078 * appropriate position (vector and/or scalar), and their positions are
1079 * recorded in the srcpos array.
1081 * This function emits instruction code for the source fetch and the
1082 * argument selection. It does not emit instruction code for the
1083 * opcode or the destination selection.
1085 * @return the index of the slot
1087 static int find_and_prepare_slot(struct r300_fragment_program
*fp
,
1090 int argc
, GLuint
* src
, GLuint dest
, int mask
)
1103 // Determine instruction slots, whether sources are required on
1104 // vector or scalar side, and the smallest slot number where
1105 // all source registers are available
1108 used
|= SLOT_OP_VECTOR
;
1110 used
|= SLOT_OP_SCALAR
;
1112 pos
= get_earliest_allowed_write(fp
, dest
, mask
);
1114 if (fp
->node
[fp
->cur_node
].alu_offset
> pos
)
1115 pos
= fp
->node
[fp
->cur_node
].alu_offset
;
1116 for (i
= 0; i
< argc
; ++i
) {
1117 if (!REG_GET_BUILTIN(src
[i
])) {
1119 used
|= v_swiz
[REG_GET_VSWZ(src
[i
])].flags
<< i
;
1121 used
|= s_swiz
[REG_GET_SSWZ(src
[i
])].flags
<< i
;
1124 hwsrc
[i
] = t_hw_src(fp
, src
[i
], GL_FALSE
); /* Note: sideeffects wrt refcounting! */
1125 regnr
= hwsrc
[i
] & 31;
1127 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1128 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1129 if (cs
->hwtemps
[regnr
].vector_valid
> pos
)
1130 pos
= cs
->hwtemps
[regnr
].vector_valid
;
1132 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1133 if (cs
->hwtemps
[regnr
].scalar_valid
> pos
)
1134 pos
= cs
->hwtemps
[regnr
].scalar_valid
;
1139 // Find a slot that fits
1141 if (cs
->slot
[pos
].used
& used
& SLOT_OP_BOTH
)
1144 if (pos
>= cs
->nrslots
) {
1145 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
1146 ERROR("Out of ALU instruction slots\n");
1150 fp
->alu
.inst
[pos
].inst0
= NOP_INST0
;
1151 fp
->alu
.inst
[pos
].inst1
= NOP_INST1
;
1152 fp
->alu
.inst
[pos
].inst2
= NOP_INST2
;
1153 fp
->alu
.inst
[pos
].inst3
= NOP_INST3
;
1157 // Note: When we need both parts (vector and scalar) of a source,
1158 // we always try to put them into the same position. This makes the
1159 // code easier to read, and it is optimal (i.e. one doesn't gain
1160 // anything by splitting the parts).
1161 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1162 tempused
= cs
->slot
[pos
].used
;
1163 for (i
= 0; i
< 3; ++i
) {
1164 tempvsrc
[i
] = cs
->slot
[pos
].vsrc
[i
];
1165 tempssrc
[i
] = cs
->slot
[pos
].ssrc
[i
];
1168 for (i
= 0; i
< argc
; ++i
) {
1169 int flags
= (used
>> i
) & SLOT_SRC_BOTH
;
1176 for (j
= 0; j
< 3; ++j
) {
1177 if ((tempused
>> j
) & flags
& SLOT_SRC_VECTOR
) {
1178 if (tempvsrc
[j
] != hwsrc
[i
])
1182 if ((tempused
>> j
) & flags
& SLOT_SRC_SCALAR
) {
1183 if (tempssrc
[j
] != hwsrc
[i
])
1194 tempused
|= flags
<< j
;
1195 if (flags
& SLOT_SRC_VECTOR
)
1196 tempvsrc
[j
] = hwsrc
[i
];
1197 if (flags
& SLOT_SRC_SCALAR
)
1198 tempssrc
[j
] = hwsrc
[i
];
1205 // Found a slot, reserve it
1206 cs
->slot
[pos
].used
= tempused
| (used
& SLOT_OP_BOTH
);
1207 for (i
= 0; i
< 3; ++i
) {
1208 cs
->slot
[pos
].vsrc
[i
] = tempvsrc
[i
];
1209 cs
->slot
[pos
].ssrc
[i
] = tempssrc
[i
];
1212 for (i
= 0; i
< argc
; ++i
) {
1213 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1214 int regnr
= hwsrc
[i
] & 31;
1216 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1217 if (cs
->hwtemps
[regnr
].vector_lastread
< pos
)
1218 cs
->hwtemps
[regnr
].vector_lastread
=
1221 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1222 if (cs
->hwtemps
[regnr
].scalar_lastread
< pos
)
1223 cs
->hwtemps
[regnr
].scalar_lastread
=
1229 // Emit the source fetch code
1230 fp
->alu
.inst
[pos
].inst1
&= ~R300_FPI1_SRC_MASK
;
1231 fp
->alu
.inst
[pos
].inst1
|=
1232 ((cs
->slot
[pos
].vsrc
[0] << R300_FPI1_SRC0C_SHIFT
) |
1233 (cs
->slot
[pos
].vsrc
[1] << R300_FPI1_SRC1C_SHIFT
) |
1234 (cs
->slot
[pos
].vsrc
[2] << R300_FPI1_SRC2C_SHIFT
));
1236 fp
->alu
.inst
[pos
].inst3
&= ~R300_FPI3_SRC_MASK
;
1237 fp
->alu
.inst
[pos
].inst3
|=
1238 ((cs
->slot
[pos
].ssrc
[0] << R300_FPI3_SRC0A_SHIFT
) |
1239 (cs
->slot
[pos
].ssrc
[1] << R300_FPI3_SRC1A_SHIFT
) |
1240 (cs
->slot
[pos
].ssrc
[2] << R300_FPI3_SRC2A_SHIFT
));
1242 // Emit the argument selection code
1246 for (i
= 0; i
< 3; ++i
) {
1248 swz
[i
] = (v_swiz
[REG_GET_VSWZ(src
[i
])].base
+
1250 v_swiz
[REG_GET_VSWZ(src
[i
])].
1251 stride
)) | ((src
[i
] & REG_NEGV_MASK
)
1252 ? ARG_NEG
: 0) | ((src
[i
]
1259 swz
[i
] = R300_FPI0_ARGC_ZERO
;
1263 fp
->alu
.inst
[pos
].inst0
&=
1264 ~(R300_FPI0_ARG0C_MASK
| R300_FPI0_ARG1C_MASK
|
1265 R300_FPI0_ARG2C_MASK
);
1266 fp
->alu
.inst
[pos
].inst0
|=
1267 (swz
[0] << R300_FPI0_ARG0C_SHIFT
) | (swz
[1] <<
1268 R300_FPI0_ARG1C_SHIFT
)
1269 | (swz
[2] << R300_FPI0_ARG2C_SHIFT
);
1275 for (i
= 0; i
< 3; ++i
) {
1277 swz
[i
] = (s_swiz
[REG_GET_SSWZ(src
[i
])].base
+
1279 s_swiz
[REG_GET_SSWZ(src
[i
])].
1280 stride
)) | ((src
[i
] & REG_NEGV_MASK
)
1281 ? ARG_NEG
: 0) | ((src
[i
]
1288 swz
[i
] = R300_FPI2_ARGA_ZERO
;
1292 fp
->alu
.inst
[pos
].inst2
&=
1293 ~(R300_FPI2_ARG0A_MASK
| R300_FPI2_ARG1A_MASK
|
1294 R300_FPI2_ARG2A_MASK
);
1295 fp
->alu
.inst
[pos
].inst2
|=
1296 (swz
[0] << R300_FPI2_ARG0A_SHIFT
) | (swz
[1] <<
1297 R300_FPI2_ARG1A_SHIFT
)
1298 | (swz
[2] << R300_FPI2_ARG2A_SHIFT
);
1305 * Append an ALU instruction to the instruction list.
1307 static void emit_arith(struct r300_fragment_program
*fp
,
1311 GLuint src0
, GLuint src1
, GLuint src2
, int flags
)
1314 GLuint src
[3] = { src0
, src1
, src2
};
1316 GLboolean emit_vop
, emit_sop
;
1320 vop
= r300_fpop
[op
].v_op
;
1321 sop
= r300_fpop
[op
].s_op
;
1322 argc
= r300_fpop
[op
].argc
;
1324 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
&&
1325 REG_GET_INDEX(dest
) == FRAG_RESULT_DEPR
) {
1326 if (mask
& WRITEMASK_Z
) {
1333 emit_vop
= GL_FALSE
;
1334 emit_sop
= GL_FALSE
;
1335 if ((mask
& WRITEMASK_XYZ
) || vop
== R300_FPI0_OUTC_DP3
)
1337 if ((mask
& WRITEMASK_W
) || vop
== R300_FPI0_OUTC_REPL_ALPHA
)
1341 find_and_prepare_slot(fp
, emit_vop
, emit_sop
, argc
, src
, dest
,
1346 hwdest
= t_hw_dst(fp
, dest
, GL_FALSE
, pos
); /* Note: Side effects wrt register allocation */
1348 if (flags
& PFS_FLAG_SAT
) {
1349 vop
|= R300_FPI0_OUTC_SAT
;
1350 sop
|= R300_FPI2_OUTA_SAT
;
1353 /* Throw the pieces together and get FPI0/1 */
1355 fp
->alu
.inst
[pos
].inst0
|= vop
;
1357 fp
->alu
.inst
[pos
].inst1
|= hwdest
<< R300_FPI1_DSTC_SHIFT
;
1359 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1360 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1361 fp
->alu
.inst
[pos
].inst1
|=
1362 (mask
& WRITEMASK_XYZ
) <<
1363 R300_FPI1_DSTC_OUTPUT_MASK_SHIFT
;
1367 fp
->alu
.inst
[pos
].inst1
|=
1368 (mask
& WRITEMASK_XYZ
) <<
1369 R300_FPI1_DSTC_REG_MASK_SHIFT
;
1371 cs
->hwtemps
[hwdest
].vector_valid
= pos
+ 1;
1375 /* And now FPI2/3 */
1377 fp
->alu
.inst
[pos
].inst2
|= sop
;
1379 if (mask
& WRITEMASK_W
) {
1380 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1381 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1382 fp
->alu
.inst
[pos
].inst3
|=
1383 (hwdest
<< R300_FPI3_DSTA_SHIFT
) |
1384 R300_FPI3_DSTA_OUTPUT
;
1385 } else if (REG_GET_INDEX(dest
) ==
1387 fp
->alu
.inst
[pos
].inst3
|=
1388 R300_FPI3_DSTA_DEPTH
;
1392 fp
->alu
.inst
[pos
].inst3
|=
1393 (hwdest
<< R300_FPI3_DSTA_SHIFT
) |
1396 cs
->hwtemps
[hwdest
].scalar_valid
= pos
+ 1;
1405 static GLuint
get_attrib(struct r300_fragment_program
*fp
, GLuint attr
)
1407 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
1410 if (!(mp
->Base
.InputsRead
& (1 << attr
))) {
1411 ERROR("Attribute %d was not provided!\n", attr
);
1415 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
1416 REG_SET_INDEX(r
, attr
);
1417 REG_SET_VALID(r
, GL_TRUE
);
1422 static GLfloat SinCosConsts
[2][4] = {
1424 1.273239545, // 4/PI
1425 -0.405284735, // -4/(PI*PI)
1432 0.159154943, // 1/(2*PI)
1438 * Emit a LIT instruction.
1439 * \p flags may be PFS_FLAG_SAT
1441 * Definition of LIT (from ARB_fragment_program):
1442 * tmp = VectorLoad(op0);
1443 * if (tmp.x < 0) tmp.x = 0;
1444 * if (tmp.y < 0) tmp.y = 0;
1445 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1446 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1449 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1452 * The longest path of computation is the one leading to result.z,
1453 * consisting of 5 operations. This implementation of LIT takes
1454 * 5 slots. So unless there's some special undocumented opcode,
1455 * this implementation is potentially optimal. Unfortunately,
1456 * emit_arith is a bit too conservative because it doesn't understand
1457 * partial writes to the vector component.
1459 static const GLfloat LitConst
[4] =
1460 { 127.999999, 127.999999, 127.999999, -127.999999 };
1462 static void emit_lit(struct r300_fragment_program
*fp
,
1463 GLuint dest
, int mask
, GLuint src
, int flags
)
1470 cnst
= emit_const4fv(fp
, LitConst
);
1473 if ((mask
& WRITEMASK_XYZW
) != WRITEMASK_XYZW
) {
1475 } else if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1476 // LIT is typically followed by DP3/DP4, so there's no point
1477 // in creating special code for this case
1481 if (needTemporary
) {
1482 temp
= keep(get_temp_reg(fp
));
1487 // Note: The order of emit_arith inside the slots is relevant,
1488 // because emit_arith only looks at scalar vs. vector when resolving
1489 // dependencies, and it does not consider individual vector components,
1490 // so swizzling between the two parts can create fake dependencies.
1493 emit_arith(fp
, PFS_OP_MAX
, temp
, WRITEMASK_XY
,
1494 keep(src
), pfs_zero
, undef
, 0);
1495 emit_arith(fp
, PFS_OP_MAX
, temp
, WRITEMASK_W
, src
, cnst
, undef
, 0);
1498 emit_arith(fp
, PFS_OP_MIN
, temp
, WRITEMASK_Z
,
1499 swizzle(temp
, W
, W
, W
, W
), cnst
, undef
, 0);
1500 emit_arith(fp
, PFS_OP_LG2
, temp
, WRITEMASK_W
,
1501 swizzle(temp
, Y
, Y
, Y
, Y
), undef
, undef
, 0);
1504 // If desired, we saturate the y result here.
1505 // This does not affect the use as a condition variable in the CMP later
1506 emit_arith(fp
, PFS_OP_MAD
, temp
, WRITEMASK_W
,
1507 temp
, swizzle(temp
, Z
, Z
, Z
, Z
), pfs_zero
, 0);
1508 emit_arith(fp
, PFS_OP_MAD
, temp
, WRITEMASK_Y
,
1509 swizzle(temp
, X
, X
, X
, X
), pfs_one
, pfs_zero
, flags
);
1512 emit_arith(fp
, PFS_OP_MAD
, temp
, WRITEMASK_X
,
1513 pfs_one
, pfs_one
, pfs_zero
, 0);
1514 emit_arith(fp
, PFS_OP_EX2
, temp
, WRITEMASK_W
, temp
, undef
, undef
, 0);
1517 emit_arith(fp
, PFS_OP_CMP
, temp
, WRITEMASK_Z
,
1518 pfs_zero
, swizzle(temp
, W
, W
, W
, W
),
1519 negate(swizzle(temp
, Y
, Y
, Y
, Y
)), flags
);
1520 emit_arith(fp
, PFS_OP_MAD
, temp
, WRITEMASK_W
, pfs_one
, pfs_one
,
1523 if (needTemporary
) {
1524 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1525 temp
, pfs_one
, pfs_zero
, flags
);
1526 free_temp(fp
, temp
);
1528 // Decrease refcount of the destination
1529 t_hw_dst(fp
, dest
, GL_FALSE
, cs
->nrslots
);
1533 static GLboolean
parse_program(struct r300_fragment_program
*fp
)
1535 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
1536 const struct prog_instruction
*inst
= mp
->Base
.Instructions
;
1537 struct prog_instruction
*fpi
;
1538 GLuint src
[3], dest
, temp
[2];
1539 int flags
, mask
= 0;
1542 if (!inst
|| inst
[0].Opcode
== OPCODE_END
) {
1543 ERROR("empty program?\n");
1547 for (fpi
= mp
->Base
.Instructions
; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
1548 if (fpi
->SaturateMode
== SATURATE_ZERO_ONE
)
1549 flags
= PFS_FLAG_SAT
;
1553 if (fpi
->Opcode
!= OPCODE_KIL
) {
1554 dest
= t_dst(fp
, fpi
->DstReg
);
1555 mask
= fpi
->DstReg
.WriteMask
;
1558 switch (fpi
->Opcode
) {
1560 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1561 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1562 absolute(src
[0]), pfs_one
, pfs_zero
, flags
);
1565 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1566 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1567 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1568 src
[0], pfs_one
, src
[1], flags
);
1571 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1572 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1573 src
[2] = t_src(fp
, fpi
->SrcReg
[2]);
1574 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1575 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1577 emit_arith(fp
, PFS_OP_CMP
, dest
, mask
,
1578 src
[2], src
[1], src
[0], flags
);
1582 * cos using a parabola (see SIN):
1584 * x = (x/(2*PI))+0.75
1589 temp
[0] = get_temp_reg(fp
);
1590 const_sin
[0] = emit_const4fv(fp
, SinCosConsts
[0]);
1591 const_sin
[1] = emit_const4fv(fp
, SinCosConsts
[1]);
1592 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1594 /* add 0.5*PI and do range reduction */
1596 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1597 swizzle(src
[0], X
, X
, X
, X
),
1598 swizzle(const_sin
[1], Z
, Z
, Z
, Z
),
1599 swizzle(const_sin
[1], X
, X
, X
, X
), 0);
1601 emit_arith(fp
, PFS_OP_FRC
, temp
[0], WRITEMASK_X
,
1602 swizzle(temp
[0], X
, X
, X
, X
),
1605 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
, swizzle(temp
[0], X
, X
, X
, X
), swizzle(const_sin
[1], W
, W
, W
, W
), //2*PI
1606 negate(swizzle(const_sin
[0], Z
, Z
, Z
, Z
)), //-PI
1611 emit_arith(fp
, PFS_OP_MAD
, temp
[0],
1612 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[0],
1615 const_sin
[0], pfs_zero
, 0);
1617 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1618 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1619 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1620 swizzle(temp
[0], X
, X
, X
, X
), 0);
1622 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Y
,
1623 swizzle(temp
[0], X
, X
, X
, X
),
1624 absolute(swizzle(temp
[0], X
, X
, X
, X
)),
1625 negate(swizzle(temp
[0], X
, X
, X
, X
)), 0);
1627 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1628 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1629 swizzle(const_sin
[0], W
, W
, W
, W
),
1630 swizzle(temp
[0], X
, X
, X
, X
), flags
);
1632 free_temp(fp
, temp
[0]);
1635 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1636 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1637 emit_arith(fp
, PFS_OP_DP3
, dest
, mask
,
1638 src
[0], src
[1], undef
, flags
);
1641 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1642 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1643 emit_arith(fp
, PFS_OP_DP4
, dest
, mask
,
1644 src
[0], src
[1], undef
, flags
);
1647 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1648 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1649 /* src0.xyz1 -> temp
1650 * DP4 dest, temp, src1
1653 temp
[0] = get_temp_reg(fp
);
1654 src
[0].s_swz
= SWIZZLE_ONE
;
1655 emit_arith(fp
, PFS_OP_MAD
, temp
[0], mask
,
1656 src
[0], pfs_one
, pfs_zero
, 0);
1657 emit_arith(fp
, PFS_OP_DP4
, dest
, mask
,
1658 temp
[0], src
[1], undef
, flags
);
1659 free_temp(fp
, temp
[0]);
1661 emit_arith(fp
, PFS_OP_DP4
, dest
, mask
,
1662 swizzle(src
[0], X
, Y
, Z
, ONE
), src
[1],
1667 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1668 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1669 /* dest.y = src0.y * src1.y */
1670 if (mask
& WRITEMASK_Y
)
1671 emit_arith(fp
, PFS_OP_MAD
, dest
, WRITEMASK_Y
,
1672 keep(src
[0]), keep(src
[1]),
1674 /* dest.z = src0.z */
1675 if (mask
& WRITEMASK_Z
)
1676 emit_arith(fp
, PFS_OP_MAD
, dest
, WRITEMASK_Z
,
1677 src
[0], pfs_one
, pfs_zero
, flags
);
1679 * result.w = src1.w */
1680 if (mask
& WRITEMASK_XW
) {
1681 REG_SET_VSWZ(src
[1], SWIZZLE_111
); /*Cheat */
1682 emit_arith(fp
, PFS_OP_MAD
, dest
,
1683 mask
& WRITEMASK_XW
,
1684 src
[1], pfs_one
, pfs_zero
, flags
);
1688 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1689 emit_arith(fp
, PFS_OP_EX2
, dest
, mask
,
1690 src
[0], undef
, undef
, flags
);
1693 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1694 temp
[0] = get_temp_reg(fp
);
1696 * MAD dest, src0, 1.0, -temp
1698 emit_arith(fp
, PFS_OP_FRC
, temp
[0], mask
,
1699 keep(src
[0]), undef
, undef
, 0);
1700 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1701 src
[0], pfs_one
, negate(temp
[0]), flags
);
1702 free_temp(fp
, temp
[0]);
1705 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1706 emit_arith(fp
, PFS_OP_FRC
, dest
, mask
,
1707 src
[0], undef
, undef
, flags
);
1710 emit_tex(fp
, fpi
, R300_FPITX_OP_KIL
);
1713 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1714 emit_arith(fp
, PFS_OP_LG2
, dest
, mask
,
1715 src
[0], undef
, undef
, flags
);
1718 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1719 emit_lit(fp
, dest
, mask
, src
[0], flags
);
1722 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1723 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1724 src
[2] = t_src(fp
, fpi
->SrcReg
[2]);
1725 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1726 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1727 * MAD temp, -tmp0, tmp2, tmp2
1728 * MAD result, tmp0, tmp1, temp
1730 temp
[0] = get_temp_reg(fp
);
1731 emit_arith(fp
, PFS_OP_MAD
, temp
[0], mask
,
1732 negate(keep(src
[0])), keep(src
[2]), src
[2],
1734 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1735 src
[0], src
[1], temp
[0], flags
);
1736 free_temp(fp
, temp
[0]);
1739 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1740 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1741 src
[2] = t_src(fp
, fpi
->SrcReg
[2]);
1742 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1743 src
[0], src
[1], src
[2], flags
);
1746 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1747 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1748 emit_arith(fp
, PFS_OP_MAX
, dest
, mask
,
1749 src
[0], src
[1], undef
, flags
);
1752 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1753 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1754 emit_arith(fp
, PFS_OP_MIN
, dest
, mask
,
1755 src
[0], src
[1], undef
, flags
);
1759 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1760 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1761 src
[0], pfs_one
, pfs_zero
, flags
);
1764 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1765 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1766 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1767 src
[0], src
[1], pfs_zero
, flags
);
1770 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1771 src
[1] = t_scalar_src(fp
, fpi
->SrcReg
[1]);
1772 temp
[0] = get_temp_reg(fp
);
1773 emit_arith(fp
, PFS_OP_LG2
, temp
[0], WRITEMASK_W
,
1774 src
[0], undef
, undef
, 0);
1775 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_W
,
1776 temp
[0], src
[1], pfs_zero
, 0);
1777 emit_arith(fp
, PFS_OP_EX2
, dest
, fpi
->DstReg
.WriteMask
,
1778 temp
[0], undef
, undef
, 0);
1779 free_temp(fp
, temp
[0]);
1782 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1783 emit_arith(fp
, PFS_OP_RCP
, dest
, mask
,
1784 src
[0], undef
, undef
, flags
);
1787 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1788 emit_arith(fp
, PFS_OP_RSQ
, dest
, mask
,
1789 absolute(src
[0]), pfs_zero
, pfs_zero
, flags
);
1793 * scs using a parabola :
1795 * result.x = sin(-abs(x)+0.5*PI) (cos)
1796 * result.y = sin(x) (sin)
1799 temp
[0] = get_temp_reg(fp
);
1800 temp
[1] = get_temp_reg(fp
);
1801 const_sin
[0] = emit_const4fv(fp
, SinCosConsts
[0]);
1802 const_sin
[1] = emit_const4fv(fp
, SinCosConsts
[1]);
1803 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1805 /* x = -abs(x)+0.5*PI */
1806 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
, swizzle(const_sin
[0], Z
, Z
, Z
, Z
), //PI
1809 (swizzle(keep(src
[0]), X
, X
, X
, X
))),
1813 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_W
,
1814 swizzle(const_sin
[0], Y
, Y
, Y
, Y
),
1815 swizzle(keep(src
[0]), X
, X
, X
, X
),
1818 /* B*x, C*x (cos) */
1819 emit_arith(fp
, PFS_OP_MAD
, temp
[0],
1820 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[0],
1823 const_sin
[0], pfs_zero
, 0);
1826 emit_arith(fp
, PFS_OP_MAD
, temp
[1], WRITEMASK_W
,
1827 swizzle(const_sin
[0], X
, X
, X
, X
),
1828 keep(src
[0]), pfs_zero
, 0);
1830 /* y = B*x + C*x*abs(x) (sin) */
1831 emit_arith(fp
, PFS_OP_MAD
, temp
[1], WRITEMASK_Z
,
1833 swizzle(temp
[0], W
, W
, W
, W
),
1834 swizzle(temp
[1], W
, W
, W
, W
), 0);
1836 /* y = B*x + C*x*abs(x) (cos) */
1837 emit_arith(fp
, PFS_OP_MAD
, temp
[1], WRITEMASK_W
,
1838 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1839 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1840 swizzle(temp
[0], X
, X
, X
, X
), 0);
1842 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1843 emit_arith(fp
, PFS_OP_MAD
, temp
[0],
1844 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[1],
1847 absolute(swizzle(temp
[1], W
, Z
, Y
, X
)),
1848 negate(swizzle(temp
[1], W
, Z
, Y
, X
)), 0);
1850 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1851 emit_arith(fp
, PFS_OP_MAD
, dest
,
1852 mask
& (WRITEMASK_X
| WRITEMASK_Y
), temp
[0],
1853 swizzle(const_sin
[0], W
, W
, W
, W
),
1854 swizzle(temp
[1], W
, Z
, Y
, X
), flags
);
1856 free_temp(fp
, temp
[0]);
1857 free_temp(fp
, temp
[1]);
1860 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1861 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1862 temp
[0] = get_temp_reg(fp
);
1863 /* temp = src0 - src1
1864 * dest.c = (temp.c < 0.0) ? 0 : 1
1866 emit_arith(fp
, PFS_OP_MAD
, temp
[0], mask
,
1867 src
[0], pfs_one
, negate(src
[1]), 0);
1868 emit_arith(fp
, PFS_OP_CMP
, dest
, mask
,
1869 pfs_one
, pfs_zero
, temp
[0], 0);
1870 free_temp(fp
, temp
[0]);
1875 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1876 * extra precision is obtained by weighting against
1880 temp
[0] = get_temp_reg(fp
);
1881 const_sin
[0] = emit_const4fv(fp
, SinCosConsts
[0]);
1882 const_sin
[1] = emit_const4fv(fp
, SinCosConsts
[1]);
1883 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1885 /* do range reduction */
1887 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1888 swizzle(keep(src
[0]), X
, X
, X
, X
),
1889 swizzle(const_sin
[1], Z
, Z
, Z
, Z
),
1892 emit_arith(fp
, PFS_OP_FRC
, temp
[0], WRITEMASK_X
,
1893 swizzle(temp
[0], X
, X
, X
, X
),
1896 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
, swizzle(temp
[0], X
, X
, X
, X
), swizzle(const_sin
[1], W
, W
, W
, W
), //2*PI
1897 negate(swizzle(const_sin
[0], Z
, Z
, Z
, Z
)), //PI
1902 emit_arith(fp
, PFS_OP_MAD
, temp
[0],
1903 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[0],
1906 const_sin
[0], pfs_zero
, 0);
1908 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1909 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1910 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1911 swizzle(temp
[0], X
, X
, X
, X
), 0);
1913 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Y
,
1914 swizzle(temp
[0], X
, X
, X
, X
),
1915 absolute(swizzle(temp
[0], X
, X
, X
, X
)),
1916 negate(swizzle(temp
[0], X
, X
, X
, X
)), 0);
1918 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1919 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1920 swizzle(const_sin
[0], W
, W
, W
, W
),
1921 swizzle(temp
[0], X
, X
, X
, X
), flags
);
1923 free_temp(fp
, temp
[0]);
1926 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1927 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1928 temp
[0] = get_temp_reg(fp
);
1929 /* temp = src0 - src1
1930 * dest.c = (temp.c < 0.0) ? 1 : 0
1932 emit_arith(fp
, PFS_OP_MAD
, temp
[0], mask
,
1933 src
[0], pfs_one
, negate(src
[1]), 0);
1934 emit_arith(fp
, PFS_OP_CMP
, dest
, mask
,
1935 pfs_zero
, pfs_one
, temp
[0], 0);
1936 free_temp(fp
, temp
[0]);
1939 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1940 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1941 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1942 src
[0], pfs_one
, negate(src
[1]), flags
);
1945 emit_tex(fp
, fpi
, R300_FPITX_OP_TEX
);
1948 emit_tex(fp
, fpi
, R300_FPITX_OP_TXB
);
1951 emit_tex(fp
, fpi
, R300_FPITX_OP_TXP
);
1954 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1955 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1956 temp
[0] = get_temp_reg(fp
);
1957 /* temp = src0.zxy * src1.yzx */
1958 emit_arith(fp
, PFS_OP_MAD
, temp
[0],
1959 WRITEMASK_XYZ
, swizzle(keep(src
[0]),
1961 swizzle(keep(src
[1]), Y
, Z
, X
, W
),
1963 /* dest.xyz = src0.yzx * src1.zxy - temp
1964 * dest.w = undefined
1966 emit_arith(fp
, PFS_OP_MAD
, dest
,
1967 mask
& WRITEMASK_XYZ
, swizzle(src
[0],
1970 swizzle(src
[1], Z
, X
, Y
, W
),
1971 negate(temp
[0]), flags
);
1973 free_temp(fp
, temp
[0]);
1977 ERROR("unknown fpi->Opcode %d\n", fpi
->Opcode
);
1989 static void insert_wpos(struct gl_program
*prog
)
1991 static gl_state_index tokens
[STATE_LENGTH
] = {
1992 STATE_INTERNAL
, STATE_R300_WINDOW_DIMENSION
, 0, 0, 0
1994 struct prog_instruction
*fpi
;
1995 GLuint window_index
;
1997 GLuint tempregi
= prog
->NumTemporaries
;
1998 /* should do something else if no temps left... */
1999 prog
->NumTemporaries
++;
2001 fpi
= _mesa_alloc_instructions(prog
->NumInstructions
+ 3);
2002 _mesa_init_instructions(fpi
, prog
->NumInstructions
+ 3);
2004 /* perspective divide */
2005 fpi
[i
].Opcode
= OPCODE_RCP
;
2007 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2008 fpi
[i
].DstReg
.Index
= tempregi
;
2009 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_W
;
2010 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2012 fpi
[i
].SrcReg
[0].File
= PROGRAM_INPUT
;
2013 fpi
[i
].SrcReg
[0].Index
= FRAG_ATTRIB_WPOS
;
2014 fpi
[i
].SrcReg
[0].Swizzle
= SWIZZLE_WWWW
;
2017 fpi
[i
].Opcode
= OPCODE_MUL
;
2019 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2020 fpi
[i
].DstReg
.Index
= tempregi
;
2021 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_XYZ
;
2022 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2024 fpi
[i
].SrcReg
[0].File
= PROGRAM_INPUT
;
2025 fpi
[i
].SrcReg
[0].Index
= FRAG_ATTRIB_WPOS
;
2026 fpi
[i
].SrcReg
[0].Swizzle
= SWIZZLE_XYZW
;
2028 fpi
[i
].SrcReg
[1].File
= PROGRAM_TEMPORARY
;
2029 fpi
[i
].SrcReg
[1].Index
= tempregi
;
2030 fpi
[i
].SrcReg
[1].Swizzle
= SWIZZLE_WWWW
;
2033 /* viewport transformation */
2034 window_index
= _mesa_add_state_reference(prog
->Parameters
, tokens
);
2036 fpi
[i
].Opcode
= OPCODE_MAD
;
2038 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2039 fpi
[i
].DstReg
.Index
= tempregi
;
2040 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_XYZ
;
2041 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2043 fpi
[i
].SrcReg
[0].File
= PROGRAM_TEMPORARY
;
2044 fpi
[i
].SrcReg
[0].Index
= tempregi
;
2045 fpi
[i
].SrcReg
[0].Swizzle
=
2046 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2048 fpi
[i
].SrcReg
[1].File
= PROGRAM_STATE_VAR
;
2049 fpi
[i
].SrcReg
[1].Index
= window_index
;
2050 fpi
[i
].SrcReg
[1].Swizzle
=
2051 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2053 fpi
[i
].SrcReg
[2].File
= PROGRAM_STATE_VAR
;
2054 fpi
[i
].SrcReg
[2].Index
= window_index
;
2055 fpi
[i
].SrcReg
[2].Swizzle
=
2056 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2059 _mesa_copy_instructions(&fpi
[i
], prog
->Instructions
,
2060 prog
->NumInstructions
);
2062 free(prog
->Instructions
);
2064 prog
->Instructions
= fpi
;
2066 prog
->NumInstructions
+= i
;
2067 fpi
= &prog
->Instructions
[prog
->NumInstructions
- 1];
2069 assert(fpi
->Opcode
== OPCODE_END
);
2071 for (fpi
= &prog
->Instructions
[3]; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
2072 for (i
= 0; i
< 3; i
++)
2073 if (fpi
->SrcReg
[i
].File
== PROGRAM_INPUT
&&
2074 fpi
->SrcReg
[i
].Index
== FRAG_ATTRIB_WPOS
) {
2075 fpi
->SrcReg
[i
].File
= PROGRAM_TEMPORARY
;
2076 fpi
->SrcReg
[i
].Index
= tempregi
;
2081 /* - Init structures
2082 * - Determine what hwregs each input corresponds to
2084 static void init_program(r300ContextPtr r300
, struct r300_fragment_program
*fp
)
2086 struct r300_pfs_compile_state
*cs
= NULL
;
2087 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
2088 struct prog_instruction
*fpi
;
2089 GLuint InputsRead
= mp
->Base
.InputsRead
;
2090 GLuint temps_used
= 0; /* for fp->temps[] */
2093 /* New compile, reset tracking data */
2095 driQueryOptioni(&r300
->radeon
.optionCache
, "fp_optimization");
2096 fp
->translated
= GL_FALSE
;
2097 fp
->error
= GL_FALSE
;
2098 fp
->cs
= cs
= &(R300_CONTEXT(fp
->ctx
)->state
.pfs_compile
);
2101 fp
->first_node_has_tex
= 0;
2103 fp
->max_temp_idx
= 0;
2104 fp
->node
[0].alu_end
= -1;
2105 fp
->node
[0].tex_end
= -1;
2107 _mesa_memset(cs
, 0, sizeof(*fp
->cs
));
2108 for (i
= 0; i
< PFS_MAX_ALU_INST
; i
++) {
2109 for (j
= 0; j
< 3; j
++) {
2110 cs
->slot
[i
].vsrc
[j
] = SRC_CONST
;
2111 cs
->slot
[i
].ssrc
[j
] = SRC_CONST
;
2115 /* Work out what temps the Mesa inputs correspond to, this must match
2116 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2117 * configures itself based on the fragprog's InputsRead
2119 * NOTE: this depends on get_hw_temp() allocating registers in order,
2120 * starting from register 0.
2123 /* Texcoords come first */
2124 for (i
= 0; i
< fp
->ctx
->Const
.MaxTextureUnits
; i
++) {
2125 if (InputsRead
& (FRAG_BIT_TEX0
<< i
)) {
2126 cs
->inputs
[FRAG_ATTRIB_TEX0
+ i
].refcount
= 0;
2127 cs
->inputs
[FRAG_ATTRIB_TEX0
+ i
].reg
=
2131 InputsRead
&= ~FRAG_BITS_TEX_ANY
;
2133 /* fragment position treated as a texcoord */
2134 if (InputsRead
& FRAG_BIT_WPOS
) {
2135 cs
->inputs
[FRAG_ATTRIB_WPOS
].refcount
= 0;
2136 cs
->inputs
[FRAG_ATTRIB_WPOS
].reg
= get_hw_temp(fp
, 0);
2137 insert_wpos(&mp
->Base
);
2139 InputsRead
&= ~FRAG_BIT_WPOS
;
2141 /* Then primary colour */
2142 if (InputsRead
& FRAG_BIT_COL0
) {
2143 cs
->inputs
[FRAG_ATTRIB_COL0
].refcount
= 0;
2144 cs
->inputs
[FRAG_ATTRIB_COL0
].reg
= get_hw_temp(fp
, 0);
2146 InputsRead
&= ~FRAG_BIT_COL0
;
2148 /* Secondary color */
2149 if (InputsRead
& FRAG_BIT_COL1
) {
2150 cs
->inputs
[FRAG_ATTRIB_COL1
].refcount
= 0;
2151 cs
->inputs
[FRAG_ATTRIB_COL1
].reg
= get_hw_temp(fp
, 0);
2153 InputsRead
&= ~FRAG_BIT_COL1
;
2157 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead
);
2158 /* force read from hwreg 0 for now */
2159 for (i
= 0; i
< 32; i
++)
2160 if (InputsRead
& (1 << i
))
2161 cs
->inputs
[i
].reg
= 0;
2164 /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2165 * That way, we can free up the reg when it's no longer needed
2167 if (!mp
->Base
.Instructions
) {
2168 ERROR("No instructions found in program\n");
2172 for (fpi
= mp
->Base
.Instructions
; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
2175 for (i
= 0; i
< 3; i
++) {
2176 idx
= fpi
->SrcReg
[i
].Index
;
2177 switch (fpi
->SrcReg
[i
].File
) {
2178 case PROGRAM_TEMPORARY
:
2179 if (!(temps_used
& (1 << idx
))) {
2180 cs
->temps
[idx
].reg
= -1;
2181 cs
->temps
[idx
].refcount
= 1;
2182 temps_used
|= (1 << idx
);
2184 cs
->temps
[idx
].refcount
++;
2187 cs
->inputs
[idx
].refcount
++;
2194 idx
= fpi
->DstReg
.Index
;
2195 if (fpi
->DstReg
.File
== PROGRAM_TEMPORARY
) {
2196 if (!(temps_used
& (1 << idx
))) {
2197 cs
->temps
[idx
].reg
= -1;
2198 cs
->temps
[idx
].refcount
= 1;
2199 temps_used
|= (1 << idx
);
2201 cs
->temps
[idx
].refcount
++;
2204 cs
->temp_in_use
= temps_used
;
2207 static void update_params(struct r300_fragment_program
*fp
)
2209 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
2211 /* Ask Mesa nicely to fill in ParameterValues for us */
2212 if (mp
->Base
.Parameters
)
2213 _mesa_load_state_parameters(fp
->ctx
, mp
->Base
.Parameters
);
2216 void r300TranslateFragmentShader(r300ContextPtr r300
,
2217 struct r300_fragment_program
*fp
)
2219 struct r300_pfs_compile_state
*cs
= NULL
;
2221 if (!fp
->translated
) {
2223 init_program(r300
, fp
);
2226 if (parse_program(fp
) == GL_FALSE
) {
2232 fp
->node
[fp
->cur_node
].alu_end
=
2233 cs
->nrslots
- fp
->node
[fp
->cur_node
].alu_offset
- 1;
2234 if (fp
->node
[fp
->cur_node
].tex_end
< 0)
2235 fp
->node
[fp
->cur_node
].tex_end
= 0;
2237 fp
->alu_end
= cs
->nrslots
- 1;
2239 fp
->tex_end
= fp
->tex
.length
? fp
->tex
.length
- 1 : 0;
2240 assert(fp
->node
[fp
->cur_node
].alu_end
>= 0);
2241 assert(fp
->alu_end
>= 0);
2243 fp
->translated
= GL_TRUE
;
2244 if (RADEON_DEBUG
& DEBUG_PIXEL
)
2246 r300UpdateStateParameters(fp
->ctx
, _NEW_PROGRAM
);
2252 /* just some random things... */
2253 static void dump_program(struct r300_fragment_program
*fp
)
2258 fprintf(stderr
, "pc=%d*************************************\n", pc
++);
2260 fprintf(stderr
, "Mesa program:\n");
2261 fprintf(stderr
, "-------------\n");
2262 _mesa_print_program(&fp
->mesa_program
.Base
);
2265 fprintf(stderr
, "Hardware program\n");
2266 fprintf(stderr
, "----------------\n");
2268 for (n
= 0; n
< (fp
->cur_node
+ 1); n
++) {
2269 fprintf(stderr
, "NODE %d: alu_offset: %d, tex_offset: %d, "
2270 "alu_end: %d, tex_end: %d\n", n
,
2271 fp
->node
[n
].alu_offset
,
2272 fp
->node
[n
].tex_offset
,
2273 fp
->node
[n
].alu_end
, fp
->node
[n
].tex_end
);
2275 if (fp
->tex
.length
) {
2276 fprintf(stderr
, " TEX:\n");
2277 for (i
= fp
->node
[n
].tex_offset
;
2278 i
<= fp
->node
[n
].tex_offset
+ fp
->node
[n
].tex_end
;
2283 inst
[i
] >> R300_FPITX_OPCODE_SHIFT
) &
2285 case R300_FPITX_OP_TEX
:
2288 case R300_FPITX_OP_KIL
:
2291 case R300_FPITX_OP_TXP
:
2294 case R300_FPITX_OP_TXB
:
2302 " %s t%i, %c%i, texture[%i] (%08x)\n",
2305 inst
[i
] >> R300_FPITX_DST_SHIFT
) & 31,
2307 inst
[i
] & R300_FPITX_SRC_CONST
) ? 'c' :
2310 inst
[i
] >> R300_FPITX_SRC_SHIFT
) & 31,
2312 inst
[i
] & R300_FPITX_IMAGE_MASK
) >>
2313 R300_FPITX_IMAGE_SHIFT
,
2318 for (i
= fp
->node
[n
].alu_offset
;
2319 i
<= fp
->node
[n
].alu_offset
+ fp
->node
[n
].alu_end
; ++i
) {
2320 char srcc
[3][10], dstc
[20];
2321 char srca
[3][10], dsta
[20];
2324 char flags
[5], tmp
[10];
2326 for (j
= 0; j
< 3; ++j
) {
2327 int regc
= fp
->alu
.inst
[i
].inst1
>> (j
* 6);
2328 int rega
= fp
->alu
.inst
[i
].inst3
>> (j
* 6);
2330 sprintf(srcc
[j
], "%c%i",
2331 (regc
& 32) ? 'c' : 't', regc
& 31);
2332 sprintf(srca
[j
], "%c%i",
2333 (rega
& 32) ? 'c' : 't', rega
& 31);
2337 sprintf(flags
, "%s%s%s",
2339 inst1
& R300_FPI1_DSTC_REG_X
) ? "x" : "",
2341 inst1
& R300_FPI1_DSTC_REG_Y
) ? "y" : "",
2343 inst1
& R300_FPI1_DSTC_REG_Z
) ? "z" : "");
2344 if (flags
[0] != 0) {
2345 sprintf(dstc
, "t%i.%s ",
2347 inst1
>> R300_FPI1_DSTC_SHIFT
) & 31,
2350 sprintf(flags
, "%s%s%s",
2352 inst1
& R300_FPI1_DSTC_OUTPUT_X
) ? "x" : "",
2354 inst1
& R300_FPI1_DSTC_OUTPUT_Y
) ? "y" : "",
2356 inst1
& R300_FPI1_DSTC_OUTPUT_Z
) ? "z" : "");
2357 if (flags
[0] != 0) {
2358 sprintf(tmp
, "o%i.%s",
2360 inst1
>> R300_FPI1_DSTC_SHIFT
) & 31,
2366 if (fp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_REG
) {
2367 sprintf(dsta
, "t%i.w ",
2369 inst3
>> R300_FPI3_DSTA_SHIFT
) & 31);
2371 if (fp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_OUTPUT
) {
2372 sprintf(tmp
, "o%i.w ",
2374 inst3
>> R300_FPI3_DSTA_SHIFT
) & 31);
2377 if (fp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_DEPTH
) {
2382 "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2383 " w: %3s %3s %3s -> %-20s (%08x)\n", i
,
2384 srcc
[0], srcc
[1], srcc
[2], dstc
,
2385 fp
->alu
.inst
[i
].inst1
, srca
[0], srca
[1],
2386 srca
[2], dsta
, fp
->alu
.inst
[i
].inst3
);
2388 for (j
= 0; j
< 3; ++j
) {
2389 int regc
= fp
->alu
.inst
[i
].inst0
>> (j
* 7);
2390 int rega
= fp
->alu
.inst
[i
].inst2
>> (j
* 7);
2397 case R300_FPI0_ARGC_SRC0C_XYZ
:
2398 sprintf(buf
, "%s.xyz",
2401 case R300_FPI0_ARGC_SRC0C_XXX
:
2402 sprintf(buf
, "%s.xxx",
2405 case R300_FPI0_ARGC_SRC0C_YYY
:
2406 sprintf(buf
, "%s.yyy",
2409 case R300_FPI0_ARGC_SRC0C_ZZZ
:
2410 sprintf(buf
, "%s.zzz",
2414 } else if (d
< 15) {
2415 sprintf(buf
, "%s.www", srca
[d
- 12]);
2416 } else if (d
== 20) {
2417 sprintf(buf
, "0.0");
2418 } else if (d
== 21) {
2419 sprintf(buf
, "1.0");
2420 } else if (d
== 22) {
2421 sprintf(buf
, "0.5");
2422 } else if (d
>= 23 && d
< 32) {
2426 sprintf(buf
, "%s.yzx",
2430 sprintf(buf
, "%s.zxy",
2434 sprintf(buf
, "%s.Wzy",
2439 sprintf(buf
, "%i", d
);
2442 sprintf(argc
[j
], "%s%s%s%s",
2443 (regc
& 32) ? "-" : "",
2444 (regc
& 64) ? "|" : "",
2445 buf
, (regc
& 64) ? "|" : "");
2449 sprintf(buf
, "%s.%c", srcc
[d
/ 3],
2450 'x' + (char)(d
% 3));
2451 } else if (d
< 12) {
2452 sprintf(buf
, "%s.w", srca
[d
- 9]);
2453 } else if (d
== 16) {
2454 sprintf(buf
, "0.0");
2455 } else if (d
== 17) {
2456 sprintf(buf
, "1.0");
2457 } else if (d
== 18) {
2458 sprintf(buf
, "0.5");
2460 sprintf(buf
, "%i", d
);
2463 sprintf(arga
[j
], "%s%s%s%s",
2464 (rega
& 32) ? "-" : "",
2465 (rega
& 64) ? "|" : "",
2466 buf
, (rega
& 64) ? "|" : "");
2469 fprintf(stderr
, " xyz: %8s %8s %8s op: %08x\n"
2470 " w: %8s %8s %8s op: %08x\n",
2471 argc
[0], argc
[1], argc
[2],
2472 fp
->alu
.inst
[i
].inst0
, arga
[0], arga
[1],
2473 arga
[2], fp
->alu
.inst
[i
].inst2
);