2 * Copyright (C) 2005 Ben Skeggs.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * \author Ben Skeggs <darktama@iinet.net.au>
33 * \author Jerome Glisse <j.glisse@gmail.com>
35 * \todo Depth write, WPOS/FOGC inputs
39 * \todo Verify results of opcodes for accuracy, I've only checked them in
46 #include "shader/prog_instruction.h"
47 #include "shader/prog_parameter.h"
48 #include "shader/prog_print.h"
50 #include "r300_context.h"
51 #include "r300_fragprog.h"
53 #include "r300_state.h"
56 * Usefull macros and values
58 #define ERROR(fmt, args...) do { \
59 fprintf(stderr, "%s::%s(): " fmt "\n", \
60 __FILE__, __FUNCTION__, ##args); \
61 fp->error = GL_TRUE; \
64 #define PFS_INVAL 0xFFFFFFFF
65 #define COMPILE_STATE struct r300_pfs_compile_state *cs = fp->cs
77 #define SWIZZLE_HHH 10
79 #define swizzle(r, x, y, z, w) do_swizzle(fp, r, \
86 #define REG_TYPE_INPUT 0
87 #define REG_TYPE_OUTPUT 1
88 #define REG_TYPE_TEMP 2
89 #define REG_TYPE_CONST 3
91 #define REG_TYPE_SHIFT 0
92 #define REG_INDEX_SHIFT 2
93 #define REG_VSWZ_SHIFT 8
94 #define REG_SSWZ_SHIFT 13
95 #define REG_NEGV_SHIFT 18
96 #define REG_NEGS_SHIFT 19
97 #define REG_ABS_SHIFT 20
98 #define REG_NO_USE_SHIFT 21 // Hack for refcounting
99 #define REG_VALID_SHIFT 22 // Does the register contain a defined value?
100 #define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
102 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
103 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
104 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
105 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
106 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
107 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
108 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
109 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
110 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
111 #define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
113 #define REG(type, index, vswz, sswz, nouse, valid, builtin) \
114 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
115 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
116 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
117 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
118 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
119 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
120 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
121 #define REG_GET_TYPE(reg) \
122 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
123 #define REG_GET_INDEX(reg) \
124 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
125 #define REG_GET_VSWZ(reg) \
126 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
127 #define REG_GET_SSWZ(reg) \
128 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
129 #define REG_GET_NO_USE(reg) \
130 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
131 #define REG_GET_VALID(reg) \
132 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
133 #define REG_GET_BUILTIN(reg) \
134 ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
135 #define REG_SET_TYPE(reg, type) \
136 reg = ((reg & ~REG_TYPE_MASK) | \
137 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
138 #define REG_SET_INDEX(reg, index) \
139 reg = ((reg & ~REG_INDEX_MASK) | \
140 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
141 #define REG_SET_VSWZ(reg, vswz) \
142 reg = ((reg & ~REG_VSWZ_MASK) | \
143 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
144 #define REG_SET_SSWZ(reg, sswz) \
145 reg = ((reg & ~REG_SSWZ_MASK) | \
146 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
147 #define REG_SET_NO_USE(reg, nouse) \
148 reg = ((reg & ~REG_NO_USE_MASK) | \
149 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
150 #define REG_SET_VALID(reg, valid) \
151 reg = ((reg & ~REG_VALID_MASK) | \
152 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
153 #define REG_SET_BUILTIN(reg, builtin) \
154 reg = ((reg & ~REG_BUILTIN_MASK) | \
155 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
156 #define REG_ABS(reg) \
157 reg = (reg | REG_ABS_MASK)
158 #define REG_NEGV(reg) \
159 reg = (reg | REG_NEGV_MASK)
160 #define REG_NEGS(reg) \
161 reg = (reg | REG_NEGS_MASK)
164 * Datas structures for fragment program generation
167 /* description of r300 native hw instructions */
168 static const struct {
175 {"MAD", 3, R300_FPI0_OUTC_MAD
, R300_FPI2_OUTA_MAD
},
176 {"DP3", 2, R300_FPI0_OUTC_DP3
, R300_FPI2_OUTA_DP4
},
177 {"DP4", 2, R300_FPI0_OUTC_DP4
, R300_FPI2_OUTA_DP4
},
178 {"MIN", 2, R300_FPI0_OUTC_MIN
, R300_FPI2_OUTA_MIN
},
179 {"MAX", 2, R300_FPI0_OUTC_MAX
, R300_FPI2_OUTA_MAX
},
180 {"CMP", 3, R300_FPI0_OUTC_CMP
, R300_FPI2_OUTA_CMP
},
181 {"FRC", 1, R300_FPI0_OUTC_FRC
, R300_FPI2_OUTA_FRC
},
182 {"EX2", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_EX2
},
183 {"LG2", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_LG2
},
184 {"RCP", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_RCP
},
185 {"RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_RSQ
},
186 {"REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA
, PFS_INVAL
},
187 {"CMPH", 3, R300_FPI0_OUTC_CMPH
, PFS_INVAL
},
191 /* vector swizzles r300 can support natively, with a couple of
192 * cases we handle specially
194 * REG_VSWZ/REG_SSWZ is an index into this table
197 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
198 #define SWIZZLE_HALF 6
200 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
204 /* native swizzles */
205 static const struct r300_pfs_swizzle
{
206 GLuint hash
; /* swizzle value this matches */
207 GLuint base
; /* base value for hw swizzle */
208 GLuint stride
; /* difference in base between arg0/1/2 */
212 {MAKE_SWZ3(X
, Y
, Z
), R300_FPI0_ARGC_SRC0C_XYZ
, 4, SLOT_SRC_VECTOR
},
213 {MAKE_SWZ3(X
, X
, X
), R300_FPI0_ARGC_SRC0C_XXX
, 4, SLOT_SRC_VECTOR
},
214 {MAKE_SWZ3(Y
, Y
, Y
), R300_FPI0_ARGC_SRC0C_YYY
, 4, SLOT_SRC_VECTOR
},
215 {MAKE_SWZ3(Z
, Z
, Z
), R300_FPI0_ARGC_SRC0C_ZZZ
, 4, SLOT_SRC_VECTOR
},
216 {MAKE_SWZ3(W
, W
, W
), R300_FPI0_ARGC_SRC0A
, 1, SLOT_SRC_SCALAR
},
217 {MAKE_SWZ3(Y
, Z
, X
), R300_FPI0_ARGC_SRC0C_YZX
, 1, SLOT_SRC_VECTOR
},
218 {MAKE_SWZ3(Z
, X
, Y
), R300_FPI0_ARGC_SRC0C_ZXY
, 1, SLOT_SRC_VECTOR
},
219 {MAKE_SWZ3(W
, Z
, Y
), R300_FPI0_ARGC_SRC0CA_WZY
, 1, SLOT_SRC_BOTH
},
220 {MAKE_SWZ3(ONE
, ONE
, ONE
), R300_FPI0_ARGC_ONE
, 0, 0},
221 {MAKE_SWZ3(ZERO
, ZERO
, ZERO
), R300_FPI0_ARGC_ZERO
, 0, 0},
222 {MAKE_SWZ3(HALF
, HALF
, HALF
), R300_FPI0_ARGC_HALF
, 0, 0},
223 {PFS_INVAL
, 0, 0, 0},
227 /* used during matching of non-native swizzles */
228 #define SWZ_X_MASK (7 << 0)
229 #define SWZ_Y_MASK (7 << 3)
230 #define SWZ_Z_MASK (7 << 6)
231 #define SWZ_W_MASK (7 << 9)
232 static const struct {
233 GLuint hash
; /* used to mask matching swizzle components */
234 int mask
; /* actual outmask */
235 int count
; /* count of components matched */
238 {SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
, 1 | 2 | 4, 3},
239 {SWZ_X_MASK
| SWZ_Y_MASK
, 1 | 2, 2},
240 {SWZ_X_MASK
| SWZ_Z_MASK
, 1 | 4, 2},
241 {SWZ_Y_MASK
| SWZ_Z_MASK
, 2 | 4, 2},
245 {PFS_INVAL
, PFS_INVAL
, PFS_INVAL
}
249 static const struct {
250 int base
; /* hw value of swizzle */
251 int stride
; /* difference between SRC0/1/2 */
255 {R300_FPI2_ARGA_SRC0C_X
, 3, SLOT_SRC_VECTOR
},
256 {R300_FPI2_ARGA_SRC0C_Y
, 3, SLOT_SRC_VECTOR
},
257 {R300_FPI2_ARGA_SRC0C_Z
, 3, SLOT_SRC_VECTOR
},
258 {R300_FPI2_ARGA_SRC0A
, 1, SLOT_SRC_SCALAR
},
259 {R300_FPI2_ARGA_ZERO
, 0, 0},
260 {R300_FPI2_ARGA_ONE
, 0, 0},
261 {R300_FPI2_ARGA_HALF
, 0, 0}
265 /* boiler-plate reg, for convenience */
266 static const GLuint undef
= REG(REG_TYPE_TEMP
,
274 /* constant one source */
275 static const GLuint pfs_one
= REG(REG_TYPE_CONST
,
283 /* constant half source */
284 static const GLuint pfs_half
= REG(REG_TYPE_CONST
,
292 /* constant zero source */
293 static const GLuint pfs_zero
= REG(REG_TYPE_CONST
,
302 * Common functions prototypes
304 static void dump_program(struct r300_fragment_program
*fp
);
305 static void emit_arith(struct r300_fragment_program
*fp
, int op
,
306 GLuint dest
, int mask
,
307 GLuint src0
, GLuint src1
, GLuint src2
, int flags
);
310 * Get an R300 temporary that can be written to in the given slot.
312 static int get_hw_temp(struct r300_fragment_program
*fp
, int slot
)
317 for (r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
318 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= slot
)
322 if (r
>= PFS_NUM_TEMP_REGS
) {
323 ERROR("Out of hardware temps\n");
326 // Reserved is used to avoid the following scenario:
327 // R300 temporary X is first assigned to Mesa temporary Y during vector ops
328 // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
329 // Then scalar ops on Mesa temporary Z are emitted and move back in time
330 // to overwrite the value of temporary Y.
332 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
333 cs
->hwtemps
[r
].free
= -1;
335 // Reset to some value that won't mess things up when the user
336 // tries to read from a temporary that hasn't been assigned a value yet.
337 // In the normal case, vector_valid and scalar_valid should be set to
338 // a sane value by the first emit that writes to this temporary.
339 cs
->hwtemps
[r
].vector_valid
= 0;
340 cs
->hwtemps
[r
].scalar_valid
= 0;
342 if (r
> fp
->max_temp_idx
)
343 fp
->max_temp_idx
= r
;
349 * Get an R300 temporary that will act as a TEX destination register.
351 static int get_hw_temp_tex(struct r300_fragment_program
*fp
)
356 for (r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
357 if (cs
->used_in_node
& (1 << r
))
360 // Note: Be very careful here
361 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= 0)
365 if (r
>= PFS_NUM_TEMP_REGS
)
366 return get_hw_temp(fp
, 0); /* Will cause an indirection */
368 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
369 cs
->hwtemps
[r
].free
= -1;
371 // Reset to some value that won't mess things up when the user
372 // tries to read from a temporary that hasn't been assigned a value yet.
373 // In the normal case, vector_valid and scalar_valid should be set to
374 // a sane value by the first emit that writes to this temporary.
375 cs
->hwtemps
[r
].vector_valid
= cs
->nrslots
;
376 cs
->hwtemps
[r
].scalar_valid
= cs
->nrslots
;
378 if (r
> fp
->max_temp_idx
)
379 fp
->max_temp_idx
= r
;
385 * Mark the given hardware register as free.
387 static void free_hw_temp(struct r300_fragment_program
*fp
, int idx
)
391 // Be very careful here. Consider sequences like
394 // The TEX instruction may be moved in front of the MAD instruction
395 // due to the way nodes work. We don't want to alias r1 and r4 in
397 // I'm certain the register allocation could be further sanitized,
398 // but it's tricky because of stuff that can happen inside emit_tex
400 cs
->hwtemps
[idx
].free
= cs
->nrslots
+ 1;
404 * Create a new Mesa temporary register.
406 static GLuint
get_temp_reg(struct r300_fragment_program
*fp
)
412 index
= ffs(~cs
->temp_in_use
);
414 ERROR("Out of program temps\n");
418 cs
->temp_in_use
|= (1 << --index
);
419 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
420 cs
->temps
[index
].reg
= -1;
422 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
423 REG_SET_INDEX(r
, index
);
424 REG_SET_VALID(r
, GL_TRUE
);
429 * Create a new Mesa temporary register that will act as the destination
430 * register for a texture read.
432 static GLuint
get_temp_reg_tex(struct r300_fragment_program
*fp
)
438 index
= ffs(~cs
->temp_in_use
);
440 ERROR("Out of program temps\n");
444 cs
->temp_in_use
|= (1 << --index
);
445 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
446 cs
->temps
[index
].reg
= get_hw_temp_tex(fp
);
448 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
449 REG_SET_INDEX(r
, index
);
450 REG_SET_VALID(r
, GL_TRUE
);
455 * Free a Mesa temporary and the associated R300 temporary.
457 static void free_temp(struct r300_fragment_program
*fp
, GLuint r
)
460 GLuint index
= REG_GET_INDEX(r
);
462 if (!(cs
->temp_in_use
& (1 << index
)))
465 if (REG_GET_TYPE(r
) == REG_TYPE_TEMP
) {
466 free_hw_temp(fp
, cs
->temps
[index
].reg
);
467 cs
->temps
[index
].reg
= -1;
468 cs
->temp_in_use
&= ~(1 << index
);
469 } else if (REG_GET_TYPE(r
) == REG_TYPE_INPUT
) {
470 free_hw_temp(fp
, cs
->inputs
[index
].reg
);
471 cs
->inputs
[index
].reg
= -1;
476 * Emit a hardware constant/parameter.
478 * \p cp Stable pointer to an array of 4 floats.
479 * The pointer must be stable in the sense that it remains to be valid
480 * and hold the contents of the constant/parameter throughout the lifetime
481 * of the fragment program (actually, up until the next time the fragment
482 * program is translated).
484 static GLuint
emit_const4fv(struct r300_fragment_program
*fp
,
490 for (index
= 0; index
< fp
->const_nr
; ++index
) {
491 if (fp
->constant
[index
] == cp
)
495 if (index
>= fp
->const_nr
) {
496 if (index
>= PFS_NUM_CONST_REGS
) {
497 ERROR("Out of hw constants!\n");
502 fp
->constant
[index
] = cp
;
505 REG_SET_TYPE(reg
, REG_TYPE_CONST
);
506 REG_SET_INDEX(reg
, index
);
507 REG_SET_VALID(reg
, GL_TRUE
);
511 static inline GLuint
negate(GLuint r
)
518 /* Hack, to prevent clobbering sources used multiple times when
519 * emulating non-native instructions
521 static inline GLuint
keep(GLuint r
)
523 REG_SET_NO_USE(r
, GL_TRUE
);
527 static inline GLuint
absolute(GLuint r
)
533 static int swz_native(struct r300_fragment_program
*fp
,
534 GLuint src
, GLuint
* r
, GLuint arbneg
)
536 /* Native swizzle, handle negation */
537 src
= (src
& ~REG_NEGS_MASK
) | (((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
);
539 if ((arbneg
& 0x7) == 0x0) {
540 src
= src
& ~REG_NEGV_MASK
;
542 } else if ((arbneg
& 0x7) == 0x7) {
543 src
|= REG_NEGV_MASK
;
546 if (!REG_GET_VALID(*r
))
547 *r
= get_temp_reg(fp
);
548 src
|= REG_NEGV_MASK
;
551 *r
, arbneg
& 0x7, keep(src
), pfs_one
, pfs_zero
, 0);
552 src
= src
& ~REG_NEGV_MASK
;
556 (arbneg
^ 0x7) | WRITEMASK_W
,
557 src
, pfs_one
, pfs_zero
, 0);
563 static int swz_emit_partial(struct r300_fragment_program
*fp
,
565 GLuint
* r
, int mask
, int mc
, GLuint arbneg
)
570 if (!REG_GET_VALID(*r
))
571 *r
= get_temp_reg(fp
);
573 /* A partial match, VSWZ/mask define what parts of the
574 * desired swizzle we match
576 if (mc
+ s_mask
[mask
].count
== 3) {
578 src
|= ((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
;
581 tmp
= arbneg
& s_mask
[mask
].mask
;
583 tmp
= tmp
^ s_mask
[mask
].mask
;
588 arbneg
& s_mask
[mask
].mask
,
589 keep(src
) | REG_NEGV_MASK
,
590 pfs_one
, pfs_zero
, 0);
592 REG_SET_NO_USE(src
, GL_TRUE
);
594 REG_SET_NO_USE(src
, GL_FALSE
);
598 *r
, tmp
| wmask
, src
, pfs_one
, pfs_zero
, 0);
601 REG_SET_NO_USE(src
, GL_TRUE
);
603 REG_SET_NO_USE(src
, GL_FALSE
);
608 (arbneg
& s_mask
[mask
].mask
) | wmask
,
609 src
| REG_NEGV_MASK
, pfs_one
, pfs_zero
, 0);
613 REG_SET_NO_USE(src
, GL_TRUE
);
615 REG_SET_NO_USE(src
, GL_FALSE
);
617 emit_arith(fp
, PFS_OP_MAD
,
619 s_mask
[mask
].mask
| wmask
,
620 src
, pfs_one
, pfs_zero
, 0);
623 return s_mask
[mask
].count
;
626 static GLuint
do_swizzle(struct r300_fragment_program
*fp
,
627 GLuint src
, GLuint arbswz
, GLuint arbneg
)
634 /* If swizzling from something without an XYZW native swizzle,
635 * emit result to a temp, and do new swizzle from the temp.
638 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
|| REG_GET_SSWZ(src
) != SWIZZLE_W
) {
639 GLuint temp
= get_temp_reg(fp
);
642 temp
, WRITEMASK_XYZW
, src
, pfs_one
, pfs_zero
, 0);
647 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
|| REG_GET_SSWZ(src
) != SWIZZLE_W
) {
649 (v_swiz
[REG_GET_VSWZ(src
)].
650 hash
& (SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
)) |
651 REG_GET_SSWZ(src
) << 9;
656 for (i
= 0; i
< 4; ++i
) {
657 offset
= GET_SWZ(arbswz
, i
);
660 (offset
<= 3) ? GET_SWZ(vsrcswz
,
665 arbswz
= newswz
& (SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
);
666 REG_SET_SSWZ(src
, GET_SWZ(newswz
, 3));
668 /* set scalar swizzling */
669 REG_SET_SSWZ(src
, GET_SWZ(arbswz
, 3));
673 vswz
= REG_GET_VSWZ(src
);
677 REG_SET_VSWZ(src
, vswz
);
678 chash
= v_swiz
[REG_GET_VSWZ(src
)].hash
&
681 if (chash
== (arbswz
& s_mask
[c_mask
].hash
)) {
682 if (s_mask
[c_mask
].count
== 3) {
683 v_match
+= swz_native(fp
,
686 v_match
+= swz_emit_partial(fp
,
697 /* Fill with something invalid.. all 0's was
698 * wrong before, matched SWIZZLE_X. So all
699 * 1's will be okay for now
701 arbswz
|= (PFS_INVAL
& s_mask
[c_mask
].hash
);
703 } while (v_swiz
[++vswz
].hash
!= PFS_INVAL
);
704 REG_SET_VSWZ(src
, SWIZZLE_XYZ
);
705 } while (s_mask
[++c_mask
].hash
!= PFS_INVAL
);
707 ERROR("should NEVER get here\n");
711 static GLuint
t_src(struct r300_fragment_program
*fp
,
712 struct prog_src_register fpsrc
)
716 switch (fpsrc
.File
) {
717 case PROGRAM_TEMPORARY
:
718 REG_SET_INDEX(r
, fpsrc
.Index
);
719 REG_SET_VALID(r
, GL_TRUE
);
720 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
723 REG_SET_INDEX(r
, fpsrc
.Index
);
724 REG_SET_VALID(r
, GL_TRUE
);
725 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
727 case PROGRAM_LOCAL_PARAM
:
728 r
= emit_const4fv(fp
,
729 fp
->mesa_program
.Base
.LocalParams
[fpsrc
.
732 case PROGRAM_ENV_PARAM
:
733 r
= emit_const4fv(fp
,
734 fp
->ctx
->FragmentProgram
.Parameters
[fpsrc
.
737 case PROGRAM_STATE_VAR
:
738 case PROGRAM_NAMED_PARAM
:
739 r
= emit_const4fv(fp
,
740 fp
->mesa_program
.Base
.Parameters
->
741 ParameterValues
[fpsrc
.Index
]);
744 ERROR("unknown SrcReg->File %x\n", fpsrc
.File
);
748 /* no point swizzling ONE/ZERO/HALF constants... */
749 if (REG_GET_VSWZ(r
) < SWIZZLE_111
|| REG_GET_SSWZ(r
) < SWIZZLE_ZERO
)
750 r
= do_swizzle(fp
, r
, fpsrc
.Swizzle
, fpsrc
.NegateBase
);
754 static GLuint
t_scalar_src(struct r300_fragment_program
*fp
,
755 struct prog_src_register fpsrc
)
757 struct prog_src_register src
= fpsrc
;
758 int sc
= GET_SWZ(fpsrc
.Swizzle
, 0); /* X */
760 src
.Swizzle
= ((sc
<< 0) | (sc
<< 3) | (sc
<< 6) | (sc
<< 9));
762 return t_src(fp
, src
);
765 static GLuint
t_dst(struct r300_fragment_program
*fp
,
766 struct prog_dst_register dest
)
771 case PROGRAM_TEMPORARY
:
772 REG_SET_INDEX(r
, dest
.Index
);
773 REG_SET_VALID(r
, GL_TRUE
);
774 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
777 REG_SET_TYPE(r
, REG_TYPE_OUTPUT
);
778 switch (dest
.Index
) {
779 case FRAG_RESULT_COLR
:
780 case FRAG_RESULT_DEPR
:
781 REG_SET_INDEX(r
, dest
.Index
);
782 REG_SET_VALID(r
, GL_TRUE
);
785 ERROR("Bad DstReg->Index 0x%x\n", dest
.Index
);
789 ERROR("Bad DstReg->File 0x%x\n", dest
.File
);
794 static int t_hw_src(struct r300_fragment_program
*fp
, GLuint src
, GLboolean tex
)
798 int index
= REG_GET_INDEX(src
);
800 switch (REG_GET_TYPE(src
)) {
802 /* NOTE: if reg==-1 here, a source is being read that
803 * hasn't been written to. Undefined results.
805 if (cs
->temps
[index
].reg
== -1)
806 cs
->temps
[index
].reg
= get_hw_temp(fp
, cs
->nrslots
);
808 idx
= cs
->temps
[index
].reg
;
810 if (!REG_GET_NO_USE(src
) && (--cs
->temps
[index
].refcount
== 0))
814 idx
= cs
->inputs
[index
].reg
;
816 if (!REG_GET_NO_USE(src
) && (--cs
->inputs
[index
].refcount
== 0))
817 free_hw_temp(fp
, cs
->inputs
[index
].reg
);
820 return (index
| SRC_CONST
);
822 ERROR("Invalid type for source reg\n");
823 return (0 | SRC_CONST
);
827 cs
->used_in_node
|= (1 << idx
);
832 static int t_hw_dst(struct r300_fragment_program
*fp
,
833 GLuint dest
, GLboolean tex
, int slot
)
837 GLuint index
= REG_GET_INDEX(dest
);
838 assert(REG_GET_VALID(dest
));
840 switch (REG_GET_TYPE(dest
)) {
842 if (cs
->temps
[REG_GET_INDEX(dest
)].reg
== -1) {
844 cs
->temps
[index
].reg
= get_hw_temp(fp
, slot
);
846 cs
->temps
[index
].reg
= get_hw_temp_tex(fp
);
849 idx
= cs
->temps
[index
].reg
;
851 if (!REG_GET_NO_USE(dest
) && (--cs
->temps
[index
].refcount
== 0))
854 cs
->dest_in_node
|= (1 << idx
);
855 cs
->used_in_node
|= (1 << idx
);
857 case REG_TYPE_OUTPUT
:
859 case FRAG_RESULT_COLR
:
860 fp
->node
[fp
->cur_node
].flags
|=
861 R300_PFS_NODE_OUTPUT_COLOR
;
863 case FRAG_RESULT_DEPR
:
864 fp
->node
[fp
->cur_node
].flags
|=
865 R300_PFS_NODE_OUTPUT_DEPTH
;
871 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
878 static void emit_nop(struct r300_fragment_program
*fp
)
882 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
883 ERROR("Out of ALU instruction slots\n");
887 fp
->alu
.inst
[cs
->nrslots
].inst0
= NOP_INST0
;
888 fp
->alu
.inst
[cs
->nrslots
].inst1
= NOP_INST1
;
889 fp
->alu
.inst
[cs
->nrslots
].inst2
= NOP_INST2
;
890 fp
->alu
.inst
[cs
->nrslots
].inst3
= NOP_INST3
;
894 static void emit_tex(struct r300_fragment_program
*fp
,
895 struct prog_instruction
*fpi
, int opcode
)
898 GLuint coord
= t_src(fp
, fpi
->SrcReg
[0]);
899 GLuint dest
= undef
, rdest
= undef
;
901 int unit
= fpi
->TexSrcUnit
;
905 uin
= cs
->used_in_node
;
906 din
= cs
->dest_in_node
;
908 /* Resolve source/dest to hardware registers */
909 if (opcode
!= R300_FPITX_OP_KIL
) {
910 if (fpi
->TexSrcTarget
== TEXTURE_RECT_INDEX
) {
912 * Hardware uses [0..1]x[0..1] range for rectangle textures
913 * instead of [0..Width]x[0..Height].
914 * Add a scaling instruction.
916 * \todo Refactor this once we have proper rewriting/optimization
917 * support for programs.
919 gl_state_index tokens
[STATE_LENGTH
] = {
920 STATE_INTERNAL
, STATE_R300_TEXRECT_FACTOR
, 0, 0,
928 _mesa_add_state_reference(fp
->mesa_program
.Base
.
932 fp
->mesa_program
.Base
.Parameters
->
933 ParameterValues
[factor_index
]);
934 tempreg
= keep(get_temp_reg(fp
));
936 emit_arith(fp
, PFS_OP_MAD
, tempreg
, WRITEMASK_XYZW
,
937 coord
, factorreg
, pfs_zero
, 0);
939 /* Ensure correct node indirection */
940 uin
= cs
->used_in_node
;
941 din
= cs
->dest_in_node
;
943 hwsrc
= t_hw_src(fp
, tempreg
, GL_TRUE
);
945 hwsrc
= t_hw_src(fp
, coord
, GL_TRUE
);
948 dest
= t_dst(fp
, fpi
->DstReg
);
950 /* r300 doesn't seem to be able to do TEX->output reg */
951 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
953 dest
= get_temp_reg_tex(fp
);
956 t_hw_dst(fp
, dest
, GL_TRUE
,
957 fp
->node
[fp
->cur_node
].alu_offset
);
959 /* Use a temp that hasn't been used in this node, rather
960 * than causing an indirection
962 if (uin
& (1 << hwdest
)) {
963 free_hw_temp(fp
, hwdest
);
964 hwdest
= get_hw_temp_tex(fp
);
965 cs
->temps
[REG_GET_INDEX(dest
)].reg
= hwdest
;
970 hwsrc
= t_hw_src(fp
, coord
, GL_TRUE
);
973 /* Indirection if source has been written in this node, or if the
974 * dest has been read/written in this node
976 if ((REG_GET_TYPE(coord
) != REG_TYPE_CONST
&&
977 (din
& (1 << hwsrc
))) || (uin
& (1 << hwdest
))) {
979 /* Finish off current node */
980 if (fp
->node
[fp
->cur_node
].alu_offset
== cs
->nrslots
)
983 fp
->node
[fp
->cur_node
].alu_end
=
984 cs
->nrslots
- fp
->node
[fp
->cur_node
].alu_offset
- 1;
985 assert(fp
->node
[fp
->cur_node
].alu_end
>= 0);
987 if (++fp
->cur_node
>= PFS_MAX_TEX_INDIRECT
) {
988 ERROR("too many levels of texture indirection\n");
993 fp
->node
[fp
->cur_node
].tex_offset
= fp
->tex
.length
;
994 fp
->node
[fp
->cur_node
].alu_offset
= cs
->nrslots
;
995 fp
->node
[fp
->cur_node
].tex_end
= -1;
996 fp
->node
[fp
->cur_node
].alu_end
= -1;
997 fp
->node
[fp
->cur_node
].flags
= 0;
998 cs
->used_in_node
= 0;
999 cs
->dest_in_node
= 0;
1002 if (fp
->cur_node
== 0)
1003 fp
->first_node_has_tex
= 1;
1005 fp
->tex
.inst
[fp
->tex
.length
++] = 0 | (hwsrc
<< R300_FPITX_SRC_SHIFT
)
1006 | (hwdest
<< R300_FPITX_DST_SHIFT
)
1007 | (unit
<< R300_FPITX_IMAGE_SHIFT
)
1008 /* not entirely sure about this */
1009 | (opcode
<< R300_FPITX_OPCODE_SHIFT
);
1011 cs
->dest_in_node
|= (1 << hwdest
);
1012 if (REG_GET_TYPE(coord
) != REG_TYPE_CONST
)
1013 cs
->used_in_node
|= (1 << hwsrc
);
1015 fp
->node
[fp
->cur_node
].tex_end
++;
1017 /* Copy from temp to output if needed */
1018 if (REG_GET_VALID(rdest
)) {
1019 emit_arith(fp
, PFS_OP_MAD
, rdest
, WRITEMASK_XYZW
, dest
,
1020 pfs_one
, pfs_zero
, 0);
1021 free_temp(fp
, dest
);
1024 /* Free temp register */
1026 free_temp(fp
, tempreg
);
1030 * Returns the first slot where we could possibly allow writing to dest,
1031 * according to register allocation.
1033 static int get_earliest_allowed_write(struct r300_fragment_program
*fp
,
1034 GLuint dest
, int mask
)
1039 GLuint index
= REG_GET_INDEX(dest
);
1040 assert(REG_GET_VALID(dest
));
1042 switch (REG_GET_TYPE(dest
)) {
1044 if (cs
->temps
[index
].reg
== -1)
1047 idx
= cs
->temps
[index
].reg
;
1049 case REG_TYPE_OUTPUT
:
1052 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
1056 pos
= cs
->hwtemps
[idx
].reserved
;
1057 if (mask
& WRITEMASK_XYZ
) {
1058 if (pos
< cs
->hwtemps
[idx
].vector_lastread
)
1059 pos
= cs
->hwtemps
[idx
].vector_lastread
;
1061 if (mask
& WRITEMASK_W
) {
1062 if (pos
< cs
->hwtemps
[idx
].scalar_lastread
)
1063 pos
= cs
->hwtemps
[idx
].scalar_lastread
;
1070 * Allocates a slot for an ALU instruction that can consist of
1071 * a vertex part or a scalar part or both.
1073 * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1074 * appropriate position (vector and/or scalar), and their positions are
1075 * recorded in the srcpos array.
1077 * This function emits instruction code for the source fetch and the
1078 * argument selection. It does not emit instruction code for the
1079 * opcode or the destination selection.
1081 * @return the index of the slot
1083 static int find_and_prepare_slot(struct r300_fragment_program
*fp
,
1086 int argc
, GLuint
* src
, GLuint dest
, int mask
)
1099 // Determine instruction slots, whether sources are required on
1100 // vector or scalar side, and the smallest slot number where
1101 // all source registers are available
1104 used
|= SLOT_OP_VECTOR
;
1106 used
|= SLOT_OP_SCALAR
;
1108 pos
= get_earliest_allowed_write(fp
, dest
, mask
);
1110 if (fp
->node
[fp
->cur_node
].alu_offset
> pos
)
1111 pos
= fp
->node
[fp
->cur_node
].alu_offset
;
1112 for (i
= 0; i
< argc
; ++i
) {
1113 if (!REG_GET_BUILTIN(src
[i
])) {
1115 used
|= v_swiz
[REG_GET_VSWZ(src
[i
])].flags
<< i
;
1117 used
|= s_swiz
[REG_GET_SSWZ(src
[i
])].flags
<< i
;
1120 hwsrc
[i
] = t_hw_src(fp
, src
[i
], GL_FALSE
); /* Note: sideeffects wrt refcounting! */
1121 regnr
= hwsrc
[i
] & 31;
1123 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1124 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1125 if (cs
->hwtemps
[regnr
].vector_valid
> pos
)
1126 pos
= cs
->hwtemps
[regnr
].vector_valid
;
1128 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1129 if (cs
->hwtemps
[regnr
].scalar_valid
> pos
)
1130 pos
= cs
->hwtemps
[regnr
].scalar_valid
;
1135 // Find a slot that fits
1137 if (cs
->slot
[pos
].used
& used
& SLOT_OP_BOTH
)
1140 if (pos
>= cs
->nrslots
) {
1141 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
1142 ERROR("Out of ALU instruction slots\n");
1146 fp
->alu
.inst
[pos
].inst0
= NOP_INST0
;
1147 fp
->alu
.inst
[pos
].inst1
= NOP_INST1
;
1148 fp
->alu
.inst
[pos
].inst2
= NOP_INST2
;
1149 fp
->alu
.inst
[pos
].inst3
= NOP_INST3
;
1153 // Note: When we need both parts (vector and scalar) of a source,
1154 // we always try to put them into the same position. This makes the
1155 // code easier to read, and it is optimal (i.e. one doesn't gain
1156 // anything by splitting the parts).
1157 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1158 tempused
= cs
->slot
[pos
].used
;
1159 for (i
= 0; i
< 3; ++i
) {
1160 tempvsrc
[i
] = cs
->slot
[pos
].vsrc
[i
];
1161 tempssrc
[i
] = cs
->slot
[pos
].ssrc
[i
];
1164 for (i
= 0; i
< argc
; ++i
) {
1165 int flags
= (used
>> i
) & SLOT_SRC_BOTH
;
1172 for (j
= 0; j
< 3; ++j
) {
1173 if ((tempused
>> j
) & flags
& SLOT_SRC_VECTOR
) {
1174 if (tempvsrc
[j
] != hwsrc
[i
])
1178 if ((tempused
>> j
) & flags
& SLOT_SRC_SCALAR
) {
1179 if (tempssrc
[j
] != hwsrc
[i
])
1190 tempused
|= flags
<< j
;
1191 if (flags
& SLOT_SRC_VECTOR
)
1192 tempvsrc
[j
] = hwsrc
[i
];
1193 if (flags
& SLOT_SRC_SCALAR
)
1194 tempssrc
[j
] = hwsrc
[i
];
1201 // Found a slot, reserve it
1202 cs
->slot
[pos
].used
= tempused
| (used
& SLOT_OP_BOTH
);
1203 for (i
= 0; i
< 3; ++i
) {
1204 cs
->slot
[pos
].vsrc
[i
] = tempvsrc
[i
];
1205 cs
->slot
[pos
].ssrc
[i
] = tempssrc
[i
];
1208 for (i
= 0; i
< argc
; ++i
) {
1209 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1210 int regnr
= hwsrc
[i
] & 31;
1212 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1213 if (cs
->hwtemps
[regnr
].vector_lastread
< pos
)
1214 cs
->hwtemps
[regnr
].vector_lastread
=
1217 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1218 if (cs
->hwtemps
[regnr
].scalar_lastread
< pos
)
1219 cs
->hwtemps
[regnr
].scalar_lastread
=
1225 // Emit the source fetch code
1226 fp
->alu
.inst
[pos
].inst1
&= ~R300_FPI1_SRC_MASK
;
1227 fp
->alu
.inst
[pos
].inst1
|=
1228 ((cs
->slot
[pos
].vsrc
[0] << R300_FPI1_SRC0C_SHIFT
) |
1229 (cs
->slot
[pos
].vsrc
[1] << R300_FPI1_SRC1C_SHIFT
) |
1230 (cs
->slot
[pos
].vsrc
[2] << R300_FPI1_SRC2C_SHIFT
));
1232 fp
->alu
.inst
[pos
].inst3
&= ~R300_FPI3_SRC_MASK
;
1233 fp
->alu
.inst
[pos
].inst3
|=
1234 ((cs
->slot
[pos
].ssrc
[0] << R300_FPI3_SRC0A_SHIFT
) |
1235 (cs
->slot
[pos
].ssrc
[1] << R300_FPI3_SRC1A_SHIFT
) |
1236 (cs
->slot
[pos
].ssrc
[2] << R300_FPI3_SRC2A_SHIFT
));
1238 // Emit the argument selection code
1242 for (i
= 0; i
< 3; ++i
) {
1244 swz
[i
] = (v_swiz
[REG_GET_VSWZ(src
[i
])].base
+
1246 v_swiz
[REG_GET_VSWZ(src
[i
])].
1247 stride
)) | ((src
[i
] & REG_NEGV_MASK
)
1248 ? ARG_NEG
: 0) | ((src
[i
]
1255 swz
[i
] = R300_FPI0_ARGC_ZERO
;
1259 fp
->alu
.inst
[pos
].inst0
&=
1260 ~(R300_FPI0_ARG0C_MASK
| R300_FPI0_ARG1C_MASK
|
1261 R300_FPI0_ARG2C_MASK
);
1262 fp
->alu
.inst
[pos
].inst0
|=
1263 (swz
[0] << R300_FPI0_ARG0C_SHIFT
) | (swz
[1] <<
1264 R300_FPI0_ARG1C_SHIFT
)
1265 | (swz
[2] << R300_FPI0_ARG2C_SHIFT
);
1271 for (i
= 0; i
< 3; ++i
) {
1273 swz
[i
] = (s_swiz
[REG_GET_SSWZ(src
[i
])].base
+
1275 s_swiz
[REG_GET_SSWZ(src
[i
])].
1276 stride
)) | ((src
[i
] & REG_NEGV_MASK
)
1277 ? ARG_NEG
: 0) | ((src
[i
]
1284 swz
[i
] = R300_FPI2_ARGA_ZERO
;
1288 fp
->alu
.inst
[pos
].inst2
&=
1289 ~(R300_FPI2_ARG0A_MASK
| R300_FPI2_ARG1A_MASK
|
1290 R300_FPI2_ARG2A_MASK
);
1291 fp
->alu
.inst
[pos
].inst2
|=
1292 (swz
[0] << R300_FPI2_ARG0A_SHIFT
) | (swz
[1] <<
1293 R300_FPI2_ARG1A_SHIFT
)
1294 | (swz
[2] << R300_FPI2_ARG2A_SHIFT
);
1301 * Append an ALU instruction to the instruction list.
1303 static void emit_arith(struct r300_fragment_program
*fp
,
1307 GLuint src0
, GLuint src1
, GLuint src2
, int flags
)
1310 GLuint src
[3] = { src0
, src1
, src2
};
1312 GLboolean emit_vop
, emit_sop
;
1316 vop
= r300_fpop
[op
].v_op
;
1317 sop
= r300_fpop
[op
].s_op
;
1318 argc
= r300_fpop
[op
].argc
;
1320 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
&&
1321 REG_GET_INDEX(dest
) == FRAG_RESULT_DEPR
) {
1322 if (mask
& WRITEMASK_Z
) {
1329 emit_vop
= GL_FALSE
;
1330 emit_sop
= GL_FALSE
;
1331 if ((mask
& WRITEMASK_XYZ
) || vop
== R300_FPI0_OUTC_DP3
)
1333 if ((mask
& WRITEMASK_W
) || vop
== R300_FPI0_OUTC_REPL_ALPHA
)
1337 find_and_prepare_slot(fp
, emit_vop
, emit_sop
, argc
, src
, dest
,
1342 hwdest
= t_hw_dst(fp
, dest
, GL_FALSE
, pos
); /* Note: Side effects wrt register allocation */
1344 if (flags
& PFS_FLAG_SAT
) {
1345 vop
|= R300_FPI0_OUTC_SAT
;
1346 sop
|= R300_FPI2_OUTA_SAT
;
1349 /* Throw the pieces together and get FPI0/1 */
1351 fp
->alu
.inst
[pos
].inst0
|= vop
;
1353 fp
->alu
.inst
[pos
].inst1
|= hwdest
<< R300_FPI1_DSTC_SHIFT
;
1355 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1356 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1357 fp
->alu
.inst
[pos
].inst1
|=
1358 (mask
& WRITEMASK_XYZ
) <<
1359 R300_FPI1_DSTC_OUTPUT_MASK_SHIFT
;
1363 fp
->alu
.inst
[pos
].inst1
|=
1364 (mask
& WRITEMASK_XYZ
) <<
1365 R300_FPI1_DSTC_REG_MASK_SHIFT
;
1367 cs
->hwtemps
[hwdest
].vector_valid
= pos
+ 1;
1371 /* And now FPI2/3 */
1373 fp
->alu
.inst
[pos
].inst2
|= sop
;
1375 if (mask
& WRITEMASK_W
) {
1376 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1377 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1378 fp
->alu
.inst
[pos
].inst3
|=
1379 (hwdest
<< R300_FPI3_DSTA_SHIFT
) |
1380 R300_FPI3_DSTA_OUTPUT
;
1381 } else if (REG_GET_INDEX(dest
) ==
1383 fp
->alu
.inst
[pos
].inst3
|=
1384 R300_FPI3_DSTA_DEPTH
;
1388 fp
->alu
.inst
[pos
].inst3
|=
1389 (hwdest
<< R300_FPI3_DSTA_SHIFT
) |
1392 cs
->hwtemps
[hwdest
].scalar_valid
= pos
+ 1;
1401 static GLuint
get_attrib(struct r300_fragment_program
*fp
, GLuint attr
)
1403 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
1406 if (!(mp
->Base
.InputsRead
& (1 << attr
))) {
1407 ERROR("Attribute %d was not provided!\n", attr
);
1411 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
1412 REG_SET_INDEX(r
, attr
);
1413 REG_SET_VALID(r
, GL_TRUE
);
1418 static GLfloat SinCosConsts
[2][4] = {
1420 1.273239545, // 4/PI
1421 -0.405284735, // -4/(PI*PI)
1428 0.159154943, // 1/(2*PI)
1434 * Emit a LIT instruction.
1435 * \p flags may be PFS_FLAG_SAT
1437 * Definition of LIT (from ARB_fragment_program):
1438 * tmp = VectorLoad(op0);
1439 * if (tmp.x < 0) tmp.x = 0;
1440 * if (tmp.y < 0) tmp.y = 0;
1441 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1442 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1445 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1448 * The longest path of computation is the one leading to result.z,
1449 * consisting of 5 operations. This implementation of LIT takes
1450 * 5 slots. So unless there's some special undocumented opcode,
1451 * this implementation is potentially optimal. Unfortunately,
1452 * emit_arith is a bit too conservative because it doesn't understand
1453 * partial writes to the vector component.
1455 static const GLfloat LitConst
[4] =
1456 { 127.999999, 127.999999, 127.999999, -127.999999 };
1458 static void emit_lit(struct r300_fragment_program
*fp
,
1459 GLuint dest
, int mask
, GLuint src
, int flags
)
1466 cnst
= emit_const4fv(fp
, LitConst
);
1469 if ((mask
& WRITEMASK_XYZW
) != WRITEMASK_XYZW
) {
1471 } else if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1472 // LIT is typically followed by DP3/DP4, so there's no point
1473 // in creating special code for this case
1477 if (needTemporary
) {
1478 temp
= keep(get_temp_reg(fp
));
1483 // Note: The order of emit_arith inside the slots is relevant,
1484 // because emit_arith only looks at scalar vs. vector when resolving
1485 // dependencies, and it does not consider individual vector components,
1486 // so swizzling between the two parts can create fake dependencies.
1489 emit_arith(fp
, PFS_OP_MAX
, temp
, WRITEMASK_XY
,
1490 keep(src
), pfs_zero
, undef
, 0);
1491 emit_arith(fp
, PFS_OP_MAX
, temp
, WRITEMASK_W
, src
, cnst
, undef
, 0);
1494 emit_arith(fp
, PFS_OP_MIN
, temp
, WRITEMASK_Z
,
1495 swizzle(temp
, W
, W
, W
, W
), cnst
, undef
, 0);
1496 emit_arith(fp
, PFS_OP_LG2
, temp
, WRITEMASK_W
,
1497 swizzle(temp
, Y
, Y
, Y
, Y
), undef
, undef
, 0);
1500 // If desired, we saturate the y result here.
1501 // This does not affect the use as a condition variable in the CMP later
1502 emit_arith(fp
, PFS_OP_MAD
, temp
, WRITEMASK_W
,
1503 temp
, swizzle(temp
, Z
, Z
, Z
, Z
), pfs_zero
, 0);
1504 emit_arith(fp
, PFS_OP_MAD
, temp
, WRITEMASK_Y
,
1505 swizzle(temp
, X
, X
, X
, X
), pfs_one
, pfs_zero
, flags
);
1508 emit_arith(fp
, PFS_OP_MAD
, temp
, WRITEMASK_X
,
1509 pfs_one
, pfs_one
, pfs_zero
, 0);
1510 emit_arith(fp
, PFS_OP_EX2
, temp
, WRITEMASK_W
, temp
, undef
, undef
, 0);
1513 emit_arith(fp
, PFS_OP_CMP
, temp
, WRITEMASK_Z
,
1514 pfs_zero
, swizzle(temp
, W
, W
, W
, W
),
1515 negate(swizzle(temp
, Y
, Y
, Y
, Y
)), flags
);
1516 emit_arith(fp
, PFS_OP_MAD
, temp
, WRITEMASK_W
, pfs_one
, pfs_one
,
1519 if (needTemporary
) {
1520 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1521 temp
, pfs_one
, pfs_zero
, flags
);
1522 free_temp(fp
, temp
);
1524 // Decrease refcount of the destination
1525 t_hw_dst(fp
, dest
, GL_FALSE
, cs
->nrslots
);
1529 static GLboolean
parse_program(struct r300_fragment_program
*fp
)
1531 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
1532 const struct prog_instruction
*inst
= mp
->Base
.Instructions
;
1533 struct prog_instruction
*fpi
;
1534 GLuint src
[3], dest
, temp
[2];
1535 int flags
, mask
= 0;
1538 if (!inst
|| inst
[0].Opcode
== OPCODE_END
) {
1539 ERROR("empty program?\n");
1543 for (fpi
= mp
->Base
.Instructions
; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
1544 if (fpi
->SaturateMode
== SATURATE_ZERO_ONE
)
1545 flags
= PFS_FLAG_SAT
;
1549 if (fpi
->Opcode
!= OPCODE_KIL
) {
1550 dest
= t_dst(fp
, fpi
->DstReg
);
1551 mask
= fpi
->DstReg
.WriteMask
;
1554 switch (fpi
->Opcode
) {
1556 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1557 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1558 absolute(src
[0]), pfs_one
, pfs_zero
, flags
);
1561 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1562 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1563 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1564 src
[0], pfs_one
, src
[1], flags
);
1567 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1568 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1569 src
[2] = t_src(fp
, fpi
->SrcReg
[2]);
1570 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1571 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1573 emit_arith(fp
, PFS_OP_CMP
, dest
, mask
,
1574 src
[2], src
[1], src
[0], flags
);
1578 * cos using a parabola (see SIN):
1580 * x = (x/(2*PI))+0.75
1585 temp
[0] = get_temp_reg(fp
);
1586 const_sin
[0] = emit_const4fv(fp
, SinCosConsts
[0]);
1587 const_sin
[1] = emit_const4fv(fp
, SinCosConsts
[1]);
1588 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1590 /* add 0.5*PI and do range reduction */
1592 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1593 swizzle(src
[0], X
, X
, X
, X
),
1594 swizzle(const_sin
[1], Z
, Z
, Z
, Z
),
1595 swizzle(const_sin
[1], X
, X
, X
, X
), 0);
1597 emit_arith(fp
, PFS_OP_FRC
, temp
[0], WRITEMASK_X
,
1598 swizzle(temp
[0], X
, X
, X
, X
),
1601 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
, swizzle(temp
[0], X
, X
, X
, X
), swizzle(const_sin
[1], W
, W
, W
, W
), //2*PI
1602 negate(swizzle(const_sin
[0], Z
, Z
, Z
, Z
)), //-PI
1607 emit_arith(fp
, PFS_OP_MAD
, temp
[0],
1608 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[0],
1611 const_sin
[0], pfs_zero
, 0);
1613 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1614 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1615 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1616 swizzle(temp
[0], X
, X
, X
, X
), 0);
1618 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Y
,
1619 swizzle(temp
[0], X
, X
, X
, X
),
1620 absolute(swizzle(temp
[0], X
, X
, X
, X
)),
1621 negate(swizzle(temp
[0], X
, X
, X
, X
)), 0);
1623 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1624 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1625 swizzle(const_sin
[0], W
, W
, W
, W
),
1626 swizzle(temp
[0], X
, X
, X
, X
), flags
);
1628 free_temp(fp
, temp
[0]);
1631 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1632 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1633 emit_arith(fp
, PFS_OP_DP3
, dest
, mask
,
1634 src
[0], src
[1], undef
, flags
);
1637 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1638 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1639 emit_arith(fp
, PFS_OP_DP4
, dest
, mask
,
1640 src
[0], src
[1], undef
, flags
);
1643 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1644 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1645 /* src0.xyz1 -> temp
1646 * DP4 dest, temp, src1
1649 temp
[0] = get_temp_reg(fp
);
1650 src
[0].s_swz
= SWIZZLE_ONE
;
1651 emit_arith(fp
, PFS_OP_MAD
, temp
[0], mask
,
1652 src
[0], pfs_one
, pfs_zero
, 0);
1653 emit_arith(fp
, PFS_OP_DP4
, dest
, mask
,
1654 temp
[0], src
[1], undef
, flags
);
1655 free_temp(fp
, temp
[0]);
1657 emit_arith(fp
, PFS_OP_DP4
, dest
, mask
,
1658 swizzle(src
[0], X
, Y
, Z
, ONE
), src
[1],
1663 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1664 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1665 /* dest.y = src0.y * src1.y */
1666 if (mask
& WRITEMASK_Y
)
1667 emit_arith(fp
, PFS_OP_MAD
, dest
, WRITEMASK_Y
,
1668 keep(src
[0]), keep(src
[1]),
1670 /* dest.z = src0.z */
1671 if (mask
& WRITEMASK_Z
)
1672 emit_arith(fp
, PFS_OP_MAD
, dest
, WRITEMASK_Z
,
1673 src
[0], pfs_one
, pfs_zero
, flags
);
1675 * result.w = src1.w */
1676 if (mask
& WRITEMASK_XW
) {
1677 REG_SET_VSWZ(src
[1], SWIZZLE_111
); /*Cheat */
1678 emit_arith(fp
, PFS_OP_MAD
, dest
,
1679 mask
& WRITEMASK_XW
,
1680 src
[1], pfs_one
, pfs_zero
, flags
);
1684 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1685 emit_arith(fp
, PFS_OP_EX2
, dest
, mask
,
1686 src
[0], undef
, undef
, flags
);
1689 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1690 temp
[0] = get_temp_reg(fp
);
1692 * MAD dest, src0, 1.0, -temp
1694 emit_arith(fp
, PFS_OP_FRC
, temp
[0], mask
,
1695 keep(src
[0]), undef
, undef
, 0);
1696 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1697 src
[0], pfs_one
, negate(temp
[0]), flags
);
1698 free_temp(fp
, temp
[0]);
1701 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1702 emit_arith(fp
, PFS_OP_FRC
, dest
, mask
,
1703 src
[0], undef
, undef
, flags
);
1706 emit_tex(fp
, fpi
, R300_FPITX_OP_KIL
);
1709 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1710 emit_arith(fp
, PFS_OP_LG2
, dest
, mask
,
1711 src
[0], undef
, undef
, flags
);
1714 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1715 emit_lit(fp
, dest
, mask
, src
[0], flags
);
1718 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1719 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1720 src
[2] = t_src(fp
, fpi
->SrcReg
[2]);
1721 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1722 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1723 * MAD temp, -tmp0, tmp2, tmp2
1724 * MAD result, tmp0, tmp1, temp
1726 temp
[0] = get_temp_reg(fp
);
1727 emit_arith(fp
, PFS_OP_MAD
, temp
[0], mask
,
1728 negate(keep(src
[0])), keep(src
[2]), src
[2],
1730 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1731 src
[0], src
[1], temp
[0], flags
);
1732 free_temp(fp
, temp
[0]);
1735 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1736 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1737 src
[2] = t_src(fp
, fpi
->SrcReg
[2]);
1738 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1739 src
[0], src
[1], src
[2], flags
);
1742 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1743 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1744 emit_arith(fp
, PFS_OP_MAX
, dest
, mask
,
1745 src
[0], src
[1], undef
, flags
);
1748 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1749 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1750 emit_arith(fp
, PFS_OP_MIN
, dest
, mask
,
1751 src
[0], src
[1], undef
, flags
);
1755 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1756 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1757 src
[0], pfs_one
, pfs_zero
, flags
);
1760 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1761 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1762 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1763 src
[0], src
[1], pfs_zero
, flags
);
1766 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1767 src
[1] = t_scalar_src(fp
, fpi
->SrcReg
[1]);
1768 temp
[0] = get_temp_reg(fp
);
1769 emit_arith(fp
, PFS_OP_LG2
, temp
[0], WRITEMASK_W
,
1770 src
[0], undef
, undef
, 0);
1771 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_W
,
1772 temp
[0], src
[1], pfs_zero
, 0);
1773 emit_arith(fp
, PFS_OP_EX2
, dest
, fpi
->DstReg
.WriteMask
,
1774 temp
[0], undef
, undef
, 0);
1775 free_temp(fp
, temp
[0]);
1778 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1779 emit_arith(fp
, PFS_OP_RCP
, dest
, mask
,
1780 src
[0], undef
, undef
, flags
);
1783 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1784 emit_arith(fp
, PFS_OP_RSQ
, dest
, mask
,
1785 absolute(src
[0]), pfs_zero
, pfs_zero
, flags
);
1789 * scs using a parabola :
1791 * result.x = sin(-abs(x)+0.5*PI) (cos)
1792 * result.y = sin(x) (sin)
1795 temp
[0] = get_temp_reg(fp
);
1796 temp
[1] = get_temp_reg(fp
);
1797 const_sin
[0] = emit_const4fv(fp
, SinCosConsts
[0]);
1798 const_sin
[1] = emit_const4fv(fp
, SinCosConsts
[1]);
1799 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1801 /* x = -abs(x)+0.5*PI */
1802 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
, swizzle(const_sin
[0], Z
, Z
, Z
, Z
), //PI
1805 (swizzle(keep(src
[0]), X
, X
, X
, X
))),
1809 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_W
,
1810 swizzle(const_sin
[0], Y
, Y
, Y
, Y
),
1811 swizzle(keep(src
[0]), X
, X
, X
, X
),
1814 /* B*x, C*x (cos) */
1815 emit_arith(fp
, PFS_OP_MAD
, temp
[0],
1816 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[0],
1819 const_sin
[0], pfs_zero
, 0);
1822 emit_arith(fp
, PFS_OP_MAD
, temp
[1], WRITEMASK_W
,
1823 swizzle(const_sin
[0], X
, X
, X
, X
),
1824 keep(src
[0]), pfs_zero
, 0);
1826 /* y = B*x + C*x*abs(x) (sin) */
1827 emit_arith(fp
, PFS_OP_MAD
, temp
[1], WRITEMASK_Z
,
1829 swizzle(temp
[0], W
, W
, W
, W
),
1830 swizzle(temp
[1], W
, W
, W
, W
), 0);
1832 /* y = B*x + C*x*abs(x) (cos) */
1833 emit_arith(fp
, PFS_OP_MAD
, temp
[1], WRITEMASK_W
,
1834 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1835 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1836 swizzle(temp
[0], X
, X
, X
, X
), 0);
1838 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1839 emit_arith(fp
, PFS_OP_MAD
, temp
[0],
1840 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[1],
1843 absolute(swizzle(temp
[1], W
, Z
, Y
, X
)),
1844 negate(swizzle(temp
[1], W
, Z
, Y
, X
)), 0);
1846 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1847 emit_arith(fp
, PFS_OP_MAD
, dest
,
1848 mask
& (WRITEMASK_X
| WRITEMASK_Y
), temp
[0],
1849 swizzle(const_sin
[0], W
, W
, W
, W
),
1850 swizzle(temp
[1], W
, Z
, Y
, X
), flags
);
1852 free_temp(fp
, temp
[0]);
1853 free_temp(fp
, temp
[1]);
1856 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1857 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1858 temp
[0] = get_temp_reg(fp
);
1859 /* temp = src0 - src1
1860 * dest.c = (temp.c < 0.0) ? 0 : 1
1862 emit_arith(fp
, PFS_OP_MAD
, temp
[0], mask
,
1863 src
[0], pfs_one
, negate(src
[1]), 0);
1864 emit_arith(fp
, PFS_OP_CMP
, dest
, mask
,
1865 pfs_one
, pfs_zero
, temp
[0], 0);
1866 free_temp(fp
, temp
[0]);
1871 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1872 * extra precision is obtained by weighting against
1876 temp
[0] = get_temp_reg(fp
);
1877 const_sin
[0] = emit_const4fv(fp
, SinCosConsts
[0]);
1878 const_sin
[1] = emit_const4fv(fp
, SinCosConsts
[1]);
1879 src
[0] = t_scalar_src(fp
, fpi
->SrcReg
[0]);
1881 /* do range reduction */
1883 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1884 swizzle(keep(src
[0]), X
, X
, X
, X
),
1885 swizzle(const_sin
[1], Z
, Z
, Z
, Z
),
1888 emit_arith(fp
, PFS_OP_FRC
, temp
[0], WRITEMASK_X
,
1889 swizzle(temp
[0], X
, X
, X
, X
),
1892 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
, swizzle(temp
[0], X
, X
, X
, X
), swizzle(const_sin
[1], W
, W
, W
, W
), //2*PI
1893 negate(swizzle(const_sin
[0], Z
, Z
, Z
, Z
)), //PI
1898 emit_arith(fp
, PFS_OP_MAD
, temp
[0],
1899 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[0],
1902 const_sin
[0], pfs_zero
, 0);
1904 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1905 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1906 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1907 swizzle(temp
[0], X
, X
, X
, X
), 0);
1909 emit_arith(fp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Y
,
1910 swizzle(temp
[0], X
, X
, X
, X
),
1911 absolute(swizzle(temp
[0], X
, X
, X
, X
)),
1912 negate(swizzle(temp
[0], X
, X
, X
, X
)), 0);
1914 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1915 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1916 swizzle(const_sin
[0], W
, W
, W
, W
),
1917 swizzle(temp
[0], X
, X
, X
, X
), flags
);
1919 free_temp(fp
, temp
[0]);
1922 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1923 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1924 temp
[0] = get_temp_reg(fp
);
1925 /* temp = src0 - src1
1926 * dest.c = (temp.c < 0.0) ? 1 : 0
1928 emit_arith(fp
, PFS_OP_MAD
, temp
[0], mask
,
1929 src
[0], pfs_one
, negate(src
[1]), 0);
1930 emit_arith(fp
, PFS_OP_CMP
, dest
, mask
,
1931 pfs_zero
, pfs_one
, temp
[0], 0);
1932 free_temp(fp
, temp
[0]);
1935 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1936 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1937 emit_arith(fp
, PFS_OP_MAD
, dest
, mask
,
1938 src
[0], pfs_one
, negate(src
[1]), flags
);
1941 emit_tex(fp
, fpi
, R300_FPITX_OP_TEX
);
1944 emit_tex(fp
, fpi
, R300_FPITX_OP_TXB
);
1947 emit_tex(fp
, fpi
, R300_FPITX_OP_TXP
);
1950 src
[0] = t_src(fp
, fpi
->SrcReg
[0]);
1951 src
[1] = t_src(fp
, fpi
->SrcReg
[1]);
1952 temp
[0] = get_temp_reg(fp
);
1953 /* temp = src0.zxy * src1.yzx */
1954 emit_arith(fp
, PFS_OP_MAD
, temp
[0],
1955 WRITEMASK_XYZ
, swizzle(keep(src
[0]),
1957 swizzle(keep(src
[1]), Y
, Z
, X
, W
),
1959 /* dest.xyz = src0.yzx * src1.zxy - temp
1960 * dest.w = undefined
1962 emit_arith(fp
, PFS_OP_MAD
, dest
,
1963 mask
& WRITEMASK_XYZ
, swizzle(src
[0],
1966 swizzle(src
[1], Z
, X
, Y
, W
),
1967 negate(temp
[0]), flags
);
1969 free_temp(fp
, temp
[0]);
1973 ERROR("unknown fpi->Opcode %d\n", fpi
->Opcode
);
1985 static void insert_wpos(struct gl_program
*prog
)
1987 static gl_state_index tokens
[STATE_LENGTH
] = {
1988 STATE_INTERNAL
, STATE_R300_WINDOW_DIMENSION
, 0, 0, 0
1990 struct prog_instruction
*fpi
;
1991 GLuint window_index
;
1993 GLuint tempregi
= prog
->NumTemporaries
;
1994 /* should do something else if no temps left... */
1995 prog
->NumTemporaries
++;
1997 fpi
= _mesa_alloc_instructions(prog
->NumInstructions
+ 3);
1998 _mesa_init_instructions(fpi
, prog
->NumInstructions
+ 3);
2000 /* perspective divide */
2001 fpi
[i
].Opcode
= OPCODE_RCP
;
2003 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2004 fpi
[i
].DstReg
.Index
= tempregi
;
2005 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_W
;
2006 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2008 fpi
[i
].SrcReg
[0].File
= PROGRAM_INPUT
;
2009 fpi
[i
].SrcReg
[0].Index
= FRAG_ATTRIB_WPOS
;
2010 fpi
[i
].SrcReg
[0].Swizzle
= SWIZZLE_WWWW
;
2013 fpi
[i
].Opcode
= OPCODE_MUL
;
2015 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2016 fpi
[i
].DstReg
.Index
= tempregi
;
2017 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_XYZ
;
2018 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2020 fpi
[i
].SrcReg
[0].File
= PROGRAM_INPUT
;
2021 fpi
[i
].SrcReg
[0].Index
= FRAG_ATTRIB_WPOS
;
2022 fpi
[i
].SrcReg
[0].Swizzle
= SWIZZLE_XYZW
;
2024 fpi
[i
].SrcReg
[1].File
= PROGRAM_TEMPORARY
;
2025 fpi
[i
].SrcReg
[1].Index
= tempregi
;
2026 fpi
[i
].SrcReg
[1].Swizzle
= SWIZZLE_WWWW
;
2029 /* viewport transformation */
2030 window_index
= _mesa_add_state_reference(prog
->Parameters
, tokens
);
2032 fpi
[i
].Opcode
= OPCODE_MAD
;
2034 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2035 fpi
[i
].DstReg
.Index
= tempregi
;
2036 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_XYZ
;
2037 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2039 fpi
[i
].SrcReg
[0].File
= PROGRAM_TEMPORARY
;
2040 fpi
[i
].SrcReg
[0].Index
= tempregi
;
2041 fpi
[i
].SrcReg
[0].Swizzle
=
2042 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2044 fpi
[i
].SrcReg
[1].File
= PROGRAM_STATE_VAR
;
2045 fpi
[i
].SrcReg
[1].Index
= window_index
;
2046 fpi
[i
].SrcReg
[1].Swizzle
=
2047 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2049 fpi
[i
].SrcReg
[2].File
= PROGRAM_STATE_VAR
;
2050 fpi
[i
].SrcReg
[2].Index
= window_index
;
2051 fpi
[i
].SrcReg
[2].Swizzle
=
2052 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2055 _mesa_copy_instructions(&fpi
[i
], prog
->Instructions
,
2056 prog
->NumInstructions
);
2058 free(prog
->Instructions
);
2060 prog
->Instructions
= fpi
;
2062 prog
->NumInstructions
+= i
;
2063 fpi
= &prog
->Instructions
[prog
->NumInstructions
- 1];
2065 assert(fpi
->Opcode
== OPCODE_END
);
2067 for (fpi
= &prog
->Instructions
[3]; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
2068 for (i
= 0; i
< 3; i
++)
2069 if (fpi
->SrcReg
[i
].File
== PROGRAM_INPUT
&&
2070 fpi
->SrcReg
[i
].Index
== FRAG_ATTRIB_WPOS
) {
2071 fpi
->SrcReg
[i
].File
= PROGRAM_TEMPORARY
;
2072 fpi
->SrcReg
[i
].Index
= tempregi
;
2077 /* - Init structures
2078 * - Determine what hwregs each input corresponds to
2080 static void init_program(r300ContextPtr r300
, struct r300_fragment_program
*fp
)
2082 struct r300_pfs_compile_state
*cs
= NULL
;
2083 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
2084 struct prog_instruction
*fpi
;
2085 GLuint InputsRead
= mp
->Base
.InputsRead
;
2086 GLuint temps_used
= 0; /* for fp->temps[] */
2089 /* New compile, reset tracking data */
2091 driQueryOptioni(&r300
->radeon
.optionCache
, "fp_optimization");
2092 fp
->translated
= GL_FALSE
;
2093 fp
->error
= GL_FALSE
;
2094 fp
->cs
= cs
= &(R300_CONTEXT(fp
->ctx
)->state
.pfs_compile
);
2097 fp
->first_node_has_tex
= 0;
2099 fp
->max_temp_idx
= 0;
2100 fp
->node
[0].alu_end
= -1;
2101 fp
->node
[0].tex_end
= -1;
2103 _mesa_memset(cs
, 0, sizeof(*fp
->cs
));
2104 for (i
= 0; i
< PFS_MAX_ALU_INST
; i
++) {
2105 for (j
= 0; j
< 3; j
++) {
2106 cs
->slot
[i
].vsrc
[j
] = SRC_CONST
;
2107 cs
->slot
[i
].ssrc
[j
] = SRC_CONST
;
2111 /* Work out what temps the Mesa inputs correspond to, this must match
2112 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2113 * configures itself based on the fragprog's InputsRead
2115 * NOTE: this depends on get_hw_temp() allocating registers in order,
2116 * starting from register 0.
2119 /* Texcoords come first */
2120 for (i
= 0; i
< fp
->ctx
->Const
.MaxTextureUnits
; i
++) {
2121 if (InputsRead
& (FRAG_BIT_TEX0
<< i
)) {
2122 cs
->inputs
[FRAG_ATTRIB_TEX0
+ i
].refcount
= 0;
2123 cs
->inputs
[FRAG_ATTRIB_TEX0
+ i
].reg
=
2127 InputsRead
&= ~FRAG_BITS_TEX_ANY
;
2129 /* fragment position treated as a texcoord */
2130 if (InputsRead
& FRAG_BIT_WPOS
) {
2131 cs
->inputs
[FRAG_ATTRIB_WPOS
].refcount
= 0;
2132 cs
->inputs
[FRAG_ATTRIB_WPOS
].reg
= get_hw_temp(fp
, 0);
2133 insert_wpos(&mp
->Base
);
2135 InputsRead
&= ~FRAG_BIT_WPOS
;
2137 /* Then primary colour */
2138 if (InputsRead
& FRAG_BIT_COL0
) {
2139 cs
->inputs
[FRAG_ATTRIB_COL0
].refcount
= 0;
2140 cs
->inputs
[FRAG_ATTRIB_COL0
].reg
= get_hw_temp(fp
, 0);
2142 InputsRead
&= ~FRAG_BIT_COL0
;
2144 /* Secondary color */
2145 if (InputsRead
& FRAG_BIT_COL1
) {
2146 cs
->inputs
[FRAG_ATTRIB_COL1
].refcount
= 0;
2147 cs
->inputs
[FRAG_ATTRIB_COL1
].reg
= get_hw_temp(fp
, 0);
2149 InputsRead
&= ~FRAG_BIT_COL1
;
2153 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead
);
2154 /* force read from hwreg 0 for now */
2155 for (i
= 0; i
< 32; i
++)
2156 if (InputsRead
& (1 << i
))
2157 cs
->inputs
[i
].reg
= 0;
2160 /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2161 * That way, we can free up the reg when it's no longer needed
2163 if (!mp
->Base
.Instructions
) {
2164 ERROR("No instructions found in program\n");
2168 for (fpi
= mp
->Base
.Instructions
; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
2171 for (i
= 0; i
< 3; i
++) {
2172 idx
= fpi
->SrcReg
[i
].Index
;
2173 switch (fpi
->SrcReg
[i
].File
) {
2174 case PROGRAM_TEMPORARY
:
2175 if (!(temps_used
& (1 << idx
))) {
2176 cs
->temps
[idx
].reg
= -1;
2177 cs
->temps
[idx
].refcount
= 1;
2178 temps_used
|= (1 << idx
);
2180 cs
->temps
[idx
].refcount
++;
2183 cs
->inputs
[idx
].refcount
++;
2190 idx
= fpi
->DstReg
.Index
;
2191 if (fpi
->DstReg
.File
== PROGRAM_TEMPORARY
) {
2192 if (!(temps_used
& (1 << idx
))) {
2193 cs
->temps
[idx
].reg
= -1;
2194 cs
->temps
[idx
].refcount
= 1;
2195 temps_used
|= (1 << idx
);
2197 cs
->temps
[idx
].refcount
++;
2200 cs
->temp_in_use
= temps_used
;
2203 static void update_params(struct r300_fragment_program
*fp
)
2205 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
2207 /* Ask Mesa nicely to fill in ParameterValues for us */
2208 if (mp
->Base
.Parameters
)
2209 _mesa_load_state_parameters(fp
->ctx
, mp
->Base
.Parameters
);
2212 void r300TranslateFragmentShader(r300ContextPtr r300
,
2213 struct r300_fragment_program
*fp
)
2215 struct r300_pfs_compile_state
*cs
= NULL
;
2217 if (!fp
->translated
) {
2219 init_program(r300
, fp
);
2222 if (parse_program(fp
) == GL_FALSE
) {
2228 fp
->node
[fp
->cur_node
].alu_end
=
2229 cs
->nrslots
- fp
->node
[fp
->cur_node
].alu_offset
- 1;
2230 if (fp
->node
[fp
->cur_node
].tex_end
< 0)
2231 fp
->node
[fp
->cur_node
].tex_end
= 0;
2233 fp
->alu_end
= cs
->nrslots
- 1;
2235 fp
->tex_end
= fp
->tex
.length
? fp
->tex
.length
- 1 : 0;
2236 assert(fp
->node
[fp
->cur_node
].alu_end
>= 0);
2237 assert(fp
->alu_end
>= 0);
2239 fp
->translated
= GL_TRUE
;
2240 if (RADEON_DEBUG
& DEBUG_PIXEL
)
2242 r300UpdateStateParameters(fp
->ctx
, _NEW_PROGRAM
);
2248 /* just some random things... */
2249 static void dump_program(struct r300_fragment_program
*fp
)
2254 fprintf(stderr
, "pc=%d*************************************\n", pc
++);
2256 fprintf(stderr
, "Mesa program:\n");
2257 fprintf(stderr
, "-------------\n");
2258 _mesa_print_program(&fp
->mesa_program
.Base
);
2261 fprintf(stderr
, "Hardware program\n");
2262 fprintf(stderr
, "----------------\n");
2264 for (n
= 0; n
< (fp
->cur_node
+ 1); n
++) {
2265 fprintf(stderr
, "NODE %d: alu_offset: %d, tex_offset: %d, "
2266 "alu_end: %d, tex_end: %d\n", n
,
2267 fp
->node
[n
].alu_offset
,
2268 fp
->node
[n
].tex_offset
,
2269 fp
->node
[n
].alu_end
, fp
->node
[n
].tex_end
);
2271 if (fp
->tex
.length
) {
2272 fprintf(stderr
, " TEX:\n");
2273 for (i
= fp
->node
[n
].tex_offset
;
2274 i
<= fp
->node
[n
].tex_offset
+ fp
->node
[n
].tex_end
;
2279 inst
[i
] >> R300_FPITX_OPCODE_SHIFT
) &
2281 case R300_FPITX_OP_TEX
:
2284 case R300_FPITX_OP_KIL
:
2287 case R300_FPITX_OP_TXP
:
2290 case R300_FPITX_OP_TXB
:
2298 " %s t%i, %c%i, texture[%i] (%08x)\n",
2301 inst
[i
] >> R300_FPITX_DST_SHIFT
) & 31,
2303 inst
[i
] & R300_FPITX_SRC_CONST
) ? 'c' :
2306 inst
[i
] >> R300_FPITX_SRC_SHIFT
) & 31,
2308 inst
[i
] & R300_FPITX_IMAGE_MASK
) >>
2309 R300_FPITX_IMAGE_SHIFT
,
2314 for (i
= fp
->node
[n
].alu_offset
;
2315 i
<= fp
->node
[n
].alu_offset
+ fp
->node
[n
].alu_end
; ++i
) {
2316 char srcc
[3][10], dstc
[20];
2317 char srca
[3][10], dsta
[20];
2320 char flags
[5], tmp
[10];
2322 for (j
= 0; j
< 3; ++j
) {
2323 int regc
= fp
->alu
.inst
[i
].inst1
>> (j
* 6);
2324 int rega
= fp
->alu
.inst
[i
].inst3
>> (j
* 6);
2326 sprintf(srcc
[j
], "%c%i",
2327 (regc
& 32) ? 'c' : 't', regc
& 31);
2328 sprintf(srca
[j
], "%c%i",
2329 (rega
& 32) ? 'c' : 't', rega
& 31);
2333 sprintf(flags
, "%s%s%s",
2335 inst1
& R300_FPI1_DSTC_REG_X
) ? "x" : "",
2337 inst1
& R300_FPI1_DSTC_REG_Y
) ? "y" : "",
2339 inst1
& R300_FPI1_DSTC_REG_Z
) ? "z" : "");
2340 if (flags
[0] != 0) {
2341 sprintf(dstc
, "t%i.%s ",
2343 inst1
>> R300_FPI1_DSTC_SHIFT
) & 31,
2346 sprintf(flags
, "%s%s%s",
2348 inst1
& R300_FPI1_DSTC_OUTPUT_X
) ? "x" : "",
2350 inst1
& R300_FPI1_DSTC_OUTPUT_Y
) ? "y" : "",
2352 inst1
& R300_FPI1_DSTC_OUTPUT_Z
) ? "z" : "");
2353 if (flags
[0] != 0) {
2354 sprintf(tmp
, "o%i.%s",
2356 inst1
>> R300_FPI1_DSTC_SHIFT
) & 31,
2362 if (fp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_REG
) {
2363 sprintf(dsta
, "t%i.w ",
2365 inst3
>> R300_FPI3_DSTA_SHIFT
) & 31);
2367 if (fp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_OUTPUT
) {
2368 sprintf(tmp
, "o%i.w ",
2370 inst3
>> R300_FPI3_DSTA_SHIFT
) & 31);
2373 if (fp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_DEPTH
) {
2378 "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2379 " w: %3s %3s %3s -> %-20s (%08x)\n", i
,
2380 srcc
[0], srcc
[1], srcc
[2], dstc
,
2381 fp
->alu
.inst
[i
].inst1
, srca
[0], srca
[1],
2382 srca
[2], dsta
, fp
->alu
.inst
[i
].inst3
);
2384 for (j
= 0; j
< 3; ++j
) {
2385 int regc
= fp
->alu
.inst
[i
].inst0
>> (j
* 7);
2386 int rega
= fp
->alu
.inst
[i
].inst2
>> (j
* 7);
2393 case R300_FPI0_ARGC_SRC0C_XYZ
:
2394 sprintf(buf
, "%s.xyz",
2397 case R300_FPI0_ARGC_SRC0C_XXX
:
2398 sprintf(buf
, "%s.xxx",
2401 case R300_FPI0_ARGC_SRC0C_YYY
:
2402 sprintf(buf
, "%s.yyy",
2405 case R300_FPI0_ARGC_SRC0C_ZZZ
:
2406 sprintf(buf
, "%s.zzz",
2410 } else if (d
< 15) {
2411 sprintf(buf
, "%s.www", srca
[d
- 12]);
2412 } else if (d
== 20) {
2413 sprintf(buf
, "0.0");
2414 } else if (d
== 21) {
2415 sprintf(buf
, "1.0");
2416 } else if (d
== 22) {
2417 sprintf(buf
, "0.5");
2418 } else if (d
>= 23 && d
< 32) {
2422 sprintf(buf
, "%s.yzx",
2426 sprintf(buf
, "%s.zxy",
2430 sprintf(buf
, "%s.Wzy",
2435 sprintf(buf
, "%i", d
);
2438 sprintf(argc
[j
], "%s%s%s%s",
2439 (regc
& 32) ? "-" : "",
2440 (regc
& 64) ? "|" : "",
2441 buf
, (regc
& 64) ? "|" : "");
2445 sprintf(buf
, "%s.%c", srcc
[d
/ 3],
2446 'x' + (char)(d
% 3));
2447 } else if (d
< 12) {
2448 sprintf(buf
, "%s.w", srca
[d
- 9]);
2449 } else if (d
== 16) {
2450 sprintf(buf
, "0.0");
2451 } else if (d
== 17) {
2452 sprintf(buf
, "1.0");
2453 } else if (d
== 18) {
2454 sprintf(buf
, "0.5");
2456 sprintf(buf
, "%i", d
);
2459 sprintf(arga
[j
], "%s%s%s%s",
2460 (rega
& 32) ? "-" : "",
2461 (rega
& 64) ? "|" : "",
2462 buf
, (rega
& 64) ? "|" : "");
2465 fprintf(stderr
, " xyz: %8s %8s %8s op: %08x\n"
2466 " w: %8s %8s %8s op: %08x\n",
2467 argc
[0], argc
[1], argc
[2],
2468 fp
->alu
.inst
[i
].inst0
, arga
[0], arga
[1],
2469 arga
[2], fp
->alu
.inst
[i
].inst2
);