2 * Copyright (C) 2005 Ben Skeggs.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 * Ben Skeggs <darktama@iinet.net.au>
31 * Jerome Glisse <j.glisse@gmail.com>
36 * - Depth write, WPOS/FOGC inputs
38 * - Verify results of opcodes for accuracy, I've only checked them
46 #include "shader/prog_instruction.h"
47 #include "shader/prog_parameter.h"
48 #include "shader/prog_print.h"
50 #include "r300_context.h"
51 #include "r300_fragprog.h"
53 #include "r300_state.h"
56 * Usefull macros and values
58 #define ERROR(fmt, args...) do { \
59 fprintf(stderr, "%s::%s(): " fmt "\n", \
60 __FILE__, __func__, ##args); \
61 rp->error = GL_TRUE; \
64 #define PFS_INVAL 0xFFFFFFFF
65 #define COMPILE_STATE struct r300_pfs_compile_state *cs = rp->cs
77 #define SWIZZLE_HHH 10
79 #define swizzle(r, x, y, z, w) do_swizzle(rp, r, \
86 #define REG_TYPE_INPUT 0
87 #define REG_TYPE_OUTPUT 1
88 #define REG_TYPE_TEMP 2
89 #define REG_TYPE_CONST 3
91 #define REG_TYPE_SHIFT 0
92 #define REG_INDEX_SHIFT 2
93 #define REG_VSWZ_SHIFT 8
94 #define REG_SSWZ_SHIFT 13
95 #define REG_NEGV_SHIFT 18
96 #define REG_NEGS_SHIFT 19
97 #define REG_ABS_SHIFT 20
98 #define REG_NO_USE_SHIFT 21 // Hack for refcounting
99 #define REG_VALID_SHIFT 22 // Does the register contain a defined value?
100 #define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
102 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
103 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
104 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
105 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
106 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
107 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
108 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
109 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
110 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
111 #define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
113 #define REG(type, index, vswz, sswz, nouse, valid, builtin) \
114 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
115 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
116 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
117 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
118 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
119 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
120 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
121 #define REG_GET_TYPE(reg) \
122 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
123 #define REG_GET_INDEX(reg) \
124 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
125 #define REG_GET_VSWZ(reg) \
126 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
127 #define REG_GET_SSWZ(reg) \
128 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
129 #define REG_GET_NO_USE(reg) \
130 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
131 #define REG_GET_VALID(reg) \
132 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
133 #define REG_GET_BUILTIN(reg) \
134 ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
135 #define REG_SET_TYPE(reg, type) \
136 reg = ((reg & ~REG_TYPE_MASK) | \
137 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
138 #define REG_SET_INDEX(reg, index) \
139 reg = ((reg & ~REG_INDEX_MASK) | \
140 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
141 #define REG_SET_VSWZ(reg, vswz) \
142 reg = ((reg & ~REG_VSWZ_MASK) | \
143 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
144 #define REG_SET_SSWZ(reg, sswz) \
145 reg = ((reg & ~REG_SSWZ_MASK) | \
146 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
147 #define REG_SET_NO_USE(reg, nouse) \
148 reg = ((reg & ~REG_NO_USE_MASK) | \
149 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
150 #define REG_SET_VALID(reg, valid) \
151 reg = ((reg & ~REG_VALID_MASK) | \
152 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
153 #define REG_SET_BUILTIN(reg, builtin) \
154 reg = ((reg & ~REG_BUILTIN_MASK) | \
155 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
156 #define REG_ABS(reg) \
157 reg = (reg | REG_ABS_MASK)
158 #define REG_NEGV(reg) \
159 reg = (reg | REG_NEGV_MASK)
160 #define REG_NEGS(reg) \
161 reg = (reg | REG_NEGS_MASK)
165 * Datas structures for fragment program generation
168 /* description of r300 native hw instructions */
169 static const struct {
175 { "MAD", 3, R300_FPI0_OUTC_MAD
, R300_FPI2_OUTA_MAD
},
176 { "DP3", 2, R300_FPI0_OUTC_DP3
, R300_FPI2_OUTA_DP4
},
177 { "DP4", 2, R300_FPI0_OUTC_DP4
, R300_FPI2_OUTA_DP4
},
178 { "MIN", 2, R300_FPI0_OUTC_MIN
, R300_FPI2_OUTA_MIN
},
179 { "MAX", 2, R300_FPI0_OUTC_MAX
, R300_FPI2_OUTA_MAX
},
180 { "CMP", 3, R300_FPI0_OUTC_CMP
, R300_FPI2_OUTA_CMP
},
181 { "FRC", 1, R300_FPI0_OUTC_FRC
, R300_FPI2_OUTA_FRC
},
182 { "EX2", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_EX2
},
183 { "LG2", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_LG2
},
184 { "RCP", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_RCP
},
185 { "RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_RSQ
},
186 { "REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA
, PFS_INVAL
},
187 { "CMPH", 3, R300_FPI0_OUTC_CMPH
, PFS_INVAL
},
191 /* vector swizzles r300 can support natively, with a couple of
192 * cases we handle specially
194 * REG_VSWZ/REG_SSWZ is an index into this table
197 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
198 #define SWIZZLE_HALF 6
200 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
204 static const struct r300_pfs_swizzle
{
205 GLuint hash
; /* swizzle value this matches */
206 GLuint base
; /* base value for hw swizzle */
207 GLuint stride
; /* difference in base between arg0/1/2 */
210 /* native swizzles */
211 { MAKE_SWZ3(X
, Y
, Z
), R300_FPI0_ARGC_SRC0C_XYZ
, 4, SLOT_SRC_VECTOR
},
212 { MAKE_SWZ3(X
, X
, X
), R300_FPI0_ARGC_SRC0C_XXX
, 4, SLOT_SRC_VECTOR
},
213 { MAKE_SWZ3(Y
, Y
, Y
), R300_FPI0_ARGC_SRC0C_YYY
, 4, SLOT_SRC_VECTOR
},
214 { MAKE_SWZ3(Z
, Z
, Z
), R300_FPI0_ARGC_SRC0C_ZZZ
, 4, SLOT_SRC_VECTOR
},
215 { MAKE_SWZ3(W
, W
, W
), R300_FPI0_ARGC_SRC0A
, 1, SLOT_SRC_SCALAR
},
216 { MAKE_SWZ3(Y
, Z
, X
), R300_FPI0_ARGC_SRC0C_YZX
, 1, SLOT_SRC_VECTOR
},
217 { MAKE_SWZ3(Z
, X
, Y
), R300_FPI0_ARGC_SRC0C_ZXY
, 1, SLOT_SRC_VECTOR
},
218 { MAKE_SWZ3(W
, Z
, Y
), R300_FPI0_ARGC_SRC0CA_WZY
, 1, SLOT_SRC_BOTH
},
219 { MAKE_SWZ3(ONE
, ONE
, ONE
), R300_FPI0_ARGC_ONE
, 0, 0},
220 { MAKE_SWZ3(ZERO
, ZERO
, ZERO
), R300_FPI0_ARGC_ZERO
, 0, 0},
221 { MAKE_SWZ3(HALF
, HALF
, HALF
), R300_FPI0_ARGC_HALF
, 0, 0},
222 { PFS_INVAL
, 0, 0, 0},
225 /* used during matching of non-native swizzles */
226 #define SWZ_X_MASK (7 << 0)
227 #define SWZ_Y_MASK (7 << 3)
228 #define SWZ_Z_MASK (7 << 6)
229 #define SWZ_W_MASK (7 << 9)
230 static const struct {
231 GLuint hash
; /* used to mask matching swizzle components */
232 int mask
; /* actual outmask */
233 int count
; /* count of components matched */
235 { SWZ_X_MASK
|SWZ_Y_MASK
|SWZ_Z_MASK
, 1|2|4, 3},
236 { SWZ_X_MASK
|SWZ_Y_MASK
, 1|2, 2},
237 { SWZ_X_MASK
|SWZ_Z_MASK
, 1|4, 2},
238 { SWZ_Y_MASK
|SWZ_Z_MASK
, 2|4, 2},
242 { PFS_INVAL
, PFS_INVAL
, PFS_INVAL
}
245 static const struct {
246 int base
; /* hw value of swizzle */
247 int stride
; /* difference between SRC0/1/2 */
250 { R300_FPI2_ARGA_SRC0C_X
, 3, SLOT_SRC_VECTOR
},
251 { R300_FPI2_ARGA_SRC0C_Y
, 3, SLOT_SRC_VECTOR
},
252 { R300_FPI2_ARGA_SRC0C_Z
, 3, SLOT_SRC_VECTOR
},
253 { R300_FPI2_ARGA_SRC0A
, 1, SLOT_SRC_SCALAR
},
254 { R300_FPI2_ARGA_ZERO
, 0, 0 },
255 { R300_FPI2_ARGA_ONE
, 0, 0 },
256 { R300_FPI2_ARGA_HALF
, 0, 0 }
259 /* boiler-plate reg, for convenience */
260 static const GLuint undef
= REG(REG_TYPE_TEMP
,
268 /* constant one source */
269 static const GLuint pfs_one
= REG(REG_TYPE_CONST
,
277 /* constant half source */
278 static const GLuint pfs_half
= REG(REG_TYPE_CONST
,
286 /* constant zero source */
287 static const GLuint pfs_zero
= REG(REG_TYPE_CONST
,
296 * Common functions prototypes
298 static void dump_program(struct r300_fragment_program
*rp
);
299 static void emit_arith(struct r300_fragment_program
*rp
, int op
,
300 GLuint dest
, int mask
,
301 GLuint src0
, GLuint src1
, GLuint src2
,
305 * Get an R300 temporary that can be written to in the given slot.
307 static int get_hw_temp(struct r300_fragment_program
*rp
, int slot
)
312 for(r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
313 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= slot
)
317 if (r
>= PFS_NUM_TEMP_REGS
) {
318 ERROR("Out of hardware temps\n");
322 // Reserved is used to avoid the following scenario:
323 // R300 temporary X is first assigned to Mesa temporary Y during vector ops
324 // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
325 // Then scalar ops on Mesa temporary Z are emitted and move back in time
326 // to overwrite the value of temporary Y.
328 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
329 cs
->hwtemps
[r
].free
= -1;
331 // Reset to some value that won't mess things up when the user
332 // tries to read from a temporary that hasn't been assigned a value yet.
333 // In the normal case, vector_valid and scalar_valid should be set to
334 // a sane value by the first emit that writes to this temporary.
335 cs
->hwtemps
[r
].vector_valid
= 0;
336 cs
->hwtemps
[r
].scalar_valid
= 0;
338 if (r
> rp
->max_temp_idx
)
339 rp
->max_temp_idx
= r
;
345 * Get an R300 temporary that will act as a TEX destination register.
347 static int get_hw_temp_tex(struct r300_fragment_program
*rp
)
352 for(r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
353 if (cs
->used_in_node
& (1 << r
))
356 // Note: Be very careful here
357 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= 0)
361 if (r
>= PFS_NUM_TEMP_REGS
)
362 return get_hw_temp(rp
, 0); /* Will cause an indirection */
364 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
365 cs
->hwtemps
[r
].free
= -1;
367 // Reset to some value that won't mess things up when the user
368 // tries to read from a temporary that hasn't been assigned a value yet.
369 // In the normal case, vector_valid and scalar_valid should be set to
370 // a sane value by the first emit that writes to this temporary.
371 cs
->hwtemps
[r
].vector_valid
= cs
->nrslots
;
372 cs
->hwtemps
[r
].scalar_valid
= cs
->nrslots
;
374 if (r
> rp
->max_temp_idx
)
375 rp
->max_temp_idx
= r
;
381 * Mark the given hardware register as free.
383 static void free_hw_temp(struct r300_fragment_program
*rp
, int idx
)
387 // Be very careful here. Consider sequences like
390 // The TEX instruction may be moved in front of the MAD instruction
391 // due to the way nodes work. We don't want to alias r1 and r4 in
393 // I'm certain the register allocation could be further sanitized,
394 // but it's tricky because of stuff that can happen inside emit_tex
396 cs
->hwtemps
[idx
].free
= cs
->nrslots
+1;
401 * Create a new Mesa temporary register.
403 static GLuint
get_temp_reg(struct r300_fragment_program
*rp
)
409 index
= ffs(~cs
->temp_in_use
);
411 ERROR("Out of program temps\n");
415 cs
->temp_in_use
|= (1 << --index
);
416 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
417 cs
->temps
[index
].reg
= -1;
419 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
420 REG_SET_INDEX(r
, index
);
421 REG_SET_VALID(r
, GL_TRUE
);
426 * Create a new Mesa temporary register that will act as the destination
427 * register for a texture read.
429 static GLuint
get_temp_reg_tex(struct r300_fragment_program
*rp
)
435 index
= ffs(~cs
->temp_in_use
);
437 ERROR("Out of program temps\n");
441 cs
->temp_in_use
|= (1 << --index
);
442 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
443 cs
->temps
[index
].reg
= get_hw_temp_tex(rp
);
445 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
446 REG_SET_INDEX(r
, index
);
447 REG_SET_VALID(r
, GL_TRUE
);
452 * Free a Mesa temporary and the associated R300 temporary.
454 static void free_temp(struct r300_fragment_program
*rp
, GLuint r
)
457 GLuint index
= REG_GET_INDEX(r
);
459 if (!(cs
->temp_in_use
& (1 << index
)))
462 if (REG_GET_TYPE(r
) == REG_TYPE_TEMP
) {
463 free_hw_temp(rp
, cs
->temps
[index
].reg
);
464 cs
->temps
[index
].reg
= -1;
465 cs
->temp_in_use
&= ~(1 << index
);
466 } else if (REG_GET_TYPE(r
) == REG_TYPE_INPUT
) {
467 free_hw_temp(rp
, cs
->inputs
[index
].reg
);
468 cs
->inputs
[index
].reg
= -1;
473 * Emit a hardware constant/parameter.
475 * \p cp Stable pointer to an array of 4 floats.
476 * The pointer must be stable in the sense that it remains to be valid
477 * and hold the contents of the constant/parameter throughout the lifetime
478 * of the fragment program (actually, up until the next time the fragment
479 * program is translated).
481 static GLuint
emit_const4fv(struct r300_fragment_program
*rp
, const GLfloat
* cp
)
486 for(index
= 0; index
< rp
->const_nr
; ++index
) {
487 if (rp
->constant
[index
] == cp
)
491 if (index
>= rp
->const_nr
) {
492 if (index
>= PFS_NUM_CONST_REGS
) {
493 ERROR("Out of hw constants!\n");
498 rp
->constant
[index
] = cp
;
501 REG_SET_TYPE(reg
, REG_TYPE_CONST
);
502 REG_SET_INDEX(reg
, index
);
503 REG_SET_VALID(reg
, GL_TRUE
);
507 static inline GLuint
negate(GLuint r
)
514 /* Hack, to prevent clobbering sources used multiple times when
515 * emulating non-native instructions
517 static inline GLuint
keep(GLuint r
)
519 REG_SET_NO_USE(r
, GL_TRUE
);
523 static inline GLuint
absolute(GLuint r
)
529 static int swz_native(struct r300_fragment_program
*rp
,
534 /* Native swizzle, handle negation */
535 src
= (src
& ~REG_NEGS_MASK
) |
536 (((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
);
538 if ((arbneg
& 0x7) == 0x0) {
539 src
= src
& ~REG_NEGV_MASK
;
541 } else if ((arbneg
& 0x7) == 0x7) {
542 src
|= REG_NEGV_MASK
;
545 if (!REG_GET_VALID(*r
))
546 *r
= get_temp_reg(rp
);
547 src
|= REG_NEGV_MASK
;
556 src
= src
& ~REG_NEGV_MASK
;
560 (arbneg
^ 0x7) | WRITEMASK_W
,
570 static int swz_emit_partial(struct r300_fragment_program
*rp
,
580 if (!REG_GET_VALID(*r
))
581 *r
= get_temp_reg(rp
);
583 /* A partial match, VSWZ/mask define what parts of the
584 * desired swizzle we match
586 if (mc
+ s_mask
[mask
].count
== 3) {
588 src
|= ((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
;
591 tmp
= arbneg
& s_mask
[mask
].mask
;
593 tmp
= tmp
^ s_mask
[mask
].mask
;
598 arbneg
& s_mask
[mask
].mask
,
599 keep(src
) | REG_NEGV_MASK
,
604 REG_SET_NO_USE(src
, GL_TRUE
);
606 REG_SET_NO_USE(src
, GL_FALSE
);
618 REG_SET_NO_USE(src
, GL_TRUE
);
620 REG_SET_NO_USE(src
, GL_FALSE
);
625 (arbneg
& s_mask
[mask
].mask
) | wmask
,
633 REG_SET_NO_USE(src
, GL_TRUE
);
635 REG_SET_NO_USE(src
, GL_FALSE
);
637 emit_arith(rp
, PFS_OP_MAD
,
639 s_mask
[mask
].mask
| wmask
,
646 return s_mask
[mask
].count
;
649 static GLuint
do_swizzle(struct r300_fragment_program
*rp
,
659 /* If swizzling from something without an XYZW native swizzle,
660 * emit result to a temp, and do new swizzle from the temp.
663 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
||
664 REG_GET_SSWZ(src
) != SWIZZLE_W
) {
665 GLuint temp
= get_temp_reg(rp
);
678 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
||
679 REG_GET_SSWZ(src
) != SWIZZLE_W
) {
680 GLuint vsrcswz
= (v_swiz
[REG_GET_VSWZ(src
)].hash
& (SWZ_X_MASK
|SWZ_Y_MASK
|SWZ_Z_MASK
)) | REG_GET_SSWZ(src
) << 9;
685 for(i
=0; i
< 4; ++i
){
686 offset
= GET_SWZ(arbswz
, i
);
688 newswz
|= (offset
<= 3)?GET_SWZ(vsrcswz
, offset
) << i
*3:offset
<< i
*3;
691 arbswz
= newswz
& (SWZ_X_MASK
|SWZ_Y_MASK
|SWZ_Z_MASK
);
692 REG_SET_SSWZ(src
, GET_SWZ(newswz
, 3));
696 /* set scalar swizzling */
697 REG_SET_SSWZ(src
, GET_SWZ(arbswz
, 3));
701 vswz
= REG_GET_VSWZ(src
);
705 REG_SET_VSWZ(src
, vswz
);
706 chash
= v_swiz
[REG_GET_VSWZ(src
)].hash
&
709 if (chash
== (arbswz
& s_mask
[c_mask
].hash
)) {
710 if (s_mask
[c_mask
].count
== 3) {
711 v_match
+= swz_native(rp
,
716 v_match
+= swz_emit_partial(rp
,
727 /* Fill with something invalid.. all 0's was
728 * wrong before, matched SWIZZLE_X. So all
729 * 1's will be okay for now
731 arbswz
|= (PFS_INVAL
& s_mask
[c_mask
].hash
);
733 } while(v_swiz
[++vswz
].hash
!= PFS_INVAL
);
734 REG_SET_VSWZ(src
, SWIZZLE_XYZ
);
735 } while (s_mask
[++c_mask
].hash
!= PFS_INVAL
);
737 ERROR("should NEVER get here\n");
741 static GLuint
t_src(struct r300_fragment_program
*rp
,
742 struct prog_src_register fpsrc
)
746 switch (fpsrc
.File
) {
747 case PROGRAM_TEMPORARY
:
748 REG_SET_INDEX(r
, fpsrc
.Index
);
749 REG_SET_VALID(r
, GL_TRUE
);
750 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
753 REG_SET_INDEX(r
, fpsrc
.Index
);
754 REG_SET_VALID(r
, GL_TRUE
);
755 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
757 case PROGRAM_LOCAL_PARAM
:
758 r
= emit_const4fv(rp
,
759 rp
->mesa_program
.Base
.LocalParams
[fpsrc
.Index
]);
761 case PROGRAM_ENV_PARAM
:
762 r
= emit_const4fv(rp
,
763 rp
->ctx
->FragmentProgram
.Parameters
[fpsrc
.Index
]);
765 case PROGRAM_STATE_VAR
:
766 case PROGRAM_NAMED_PARAM
:
767 r
= emit_const4fv(rp
,
768 rp
->mesa_program
.Base
.Parameters
->ParameterValues
[fpsrc
.Index
]);
771 ERROR("unknown SrcReg->File %x\n", fpsrc
.File
);
775 /* no point swizzling ONE/ZERO/HALF constants... */
776 if (REG_GET_VSWZ(r
) < SWIZZLE_111
|| REG_GET_SSWZ(r
) < SWIZZLE_ZERO
)
777 r
= do_swizzle(rp
, r
, fpsrc
.Swizzle
, fpsrc
.NegateBase
);
781 static GLuint
t_scalar_src(struct r300_fragment_program
*rp
,
782 struct prog_src_register fpsrc
)
784 struct prog_src_register src
= fpsrc
;
785 int sc
= GET_SWZ(fpsrc
.Swizzle
, 0); /* X */
787 src
.Swizzle
= ((sc
<<0)|(sc
<<3)|(sc
<<6)|(sc
<<9));
789 return t_src(rp
, src
);
792 static GLuint
t_dst(struct r300_fragment_program
*rp
,
793 struct prog_dst_register dest
)
798 case PROGRAM_TEMPORARY
:
799 REG_SET_INDEX(r
, dest
.Index
);
800 REG_SET_VALID(r
, GL_TRUE
);
801 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
804 REG_SET_TYPE(r
, REG_TYPE_OUTPUT
);
805 switch (dest
.Index
) {
806 case FRAG_RESULT_COLR
:
807 case FRAG_RESULT_DEPR
:
808 REG_SET_INDEX(r
, dest
.Index
);
809 REG_SET_VALID(r
, GL_TRUE
);
812 ERROR("Bad DstReg->Index 0x%x\n", dest
.Index
);
816 ERROR("Bad DstReg->File 0x%x\n", dest
.File
);
821 static int t_hw_src(struct r300_fragment_program
*rp
,
827 int index
= REG_GET_INDEX(src
);
829 switch(REG_GET_TYPE(src
)) {
831 /* NOTE: if reg==-1 here, a source is being read that
832 * hasn't been written to. Undefined results.
834 if (cs
->temps
[index
].reg
== -1)
835 cs
->temps
[index
].reg
= get_hw_temp(rp
, cs
->nrslots
);
837 idx
= cs
->temps
[index
].reg
;
839 if (!REG_GET_NO_USE(src
) &&
840 (--cs
->temps
[index
].refcount
== 0))
844 idx
= cs
->inputs
[index
].reg
;
846 if (!REG_GET_NO_USE(src
) &&
847 (--cs
->inputs
[index
].refcount
== 0))
848 free_hw_temp(rp
, cs
->inputs
[index
].reg
);
851 return (index
| SRC_CONST
);
853 ERROR("Invalid type for source reg\n");
854 return (0 | SRC_CONST
);
858 cs
->used_in_node
|= (1 << idx
);
863 static int t_hw_dst(struct r300_fragment_program
*rp
,
870 GLuint index
= REG_GET_INDEX(dest
);
871 assert(REG_GET_VALID(dest
));
873 switch(REG_GET_TYPE(dest
)) {
875 if (cs
->temps
[REG_GET_INDEX(dest
)].reg
== -1) {
877 cs
->temps
[index
].reg
= get_hw_temp(rp
, slot
);
879 cs
->temps
[index
].reg
= get_hw_temp_tex(rp
);
882 idx
= cs
->temps
[index
].reg
;
884 if (!REG_GET_NO_USE(dest
) &&
885 (--cs
->temps
[index
].refcount
== 0))
888 cs
->dest_in_node
|= (1 << idx
);
889 cs
->used_in_node
|= (1 << idx
);
891 case REG_TYPE_OUTPUT
:
893 case FRAG_RESULT_COLR
:
894 rp
->node
[rp
->cur_node
].flags
|= R300_PFS_NODE_OUTPUT_COLOR
;
896 case FRAG_RESULT_DEPR
:
897 rp
->node
[rp
->cur_node
].flags
|= R300_PFS_NODE_OUTPUT_DEPTH
;
903 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
910 static void emit_nop(struct r300_fragment_program
*rp
)
914 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
915 ERROR("Out of ALU instruction slots\n");
919 rp
->alu
.inst
[cs
->nrslots
].inst0
= NOP_INST0
;
920 rp
->alu
.inst
[cs
->nrslots
].inst1
= NOP_INST1
;
921 rp
->alu
.inst
[cs
->nrslots
].inst2
= NOP_INST2
;
922 rp
->alu
.inst
[cs
->nrslots
].inst3
= NOP_INST3
;
926 static void emit_tex(struct r300_fragment_program
*rp
,
927 struct prog_instruction
*fpi
,
931 GLuint coord
= t_src(rp
, fpi
->SrcReg
[0]);
932 GLuint dest
= undef
, rdest
= undef
;
933 GLuint din
= cs
->dest_in_node
, uin
= cs
->used_in_node
;
934 int unit
= fpi
->TexSrcUnit
;
937 /* Resolve source/dest to hardware registers */
938 hwsrc
= t_hw_src(rp
, coord
, GL_TRUE
);
939 if (opcode
!= R300_FPITX_OP_KIL
) {
940 dest
= t_dst(rp
, fpi
->DstReg
);
942 /* r300 doesn't seem to be able to do TEX->output reg */
943 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
945 dest
= get_temp_reg_tex(rp
);
947 hwdest
= t_hw_dst(rp
, dest
, GL_TRUE
, rp
->node
[rp
->cur_node
].alu_offset
);
949 /* Use a temp that hasn't been used in this node, rather
950 * than causing an indirection
952 if (uin
& (1 << hwdest
)) {
953 free_hw_temp(rp
, hwdest
);
954 hwdest
= get_hw_temp_tex(rp
);
955 cs
->temps
[REG_GET_INDEX(dest
)].reg
= hwdest
;
962 /* Indirection if source has been written in this node, or if the
963 * dest has been read/written in this node
965 if ((REG_GET_TYPE(coord
) != REG_TYPE_CONST
&&
966 (din
& (1<<hwsrc
))) || (uin
& (1<<hwdest
))) {
968 /* Finish off current node */
969 if (rp
->node
[rp
->cur_node
].alu_offset
== cs
->nrslots
)
972 rp
->node
[rp
->cur_node
].alu_end
=
973 cs
->nrslots
- rp
->node
[rp
->cur_node
].alu_offset
- 1;
974 assert(rp
->node
[rp
->cur_node
].alu_end
>= 0);
976 if (++rp
->cur_node
>= PFS_MAX_TEX_INDIRECT
) {
977 ERROR("too many levels of texture indirection\n");
982 rp
->node
[rp
->cur_node
].tex_offset
= rp
->tex
.length
;
983 rp
->node
[rp
->cur_node
].alu_offset
= cs
->nrslots
;
984 rp
->node
[rp
->cur_node
].tex_end
= -1;
985 rp
->node
[rp
->cur_node
].alu_end
= -1;
986 rp
->node
[rp
->cur_node
].flags
= 0;
987 cs
->used_in_node
= 0;
988 cs
->dest_in_node
= 0;
991 if (rp
->cur_node
== 0)
992 rp
->first_node_has_tex
= 1;
994 rp
->tex
.inst
[rp
->tex
.length
++] = 0
995 | (hwsrc
<< R300_FPITX_SRC_SHIFT
)
996 | (hwdest
<< R300_FPITX_DST_SHIFT
)
997 | (unit
<< R300_FPITX_IMAGE_SHIFT
)
998 /* not entirely sure about this */
999 | (opcode
<< R300_FPITX_OPCODE_SHIFT
);
1001 cs
->dest_in_node
|= (1 << hwdest
);
1002 if (REG_GET_TYPE(coord
) != REG_TYPE_CONST
)
1003 cs
->used_in_node
|= (1 << hwsrc
);
1005 rp
->node
[rp
->cur_node
].tex_end
++;
1007 /* Copy from temp to output if needed */
1008 if (REG_GET_VALID(rdest
)) {
1009 emit_arith(rp
, PFS_OP_MAD
, rdest
, WRITEMASK_XYZW
, dest
,
1010 pfs_one
, pfs_zero
, 0);
1011 free_temp(rp
, dest
);
1017 * Returns the first slot where we could possibly allow writing to dest,
1018 * according to register allocation.
1020 static int get_earliest_allowed_write(
1021 struct r300_fragment_program
* rp
,
1022 GLuint dest
, int mask
)
1027 GLuint index
= REG_GET_INDEX(dest
);
1028 assert(REG_GET_VALID(dest
));
1030 switch(REG_GET_TYPE(dest
)) {
1032 if (cs
->temps
[index
].reg
== -1)
1035 idx
= cs
->temps
[index
].reg
;
1037 case REG_TYPE_OUTPUT
:
1040 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
1044 pos
= cs
->hwtemps
[idx
].reserved
;
1045 if (mask
& WRITEMASK_XYZ
) {
1046 if (pos
< cs
->hwtemps
[idx
].vector_lastread
)
1047 pos
= cs
->hwtemps
[idx
].vector_lastread
;
1049 if (mask
& WRITEMASK_W
) {
1050 if (pos
< cs
->hwtemps
[idx
].scalar_lastread
)
1051 pos
= cs
->hwtemps
[idx
].scalar_lastread
;
1059 * Allocates a slot for an ALU instruction that can consist of
1060 * a vertex part or a scalar part or both.
1062 * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1063 * appropriate position (vector and/or scalar), and their positions are
1064 * recorded in the srcpos array.
1066 * This function emits instruction code for the source fetch and the
1067 * argument selection. It does not emit instruction code for the
1068 * opcode or the destination selection.
1070 * @return the index of the slot
1072 static int find_and_prepare_slot(struct r300_fragment_program
* rp
,
1091 // Determine instruction slots, whether sources are required on
1092 // vector or scalar side, and the smallest slot number where
1093 // all source registers are available
1096 used
|= SLOT_OP_VECTOR
;
1098 used
|= SLOT_OP_SCALAR
;
1100 pos
= get_earliest_allowed_write(rp
, dest
, mask
);
1102 if (rp
->node
[rp
->cur_node
].alu_offset
> pos
)
1103 pos
= rp
->node
[rp
->cur_node
].alu_offset
;
1104 for(i
= 0; i
< argc
; ++i
) {
1105 if (!REG_GET_BUILTIN(src
[i
])) {
1107 used
|= v_swiz
[REG_GET_VSWZ(src
[i
])].flags
<< i
;
1109 used
|= s_swiz
[REG_GET_SSWZ(src
[i
])].flags
<< i
;
1112 hwsrc
[i
] = t_hw_src(rp
, src
[i
], GL_FALSE
); /* Note: sideeffects wrt refcounting! */
1113 regnr
= hwsrc
[i
] & 31;
1115 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1116 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1117 if (cs
->hwtemps
[regnr
].vector_valid
> pos
)
1118 pos
= cs
->hwtemps
[regnr
].vector_valid
;
1120 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1121 if (cs
->hwtemps
[regnr
].scalar_valid
> pos
)
1122 pos
= cs
->hwtemps
[regnr
].scalar_valid
;
1127 // Find a slot that fits
1129 if (cs
->slot
[pos
].used
& used
& SLOT_OP_BOTH
)
1132 if (pos
>= cs
->nrslots
) {
1133 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
1134 ERROR("Out of ALU instruction slots\n");
1138 rp
->alu
.inst
[pos
].inst0
= NOP_INST0
;
1139 rp
->alu
.inst
[pos
].inst1
= NOP_INST1
;
1140 rp
->alu
.inst
[pos
].inst2
= NOP_INST2
;
1141 rp
->alu
.inst
[pos
].inst3
= NOP_INST3
;
1146 // Note: When we need both parts (vector and scalar) of a source,
1147 // we always try to put them into the same position. This makes the
1148 // code easier to read, and it is optimal (i.e. one doesn't gain
1149 // anything by splitting the parts).
1150 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1151 tempused
= cs
->slot
[pos
].used
;
1152 for(i
= 0; i
< 3; ++i
) {
1153 tempvsrc
[i
] = cs
->slot
[pos
].vsrc
[i
];
1154 tempssrc
[i
] = cs
->slot
[pos
].ssrc
[i
];
1157 for(i
= 0; i
< argc
; ++i
) {
1158 int flags
= (used
>> i
) & SLOT_SRC_BOTH
;
1165 for(j
= 0; j
< 3; ++j
) {
1166 if ((tempused
>> j
) & flags
& SLOT_SRC_VECTOR
) {
1167 if (tempvsrc
[j
] != hwsrc
[i
])
1171 if ((tempused
>> j
) & flags
& SLOT_SRC_SCALAR
) {
1172 if (tempssrc
[j
] != hwsrc
[i
])
1183 tempused
|= flags
<< j
;
1184 if (flags
& SLOT_SRC_VECTOR
)
1185 tempvsrc
[j
] = hwsrc
[i
];
1186 if (flags
& SLOT_SRC_SCALAR
)
1187 tempssrc
[j
] = hwsrc
[i
];
1194 // Found a slot, reserve it
1195 cs
->slot
[pos
].used
= tempused
| (used
& SLOT_OP_BOTH
);
1196 for(i
= 0; i
< 3; ++i
) {
1197 cs
->slot
[pos
].vsrc
[i
] = tempvsrc
[i
];
1198 cs
->slot
[pos
].ssrc
[i
] = tempssrc
[i
];
1201 for(i
= 0; i
< argc
; ++i
) {
1202 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1203 int regnr
= hwsrc
[i
] & 31;
1205 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1206 if (cs
->hwtemps
[regnr
].vector_lastread
< pos
)
1207 cs
->hwtemps
[regnr
].vector_lastread
= pos
;
1209 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1210 if (cs
->hwtemps
[regnr
].scalar_lastread
< pos
)
1211 cs
->hwtemps
[regnr
].scalar_lastread
= pos
;
1216 // Emit the source fetch code
1217 rp
->alu
.inst
[pos
].inst1
&= ~R300_FPI1_SRC_MASK
;
1218 rp
->alu
.inst
[pos
].inst1
|=
1219 ((cs
->slot
[pos
].vsrc
[0] << R300_FPI1_SRC0C_SHIFT
) |
1220 (cs
->slot
[pos
].vsrc
[1] << R300_FPI1_SRC1C_SHIFT
) |
1221 (cs
->slot
[pos
].vsrc
[2] << R300_FPI1_SRC2C_SHIFT
));
1223 rp
->alu
.inst
[pos
].inst3
&= ~R300_FPI3_SRC_MASK
;
1224 rp
->alu
.inst
[pos
].inst3
|=
1225 ((cs
->slot
[pos
].ssrc
[0] << R300_FPI3_SRC0A_SHIFT
) |
1226 (cs
->slot
[pos
].ssrc
[1] << R300_FPI3_SRC1A_SHIFT
) |
1227 (cs
->slot
[pos
].ssrc
[2] << R300_FPI3_SRC2A_SHIFT
));
1229 // Emit the argument selection code
1233 for(i
= 0; i
< 3; ++i
) {
1235 swz
[i
] = (v_swiz
[REG_GET_VSWZ(src
[i
])].base
+
1236 (srcpos
[i
] * v_swiz
[REG_GET_VSWZ(src
[i
])].stride
)) |
1237 ((src
[i
] & REG_NEGV_MASK
) ? ARG_NEG
: 0) |
1238 ((src
[i
] & REG_ABS_MASK
) ? ARG_ABS
: 0);
1240 swz
[i
] = R300_FPI0_ARGC_ZERO
;
1244 rp
->alu
.inst
[pos
].inst0
&=
1245 ~(R300_FPI0_ARG0C_MASK
|R300_FPI0_ARG1C_MASK
|R300_FPI0_ARG2C_MASK
);
1246 rp
->alu
.inst
[pos
].inst0
|=
1247 (swz
[0] << R300_FPI0_ARG0C_SHIFT
) |
1248 (swz
[1] << R300_FPI0_ARG1C_SHIFT
) |
1249 (swz
[2] << R300_FPI0_ARG2C_SHIFT
);
1255 for(i
= 0; i
< 3; ++i
) {
1257 swz
[i
] = (s_swiz
[REG_GET_SSWZ(src
[i
])].base
+
1258 (srcpos
[i
] * s_swiz
[REG_GET_SSWZ(src
[i
])].stride
)) |
1259 ((src
[i
] & REG_NEGV_MASK
) ? ARG_NEG
: 0) |
1260 ((src
[i
] & REG_ABS_MASK
) ? ARG_ABS
: 0);
1262 swz
[i
] = R300_FPI2_ARGA_ZERO
;
1266 rp
->alu
.inst
[pos
].inst2
&=
1267 ~(R300_FPI2_ARG0A_MASK
|R300_FPI2_ARG1A_MASK
|R300_FPI2_ARG2A_MASK
);
1268 rp
->alu
.inst
[pos
].inst2
|=
1269 (swz
[0] << R300_FPI2_ARG0A_SHIFT
) |
1270 (swz
[1] << R300_FPI2_ARG1A_SHIFT
) |
1271 (swz
[2] << R300_FPI2_ARG2A_SHIFT
);
1279 * Append an ALU instruction to the instruction list.
1281 static void emit_arith(struct r300_fragment_program
*rp
,
1291 GLuint src
[3] = { src0
, src1
, src2
};
1293 GLboolean emit_vop
, emit_sop
;
1297 vop
= r300_fpop
[op
].v_op
;
1298 sop
= r300_fpop
[op
].s_op
;
1299 argc
= r300_fpop
[op
].argc
;
1301 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
&&
1302 REG_GET_INDEX(dest
) == FRAG_RESULT_DEPR
) {
1303 if (mask
& WRITEMASK_Z
) {
1310 emit_vop
= GL_FALSE
;
1311 emit_sop
= GL_FALSE
;
1312 if ((mask
& WRITEMASK_XYZ
) || vop
== R300_FPI0_OUTC_DP3
)
1314 if ((mask
& WRITEMASK_W
) || vop
== R300_FPI0_OUTC_REPL_ALPHA
)
1317 pos
= find_and_prepare_slot(rp
, emit_vop
, emit_sop
, argc
, src
, dest
, mask
);
1321 hwdest
= t_hw_dst(rp
, dest
, GL_FALSE
, pos
); /* Note: Side effects wrt register allocation */
1323 if (flags
& PFS_FLAG_SAT
) {
1324 vop
|= R300_FPI0_OUTC_SAT
;
1325 sop
|= R300_FPI2_OUTA_SAT
;
1328 /* Throw the pieces together and get FPI0/1 */
1330 rp
->alu
.inst
[pos
].inst0
|= vop
;
1332 rp
->alu
.inst
[pos
].inst1
|= hwdest
<< R300_FPI1_DSTC_SHIFT
;
1334 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1335 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1336 rp
->alu
.inst
[pos
].inst1
|=
1337 (mask
& WRITEMASK_XYZ
) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT
;
1340 rp
->alu
.inst
[pos
].inst1
|=
1341 (mask
& WRITEMASK_XYZ
) << R300_FPI1_DSTC_REG_MASK_SHIFT
;
1343 cs
->hwtemps
[hwdest
].vector_valid
= pos
+1;
1347 /* And now FPI2/3 */
1349 rp
->alu
.inst
[pos
].inst2
|= sop
;
1351 if (mask
& WRITEMASK_W
) {
1352 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1353 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1354 rp
->alu
.inst
[pos
].inst3
|=
1355 (hwdest
<< R300_FPI3_DSTA_SHIFT
) | R300_FPI3_DSTA_OUTPUT
;
1356 } else if (REG_GET_INDEX(dest
) == FRAG_RESULT_DEPR
) {
1357 rp
->alu
.inst
[pos
].inst3
|= R300_FPI3_DSTA_DEPTH
;
1360 rp
->alu
.inst
[pos
].inst3
|=
1361 (hwdest
<< R300_FPI3_DSTA_SHIFT
) | R300_FPI3_DSTA_REG
;
1363 cs
->hwtemps
[hwdest
].scalar_valid
= pos
+1;
1372 static GLuint
get_attrib(struct r300_fragment_program
*rp
, GLuint attr
)
1374 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
1377 if (!(mp
->Base
.InputsRead
& (1<<attr
))) {
1378 ERROR("Attribute %d was not provided!\n", attr
);
1382 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
1383 REG_SET_INDEX(r
, attr
);
1384 REG_SET_VALID(r
, GL_TRUE
);
1389 static GLfloat SinCosConsts
[2][4] = {
1391 1.273239545, // 4/PI
1392 -0.405284735, // -4/(PI*PI)
1399 0.159154943, // 1/(2*PI)
1406 * Emit a LIT instruction.
1407 * \p flags may be PFS_FLAG_SAT
1409 * Definition of LIT (from ARB_fragment_program):
1410 * tmp = VectorLoad(op0);
1411 * if (tmp.x < 0) tmp.x = 0;
1412 * if (tmp.y < 0) tmp.y = 0;
1413 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1414 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1417 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1420 * The longest path of computation is the one leading to result.z,
1421 * consisting of 5 operations. This implementation of LIT takes
1422 * 5 slots. So unless there's some special undocumented opcode,
1423 * this implementation is potentially optimal. Unfortunately,
1424 * emit_arith is a bit too conservative because it doesn't understand
1425 * partial writes to the vector component.
1427 static const GLfloat LitConst
[4] = { 127.999999, 127.999999, 127.999999, -127.999999 };
1429 static void emit_lit(struct r300_fragment_program
*rp
,
1440 cnst
= emit_const4fv(rp
, LitConst
);
1443 if ((mask
& WRITEMASK_XYZW
) != WRITEMASK_XYZW
) {
1445 } else if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1446 // LIT is typically followed by DP3/DP4, so there's no point
1447 // in creating special code for this case
1451 if (needTemporary
) {
1452 temp
= keep(get_temp_reg(rp
));
1457 // Note: The order of emit_arith inside the slots is relevant,
1458 // because emit_arith only looks at scalar vs. vector when resolving
1459 // dependencies, and it does not consider individual vector components,
1460 // so swizzling between the two parts can create fake dependencies.
1463 emit_arith(rp
, PFS_OP_MAX
, temp
, WRITEMASK_XY
,
1464 keep(src
), pfs_zero
, undef
, 0);
1465 emit_arith(rp
, PFS_OP_MAX
, temp
, WRITEMASK_W
,
1466 src
, cnst
, undef
, 0);
1469 emit_arith(rp
, PFS_OP_MIN
, temp
, WRITEMASK_Z
,
1470 swizzle(temp
, W
, W
, W
, W
), cnst
, undef
, 0);
1471 emit_arith(rp
, PFS_OP_LG2
, temp
, WRITEMASK_W
,
1472 swizzle(temp
, Y
, Y
, Y
, Y
), undef
, undef
, 0);
1475 // If desired, we saturate the y result here.
1476 // This does not affect the use as a condition variable in the CMP later
1477 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_W
,
1478 temp
, swizzle(temp
, Z
, Z
, Z
, Z
), pfs_zero
, 0);
1479 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_Y
,
1480 swizzle(temp
, X
, X
, X
, X
), pfs_one
, pfs_zero
, flags
);
1483 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_X
,
1484 pfs_one
, pfs_one
, pfs_zero
, 0);
1485 emit_arith(rp
, PFS_OP_EX2
, temp
, WRITEMASK_W
,
1486 temp
, undef
, undef
, 0);
1489 emit_arith(rp
, PFS_OP_CMP
, temp
, WRITEMASK_Z
,
1490 pfs_zero
, swizzle(temp
, W
, W
, W
, W
), negate(swizzle(temp
, Y
, Y
, Y
, Y
)), flags
);
1491 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_W
,
1492 pfs_one
, pfs_one
, pfs_zero
, 0);
1494 if (needTemporary
) {
1495 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1496 temp
, pfs_one
, pfs_zero
, flags
);
1497 free_temp(rp
, temp
);
1499 // Decrease refcount of the destination
1500 t_hw_dst(rp
, dest
, GL_FALSE
, cs
->nrslots
);
1505 static GLboolean
parse_program(struct r300_fragment_program
*rp
)
1507 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
1508 const struct prog_instruction
*inst
= mp
->Base
.Instructions
;
1509 struct prog_instruction
*fpi
;
1510 GLuint src
[3], dest
, temp
[2];
1511 int flags
, mask
= 0;
1514 if (!inst
|| inst
[0].Opcode
== OPCODE_END
) {
1515 ERROR("empty program?\n");
1519 for (fpi
=mp
->Base
.Instructions
; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
1520 if (fpi
->SaturateMode
== SATURATE_ZERO_ONE
)
1521 flags
= PFS_FLAG_SAT
;
1525 if (fpi
->Opcode
!= OPCODE_KIL
) {
1526 dest
= t_dst(rp
, fpi
->DstReg
);
1527 mask
= fpi
->DstReg
.WriteMask
;
1530 switch (fpi
->Opcode
) {
1532 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1533 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1534 absolute(src
[0]), pfs_one
, pfs_zero
,
1538 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1539 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1540 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1541 src
[0], pfs_one
, src
[1],
1545 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1546 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1547 src
[2] = t_src(rp
, fpi
->SrcReg
[2]);
1548 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1549 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1551 emit_arith(rp
, PFS_OP_CMP
, dest
, mask
,
1552 src
[2], src
[1], src
[0],
1557 * cos using a parabola (see SIN):
1559 * x = (x/(2*PI))+0.75
1564 temp
[0] = get_temp_reg(rp
);
1565 const_sin
[0] = emit_const4fv(rp
, SinCosConsts
[0]);
1566 const_sin
[1] = emit_const4fv(rp
, SinCosConsts
[1]);
1567 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1569 /* add 0.5*PI and do range reduction */
1571 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1572 swizzle(src
[0], X
, X
, X
, X
),
1573 swizzle(const_sin
[1], Z
, Z
, Z
, Z
),
1574 swizzle(const_sin
[1], X
, X
, X
, X
),
1577 emit_arith(rp
, PFS_OP_FRC
, temp
[0], WRITEMASK_X
,
1578 swizzle(temp
[0], X
, X
, X
, X
),
1583 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
,
1584 swizzle(temp
[0], X
, X
, X
, X
),
1585 swizzle(const_sin
[1], W
, W
, W
, W
), //2*PI
1586 negate(swizzle(const_sin
[0], Z
, Z
, Z
, Z
)), //-PI
1591 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
| WRITEMASK_Y
,
1592 swizzle(temp
[0], Z
, Z
, Z
, Z
),
1597 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1598 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1599 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1600 swizzle(temp
[0], X
, X
, X
, X
),
1603 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Y
,
1604 swizzle(temp
[0], X
, X
, X
, X
),
1605 absolute(swizzle(temp
[0], X
, X
, X
, X
)),
1606 negate(swizzle(temp
[0], X
, X
, X
, X
)),
1610 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1611 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1612 swizzle(const_sin
[0], W
, W
, W
, W
),
1613 swizzle(temp
[0], X
, X
, X
, X
),
1616 free_temp(rp
, temp
[0]);
1619 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1620 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1621 emit_arith(rp
, PFS_OP_DP3
, dest
, mask
,
1622 src
[0], src
[1], undef
,
1626 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1627 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1628 emit_arith(rp
, PFS_OP_DP4
, dest
, mask
,
1629 src
[0], src
[1], undef
,
1633 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1634 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1635 /* src0.xyz1 -> temp
1636 * DP4 dest, temp, src1
1639 temp
[0] = get_temp_reg(rp
);
1640 src
[0].s_swz
= SWIZZLE_ONE
;
1641 emit_arith(rp
, PFS_OP_MAD
, temp
[0], mask
,
1642 src
[0], pfs_one
, pfs_zero
,
1644 emit_arith(rp
, PFS_OP_DP4
, dest
, mask
,
1645 temp
[0], src
[1], undef
,
1647 free_temp(rp
, temp
[0]);
1649 emit_arith(rp
, PFS_OP_DP4
, dest
, mask
,
1650 swizzle(src
[0], X
, Y
, Z
, ONE
), src
[1],
1655 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1656 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1657 /* dest.y = src0.y * src1.y */
1658 if (mask
& WRITEMASK_Y
)
1659 emit_arith(rp
, PFS_OP_MAD
, dest
, WRITEMASK_Y
,
1660 keep(src
[0]), keep(src
[1]),
1662 /* dest.z = src0.z */
1663 if (mask
& WRITEMASK_Z
)
1664 emit_arith(rp
, PFS_OP_MAD
, dest
, WRITEMASK_Z
,
1665 src
[0], pfs_one
, pfs_zero
, flags
);
1667 * result.w = src1.w */
1668 if (mask
& WRITEMASK_XW
) {
1669 REG_SET_VSWZ(src
[1], SWIZZLE_111
); /*Cheat*/
1670 emit_arith(rp
, PFS_OP_MAD
, dest
,
1671 mask
& WRITEMASK_XW
,
1672 src
[1], pfs_one
, pfs_zero
,
1677 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1678 emit_arith(rp
, PFS_OP_EX2
, dest
, mask
,
1679 src
[0], undef
, undef
,
1683 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1684 temp
[0] = get_temp_reg(rp
);
1686 * MAD dest, src0, 1.0, -temp
1688 emit_arith(rp
, PFS_OP_FRC
, temp
[0], mask
,
1689 keep(src
[0]), undef
, undef
,
1691 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1692 src
[0], pfs_one
, negate(temp
[0]),
1694 free_temp(rp
, temp
[0]);
1697 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1698 emit_arith(rp
, PFS_OP_FRC
, dest
, mask
,
1699 src
[0], undef
, undef
,
1703 emit_tex(rp
, fpi
, R300_FPITX_OP_KIL
);
1706 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1707 emit_arith(rp
, PFS_OP_LG2
, dest
, mask
,
1708 src
[0], undef
, undef
,
1712 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1713 emit_lit(rp
, dest
, mask
, src
[0], flags
);
1716 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1717 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1718 src
[2] = t_src(rp
, fpi
->SrcReg
[2]);
1719 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1720 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1721 * MAD temp, -tmp0, tmp2, tmp2
1722 * MAD result, tmp0, tmp1, temp
1724 temp
[0] = get_temp_reg(rp
);
1725 emit_arith(rp
, PFS_OP_MAD
, temp
[0], mask
,
1726 negate(keep(src
[0])), keep(src
[2]), src
[2],
1728 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1729 src
[0], src
[1], temp
[0],
1731 free_temp(rp
, temp
[0]);
1734 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1735 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1736 src
[2] = t_src(rp
, fpi
->SrcReg
[2]);
1737 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1738 src
[0], src
[1], src
[2],
1742 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1743 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1744 emit_arith(rp
, PFS_OP_MAX
, dest
, mask
,
1745 src
[0], src
[1], undef
,
1749 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1750 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1751 emit_arith(rp
, PFS_OP_MIN
, dest
, mask
,
1752 src
[0], src
[1], undef
,
1757 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1758 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1759 src
[0], pfs_one
, pfs_zero
,
1763 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1764 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1765 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1766 src
[0], src
[1], pfs_zero
,
1770 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1771 src
[1] = t_scalar_src(rp
, fpi
->SrcReg
[1]);
1772 temp
[0] = get_temp_reg(rp
);
1773 emit_arith(rp
, PFS_OP_LG2
, temp
[0], WRITEMASK_W
,
1774 src
[0], undef
, undef
,
1776 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_W
,
1777 temp
[0], src
[1], pfs_zero
,
1779 emit_arith(rp
, PFS_OP_EX2
, dest
, fpi
->DstReg
.WriteMask
,
1780 temp
[0], undef
, undef
,
1782 free_temp(rp
, temp
[0]);
1785 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1786 emit_arith(rp
, PFS_OP_RCP
, dest
, mask
,
1787 src
[0], undef
, undef
,
1791 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1792 emit_arith(rp
, PFS_OP_RSQ
, dest
, mask
,
1793 absolute(src
[0]), pfs_zero
, pfs_zero
,
1798 * scs using a parabola :
1800 * result.x = sin(-abs(x)+0.5*PI) (cos)
1801 * result.y = sin(x) (sin)
1804 temp
[0] = get_temp_reg(rp
);
1805 temp
[1] = get_temp_reg(rp
);
1806 const_sin
[0] = emit_const4fv(rp
, SinCosConsts
[0]);
1807 const_sin
[1] = emit_const4fv(rp
, SinCosConsts
[1]);
1808 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1810 /* x = -abs(x)+0.5*PI */
1811 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
,
1812 swizzle(const_sin
[0], Z
, Z
, Z
, Z
), //PI
1814 negate(abs(swizzle(keep(src
[0]), X
, X
, X
, X
))),
1818 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_W
,
1819 swizzle(const_sin
[0], Y
, Y
, Y
, Y
),
1820 swizzle(keep(src
[0]), X
, X
, X
, X
),
1824 /* B*x, C*x (cos) */
1825 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
| WRITEMASK_Y
,
1826 swizzle(temp
[0], Z
, Z
, Z
, Z
),
1832 emit_arith(rp
, PFS_OP_MAD
, temp
[1], WRITEMASK_W
,
1833 swizzle(const_sin
[0], X
, X
, X
, X
),
1838 /* y = B*x + C*x*abs(x) (sin)*/
1839 emit_arith(rp
, PFS_OP_MAD
, temp
[1], WRITEMASK_Z
,
1841 swizzle(temp
[0], W
, W
, W
, W
),
1842 swizzle(temp
[1], W
, W
, W
, W
),
1845 /* y = B*x + C*x*abs(x) (cos)*/
1846 emit_arith(rp
, PFS_OP_MAD
, temp
[1], WRITEMASK_W
,
1847 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1848 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1849 swizzle(temp
[0], X
, X
, X
, X
),
1852 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1853 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
| WRITEMASK_Y
,
1854 swizzle(temp
[1], W
, Z
, Y
, X
),
1855 absolute(swizzle(temp
[1], W
, Z
, Y
, X
)),
1856 negate(swizzle(temp
[1], W
, Z
, Y
, X
)),
1860 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1861 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
& (WRITEMASK_X
| WRITEMASK_Y
),
1863 swizzle(const_sin
[0], W
, W
, W
, W
),
1864 swizzle(temp
[1], W
, Z
, Y
, X
),
1867 free_temp(rp
, temp
[0]);
1868 free_temp(rp
, temp
[1]);
1871 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1872 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1873 temp
[0] = get_temp_reg(rp
);
1874 /* temp = src0 - src1
1875 * dest.c = (temp.c < 0.0) ? 0 : 1
1877 emit_arith(rp
, PFS_OP_MAD
, temp
[0], mask
,
1878 src
[0], pfs_one
, negate(src
[1]),
1880 emit_arith(rp
, PFS_OP_CMP
, dest
, mask
,
1881 pfs_one
, pfs_zero
, temp
[0],
1883 free_temp(rp
, temp
[0]);
1888 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1889 * extra precision is obtained by weighting against
1893 temp
[0] = get_temp_reg(rp
);
1894 const_sin
[0] = emit_const4fv(rp
, SinCosConsts
[0]);
1895 const_sin
[1] = emit_const4fv(rp
, SinCosConsts
[1]);
1896 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1899 /* do range reduction */
1901 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1902 swizzle(keep(src
[0]), X
, X
, X
, X
),
1903 swizzle(const_sin
[1], Z
, Z
, Z
, Z
),
1907 emit_arith(rp
, PFS_OP_FRC
, temp
[0], WRITEMASK_X
,
1908 swizzle(temp
[0], X
, X
, X
, X
),
1913 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
,
1914 swizzle(temp
[0], X
, X
, X
, X
),
1915 swizzle(const_sin
[1], W
, W
, W
, W
), //2*PI
1916 negate(swizzle(const_sin
[0], Z
, Z
, Z
, Z
)), //PI
1921 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
| WRITEMASK_Y
,
1922 swizzle(temp
[0], Z
, Z
, Z
, Z
),
1927 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1928 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1929 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1930 swizzle(temp
[0], X
, X
, X
, X
),
1933 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Y
,
1934 swizzle(temp
[0], X
, X
, X
, X
),
1935 absolute(swizzle(temp
[0], X
, X
, X
, X
)),
1936 negate(swizzle(temp
[0], X
, X
, X
, X
)),
1940 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1941 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1942 swizzle(const_sin
[0], W
, W
, W
, W
),
1943 swizzle(temp
[0], X
, X
, X
, X
),
1946 free_temp(rp
, temp
[0]);
1949 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1950 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1951 temp
[0] = get_temp_reg(rp
);
1952 /* temp = src0 - src1
1953 * dest.c = (temp.c < 0.0) ? 1 : 0
1955 emit_arith(rp
, PFS_OP_MAD
, temp
[0], mask
,
1956 src
[0], pfs_one
, negate(src
[1]),
1958 emit_arith(rp
, PFS_OP_CMP
, dest
, mask
,
1959 pfs_zero
, pfs_one
, temp
[0],
1961 free_temp(rp
, temp
[0]);
1964 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1965 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1966 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1967 src
[0], pfs_one
, negate(src
[1]),
1971 emit_tex(rp
, fpi
, R300_FPITX_OP_TEX
);
1974 emit_tex(rp
, fpi
, R300_FPITX_OP_TXB
);
1977 emit_tex(rp
, fpi
, R300_FPITX_OP_TXP
);
1980 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1981 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1982 temp
[0] = get_temp_reg(rp
);
1983 /* temp = src0.zxy * src1.yzx */
1984 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_XYZ
,
1985 swizzle(keep(src
[0]), Z
, X
, Y
, W
),
1986 swizzle(keep(src
[1]), Y
, Z
, X
, W
),
1989 /* dest.xyz = src0.yzx * src1.zxy - temp
1990 * dest.w = undefined
1992 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
& WRITEMASK_XYZ
,
1993 swizzle(src
[0], Y
, Z
, X
, W
),
1994 swizzle(src
[1], Z
, X
, Y
, W
),
1998 free_temp(rp
, temp
[0]);
2002 ERROR("unknown fpi->Opcode %d\n", fpi
->Opcode
);
2014 static void insert_wpos(struct gl_program
*prog
)
2016 GLint tokens
[6] = { STATE_INTERNAL
, STATE_R300_WINDOW_DIMENSION
, 0, 0, 0, 0 };
2017 struct prog_instruction
*fpi
;
2018 GLuint window_index
;
2020 GLuint tempregi
= prog
->NumTemporaries
;
2021 /* should do something else if no temps left... */
2022 prog
->NumTemporaries
++;
2024 fpi
= _mesa_alloc_instructions (prog
->NumInstructions
+ 3);
2025 _mesa_init_instructions (fpi
, prog
->NumInstructions
+ 3);
2027 /* perspective divide */
2028 fpi
[i
].Opcode
= OPCODE_RCP
;
2030 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2031 fpi
[i
].DstReg
.Index
= tempregi
;
2032 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_W
;
2033 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2035 fpi
[i
].SrcReg
[0].File
= PROGRAM_INPUT
;
2036 fpi
[i
].SrcReg
[0].Index
= FRAG_ATTRIB_WPOS
;
2037 fpi
[i
].SrcReg
[0].Swizzle
= SWIZZLE_WWWW
;
2040 fpi
[i
].Opcode
= OPCODE_MUL
;
2042 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2043 fpi
[i
].DstReg
.Index
= tempregi
;
2044 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_XYZ
;
2045 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2047 fpi
[i
].SrcReg
[0].File
= PROGRAM_INPUT
;
2048 fpi
[i
].SrcReg
[0].Index
= FRAG_ATTRIB_WPOS
;
2049 fpi
[i
].SrcReg
[0].Swizzle
= SWIZZLE_XYZW
;
2051 fpi
[i
].SrcReg
[1].File
= PROGRAM_TEMPORARY
;
2052 fpi
[i
].SrcReg
[1].Index
= tempregi
;
2053 fpi
[i
].SrcReg
[1].Swizzle
= SWIZZLE_WWWW
;
2056 /* viewport transformation */
2057 window_index
= _mesa_add_state_reference(prog
->Parameters
, tokens
);
2059 fpi
[i
].Opcode
= OPCODE_MAD
;
2061 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2062 fpi
[i
].DstReg
.Index
= tempregi
;
2063 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_XYZ
;
2064 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2066 fpi
[i
].SrcReg
[0].File
= PROGRAM_TEMPORARY
;
2067 fpi
[i
].SrcReg
[0].Index
= tempregi
;
2068 fpi
[i
].SrcReg
[0].Swizzle
= MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2070 fpi
[i
].SrcReg
[1].File
= PROGRAM_STATE_VAR
;
2071 fpi
[i
].SrcReg
[1].Index
= window_index
;
2072 fpi
[i
].SrcReg
[1].Swizzle
= MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2074 fpi
[i
].SrcReg
[2].File
= PROGRAM_STATE_VAR
;
2075 fpi
[i
].SrcReg
[2].Index
= window_index
;
2076 fpi
[i
].SrcReg
[2].Swizzle
= MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2079 _mesa_copy_instructions (&fpi
[i
], prog
->Instructions
, prog
->NumInstructions
);
2081 free(prog
->Instructions
);
2083 prog
->Instructions
= fpi
;
2085 prog
->NumInstructions
+= i
;
2086 fpi
= &prog
->Instructions
[prog
->NumInstructions
-1];
2088 assert(fpi
->Opcode
== OPCODE_END
);
2090 for(fpi
= &prog
->Instructions
[3]; fpi
->Opcode
!= OPCODE_END
; fpi
++){
2092 if( fpi
->SrcReg
[i
].File
== PROGRAM_INPUT
&&
2093 fpi
->SrcReg
[i
].Index
== FRAG_ATTRIB_WPOS
){
2094 fpi
->SrcReg
[i
].File
= PROGRAM_TEMPORARY
;
2095 fpi
->SrcReg
[i
].Index
= tempregi
;
2100 /* - Init structures
2101 * - Determine what hwregs each input corresponds to
2103 static void init_program(r300ContextPtr r300
, struct r300_fragment_program
*rp
)
2105 struct r300_pfs_compile_state
*cs
= NULL
;
2106 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
2107 struct prog_instruction
*fpi
;
2108 GLuint InputsRead
= mp
->Base
.InputsRead
;
2109 GLuint temps_used
= 0; /* for rp->temps[] */
2112 /* New compile, reset tracking data */
2113 rp
->optimization
= driQueryOptioni(&r300
->radeon
.optionCache
, "fp_optimization");
2114 rp
->translated
= GL_FALSE
;
2115 rp
->error
= GL_FALSE
;
2116 rp
->cs
= cs
= &(R300_CONTEXT(rp
->ctx
)->state
.pfs_compile
);
2119 rp
->first_node_has_tex
= 0;
2121 rp
->max_temp_idx
= 0;
2122 rp
->node
[0].alu_end
= -1;
2123 rp
->node
[0].tex_end
= -1;
2125 _mesa_memset(cs
, 0, sizeof(*rp
->cs
));
2126 for (i
=0;i
<PFS_MAX_ALU_INST
;i
++) {
2128 cs
->slot
[i
].vsrc
[j
] = SRC_CONST
;
2129 cs
->slot
[i
].ssrc
[j
] = SRC_CONST
;
2133 /* Work out what temps the Mesa inputs correspond to, this must match
2134 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2135 * configures itself based on the fragprog's InputsRead
2137 * NOTE: this depends on get_hw_temp() allocating registers in order,
2138 * starting from register 0.
2141 /* Texcoords come first */
2142 for (i
=0;i
<rp
->ctx
->Const
.MaxTextureUnits
;i
++) {
2143 if (InputsRead
& (FRAG_BIT_TEX0
<< i
)) {
2144 cs
->inputs
[FRAG_ATTRIB_TEX0
+i
].refcount
= 0;
2145 cs
->inputs
[FRAG_ATTRIB_TEX0
+i
].reg
= get_hw_temp(rp
, 0);
2148 InputsRead
&= ~FRAG_BITS_TEX_ANY
;
2150 /* fragment position treated as a texcoord */
2151 if (InputsRead
& FRAG_BIT_WPOS
) {
2152 cs
->inputs
[FRAG_ATTRIB_WPOS
].refcount
= 0;
2153 cs
->inputs
[FRAG_ATTRIB_WPOS
].reg
= get_hw_temp(rp
, 0);
2154 insert_wpos(&mp
->Base
);
2156 InputsRead
&= ~FRAG_BIT_WPOS
;
2158 /* Then primary colour */
2159 if (InputsRead
& FRAG_BIT_COL0
) {
2160 cs
->inputs
[FRAG_ATTRIB_COL0
].refcount
= 0;
2161 cs
->inputs
[FRAG_ATTRIB_COL0
].reg
= get_hw_temp(rp
, 0);
2163 InputsRead
&= ~FRAG_BIT_COL0
;
2165 /* Secondary color */
2166 if (InputsRead
& FRAG_BIT_COL1
) {
2167 cs
->inputs
[FRAG_ATTRIB_COL1
].refcount
= 0;
2168 cs
->inputs
[FRAG_ATTRIB_COL1
].reg
= get_hw_temp(rp
, 0);
2170 InputsRead
&= ~FRAG_BIT_COL1
;
2174 WARN_ONCE("Don't know how to handle inputs 0x%x\n",
2176 /* force read from hwreg 0 for now */
2178 if (InputsRead
& (1<<i
)) cs
->inputs
[i
].reg
= 0;
2181 /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2182 * That way, we can free up the reg when it's no longer needed
2184 if (!mp
->Base
.Instructions
) {
2185 ERROR("No instructions found in program\n");
2189 for (fpi
=mp
->Base
.Instructions
;fpi
->Opcode
!= OPCODE_END
; fpi
++) {
2193 idx
= fpi
->SrcReg
[i
].Index
;
2194 switch (fpi
->SrcReg
[i
].File
) {
2195 case PROGRAM_TEMPORARY
:
2196 if (!(temps_used
& (1<<idx
))) {
2197 cs
->temps
[idx
].reg
= -1;
2198 cs
->temps
[idx
].refcount
= 1;
2199 temps_used
|= (1 << idx
);
2201 cs
->temps
[idx
].refcount
++;
2204 cs
->inputs
[idx
].refcount
++;
2210 idx
= fpi
->DstReg
.Index
;
2211 if (fpi
->DstReg
.File
== PROGRAM_TEMPORARY
) {
2212 if (!(temps_used
& (1<<idx
))) {
2213 cs
->temps
[idx
].reg
= -1;
2214 cs
->temps
[idx
].refcount
= 1;
2215 temps_used
|= (1 << idx
);
2217 cs
->temps
[idx
].refcount
++;
2220 cs
->temp_in_use
= temps_used
;
2223 static void update_params(struct r300_fragment_program
*rp
)
2225 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
2227 /* Ask Mesa nicely to fill in ParameterValues for us */
2228 if (mp
->Base
.Parameters
)
2229 _mesa_load_state_parameters(rp
->ctx
, mp
->Base
.Parameters
);
2232 void r300_translate_fragment_shader(r300ContextPtr r300
, struct r300_fragment_program
*rp
)
2234 struct r300_pfs_compile_state
*cs
= NULL
;
2236 if (!rp
->translated
) {
2238 init_program(r300
, rp
);
2241 if (parse_program(rp
) == GL_FALSE
) {
2247 rp
->node
[rp
->cur_node
].alu_end
=
2248 cs
->nrslots
- rp
->node
[rp
->cur_node
].alu_offset
- 1;
2249 if (rp
->node
[rp
->cur_node
].tex_end
< 0)
2250 rp
->node
[rp
->cur_node
].tex_end
= 0;
2252 rp
->alu_end
= cs
->nrslots
- 1;
2254 rp
->tex_end
= rp
->tex
.length
? rp
->tex
.length
- 1 : 0;
2255 assert(rp
->node
[rp
->cur_node
].alu_end
>= 0);
2256 assert(rp
->alu_end
>= 0);
2258 rp
->translated
= GL_TRUE
;
2259 if (RADEON_DEBUG
& DEBUG_PIXEL
) dump_program(rp
);
2260 r300UpdateStateParameters(rp
->ctx
, _NEW_PROGRAM
);
2266 /* just some random things... */
2267 static void dump_program(struct r300_fragment_program
*rp
)
2272 fprintf(stderr
, "pc=%d*************************************\n", pc
++);
2274 fprintf(stderr
, "Mesa program:\n");
2275 fprintf(stderr
, "-------------\n");
2276 _mesa_print_program(&rp
->mesa_program
.Base
);
2279 fprintf(stderr
, "Hardware program\n");
2280 fprintf(stderr
, "----------------\n");
2282 for (n
= 0; n
< (rp
->cur_node
+1); n
++) {
2283 fprintf(stderr
, "NODE %d: alu_offset: %d, tex_offset: %d, "\
2284 "alu_end: %d, tex_end: %d\n", n
,
2285 rp
->node
[n
].alu_offset
,
2286 rp
->node
[n
].tex_offset
,
2287 rp
->node
[n
].alu_end
,
2288 rp
->node
[n
].tex_end
);
2290 if (rp
->tex
.length
) {
2291 fprintf(stderr
, " TEX:\n");
2292 for(i
= rp
->node
[n
].tex_offset
; i
<= rp
->node
[n
].tex_offset
+rp
->node
[n
].tex_end
; ++i
) {
2295 switch((rp
->tex
.inst
[i
] >> R300_FPITX_OPCODE_SHIFT
) & 15) {
2296 case R300_FPITX_OP_TEX
:
2299 case R300_FPITX_OP_KIL
:
2302 case R300_FPITX_OP_TXP
:
2305 case R300_FPITX_OP_TXB
:
2312 fprintf(stderr
, " %s t%i, %c%i, texture[%i] (%08x)\n",
2314 (rp
->tex
.inst
[i
] >> R300_FPITX_DST_SHIFT
) & 31,
2315 (rp
->tex
.inst
[i
] & R300_FPITX_SRC_CONST
) ? 'c': 't',
2316 (rp
->tex
.inst
[i
] >> R300_FPITX_SRC_SHIFT
) & 31,
2317 (rp
->tex
.inst
[i
] & R300_FPITX_IMAGE_MASK
) >> R300_FPITX_IMAGE_SHIFT
,
2322 for(i
= rp
->node
[n
].alu_offset
; i
<= rp
->node
[n
].alu_offset
+rp
->node
[n
].alu_end
; ++i
) {
2323 char srcc
[3][10], dstc
[20];
2324 char srca
[3][10], dsta
[20];
2327 char flags
[5], tmp
[10];
2329 for(j
= 0; j
< 3; ++j
) {
2330 int regc
= rp
->alu
.inst
[i
].inst1
>> (j
*6);
2331 int rega
= rp
->alu
.inst
[i
].inst3
>> (j
*6);
2333 sprintf(srcc
[j
], "%c%i", (regc
& 32) ? 'c' : 't', regc
& 31);
2334 sprintf(srca
[j
], "%c%i", (rega
& 32) ? 'c' : 't', rega
& 31);
2338 sprintf(flags
, "%s%s%s",
2339 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_REG_X
) ? "x" : "",
2340 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_REG_Y
) ? "y" : "",
2341 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_REG_Z
) ? "z" : "");
2342 if (flags
[0] != 0) {
2343 sprintf(dstc
, "t%i.%s ",
2344 (rp
->alu
.inst
[i
].inst1
>> R300_FPI1_DSTC_SHIFT
) & 31,
2347 sprintf(flags
, "%s%s%s",
2348 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_OUTPUT_X
) ? "x" : "",
2349 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_OUTPUT_Y
) ? "y" : "",
2350 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_OUTPUT_Z
) ? "z" : "");
2351 if (flags
[0] != 0) {
2352 sprintf(tmp
, "o%i.%s",
2353 (rp
->alu
.inst
[i
].inst1
>> R300_FPI1_DSTC_SHIFT
) & 31,
2359 if (rp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_REG
) {
2360 sprintf(dsta
, "t%i.w ", (rp
->alu
.inst
[i
].inst3
>> R300_FPI3_DSTA_SHIFT
) & 31);
2362 if (rp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_OUTPUT
) {
2363 sprintf(tmp
, "o%i.w ", (rp
->alu
.inst
[i
].inst3
>> R300_FPI3_DSTA_SHIFT
) & 31);
2366 if (rp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_DEPTH
) {
2370 fprintf(stderr
, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2371 " w: %3s %3s %3s -> %-20s (%08x)\n",
2373 srcc
[0], srcc
[1], srcc
[2], dstc
, rp
->alu
.inst
[i
].inst1
,
2374 srca
[0], srca
[1], srca
[2], dsta
, rp
->alu
.inst
[i
].inst3
);
2376 for(j
= 0; j
< 3; ++j
) {
2377 int regc
= rp
->alu
.inst
[i
].inst0
>> (j
*7);
2378 int rega
= rp
->alu
.inst
[i
].inst2
>> (j
*7);
2385 case R300_FPI0_ARGC_SRC0C_XYZ
:
2386 sprintf(buf
, "%s.xyz", srcc
[d
/ 4]);
2388 case R300_FPI0_ARGC_SRC0C_XXX
:
2389 sprintf(buf
, "%s.xxx", srcc
[d
/ 4]);
2391 case R300_FPI0_ARGC_SRC0C_YYY
:
2392 sprintf(buf
, "%s.yyy", srcc
[d
/ 4]);
2394 case R300_FPI0_ARGC_SRC0C_ZZZ
:
2395 sprintf(buf
, "%s.zzz", srcc
[d
/ 4]);
2398 } else if (d
< 15) {
2399 sprintf(buf
, "%s.www", srca
[d
-12]);
2400 } else if (d
== 20) {
2401 sprintf(buf
, "0.0");
2402 } else if (d
== 21) {
2403 sprintf(buf
, "1.0");
2404 } else if (d
== 22) {
2405 sprintf(buf
, "0.5");
2406 } else if (d
>= 23 && d
< 32) {
2410 sprintf(buf
, "%s.yzx", srcc
[d
% 3]);
2413 sprintf(buf
, "%s.zxy", srcc
[d
% 3]);
2416 sprintf(buf
, "%s.Wzy", srcc
[d
% 3]);
2420 sprintf(buf
, "%i", d
);
2423 sprintf(argc
[j
], "%s%s%s%s",
2424 (regc
& 32) ? "-" : "",
2425 (regc
& 64) ? "|" : "",
2427 (regc
& 64) ? "|" : "");
2431 sprintf(buf
, "%s.%c", srcc
[d
/ 3], 'x' + (char)(d
%3));
2432 } else if (d
< 12) {
2433 sprintf(buf
, "%s.w", srca
[d
-9]);
2434 } else if (d
== 16) {
2435 sprintf(buf
, "0.0");
2436 } else if (d
== 17) {
2437 sprintf(buf
, "1.0");
2438 } else if (d
== 18) {
2439 sprintf(buf
, "0.5");
2441 sprintf(buf
, "%i", d
);
2444 sprintf(arga
[j
], "%s%s%s%s",
2445 (rega
& 32) ? "-" : "",
2446 (rega
& 64) ? "|" : "",
2448 (rega
& 64) ? "|" : "");
2451 fprintf(stderr
, " xyz: %8s %8s %8s op: %08x\n"
2452 " w: %8s %8s %8s op: %08x\n",
2453 argc
[0], argc
[1], argc
[2], rp
->alu
.inst
[i
].inst0
,
2454 arga
[0], arga
[1], arga
[2], rp
->alu
.inst
[i
].inst2
);