2 * Copyright (C) 2005 Ben Skeggs.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 * Ben Skeggs <darktama@iinet.net.au>
31 * Jerome Glisse <j.glisse@gmail.com>
36 * - Depth write, WPOS/FOGC inputs
38 * - Verify results of opcodes for accuracy, I've only checked them
46 #include "shader/prog_instruction.h"
47 #include "shader/prog_parameter.h"
48 #include "shader/prog_print.h"
50 #include "r300_context.h"
51 #include "r300_fragprog.h"
53 #include "r300_state.h"
56 * Usefull macros and values
58 #define ERROR(fmt, args...) do { \
59 fprintf(stderr, "%s::%s(): " fmt "\n", \
60 __FILE__, __func__, ##args); \
61 rp->error = GL_TRUE; \
64 #define PFS_INVAL 0xFFFFFFFF
65 #define COMPILE_STATE struct r300_pfs_compile_state *cs = rp->cs
77 #define SWIZZLE_HHH 10
79 #define swizzle(r, x, y, z, w) do_swizzle(rp, r, \
86 #define REG_TYPE_INPUT 0
87 #define REG_TYPE_OUTPUT 1
88 #define REG_TYPE_TEMP 2
89 #define REG_TYPE_CONST 3
91 #define REG_TYPE_SHIFT 0
92 #define REG_INDEX_SHIFT 2
93 #define REG_VSWZ_SHIFT 8
94 #define REG_SSWZ_SHIFT 13
95 #define REG_NEGV_SHIFT 18
96 #define REG_NEGS_SHIFT 19
97 #define REG_ABS_SHIFT 20
98 #define REG_NO_USE_SHIFT 21 // Hack for refcounting
99 #define REG_VALID_SHIFT 22 // Does the register contain a defined value?
100 #define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
102 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
103 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
104 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
105 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
106 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
107 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
108 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
109 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
110 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
111 #define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
113 #define REG(type, index, vswz, sswz, nouse, valid, builtin) \
114 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
115 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
116 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
117 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
118 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
119 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
120 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
121 #define REG_GET_TYPE(reg) \
122 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
123 #define REG_GET_INDEX(reg) \
124 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
125 #define REG_GET_VSWZ(reg) \
126 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
127 #define REG_GET_SSWZ(reg) \
128 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
129 #define REG_GET_NO_USE(reg) \
130 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
131 #define REG_GET_VALID(reg) \
132 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
133 #define REG_GET_BUILTIN(reg) \
134 ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
135 #define REG_SET_TYPE(reg, type) \
136 reg = ((reg & ~REG_TYPE_MASK) | \
137 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
138 #define REG_SET_INDEX(reg, index) \
139 reg = ((reg & ~REG_INDEX_MASK) | \
140 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
141 #define REG_SET_VSWZ(reg, vswz) \
142 reg = ((reg & ~REG_VSWZ_MASK) | \
143 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
144 #define REG_SET_SSWZ(reg, sswz) \
145 reg = ((reg & ~REG_SSWZ_MASK) | \
146 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
147 #define REG_SET_NO_USE(reg, nouse) \
148 reg = ((reg & ~REG_NO_USE_MASK) | \
149 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
150 #define REG_SET_VALID(reg, valid) \
151 reg = ((reg & ~REG_VALID_MASK) | \
152 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
153 #define REG_SET_BUILTIN(reg, builtin) \
154 reg = ((reg & ~REG_BUILTIN_MASK) | \
155 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
156 #define REG_ABS(reg) \
157 reg = (reg | REG_ABS_MASK)
158 #define REG_NEGV(reg) \
159 reg = (reg | REG_NEGV_MASK)
160 #define REG_NEGS(reg) \
161 reg = (reg | REG_NEGS_MASK)
165 * Datas structures for fragment program generation
168 /* description of r300 native hw instructions */
169 static const struct {
175 { "MAD", 3, R300_FPI0_OUTC_MAD
, R300_FPI2_OUTA_MAD
},
176 { "DP3", 2, R300_FPI0_OUTC_DP3
, R300_FPI2_OUTA_DP4
},
177 { "DP4", 2, R300_FPI0_OUTC_DP4
, R300_FPI2_OUTA_DP4
},
178 { "MIN", 2, R300_FPI0_OUTC_MIN
, R300_FPI2_OUTA_MIN
},
179 { "MAX", 2, R300_FPI0_OUTC_MAX
, R300_FPI2_OUTA_MAX
},
180 { "CMP", 3, R300_FPI0_OUTC_CMP
, R300_FPI2_OUTA_CMP
},
181 { "FRC", 1, R300_FPI0_OUTC_FRC
, R300_FPI2_OUTA_FRC
},
182 { "EX2", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_EX2
},
183 { "LG2", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_LG2
},
184 { "RCP", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_RCP
},
185 { "RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_RSQ
},
186 { "REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA
, PFS_INVAL
},
187 { "CMPH", 3, R300_FPI0_OUTC_CMPH
, PFS_INVAL
},
191 /* vector swizzles r300 can support natively, with a couple of
192 * cases we handle specially
194 * REG_VSWZ/REG_SSWZ is an index into this table
197 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
198 #define SWIZZLE_HALF 6
200 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
204 static const struct r300_pfs_swizzle
{
205 GLuint hash
; /* swizzle value this matches */
206 GLuint base
; /* base value for hw swizzle */
207 GLuint stride
; /* difference in base between arg0/1/2 */
210 /* native swizzles */
211 { MAKE_SWZ3(X
, Y
, Z
), R300_FPI0_ARGC_SRC0C_XYZ
, 4, SLOT_SRC_VECTOR
},
212 { MAKE_SWZ3(X
, X
, X
), R300_FPI0_ARGC_SRC0C_XXX
, 4, SLOT_SRC_VECTOR
},
213 { MAKE_SWZ3(Y
, Y
, Y
), R300_FPI0_ARGC_SRC0C_YYY
, 4, SLOT_SRC_VECTOR
},
214 { MAKE_SWZ3(Z
, Z
, Z
), R300_FPI0_ARGC_SRC0C_ZZZ
, 4, SLOT_SRC_VECTOR
},
215 { MAKE_SWZ3(W
, W
, W
), R300_FPI0_ARGC_SRC0A
, 1, SLOT_SRC_SCALAR
},
216 { MAKE_SWZ3(Y
, Z
, X
), R300_FPI0_ARGC_SRC0C_YZX
, 1, SLOT_SRC_VECTOR
},
217 { MAKE_SWZ3(Z
, X
, Y
), R300_FPI0_ARGC_SRC0C_ZXY
, 1, SLOT_SRC_VECTOR
},
218 { MAKE_SWZ3(W
, Z
, Y
), R300_FPI0_ARGC_SRC0CA_WZY
, 1, SLOT_SRC_BOTH
},
219 { MAKE_SWZ3(ONE
, ONE
, ONE
), R300_FPI0_ARGC_ONE
, 0, 0},
220 { MAKE_SWZ3(ZERO
, ZERO
, ZERO
), R300_FPI0_ARGC_ZERO
, 0, 0},
221 { MAKE_SWZ3(HALF
, HALF
, HALF
), R300_FPI0_ARGC_HALF
, 0, 0},
222 { PFS_INVAL
, 0, 0, 0},
225 /* used during matching of non-native swizzles */
226 #define SWZ_X_MASK (7 << 0)
227 #define SWZ_Y_MASK (7 << 3)
228 #define SWZ_Z_MASK (7 << 6)
229 #define SWZ_W_MASK (7 << 9)
230 static const struct {
231 GLuint hash
; /* used to mask matching swizzle components */
232 int mask
; /* actual outmask */
233 int count
; /* count of components matched */
235 { SWZ_X_MASK
|SWZ_Y_MASK
|SWZ_Z_MASK
, 1|2|4, 3},
236 { SWZ_X_MASK
|SWZ_Y_MASK
, 1|2, 2},
237 { SWZ_X_MASK
|SWZ_Z_MASK
, 1|4, 2},
238 { SWZ_Y_MASK
|SWZ_Z_MASK
, 2|4, 2},
242 { PFS_INVAL
, PFS_INVAL
, PFS_INVAL
}
245 static const struct {
246 int base
; /* hw value of swizzle */
247 int stride
; /* difference between SRC0/1/2 */
250 { R300_FPI2_ARGA_SRC0C_X
, 3, SLOT_SRC_VECTOR
},
251 { R300_FPI2_ARGA_SRC0C_Y
, 3, SLOT_SRC_VECTOR
},
252 { R300_FPI2_ARGA_SRC0C_Z
, 3, SLOT_SRC_VECTOR
},
253 { R300_FPI2_ARGA_SRC0A
, 1, SLOT_SRC_SCALAR
},
254 { R300_FPI2_ARGA_ZERO
, 0, 0 },
255 { R300_FPI2_ARGA_ONE
, 0, 0 },
256 { R300_FPI2_ARGA_HALF
, 0, 0 }
259 /* boiler-plate reg, for convenience */
260 static const GLuint undef
= REG(REG_TYPE_TEMP
,
268 /* constant one source */
269 static const GLuint pfs_one
= REG(REG_TYPE_CONST
,
277 /* constant half source */
278 static const GLuint pfs_half
= REG(REG_TYPE_CONST
,
286 /* constant zero source */
287 static const GLuint pfs_zero
= REG(REG_TYPE_CONST
,
296 * Common functions prototypes
298 static void dump_program(struct r300_fragment_program
*rp
);
299 static void emit_arith(struct r300_fragment_program
*rp
, int op
,
300 GLuint dest
, int mask
,
301 GLuint src0
, GLuint src1
, GLuint src2
,
305 * Get an R300 temporary that can be written to in the given slot.
307 static int get_hw_temp(struct r300_fragment_program
*rp
, int slot
)
312 for(r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
313 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= slot
)
317 if (r
>= PFS_NUM_TEMP_REGS
) {
318 ERROR("Out of hardware temps\n");
322 // Reserved is used to avoid the following scenario:
323 // R300 temporary X is first assigned to Mesa temporary Y during vector ops
324 // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
325 // Then scalar ops on Mesa temporary Z are emitted and move back in time
326 // to overwrite the value of temporary Y.
328 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
329 cs
->hwtemps
[r
].free
= -1;
331 // Reset to some value that won't mess things up when the user
332 // tries to read from a temporary that hasn't been assigned a value yet.
333 // In the normal case, vector_valid and scalar_valid should be set to
334 // a sane value by the first emit that writes to this temporary.
335 cs
->hwtemps
[r
].vector_valid
= 0;
336 cs
->hwtemps
[r
].scalar_valid
= 0;
338 if (r
> rp
->max_temp_idx
)
339 rp
->max_temp_idx
= r
;
345 * Get an R300 temporary that will act as a TEX destination register.
347 static int get_hw_temp_tex(struct r300_fragment_program
*rp
)
352 for(r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
353 if (cs
->used_in_node
& (1 << r
))
356 // Note: Be very careful here
357 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= 0)
361 if (r
>= PFS_NUM_TEMP_REGS
)
362 return get_hw_temp(rp
, 0); /* Will cause an indirection */
364 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
365 cs
->hwtemps
[r
].free
= -1;
367 // Reset to some value that won't mess things up when the user
368 // tries to read from a temporary that hasn't been assigned a value yet.
369 // In the normal case, vector_valid and scalar_valid should be set to
370 // a sane value by the first emit that writes to this temporary.
371 cs
->hwtemps
[r
].vector_valid
= cs
->nrslots
;
372 cs
->hwtemps
[r
].scalar_valid
= cs
->nrslots
;
374 if (r
> rp
->max_temp_idx
)
375 rp
->max_temp_idx
= r
;
381 * Mark the given hardware register as free.
383 static void free_hw_temp(struct r300_fragment_program
*rp
, int idx
)
387 // Be very careful here. Consider sequences like
390 // The TEX instruction may be moved in front of the MAD instruction
391 // due to the way nodes work. We don't want to alias r1 and r4 in
393 // I'm certain the register allocation could be further sanitized,
394 // but it's tricky because of stuff that can happen inside emit_tex
396 cs
->hwtemps
[idx
].free
= cs
->nrslots
+1;
401 * Create a new Mesa temporary register.
403 static GLuint
get_temp_reg(struct r300_fragment_program
*rp
)
409 index
= ffs(~cs
->temp_in_use
);
411 ERROR("Out of program temps\n");
415 cs
->temp_in_use
|= (1 << --index
);
416 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
417 cs
->temps
[index
].reg
= -1;
419 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
420 REG_SET_INDEX(r
, index
);
421 REG_SET_VALID(r
, GL_TRUE
);
426 * Create a new Mesa temporary register that will act as the destination
427 * register for a texture read.
429 static GLuint
get_temp_reg_tex(struct r300_fragment_program
*rp
)
435 index
= ffs(~cs
->temp_in_use
);
437 ERROR("Out of program temps\n");
441 cs
->temp_in_use
|= (1 << --index
);
442 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
443 cs
->temps
[index
].reg
= get_hw_temp_tex(rp
);
445 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
446 REG_SET_INDEX(r
, index
);
447 REG_SET_VALID(r
, GL_TRUE
);
452 * Free a Mesa temporary and the associated R300 temporary.
454 static void free_temp(struct r300_fragment_program
*rp
, GLuint r
)
457 GLuint index
= REG_GET_INDEX(r
);
459 if (!(cs
->temp_in_use
& (1 << index
)))
462 if (REG_GET_TYPE(r
) == REG_TYPE_TEMP
) {
463 free_hw_temp(rp
, cs
->temps
[index
].reg
);
464 cs
->temps
[index
].reg
= -1;
465 cs
->temp_in_use
&= ~(1 << index
);
466 } else if (REG_GET_TYPE(r
) == REG_TYPE_INPUT
) {
467 free_hw_temp(rp
, cs
->inputs
[index
].reg
);
468 cs
->inputs
[index
].reg
= -1;
473 * Emit a hardware constant/parameter.
475 * \p cp Stable pointer to an array of 4 floats.
476 * The pointer must be stable in the sense that it remains to be valid
477 * and hold the contents of the constant/parameter throughout the lifetime
478 * of the fragment program (actually, up until the next time the fragment
479 * program is translated).
481 static GLuint
emit_const4fv(struct r300_fragment_program
*rp
, const GLfloat
* cp
)
486 for(index
= 0; index
< rp
->const_nr
; ++index
) {
487 if (rp
->constant
[index
] == cp
)
491 if (index
>= rp
->const_nr
) {
492 if (index
>= PFS_NUM_CONST_REGS
) {
493 ERROR("Out of hw constants!\n");
498 rp
->constant
[index
] = cp
;
501 REG_SET_TYPE(reg
, REG_TYPE_CONST
);
502 REG_SET_INDEX(reg
, index
);
503 REG_SET_VALID(reg
, GL_TRUE
);
507 static inline GLuint
negate(GLuint r
)
514 /* Hack, to prevent clobbering sources used multiple times when
515 * emulating non-native instructions
517 static inline GLuint
keep(GLuint r
)
519 REG_SET_NO_USE(r
, GL_TRUE
);
523 static inline GLuint
absolute(GLuint r
)
529 static int swz_native(struct r300_fragment_program
*rp
,
534 /* Native swizzle, handle negation */
535 src
= (src
& ~REG_NEGS_MASK
) |
536 (((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
);
538 if ((arbneg
& 0x7) == 0x0) {
539 src
= src
& ~REG_NEGV_MASK
;
541 } else if ((arbneg
& 0x7) == 0x7) {
542 src
|= REG_NEGV_MASK
;
545 if (!REG_GET_VALID(*r
))
546 *r
= get_temp_reg(rp
);
547 src
|= REG_NEGV_MASK
;
556 src
= src
& ~REG_NEGV_MASK
;
560 (arbneg
^ 0x7) | WRITEMASK_W
,
570 static int swz_emit_partial(struct r300_fragment_program
*rp
,
580 if (!REG_GET_VALID(*r
))
581 *r
= get_temp_reg(rp
);
583 /* A partial match, VSWZ/mask define what parts of the
584 * desired swizzle we match
586 if (mc
+ s_mask
[mask
].count
== 3) {
588 src
|= ((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
;
591 tmp
= arbneg
& s_mask
[mask
].mask
;
593 tmp
= tmp
^ s_mask
[mask
].mask
;
598 arbneg
& s_mask
[mask
].mask
,
599 keep(src
) | REG_NEGV_MASK
,
604 REG_SET_NO_USE(src
, GL_TRUE
);
606 REG_SET_NO_USE(src
, GL_FALSE
);
618 REG_SET_NO_USE(src
, GL_TRUE
);
620 REG_SET_NO_USE(src
, GL_FALSE
);
625 (arbneg
& s_mask
[mask
].mask
) | wmask
,
633 REG_SET_NO_USE(src
, GL_TRUE
);
635 REG_SET_NO_USE(src
, GL_FALSE
);
637 emit_arith(rp
, PFS_OP_MAD
,
639 s_mask
[mask
].mask
| wmask
,
646 return s_mask
[mask
].count
;
649 static GLuint
do_swizzle(struct r300_fragment_program
*rp
,
659 /* If swizzling from something without an XYZW native swizzle,
660 * emit result to a temp, and do new swizzle from the temp.
663 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
||
664 REG_GET_SSWZ(src
) != SWIZZLE_W
) {
665 GLuint temp
= get_temp_reg(rp
);
678 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
||
679 REG_GET_SSWZ(src
) != SWIZZLE_W
) {
680 GLuint vsrcswz
= (v_swiz
[REG_GET_VSWZ(src
)].hash
& (SWZ_X_MASK
|SWZ_Y_MASK
|SWZ_Z_MASK
)) | REG_GET_SSWZ(src
) << 9;
685 for(i
=0; i
< 4; ++i
){
686 offset
= GET_SWZ(arbswz
, i
);
688 newswz
|= (offset
<= 3)?GET_SWZ(vsrcswz
, offset
) << i
*3:offset
<< i
*3;
691 arbswz
= newswz
& (SWZ_X_MASK
|SWZ_Y_MASK
|SWZ_Z_MASK
);
692 REG_SET_SSWZ(src
, GET_SWZ(newswz
, 3));
696 /* set scalar swizzling */
697 REG_SET_SSWZ(src
, GET_SWZ(arbswz
, 3));
701 vswz
= REG_GET_VSWZ(src
);
705 REG_SET_VSWZ(src
, vswz
);
706 chash
= v_swiz
[REG_GET_VSWZ(src
)].hash
&
709 if (chash
== (arbswz
& s_mask
[c_mask
].hash
)) {
710 if (s_mask
[c_mask
].count
== 3) {
711 v_match
+= swz_native(rp
,
716 v_match
+= swz_emit_partial(rp
,
727 /* Fill with something invalid.. all 0's was
728 * wrong before, matched SWIZZLE_X. So all
729 * 1's will be okay for now
731 arbswz
|= (PFS_INVAL
& s_mask
[c_mask
].hash
);
733 } while(v_swiz
[++vswz
].hash
!= PFS_INVAL
);
734 REG_SET_VSWZ(src
, SWIZZLE_XYZ
);
735 } while (s_mask
[++c_mask
].hash
!= PFS_INVAL
);
737 ERROR("should NEVER get here\n");
741 static GLuint
t_src(struct r300_fragment_program
*rp
,
742 struct prog_src_register fpsrc
)
746 switch (fpsrc
.File
) {
747 case PROGRAM_TEMPORARY
:
748 REG_SET_INDEX(r
, fpsrc
.Index
);
749 REG_SET_VALID(r
, GL_TRUE
);
750 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
753 REG_SET_INDEX(r
, fpsrc
.Index
);
754 REG_SET_VALID(r
, GL_TRUE
);
755 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
757 case PROGRAM_LOCAL_PARAM
:
758 r
= emit_const4fv(rp
,
759 rp
->mesa_program
.Base
.LocalParams
[fpsrc
.Index
]);
761 case PROGRAM_ENV_PARAM
:
762 r
= emit_const4fv(rp
,
763 rp
->ctx
->FragmentProgram
.Parameters
[fpsrc
.Index
]);
765 case PROGRAM_STATE_VAR
:
766 case PROGRAM_NAMED_PARAM
:
767 r
= emit_const4fv(rp
,
768 rp
->mesa_program
.Base
.Parameters
->ParameterValues
[fpsrc
.Index
]);
771 ERROR("unknown SrcReg->File %x\n", fpsrc
.File
);
775 /* no point swizzling ONE/ZERO/HALF constants... */
776 if (REG_GET_VSWZ(r
) < SWIZZLE_111
|| REG_GET_SSWZ(r
) < SWIZZLE_ZERO
)
777 r
= do_swizzle(rp
, r
, fpsrc
.Swizzle
, fpsrc
.NegateBase
);
781 static GLuint
t_scalar_src(struct r300_fragment_program
*rp
,
782 struct prog_src_register fpsrc
)
784 struct prog_src_register src
= fpsrc
;
785 int sc
= GET_SWZ(fpsrc
.Swizzle
, 0); /* X */
787 src
.Swizzle
= ((sc
<<0)|(sc
<<3)|(sc
<<6)|(sc
<<9));
789 return t_src(rp
, src
);
792 static GLuint
t_dst(struct r300_fragment_program
*rp
,
793 struct prog_dst_register dest
)
798 case PROGRAM_TEMPORARY
:
799 REG_SET_INDEX(r
, dest
.Index
);
800 REG_SET_VALID(r
, GL_TRUE
);
801 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
804 REG_SET_TYPE(r
, REG_TYPE_OUTPUT
);
805 switch (dest
.Index
) {
806 case FRAG_RESULT_COLR
:
807 case FRAG_RESULT_DEPR
:
808 REG_SET_INDEX(r
, dest
.Index
);
809 REG_SET_VALID(r
, GL_TRUE
);
812 ERROR("Bad DstReg->Index 0x%x\n", dest
.Index
);
816 ERROR("Bad DstReg->File 0x%x\n", dest
.File
);
821 static int t_hw_src(struct r300_fragment_program
*rp
,
827 int index
= REG_GET_INDEX(src
);
829 switch(REG_GET_TYPE(src
)) {
831 /* NOTE: if reg==-1 here, a source is being read that
832 * hasn't been written to. Undefined results.
834 if (cs
->temps
[index
].reg
== -1)
835 cs
->temps
[index
].reg
= get_hw_temp(rp
, cs
->nrslots
);
837 idx
= cs
->temps
[index
].reg
;
839 if (!REG_GET_NO_USE(src
) &&
840 (--cs
->temps
[index
].refcount
== 0))
844 idx
= cs
->inputs
[index
].reg
;
846 if (!REG_GET_NO_USE(src
) &&
847 (--cs
->inputs
[index
].refcount
== 0))
848 free_hw_temp(rp
, cs
->inputs
[index
].reg
);
851 return (index
| SRC_CONST
);
853 ERROR("Invalid type for source reg\n");
854 return (0 | SRC_CONST
);
858 cs
->used_in_node
|= (1 << idx
);
863 static int t_hw_dst(struct r300_fragment_program
*rp
,
870 GLuint index
= REG_GET_INDEX(dest
);
871 assert(REG_GET_VALID(dest
));
873 switch(REG_GET_TYPE(dest
)) {
875 if (cs
->temps
[REG_GET_INDEX(dest
)].reg
== -1) {
877 cs
->temps
[index
].reg
= get_hw_temp(rp
, slot
);
879 cs
->temps
[index
].reg
= get_hw_temp_tex(rp
);
882 idx
= cs
->temps
[index
].reg
;
884 if (!REG_GET_NO_USE(dest
) &&
885 (--cs
->temps
[index
].refcount
== 0))
888 cs
->dest_in_node
|= (1 << idx
);
889 cs
->used_in_node
|= (1 << idx
);
891 case REG_TYPE_OUTPUT
:
893 case FRAG_RESULT_COLR
:
894 rp
->node
[rp
->cur_node
].flags
|= R300_PFS_NODE_OUTPUT_COLOR
;
896 case FRAG_RESULT_DEPR
:
897 rp
->node
[rp
->cur_node
].flags
|= R300_PFS_NODE_OUTPUT_DEPTH
;
903 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
910 static void emit_nop(struct r300_fragment_program
*rp
)
914 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
915 ERROR("Out of ALU instruction slots\n");
919 rp
->alu
.inst
[cs
->nrslots
].inst0
= NOP_INST0
;
920 rp
->alu
.inst
[cs
->nrslots
].inst1
= NOP_INST1
;
921 rp
->alu
.inst
[cs
->nrslots
].inst2
= NOP_INST2
;
922 rp
->alu
.inst
[cs
->nrslots
].inst3
= NOP_INST3
;
926 static void emit_tex(struct r300_fragment_program
*rp
,
927 struct prog_instruction
*fpi
,
931 GLuint coord
= t_src(rp
, fpi
->SrcReg
[0]);
932 GLuint dest
= undef
, rdest
= undef
;
934 int unit
= fpi
->TexSrcUnit
;
938 uin
= cs
->used_in_node
;
939 din
= cs
->dest_in_node
;
941 /* Resolve source/dest to hardware registers */
942 if (opcode
!= R300_FPITX_OP_KIL
) {
943 if (fpi
->TexSrcTarget
== TEXTURE_RECT_INDEX
) {
945 * Hardware uses [0..1]x[0..1] range for rectangle textures
946 * instead of [0..Width]x[0..Height].
947 * Add a scaling instruction.
949 * \todo Refactor this once we have proper rewriting/optimization
950 * support for programs.
952 GLint tokens
[6] = { STATE_INTERNAL
, STATE_R300_TEXRECT_FACTOR
, 0, 0, 0, 0 };
957 factor_index
= _mesa_add_state_reference(rp
->mesa_program
.Base
.Parameters
, tokens
);
958 factorreg
= emit_const4fv(rp
,
959 rp
->mesa_program
.Base
.Parameters
->ParameterValues
[factor_index
]);
960 tempreg
= keep(get_temp_reg(rp
));
962 emit_arith(rp
, PFS_OP_MAD
, tempreg
, WRITEMASK_XYZW
,
963 coord
, factorreg
, pfs_zero
, 0);
965 /* Ensure correct node indirection */
966 uin
= cs
->used_in_node
;
967 din
= cs
->dest_in_node
;
969 hwsrc
= t_hw_src(rp
, tempreg
, GL_TRUE
);
971 hwsrc
= t_hw_src(rp
, coord
, GL_TRUE
);
974 dest
= t_dst(rp
, fpi
->DstReg
);
976 /* r300 doesn't seem to be able to do TEX->output reg */
977 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
979 dest
= get_temp_reg_tex(rp
);
981 hwdest
= t_hw_dst(rp
, dest
, GL_TRUE
, rp
->node
[rp
->cur_node
].alu_offset
);
983 /* Use a temp that hasn't been used in this node, rather
984 * than causing an indirection
986 if (uin
& (1 << hwdest
)) {
987 free_hw_temp(rp
, hwdest
);
988 hwdest
= get_hw_temp_tex(rp
);
989 cs
->temps
[REG_GET_INDEX(dest
)].reg
= hwdest
;
994 hwsrc
= t_hw_src(rp
, coord
, GL_TRUE
);
998 /* Indirection if source has been written in this node, or if the
999 * dest has been read/written in this node
1001 if ((REG_GET_TYPE(coord
) != REG_TYPE_CONST
&&
1002 (din
& (1<<hwsrc
))) || (uin
& (1<<hwdest
))) {
1004 /* Finish off current node */
1005 if (rp
->node
[rp
->cur_node
].alu_offset
== cs
->nrslots
)
1008 rp
->node
[rp
->cur_node
].alu_end
=
1009 cs
->nrslots
- rp
->node
[rp
->cur_node
].alu_offset
- 1;
1010 assert(rp
->node
[rp
->cur_node
].alu_end
>= 0);
1012 if (++rp
->cur_node
>= PFS_MAX_TEX_INDIRECT
) {
1013 ERROR("too many levels of texture indirection\n");
1017 /* Start new node */
1018 rp
->node
[rp
->cur_node
].tex_offset
= rp
->tex
.length
;
1019 rp
->node
[rp
->cur_node
].alu_offset
= cs
->nrslots
;
1020 rp
->node
[rp
->cur_node
].tex_end
= -1;
1021 rp
->node
[rp
->cur_node
].alu_end
= -1;
1022 rp
->node
[rp
->cur_node
].flags
= 0;
1023 cs
->used_in_node
= 0;
1024 cs
->dest_in_node
= 0;
1027 if (rp
->cur_node
== 0)
1028 rp
->first_node_has_tex
= 1;
1030 rp
->tex
.inst
[rp
->tex
.length
++] = 0
1031 | (hwsrc
<< R300_FPITX_SRC_SHIFT
)
1032 | (hwdest
<< R300_FPITX_DST_SHIFT
)
1033 | (unit
<< R300_FPITX_IMAGE_SHIFT
)
1034 /* not entirely sure about this */
1035 | (opcode
<< R300_FPITX_OPCODE_SHIFT
);
1037 cs
->dest_in_node
|= (1 << hwdest
);
1038 if (REG_GET_TYPE(coord
) != REG_TYPE_CONST
)
1039 cs
->used_in_node
|= (1 << hwsrc
);
1041 rp
->node
[rp
->cur_node
].tex_end
++;
1043 /* Copy from temp to output if needed */
1044 if (REG_GET_VALID(rdest
)) {
1045 emit_arith(rp
, PFS_OP_MAD
, rdest
, WRITEMASK_XYZW
, dest
,
1046 pfs_one
, pfs_zero
, 0);
1047 free_temp(rp
, dest
);
1050 /* Free temp register */
1052 free_temp(rp
, tempreg
);
1057 * Returns the first slot where we could possibly allow writing to dest,
1058 * according to register allocation.
1060 static int get_earliest_allowed_write(
1061 struct r300_fragment_program
* rp
,
1062 GLuint dest
, int mask
)
1067 GLuint index
= REG_GET_INDEX(dest
);
1068 assert(REG_GET_VALID(dest
));
1070 switch(REG_GET_TYPE(dest
)) {
1072 if (cs
->temps
[index
].reg
== -1)
1075 idx
= cs
->temps
[index
].reg
;
1077 case REG_TYPE_OUTPUT
:
1080 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
1084 pos
= cs
->hwtemps
[idx
].reserved
;
1085 if (mask
& WRITEMASK_XYZ
) {
1086 if (pos
< cs
->hwtemps
[idx
].vector_lastread
)
1087 pos
= cs
->hwtemps
[idx
].vector_lastread
;
1089 if (mask
& WRITEMASK_W
) {
1090 if (pos
< cs
->hwtemps
[idx
].scalar_lastread
)
1091 pos
= cs
->hwtemps
[idx
].scalar_lastread
;
1099 * Allocates a slot for an ALU instruction that can consist of
1100 * a vertex part or a scalar part or both.
1102 * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1103 * appropriate position (vector and/or scalar), and their positions are
1104 * recorded in the srcpos array.
1106 * This function emits instruction code for the source fetch and the
1107 * argument selection. It does not emit instruction code for the
1108 * opcode or the destination selection.
1110 * @return the index of the slot
1112 static int find_and_prepare_slot(struct r300_fragment_program
* rp
,
1131 // Determine instruction slots, whether sources are required on
1132 // vector or scalar side, and the smallest slot number where
1133 // all source registers are available
1136 used
|= SLOT_OP_VECTOR
;
1138 used
|= SLOT_OP_SCALAR
;
1140 pos
= get_earliest_allowed_write(rp
, dest
, mask
);
1142 if (rp
->node
[rp
->cur_node
].alu_offset
> pos
)
1143 pos
= rp
->node
[rp
->cur_node
].alu_offset
;
1144 for(i
= 0; i
< argc
; ++i
) {
1145 if (!REG_GET_BUILTIN(src
[i
])) {
1147 used
|= v_swiz
[REG_GET_VSWZ(src
[i
])].flags
<< i
;
1149 used
|= s_swiz
[REG_GET_SSWZ(src
[i
])].flags
<< i
;
1152 hwsrc
[i
] = t_hw_src(rp
, src
[i
], GL_FALSE
); /* Note: sideeffects wrt refcounting! */
1153 regnr
= hwsrc
[i
] & 31;
1155 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1156 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1157 if (cs
->hwtemps
[regnr
].vector_valid
> pos
)
1158 pos
= cs
->hwtemps
[regnr
].vector_valid
;
1160 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1161 if (cs
->hwtemps
[regnr
].scalar_valid
> pos
)
1162 pos
= cs
->hwtemps
[regnr
].scalar_valid
;
1167 // Find a slot that fits
1169 if (cs
->slot
[pos
].used
& used
& SLOT_OP_BOTH
)
1172 if (pos
>= cs
->nrslots
) {
1173 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
1174 ERROR("Out of ALU instruction slots\n");
1178 rp
->alu
.inst
[pos
].inst0
= NOP_INST0
;
1179 rp
->alu
.inst
[pos
].inst1
= NOP_INST1
;
1180 rp
->alu
.inst
[pos
].inst2
= NOP_INST2
;
1181 rp
->alu
.inst
[pos
].inst3
= NOP_INST3
;
1186 // Note: When we need both parts (vector and scalar) of a source,
1187 // we always try to put them into the same position. This makes the
1188 // code easier to read, and it is optimal (i.e. one doesn't gain
1189 // anything by splitting the parts).
1190 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1191 tempused
= cs
->slot
[pos
].used
;
1192 for(i
= 0; i
< 3; ++i
) {
1193 tempvsrc
[i
] = cs
->slot
[pos
].vsrc
[i
];
1194 tempssrc
[i
] = cs
->slot
[pos
].ssrc
[i
];
1197 for(i
= 0; i
< argc
; ++i
) {
1198 int flags
= (used
>> i
) & SLOT_SRC_BOTH
;
1205 for(j
= 0; j
< 3; ++j
) {
1206 if ((tempused
>> j
) & flags
& SLOT_SRC_VECTOR
) {
1207 if (tempvsrc
[j
] != hwsrc
[i
])
1211 if ((tempused
>> j
) & flags
& SLOT_SRC_SCALAR
) {
1212 if (tempssrc
[j
] != hwsrc
[i
])
1223 tempused
|= flags
<< j
;
1224 if (flags
& SLOT_SRC_VECTOR
)
1225 tempvsrc
[j
] = hwsrc
[i
];
1226 if (flags
& SLOT_SRC_SCALAR
)
1227 tempssrc
[j
] = hwsrc
[i
];
1234 // Found a slot, reserve it
1235 cs
->slot
[pos
].used
= tempused
| (used
& SLOT_OP_BOTH
);
1236 for(i
= 0; i
< 3; ++i
) {
1237 cs
->slot
[pos
].vsrc
[i
] = tempvsrc
[i
];
1238 cs
->slot
[pos
].ssrc
[i
] = tempssrc
[i
];
1241 for(i
= 0; i
< argc
; ++i
) {
1242 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1243 int regnr
= hwsrc
[i
] & 31;
1245 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1246 if (cs
->hwtemps
[regnr
].vector_lastread
< pos
)
1247 cs
->hwtemps
[regnr
].vector_lastread
= pos
;
1249 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1250 if (cs
->hwtemps
[regnr
].scalar_lastread
< pos
)
1251 cs
->hwtemps
[regnr
].scalar_lastread
= pos
;
1256 // Emit the source fetch code
1257 rp
->alu
.inst
[pos
].inst1
&= ~R300_FPI1_SRC_MASK
;
1258 rp
->alu
.inst
[pos
].inst1
|=
1259 ((cs
->slot
[pos
].vsrc
[0] << R300_FPI1_SRC0C_SHIFT
) |
1260 (cs
->slot
[pos
].vsrc
[1] << R300_FPI1_SRC1C_SHIFT
) |
1261 (cs
->slot
[pos
].vsrc
[2] << R300_FPI1_SRC2C_SHIFT
));
1263 rp
->alu
.inst
[pos
].inst3
&= ~R300_FPI3_SRC_MASK
;
1264 rp
->alu
.inst
[pos
].inst3
|=
1265 ((cs
->slot
[pos
].ssrc
[0] << R300_FPI3_SRC0A_SHIFT
) |
1266 (cs
->slot
[pos
].ssrc
[1] << R300_FPI3_SRC1A_SHIFT
) |
1267 (cs
->slot
[pos
].ssrc
[2] << R300_FPI3_SRC2A_SHIFT
));
1269 // Emit the argument selection code
1273 for(i
= 0; i
< 3; ++i
) {
1275 swz
[i
] = (v_swiz
[REG_GET_VSWZ(src
[i
])].base
+
1276 (srcpos
[i
] * v_swiz
[REG_GET_VSWZ(src
[i
])].stride
)) |
1277 ((src
[i
] & REG_NEGV_MASK
) ? ARG_NEG
: 0) |
1278 ((src
[i
] & REG_ABS_MASK
) ? ARG_ABS
: 0);
1280 swz
[i
] = R300_FPI0_ARGC_ZERO
;
1284 rp
->alu
.inst
[pos
].inst0
&=
1285 ~(R300_FPI0_ARG0C_MASK
|R300_FPI0_ARG1C_MASK
|R300_FPI0_ARG2C_MASK
);
1286 rp
->alu
.inst
[pos
].inst0
|=
1287 (swz
[0] << R300_FPI0_ARG0C_SHIFT
) |
1288 (swz
[1] << R300_FPI0_ARG1C_SHIFT
) |
1289 (swz
[2] << R300_FPI0_ARG2C_SHIFT
);
1295 for(i
= 0; i
< 3; ++i
) {
1297 swz
[i
] = (s_swiz
[REG_GET_SSWZ(src
[i
])].base
+
1298 (srcpos
[i
] * s_swiz
[REG_GET_SSWZ(src
[i
])].stride
)) |
1299 ((src
[i
] & REG_NEGV_MASK
) ? ARG_NEG
: 0) |
1300 ((src
[i
] & REG_ABS_MASK
) ? ARG_ABS
: 0);
1302 swz
[i
] = R300_FPI2_ARGA_ZERO
;
1306 rp
->alu
.inst
[pos
].inst2
&=
1307 ~(R300_FPI2_ARG0A_MASK
|R300_FPI2_ARG1A_MASK
|R300_FPI2_ARG2A_MASK
);
1308 rp
->alu
.inst
[pos
].inst2
|=
1309 (swz
[0] << R300_FPI2_ARG0A_SHIFT
) |
1310 (swz
[1] << R300_FPI2_ARG1A_SHIFT
) |
1311 (swz
[2] << R300_FPI2_ARG2A_SHIFT
);
1319 * Append an ALU instruction to the instruction list.
1321 static void emit_arith(struct r300_fragment_program
*rp
,
1331 GLuint src
[3] = { src0
, src1
, src2
};
1333 GLboolean emit_vop
, emit_sop
;
1337 vop
= r300_fpop
[op
].v_op
;
1338 sop
= r300_fpop
[op
].s_op
;
1339 argc
= r300_fpop
[op
].argc
;
1341 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
&&
1342 REG_GET_INDEX(dest
) == FRAG_RESULT_DEPR
) {
1343 if (mask
& WRITEMASK_Z
) {
1350 emit_vop
= GL_FALSE
;
1351 emit_sop
= GL_FALSE
;
1352 if ((mask
& WRITEMASK_XYZ
) || vop
== R300_FPI0_OUTC_DP3
)
1354 if ((mask
& WRITEMASK_W
) || vop
== R300_FPI0_OUTC_REPL_ALPHA
)
1357 pos
= find_and_prepare_slot(rp
, emit_vop
, emit_sop
, argc
, src
, dest
, mask
);
1361 hwdest
= t_hw_dst(rp
, dest
, GL_FALSE
, pos
); /* Note: Side effects wrt register allocation */
1363 if (flags
& PFS_FLAG_SAT
) {
1364 vop
|= R300_FPI0_OUTC_SAT
;
1365 sop
|= R300_FPI2_OUTA_SAT
;
1368 /* Throw the pieces together and get FPI0/1 */
1370 rp
->alu
.inst
[pos
].inst0
|= vop
;
1372 rp
->alu
.inst
[pos
].inst1
|= hwdest
<< R300_FPI1_DSTC_SHIFT
;
1374 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1375 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1376 rp
->alu
.inst
[pos
].inst1
|=
1377 (mask
& WRITEMASK_XYZ
) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT
;
1380 rp
->alu
.inst
[pos
].inst1
|=
1381 (mask
& WRITEMASK_XYZ
) << R300_FPI1_DSTC_REG_MASK_SHIFT
;
1383 cs
->hwtemps
[hwdest
].vector_valid
= pos
+1;
1387 /* And now FPI2/3 */
1389 rp
->alu
.inst
[pos
].inst2
|= sop
;
1391 if (mask
& WRITEMASK_W
) {
1392 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1393 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1394 rp
->alu
.inst
[pos
].inst3
|=
1395 (hwdest
<< R300_FPI3_DSTA_SHIFT
) | R300_FPI3_DSTA_OUTPUT
;
1396 } else if (REG_GET_INDEX(dest
) == FRAG_RESULT_DEPR
) {
1397 rp
->alu
.inst
[pos
].inst3
|= R300_FPI3_DSTA_DEPTH
;
1400 rp
->alu
.inst
[pos
].inst3
|=
1401 (hwdest
<< R300_FPI3_DSTA_SHIFT
) | R300_FPI3_DSTA_REG
;
1403 cs
->hwtemps
[hwdest
].scalar_valid
= pos
+1;
1412 static GLuint
get_attrib(struct r300_fragment_program
*rp
, GLuint attr
)
1414 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
1417 if (!(mp
->Base
.InputsRead
& (1<<attr
))) {
1418 ERROR("Attribute %d was not provided!\n", attr
);
1422 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
1423 REG_SET_INDEX(r
, attr
);
1424 REG_SET_VALID(r
, GL_TRUE
);
1429 static GLfloat SinCosConsts
[2][4] = {
1431 1.273239545, // 4/PI
1432 -0.405284735, // -4/(PI*PI)
1439 0.159154943, // 1/(2*PI)
1446 * Emit a LIT instruction.
1447 * \p flags may be PFS_FLAG_SAT
1449 * Definition of LIT (from ARB_fragment_program):
1450 * tmp = VectorLoad(op0);
1451 * if (tmp.x < 0) tmp.x = 0;
1452 * if (tmp.y < 0) tmp.y = 0;
1453 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1454 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1457 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1460 * The longest path of computation is the one leading to result.z,
1461 * consisting of 5 operations. This implementation of LIT takes
1462 * 5 slots. So unless there's some special undocumented opcode,
1463 * this implementation is potentially optimal. Unfortunately,
1464 * emit_arith is a bit too conservative because it doesn't understand
1465 * partial writes to the vector component.
1467 static const GLfloat LitConst
[4] = { 127.999999, 127.999999, 127.999999, -127.999999 };
1469 static void emit_lit(struct r300_fragment_program
*rp
,
1480 cnst
= emit_const4fv(rp
, LitConst
);
1483 if ((mask
& WRITEMASK_XYZW
) != WRITEMASK_XYZW
) {
1485 } else if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1486 // LIT is typically followed by DP3/DP4, so there's no point
1487 // in creating special code for this case
1491 if (needTemporary
) {
1492 temp
= keep(get_temp_reg(rp
));
1497 // Note: The order of emit_arith inside the slots is relevant,
1498 // because emit_arith only looks at scalar vs. vector when resolving
1499 // dependencies, and it does not consider individual vector components,
1500 // so swizzling between the two parts can create fake dependencies.
1503 emit_arith(rp
, PFS_OP_MAX
, temp
, WRITEMASK_XY
,
1504 keep(src
), pfs_zero
, undef
, 0);
1505 emit_arith(rp
, PFS_OP_MAX
, temp
, WRITEMASK_W
,
1506 src
, cnst
, undef
, 0);
1509 emit_arith(rp
, PFS_OP_MIN
, temp
, WRITEMASK_Z
,
1510 swizzle(temp
, W
, W
, W
, W
), cnst
, undef
, 0);
1511 emit_arith(rp
, PFS_OP_LG2
, temp
, WRITEMASK_W
,
1512 swizzle(temp
, Y
, Y
, Y
, Y
), undef
, undef
, 0);
1515 // If desired, we saturate the y result here.
1516 // This does not affect the use as a condition variable in the CMP later
1517 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_W
,
1518 temp
, swizzle(temp
, Z
, Z
, Z
, Z
), pfs_zero
, 0);
1519 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_Y
,
1520 swizzle(temp
, X
, X
, X
, X
), pfs_one
, pfs_zero
, flags
);
1523 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_X
,
1524 pfs_one
, pfs_one
, pfs_zero
, 0);
1525 emit_arith(rp
, PFS_OP_EX2
, temp
, WRITEMASK_W
,
1526 temp
, undef
, undef
, 0);
1529 emit_arith(rp
, PFS_OP_CMP
, temp
, WRITEMASK_Z
,
1530 pfs_zero
, swizzle(temp
, W
, W
, W
, W
), negate(swizzle(temp
, Y
, Y
, Y
, Y
)), flags
);
1531 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_W
,
1532 pfs_one
, pfs_one
, pfs_zero
, 0);
1534 if (needTemporary
) {
1535 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1536 temp
, pfs_one
, pfs_zero
, flags
);
1537 free_temp(rp
, temp
);
1539 // Decrease refcount of the destination
1540 t_hw_dst(rp
, dest
, GL_FALSE
, cs
->nrslots
);
1545 static GLboolean
parse_program(struct r300_fragment_program
*rp
)
1547 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
1548 const struct prog_instruction
*inst
= mp
->Base
.Instructions
;
1549 struct prog_instruction
*fpi
;
1550 GLuint src
[3], dest
, temp
[2];
1551 int flags
, mask
= 0;
1554 if (!inst
|| inst
[0].Opcode
== OPCODE_END
) {
1555 ERROR("empty program?\n");
1559 for (fpi
=mp
->Base
.Instructions
; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
1560 if (fpi
->SaturateMode
== SATURATE_ZERO_ONE
)
1561 flags
= PFS_FLAG_SAT
;
1565 if (fpi
->Opcode
!= OPCODE_KIL
) {
1566 dest
= t_dst(rp
, fpi
->DstReg
);
1567 mask
= fpi
->DstReg
.WriteMask
;
1570 switch (fpi
->Opcode
) {
1572 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1573 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1574 absolute(src
[0]), pfs_one
, pfs_zero
,
1578 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1579 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1580 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1581 src
[0], pfs_one
, src
[1],
1585 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1586 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1587 src
[2] = t_src(rp
, fpi
->SrcReg
[2]);
1588 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1589 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1591 emit_arith(rp
, PFS_OP_CMP
, dest
, mask
,
1592 src
[2], src
[1], src
[0],
1597 * cos using a parabola (see SIN):
1599 * x = (x/(2*PI))+0.75
1604 temp
[0] = get_temp_reg(rp
);
1605 const_sin
[0] = emit_const4fv(rp
, SinCosConsts
[0]);
1606 const_sin
[1] = emit_const4fv(rp
, SinCosConsts
[1]);
1607 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1609 /* add 0.5*PI and do range reduction */
1611 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1612 swizzle(src
[0], X
, X
, X
, X
),
1613 swizzle(const_sin
[1], Z
, Z
, Z
, Z
),
1614 swizzle(const_sin
[1], X
, X
, X
, X
),
1617 emit_arith(rp
, PFS_OP_FRC
, temp
[0], WRITEMASK_X
,
1618 swizzle(temp
[0], X
, X
, X
, X
),
1623 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
,
1624 swizzle(temp
[0], X
, X
, X
, X
),
1625 swizzle(const_sin
[1], W
, W
, W
, W
), //2*PI
1626 negate(swizzle(const_sin
[0], Z
, Z
, Z
, Z
)), //-PI
1631 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
| WRITEMASK_Y
,
1632 swizzle(temp
[0], Z
, Z
, Z
, Z
),
1637 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1638 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1639 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1640 swizzle(temp
[0], X
, X
, X
, X
),
1643 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Y
,
1644 swizzle(temp
[0], X
, X
, X
, X
),
1645 absolute(swizzle(temp
[0], X
, X
, X
, X
)),
1646 negate(swizzle(temp
[0], X
, X
, X
, X
)),
1650 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1651 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1652 swizzle(const_sin
[0], W
, W
, W
, W
),
1653 swizzle(temp
[0], X
, X
, X
, X
),
1656 free_temp(rp
, temp
[0]);
1659 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1660 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1661 emit_arith(rp
, PFS_OP_DP3
, dest
, mask
,
1662 src
[0], src
[1], undef
,
1666 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1667 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1668 emit_arith(rp
, PFS_OP_DP4
, dest
, mask
,
1669 src
[0], src
[1], undef
,
1673 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1674 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1675 /* src0.xyz1 -> temp
1676 * DP4 dest, temp, src1
1679 temp
[0] = get_temp_reg(rp
);
1680 src
[0].s_swz
= SWIZZLE_ONE
;
1681 emit_arith(rp
, PFS_OP_MAD
, temp
[0], mask
,
1682 src
[0], pfs_one
, pfs_zero
,
1684 emit_arith(rp
, PFS_OP_DP4
, dest
, mask
,
1685 temp
[0], src
[1], undef
,
1687 free_temp(rp
, temp
[0]);
1689 emit_arith(rp
, PFS_OP_DP4
, dest
, mask
,
1690 swizzle(src
[0], X
, Y
, Z
, ONE
), src
[1],
1695 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1696 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1697 /* dest.y = src0.y * src1.y */
1698 if (mask
& WRITEMASK_Y
)
1699 emit_arith(rp
, PFS_OP_MAD
, dest
, WRITEMASK_Y
,
1700 keep(src
[0]), keep(src
[1]),
1702 /* dest.z = src0.z */
1703 if (mask
& WRITEMASK_Z
)
1704 emit_arith(rp
, PFS_OP_MAD
, dest
, WRITEMASK_Z
,
1705 src
[0], pfs_one
, pfs_zero
, flags
);
1707 * result.w = src1.w */
1708 if (mask
& WRITEMASK_XW
) {
1709 REG_SET_VSWZ(src
[1], SWIZZLE_111
); /*Cheat*/
1710 emit_arith(rp
, PFS_OP_MAD
, dest
,
1711 mask
& WRITEMASK_XW
,
1712 src
[1], pfs_one
, pfs_zero
,
1717 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1718 emit_arith(rp
, PFS_OP_EX2
, dest
, mask
,
1719 src
[0], undef
, undef
,
1723 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1724 temp
[0] = get_temp_reg(rp
);
1726 * MAD dest, src0, 1.0, -temp
1728 emit_arith(rp
, PFS_OP_FRC
, temp
[0], mask
,
1729 keep(src
[0]), undef
, undef
,
1731 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1732 src
[0], pfs_one
, negate(temp
[0]),
1734 free_temp(rp
, temp
[0]);
1737 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1738 emit_arith(rp
, PFS_OP_FRC
, dest
, mask
,
1739 src
[0], undef
, undef
,
1743 emit_tex(rp
, fpi
, R300_FPITX_OP_KIL
);
1746 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1747 emit_arith(rp
, PFS_OP_LG2
, dest
, mask
,
1748 src
[0], undef
, undef
,
1752 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1753 emit_lit(rp
, dest
, mask
, src
[0], flags
);
1756 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1757 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1758 src
[2] = t_src(rp
, fpi
->SrcReg
[2]);
1759 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1760 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1761 * MAD temp, -tmp0, tmp2, tmp2
1762 * MAD result, tmp0, tmp1, temp
1764 temp
[0] = get_temp_reg(rp
);
1765 emit_arith(rp
, PFS_OP_MAD
, temp
[0], mask
,
1766 negate(keep(src
[0])), keep(src
[2]), src
[2],
1768 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1769 src
[0], src
[1], temp
[0],
1771 free_temp(rp
, temp
[0]);
1774 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1775 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1776 src
[2] = t_src(rp
, fpi
->SrcReg
[2]);
1777 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1778 src
[0], src
[1], src
[2],
1782 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1783 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1784 emit_arith(rp
, PFS_OP_MAX
, dest
, mask
,
1785 src
[0], src
[1], undef
,
1789 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1790 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1791 emit_arith(rp
, PFS_OP_MIN
, dest
, mask
,
1792 src
[0], src
[1], undef
,
1797 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1798 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1799 src
[0], pfs_one
, pfs_zero
,
1803 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1804 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1805 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1806 src
[0], src
[1], pfs_zero
,
1810 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1811 src
[1] = t_scalar_src(rp
, fpi
->SrcReg
[1]);
1812 temp
[0] = get_temp_reg(rp
);
1813 emit_arith(rp
, PFS_OP_LG2
, temp
[0], WRITEMASK_W
,
1814 src
[0], undef
, undef
,
1816 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_W
,
1817 temp
[0], src
[1], pfs_zero
,
1819 emit_arith(rp
, PFS_OP_EX2
, dest
, fpi
->DstReg
.WriteMask
,
1820 temp
[0], undef
, undef
,
1822 free_temp(rp
, temp
[0]);
1825 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1826 emit_arith(rp
, PFS_OP_RCP
, dest
, mask
,
1827 src
[0], undef
, undef
,
1831 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1832 emit_arith(rp
, PFS_OP_RSQ
, dest
, mask
,
1833 absolute(src
[0]), pfs_zero
, pfs_zero
,
1838 * scs using a parabola :
1840 * result.x = sin(-abs(x)+0.5*PI) (cos)
1841 * result.y = sin(x) (sin)
1844 temp
[0] = get_temp_reg(rp
);
1845 temp
[1] = get_temp_reg(rp
);
1846 const_sin
[0] = emit_const4fv(rp
, SinCosConsts
[0]);
1847 const_sin
[1] = emit_const4fv(rp
, SinCosConsts
[1]);
1848 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1850 /* x = -abs(x)+0.5*PI */
1851 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
,
1852 swizzle(const_sin
[0], Z
, Z
, Z
, Z
), //PI
1854 negate(abs(swizzle(keep(src
[0]), X
, X
, X
, X
))),
1858 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_W
,
1859 swizzle(const_sin
[0], Y
, Y
, Y
, Y
),
1860 swizzle(keep(src
[0]), X
, X
, X
, X
),
1864 /* B*x, C*x (cos) */
1865 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
| WRITEMASK_Y
,
1866 swizzle(temp
[0], Z
, Z
, Z
, Z
),
1872 emit_arith(rp
, PFS_OP_MAD
, temp
[1], WRITEMASK_W
,
1873 swizzle(const_sin
[0], X
, X
, X
, X
),
1878 /* y = B*x + C*x*abs(x) (sin)*/
1879 emit_arith(rp
, PFS_OP_MAD
, temp
[1], WRITEMASK_Z
,
1881 swizzle(temp
[0], W
, W
, W
, W
),
1882 swizzle(temp
[1], W
, W
, W
, W
),
1885 /* y = B*x + C*x*abs(x) (cos)*/
1886 emit_arith(rp
, PFS_OP_MAD
, temp
[1], WRITEMASK_W
,
1887 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1888 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1889 swizzle(temp
[0], X
, X
, X
, X
),
1892 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1893 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
| WRITEMASK_Y
,
1894 swizzle(temp
[1], W
, Z
, Y
, X
),
1895 absolute(swizzle(temp
[1], W
, Z
, Y
, X
)),
1896 negate(swizzle(temp
[1], W
, Z
, Y
, X
)),
1900 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1901 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
& (WRITEMASK_X
| WRITEMASK_Y
),
1903 swizzle(const_sin
[0], W
, W
, W
, W
),
1904 swizzle(temp
[1], W
, Z
, Y
, X
),
1907 free_temp(rp
, temp
[0]);
1908 free_temp(rp
, temp
[1]);
1911 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1912 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1913 temp
[0] = get_temp_reg(rp
);
1914 /* temp = src0 - src1
1915 * dest.c = (temp.c < 0.0) ? 0 : 1
1917 emit_arith(rp
, PFS_OP_MAD
, temp
[0], mask
,
1918 src
[0], pfs_one
, negate(src
[1]),
1920 emit_arith(rp
, PFS_OP_CMP
, dest
, mask
,
1921 pfs_one
, pfs_zero
, temp
[0],
1923 free_temp(rp
, temp
[0]);
1928 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1929 * extra precision is obtained by weighting against
1933 temp
[0] = get_temp_reg(rp
);
1934 const_sin
[0] = emit_const4fv(rp
, SinCosConsts
[0]);
1935 const_sin
[1] = emit_const4fv(rp
, SinCosConsts
[1]);
1936 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1939 /* do range reduction */
1941 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1942 swizzle(keep(src
[0]), X
, X
, X
, X
),
1943 swizzle(const_sin
[1], Z
, Z
, Z
, Z
),
1947 emit_arith(rp
, PFS_OP_FRC
, temp
[0], WRITEMASK_X
,
1948 swizzle(temp
[0], X
, X
, X
, X
),
1953 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
,
1954 swizzle(temp
[0], X
, X
, X
, X
),
1955 swizzle(const_sin
[1], W
, W
, W
, W
), //2*PI
1956 negate(swizzle(const_sin
[0], Z
, Z
, Z
, Z
)), //PI
1961 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
| WRITEMASK_Y
,
1962 swizzle(temp
[0], Z
, Z
, Z
, Z
),
1967 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1968 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1969 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1970 swizzle(temp
[0], X
, X
, X
, X
),
1973 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_Y
,
1974 swizzle(temp
[0], X
, X
, X
, X
),
1975 absolute(swizzle(temp
[0], X
, X
, X
, X
)),
1976 negate(swizzle(temp
[0], X
, X
, X
, X
)),
1980 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1981 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1982 swizzle(const_sin
[0], W
, W
, W
, W
),
1983 swizzle(temp
[0], X
, X
, X
, X
),
1986 free_temp(rp
, temp
[0]);
1989 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1990 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1991 temp
[0] = get_temp_reg(rp
);
1992 /* temp = src0 - src1
1993 * dest.c = (temp.c < 0.0) ? 1 : 0
1995 emit_arith(rp
, PFS_OP_MAD
, temp
[0], mask
,
1996 src
[0], pfs_one
, negate(src
[1]),
1998 emit_arith(rp
, PFS_OP_CMP
, dest
, mask
,
1999 pfs_zero
, pfs_one
, temp
[0],
2001 free_temp(rp
, temp
[0]);
2004 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
2005 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
2006 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
2007 src
[0], pfs_one
, negate(src
[1]),
2011 emit_tex(rp
, fpi
, R300_FPITX_OP_TEX
);
2014 emit_tex(rp
, fpi
, R300_FPITX_OP_TXB
);
2017 emit_tex(rp
, fpi
, R300_FPITX_OP_TXP
);
2020 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
2021 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
2022 temp
[0] = get_temp_reg(rp
);
2023 /* temp = src0.zxy * src1.yzx */
2024 emit_arith(rp
, PFS_OP_MAD
, temp
[0], WRITEMASK_XYZ
,
2025 swizzle(keep(src
[0]), Z
, X
, Y
, W
),
2026 swizzle(keep(src
[1]), Y
, Z
, X
, W
),
2029 /* dest.xyz = src0.yzx * src1.zxy - temp
2030 * dest.w = undefined
2032 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
& WRITEMASK_XYZ
,
2033 swizzle(src
[0], Y
, Z
, X
, W
),
2034 swizzle(src
[1], Z
, X
, Y
, W
),
2038 free_temp(rp
, temp
[0]);
2042 ERROR("unknown fpi->Opcode %d\n", fpi
->Opcode
);
2054 static void insert_wpos(struct gl_program
*prog
)
2056 GLint tokens
[6] = { STATE_INTERNAL
, STATE_R300_WINDOW_DIMENSION
, 0, 0, 0, 0 };
2057 struct prog_instruction
*fpi
;
2058 GLuint window_index
;
2060 GLuint tempregi
= prog
->NumTemporaries
;
2061 /* should do something else if no temps left... */
2062 prog
->NumTemporaries
++;
2064 fpi
= _mesa_alloc_instructions (prog
->NumInstructions
+ 3);
2065 _mesa_init_instructions (fpi
, prog
->NumInstructions
+ 3);
2067 /* perspective divide */
2068 fpi
[i
].Opcode
= OPCODE_RCP
;
2070 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2071 fpi
[i
].DstReg
.Index
= tempregi
;
2072 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_W
;
2073 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2075 fpi
[i
].SrcReg
[0].File
= PROGRAM_INPUT
;
2076 fpi
[i
].SrcReg
[0].Index
= FRAG_ATTRIB_WPOS
;
2077 fpi
[i
].SrcReg
[0].Swizzle
= SWIZZLE_WWWW
;
2080 fpi
[i
].Opcode
= OPCODE_MUL
;
2082 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2083 fpi
[i
].DstReg
.Index
= tempregi
;
2084 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_XYZ
;
2085 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2087 fpi
[i
].SrcReg
[0].File
= PROGRAM_INPUT
;
2088 fpi
[i
].SrcReg
[0].Index
= FRAG_ATTRIB_WPOS
;
2089 fpi
[i
].SrcReg
[0].Swizzle
= SWIZZLE_XYZW
;
2091 fpi
[i
].SrcReg
[1].File
= PROGRAM_TEMPORARY
;
2092 fpi
[i
].SrcReg
[1].Index
= tempregi
;
2093 fpi
[i
].SrcReg
[1].Swizzle
= SWIZZLE_WWWW
;
2096 /* viewport transformation */
2097 window_index
= _mesa_add_state_reference(prog
->Parameters
, tokens
);
2099 fpi
[i
].Opcode
= OPCODE_MAD
;
2101 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2102 fpi
[i
].DstReg
.Index
= tempregi
;
2103 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_XYZ
;
2104 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2106 fpi
[i
].SrcReg
[0].File
= PROGRAM_TEMPORARY
;
2107 fpi
[i
].SrcReg
[0].Index
= tempregi
;
2108 fpi
[i
].SrcReg
[0].Swizzle
= MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2110 fpi
[i
].SrcReg
[1].File
= PROGRAM_STATE_VAR
;
2111 fpi
[i
].SrcReg
[1].Index
= window_index
;
2112 fpi
[i
].SrcReg
[1].Swizzle
= MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2114 fpi
[i
].SrcReg
[2].File
= PROGRAM_STATE_VAR
;
2115 fpi
[i
].SrcReg
[2].Index
= window_index
;
2116 fpi
[i
].SrcReg
[2].Swizzle
= MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2119 _mesa_copy_instructions (&fpi
[i
], prog
->Instructions
, prog
->NumInstructions
);
2121 free(prog
->Instructions
);
2123 prog
->Instructions
= fpi
;
2125 prog
->NumInstructions
+= i
;
2126 fpi
= &prog
->Instructions
[prog
->NumInstructions
-1];
2128 assert(fpi
->Opcode
== OPCODE_END
);
2130 for(fpi
= &prog
->Instructions
[3]; fpi
->Opcode
!= OPCODE_END
; fpi
++){
2132 if( fpi
->SrcReg
[i
].File
== PROGRAM_INPUT
&&
2133 fpi
->SrcReg
[i
].Index
== FRAG_ATTRIB_WPOS
){
2134 fpi
->SrcReg
[i
].File
= PROGRAM_TEMPORARY
;
2135 fpi
->SrcReg
[i
].Index
= tempregi
;
2140 /* - Init structures
2141 * - Determine what hwregs each input corresponds to
2143 static void init_program(r300ContextPtr r300
, struct r300_fragment_program
*rp
)
2145 struct r300_pfs_compile_state
*cs
= NULL
;
2146 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
2147 struct prog_instruction
*fpi
;
2148 GLuint InputsRead
= mp
->Base
.InputsRead
;
2149 GLuint temps_used
= 0; /* for rp->temps[] */
2152 /* New compile, reset tracking data */
2153 rp
->optimization
= driQueryOptioni(&r300
->radeon
.optionCache
, "fp_optimization");
2154 rp
->translated
= GL_FALSE
;
2155 rp
->error
= GL_FALSE
;
2156 rp
->cs
= cs
= &(R300_CONTEXT(rp
->ctx
)->state
.pfs_compile
);
2159 rp
->first_node_has_tex
= 0;
2161 rp
->max_temp_idx
= 0;
2162 rp
->node
[0].alu_end
= -1;
2163 rp
->node
[0].tex_end
= -1;
2165 _mesa_memset(cs
, 0, sizeof(*rp
->cs
));
2166 for (i
=0;i
<PFS_MAX_ALU_INST
;i
++) {
2168 cs
->slot
[i
].vsrc
[j
] = SRC_CONST
;
2169 cs
->slot
[i
].ssrc
[j
] = SRC_CONST
;
2173 /* Work out what temps the Mesa inputs correspond to, this must match
2174 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2175 * configures itself based on the fragprog's InputsRead
2177 * NOTE: this depends on get_hw_temp() allocating registers in order,
2178 * starting from register 0.
2181 /* Texcoords come first */
2182 for (i
=0;i
<rp
->ctx
->Const
.MaxTextureUnits
;i
++) {
2183 if (InputsRead
& (FRAG_BIT_TEX0
<< i
)) {
2184 cs
->inputs
[FRAG_ATTRIB_TEX0
+i
].refcount
= 0;
2185 cs
->inputs
[FRAG_ATTRIB_TEX0
+i
].reg
= get_hw_temp(rp
, 0);
2188 InputsRead
&= ~FRAG_BITS_TEX_ANY
;
2190 /* fragment position treated as a texcoord */
2191 if (InputsRead
& FRAG_BIT_WPOS
) {
2192 cs
->inputs
[FRAG_ATTRIB_WPOS
].refcount
= 0;
2193 cs
->inputs
[FRAG_ATTRIB_WPOS
].reg
= get_hw_temp(rp
, 0);
2194 insert_wpos(&mp
->Base
);
2196 InputsRead
&= ~FRAG_BIT_WPOS
;
2198 /* Then primary colour */
2199 if (InputsRead
& FRAG_BIT_COL0
) {
2200 cs
->inputs
[FRAG_ATTRIB_COL0
].refcount
= 0;
2201 cs
->inputs
[FRAG_ATTRIB_COL0
].reg
= get_hw_temp(rp
, 0);
2203 InputsRead
&= ~FRAG_BIT_COL0
;
2205 /* Secondary color */
2206 if (InputsRead
& FRAG_BIT_COL1
) {
2207 cs
->inputs
[FRAG_ATTRIB_COL1
].refcount
= 0;
2208 cs
->inputs
[FRAG_ATTRIB_COL1
].reg
= get_hw_temp(rp
, 0);
2210 InputsRead
&= ~FRAG_BIT_COL1
;
2214 WARN_ONCE("Don't know how to handle inputs 0x%x\n",
2216 /* force read from hwreg 0 for now */
2218 if (InputsRead
& (1<<i
)) cs
->inputs
[i
].reg
= 0;
2221 /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2222 * That way, we can free up the reg when it's no longer needed
2224 if (!mp
->Base
.Instructions
) {
2225 ERROR("No instructions found in program\n");
2229 for (fpi
=mp
->Base
.Instructions
;fpi
->Opcode
!= OPCODE_END
; fpi
++) {
2233 idx
= fpi
->SrcReg
[i
].Index
;
2234 switch (fpi
->SrcReg
[i
].File
) {
2235 case PROGRAM_TEMPORARY
:
2236 if (!(temps_used
& (1<<idx
))) {
2237 cs
->temps
[idx
].reg
= -1;
2238 cs
->temps
[idx
].refcount
= 1;
2239 temps_used
|= (1 << idx
);
2241 cs
->temps
[idx
].refcount
++;
2244 cs
->inputs
[idx
].refcount
++;
2250 idx
= fpi
->DstReg
.Index
;
2251 if (fpi
->DstReg
.File
== PROGRAM_TEMPORARY
) {
2252 if (!(temps_used
& (1<<idx
))) {
2253 cs
->temps
[idx
].reg
= -1;
2254 cs
->temps
[idx
].refcount
= 1;
2255 temps_used
|= (1 << idx
);
2257 cs
->temps
[idx
].refcount
++;
2260 cs
->temp_in_use
= temps_used
;
2263 static void update_params(struct r300_fragment_program
*rp
)
2265 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
2267 /* Ask Mesa nicely to fill in ParameterValues for us */
2268 if (mp
->Base
.Parameters
)
2269 _mesa_load_state_parameters(rp
->ctx
, mp
->Base
.Parameters
);
2272 void r300_translate_fragment_shader(r300ContextPtr r300
, struct r300_fragment_program
*rp
)
2274 struct r300_pfs_compile_state
*cs
= NULL
;
2276 if (!rp
->translated
) {
2278 init_program(r300
, rp
);
2281 if (parse_program(rp
) == GL_FALSE
) {
2287 rp
->node
[rp
->cur_node
].alu_end
=
2288 cs
->nrslots
- rp
->node
[rp
->cur_node
].alu_offset
- 1;
2289 if (rp
->node
[rp
->cur_node
].tex_end
< 0)
2290 rp
->node
[rp
->cur_node
].tex_end
= 0;
2292 rp
->alu_end
= cs
->nrslots
- 1;
2294 rp
->tex_end
= rp
->tex
.length
? rp
->tex
.length
- 1 : 0;
2295 assert(rp
->node
[rp
->cur_node
].alu_end
>= 0);
2296 assert(rp
->alu_end
>= 0);
2298 rp
->translated
= GL_TRUE
;
2299 if (RADEON_DEBUG
& DEBUG_PIXEL
) dump_program(rp
);
2300 r300UpdateStateParameters(rp
->ctx
, _NEW_PROGRAM
);
2306 /* just some random things... */
2307 static void dump_program(struct r300_fragment_program
*rp
)
2312 fprintf(stderr
, "pc=%d*************************************\n", pc
++);
2314 fprintf(stderr
, "Mesa program:\n");
2315 fprintf(stderr
, "-------------\n");
2316 _mesa_print_program(&rp
->mesa_program
.Base
);
2319 fprintf(stderr
, "Hardware program\n");
2320 fprintf(stderr
, "----------------\n");
2322 for (n
= 0; n
< (rp
->cur_node
+1); n
++) {
2323 fprintf(stderr
, "NODE %d: alu_offset: %d, tex_offset: %d, "\
2324 "alu_end: %d, tex_end: %d\n", n
,
2325 rp
->node
[n
].alu_offset
,
2326 rp
->node
[n
].tex_offset
,
2327 rp
->node
[n
].alu_end
,
2328 rp
->node
[n
].tex_end
);
2330 if (rp
->tex
.length
) {
2331 fprintf(stderr
, " TEX:\n");
2332 for(i
= rp
->node
[n
].tex_offset
; i
<= rp
->node
[n
].tex_offset
+rp
->node
[n
].tex_end
; ++i
) {
2335 switch((rp
->tex
.inst
[i
] >> R300_FPITX_OPCODE_SHIFT
) & 15) {
2336 case R300_FPITX_OP_TEX
:
2339 case R300_FPITX_OP_KIL
:
2342 case R300_FPITX_OP_TXP
:
2345 case R300_FPITX_OP_TXB
:
2352 fprintf(stderr
, " %s t%i, %c%i, texture[%i] (%08x)\n",
2354 (rp
->tex
.inst
[i
] >> R300_FPITX_DST_SHIFT
) & 31,
2355 (rp
->tex
.inst
[i
] & R300_FPITX_SRC_CONST
) ? 'c': 't',
2356 (rp
->tex
.inst
[i
] >> R300_FPITX_SRC_SHIFT
) & 31,
2357 (rp
->tex
.inst
[i
] & R300_FPITX_IMAGE_MASK
) >> R300_FPITX_IMAGE_SHIFT
,
2362 for(i
= rp
->node
[n
].alu_offset
; i
<= rp
->node
[n
].alu_offset
+rp
->node
[n
].alu_end
; ++i
) {
2363 char srcc
[3][10], dstc
[20];
2364 char srca
[3][10], dsta
[20];
2367 char flags
[5], tmp
[10];
2369 for(j
= 0; j
< 3; ++j
) {
2370 int regc
= rp
->alu
.inst
[i
].inst1
>> (j
*6);
2371 int rega
= rp
->alu
.inst
[i
].inst3
>> (j
*6);
2373 sprintf(srcc
[j
], "%c%i", (regc
& 32) ? 'c' : 't', regc
& 31);
2374 sprintf(srca
[j
], "%c%i", (rega
& 32) ? 'c' : 't', rega
& 31);
2378 sprintf(flags
, "%s%s%s",
2379 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_REG_X
) ? "x" : "",
2380 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_REG_Y
) ? "y" : "",
2381 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_REG_Z
) ? "z" : "");
2382 if (flags
[0] != 0) {
2383 sprintf(dstc
, "t%i.%s ",
2384 (rp
->alu
.inst
[i
].inst1
>> R300_FPI1_DSTC_SHIFT
) & 31,
2387 sprintf(flags
, "%s%s%s",
2388 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_OUTPUT_X
) ? "x" : "",
2389 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_OUTPUT_Y
) ? "y" : "",
2390 (rp
->alu
.inst
[i
].inst1
& R300_FPI1_DSTC_OUTPUT_Z
) ? "z" : "");
2391 if (flags
[0] != 0) {
2392 sprintf(tmp
, "o%i.%s",
2393 (rp
->alu
.inst
[i
].inst1
>> R300_FPI1_DSTC_SHIFT
) & 31,
2399 if (rp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_REG
) {
2400 sprintf(dsta
, "t%i.w ", (rp
->alu
.inst
[i
].inst3
>> R300_FPI3_DSTA_SHIFT
) & 31);
2402 if (rp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_OUTPUT
) {
2403 sprintf(tmp
, "o%i.w ", (rp
->alu
.inst
[i
].inst3
>> R300_FPI3_DSTA_SHIFT
) & 31);
2406 if (rp
->alu
.inst
[i
].inst3
& R300_FPI3_DSTA_DEPTH
) {
2410 fprintf(stderr
, "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2411 " w: %3s %3s %3s -> %-20s (%08x)\n",
2413 srcc
[0], srcc
[1], srcc
[2], dstc
, rp
->alu
.inst
[i
].inst1
,
2414 srca
[0], srca
[1], srca
[2], dsta
, rp
->alu
.inst
[i
].inst3
);
2416 for(j
= 0; j
< 3; ++j
) {
2417 int regc
= rp
->alu
.inst
[i
].inst0
>> (j
*7);
2418 int rega
= rp
->alu
.inst
[i
].inst2
>> (j
*7);
2425 case R300_FPI0_ARGC_SRC0C_XYZ
:
2426 sprintf(buf
, "%s.xyz", srcc
[d
/ 4]);
2428 case R300_FPI0_ARGC_SRC0C_XXX
:
2429 sprintf(buf
, "%s.xxx", srcc
[d
/ 4]);
2431 case R300_FPI0_ARGC_SRC0C_YYY
:
2432 sprintf(buf
, "%s.yyy", srcc
[d
/ 4]);
2434 case R300_FPI0_ARGC_SRC0C_ZZZ
:
2435 sprintf(buf
, "%s.zzz", srcc
[d
/ 4]);
2438 } else if (d
< 15) {
2439 sprintf(buf
, "%s.www", srca
[d
-12]);
2440 } else if (d
== 20) {
2441 sprintf(buf
, "0.0");
2442 } else if (d
== 21) {
2443 sprintf(buf
, "1.0");
2444 } else if (d
== 22) {
2445 sprintf(buf
, "0.5");
2446 } else if (d
>= 23 && d
< 32) {
2450 sprintf(buf
, "%s.yzx", srcc
[d
% 3]);
2453 sprintf(buf
, "%s.zxy", srcc
[d
% 3]);
2456 sprintf(buf
, "%s.Wzy", srcc
[d
% 3]);
2460 sprintf(buf
, "%i", d
);
2463 sprintf(argc
[j
], "%s%s%s%s",
2464 (regc
& 32) ? "-" : "",
2465 (regc
& 64) ? "|" : "",
2467 (regc
& 64) ? "|" : "");
2471 sprintf(buf
, "%s.%c", srcc
[d
/ 3], 'x' + (char)(d
%3));
2472 } else if (d
< 12) {
2473 sprintf(buf
, "%s.w", srca
[d
-9]);
2474 } else if (d
== 16) {
2475 sprintf(buf
, "0.0");
2476 } else if (d
== 17) {
2477 sprintf(buf
, "1.0");
2478 } else if (d
== 18) {
2479 sprintf(buf
, "0.5");
2481 sprintf(buf
, "%i", d
);
2484 sprintf(arga
[j
], "%s%s%s%s",
2485 (rega
& 32) ? "-" : "",
2486 (rega
& 64) ? "|" : "",
2488 (rega
& 64) ? "|" : "");
2491 fprintf(stderr
, " xyz: %8s %8s %8s op: %08x\n"
2492 " w: %8s %8s %8s op: %08x\n",
2493 argc
[0], argc
[1], argc
[2], rp
->alu
.inst
[i
].inst0
,
2494 arga
[0], arga
[1], arga
[2], rp
->alu
.inst
[i
].inst2
);