2 * Copyright (C) 2005 Ben Skeggs.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 * Ben Skeggs <darktama@iinet.net.au>
31 * Jerome Glisse <j.glisse@gmail.com>
36 * - COS/SIN/SCS instructions
37 * - Depth write, WPOS/FOGC inputs
39 * - Verify results of opcodes for accuracy, I've only checked them
49 #include "program_instruction.h"
50 #include "r300_context.h"
51 #include "r300_fragprog.h"
55 * Usefull macros and values
57 #define ERROR(fmt, args...) do { \
58 fprintf(stderr, "%s::%s(): " fmt "\n", \
59 __FILE__, __func__, ##args); \
60 rp->error = GL_TRUE; \
63 #define PFS_INVAL 0xFFFFFFFF
64 #define COMPILE_STATE struct r300_pfs_compile_state *cs = rp->cs
76 #define SWIZZLE_HHH 10
78 #define swizzle(r, x, y, z, w) do_swizzle(rp, r, \
85 #define REG_TYPE_INPUT 0
86 #define REG_TYPE_OUTPUT 1
87 #define REG_TYPE_TEMP 2
88 #define REG_TYPE_CONST 3
90 #define REG_TYPE_SHIFT 0
91 #define REG_INDEX_SHIFT 2
92 #define REG_VSWZ_SHIFT 8
93 #define REG_SSWZ_SHIFT 13
94 #define REG_NEGV_SHIFT 18
95 #define REG_NEGS_SHIFT 19
96 #define REG_ABS_SHIFT 20
97 #define REG_NO_USE_SHIFT 21
98 #define REG_VALID_SHIFT 22
100 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
101 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
102 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
103 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
104 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
105 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
106 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
107 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
108 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
110 #define REG(type, index, vswz, sswz, nouse, valid) \
111 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
112 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
113 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
114 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
115 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
116 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
117 #define REG_GET_TYPE(reg) \
118 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
119 #define REG_GET_INDEX(reg) \
120 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
121 #define REG_GET_VSWZ(reg) \
122 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
123 #define REG_GET_SSWZ(reg) \
124 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
125 #define REG_GET_NO_USE(reg) \
126 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
127 #define REG_GET_VALID(reg) \
128 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
129 #define REG_SET_TYPE(reg, type) \
130 reg = ((reg & ~REG_TYPE_MASK) | \
131 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
132 #define REG_SET_INDEX(reg, index) \
133 reg = ((reg & ~REG_INDEX_MASK) | \
134 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
135 #define REG_SET_VSWZ(reg, vswz) \
136 reg = ((reg & ~REG_VSWZ_MASK) | \
137 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
138 #define REG_SET_SSWZ(reg, sswz) \
139 reg = ((reg & ~REG_SSWZ_MASK) | \
140 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
141 #define REG_SET_NO_USE(reg, nouse) \
142 reg = ((reg & ~REG_NO_USE_MASK) | \
143 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
144 #define REG_SET_VALID(reg, valid) \
145 reg = ((reg & ~REG_VALID_MASK) | \
146 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
147 #define REG_ABS(reg) \
148 reg = (reg | REG_ABS_MASK)
149 #define REG_NEGV(reg) \
150 reg = (reg | REG_NEGV_MASK)
151 #define REG_NEGS(reg) \
152 reg = (reg | REG_NEGS_MASK)
156 * Datas structures for fragment program generation
159 /* description of r300 native hw instructions */
160 static const struct {
166 { "MAD", 3, R300_FPI0_OUTC_MAD
, R300_FPI2_OUTA_MAD
},
167 { "DP3", 2, R300_FPI0_OUTC_DP3
, R300_FPI2_OUTA_DP4
},
168 { "DP4", 2, R300_FPI0_OUTC_DP4
, R300_FPI2_OUTA_DP4
},
169 { "MIN", 2, R300_FPI0_OUTC_MIN
, R300_FPI2_OUTA_MIN
},
170 { "MAX", 2, R300_FPI0_OUTC_MAX
, R300_FPI2_OUTA_MAX
},
171 { "CMP", 3, R300_FPI0_OUTC_CMP
, R300_FPI2_OUTA_CMP
},
172 { "FRC", 1, R300_FPI0_OUTC_FRC
, R300_FPI2_OUTA_FRC
},
173 { "EX2", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_EX2
},
174 { "LG2", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_LG2
},
175 { "RCP", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_RCP
},
176 { "RSQ", 1, R300_FPI0_OUTC_REPL_ALPHA
, R300_FPI2_OUTA_RSQ
},
177 { "REPL_ALPHA", 1, R300_FPI0_OUTC_REPL_ALPHA
, PFS_INVAL
},
178 { "CMPH", 3, R300_FPI0_OUTC_CMPH
, PFS_INVAL
},
182 /* vector swizzles r300 can support natively, with a couple of
183 * cases we handle specially
185 * REG_VSWZ/REG_SSWZ is an index into this table
187 #define SLOT_VECTOR (1<<0)
188 #define SLOT_SCALAR (1<<3)
189 #define SLOT_BOTH (SLOT_VECTOR | SLOT_SCALAR)
190 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
194 static const struct r300_pfs_swizzle
{
195 GLuint hash
; /* swizzle value this matches */
196 GLuint base
; /* base value for hw swizzle */
197 GLuint stride
; /* difference in base between arg0/1/2 */
200 /* native swizzles */
201 { MAKE_SWZ3(X
, Y
, Z
), R300_FPI0_ARGC_SRC0C_XYZ
, 4, SLOT_VECTOR
},
202 { MAKE_SWZ3(X
, X
, X
), R300_FPI0_ARGC_SRC0C_XXX
, 4, SLOT_VECTOR
},
203 { MAKE_SWZ3(Y
, Y
, Y
), R300_FPI0_ARGC_SRC0C_YYY
, 4, SLOT_VECTOR
},
204 { MAKE_SWZ3(Z
, Z
, Z
), R300_FPI0_ARGC_SRC0C_ZZZ
, 4, SLOT_VECTOR
},
205 { MAKE_SWZ3(W
, W
, W
), R300_FPI0_ARGC_SRC0A
, 1, SLOT_SCALAR
},
206 { MAKE_SWZ3(Y
, Z
, X
), R300_FPI0_ARGC_SRC0C_YZX
, 1, SLOT_VECTOR
},
207 { MAKE_SWZ3(Z
, X
, Y
), R300_FPI0_ARGC_SRC0C_ZXY
, 1, SLOT_VECTOR
},
208 { MAKE_SWZ3(W
, Z
, Y
), R300_FPI0_ARGC_SRC0CA_WZY
, 1, SLOT_BOTH
},
209 { MAKE_SWZ3(ONE
, ONE
, ONE
), R300_FPI0_ARGC_ONE
, 0, 0},
210 { MAKE_SWZ3(ZERO
, ZERO
, ZERO
), R300_FPI0_ARGC_ZERO
, 0, 0},
211 { PFS_INVAL
, R300_FPI0_ARGC_HALF
, 0, 0},
212 { PFS_INVAL
, 0, 0, 0},
215 /* used during matching of non-native swizzles */
216 #define SWZ_X_MASK (7 << 0)
217 #define SWZ_Y_MASK (7 << 3)
218 #define SWZ_Z_MASK (7 << 6)
219 #define SWZ_W_MASK (7 << 9)
220 static const struct {
221 GLuint hash
; /* used to mask matching swizzle components */
222 int mask
; /* actual outmask */
223 int count
; /* count of components matched */
225 { SWZ_X_MASK
|SWZ_Y_MASK
|SWZ_Z_MASK
, 1|2|4, 3},
226 { SWZ_X_MASK
|SWZ_Y_MASK
, 1|2, 2},
227 { SWZ_X_MASK
|SWZ_Z_MASK
, 1|4, 2},
228 { SWZ_Y_MASK
|SWZ_Z_MASK
, 2|4, 2},
232 { PFS_INVAL
, PFS_INVAL
, PFS_INVAL
}
235 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
236 #define SWIZZLE_HALF 6
237 static const struct {
238 int base
; /* hw value of swizzle */
239 int stride
; /* difference between SRC0/1/2 */
242 { R300_FPI2_ARGA_SRC0C_X
, 3, SLOT_VECTOR
},
243 { R300_FPI2_ARGA_SRC0C_Y
, 3, SLOT_VECTOR
},
244 { R300_FPI2_ARGA_SRC0C_Z
, 3, SLOT_VECTOR
},
245 { R300_FPI2_ARGA_SRC0A
, 1, SLOT_SCALAR
},
246 { R300_FPI2_ARGA_ZERO
, 0, 0 },
247 { R300_FPI2_ARGA_ONE
, 0, 0 },
248 { R300_FPI2_ARGA_HALF
, 0, 0 }
251 /* boiler-plate reg, for convenience */
252 static const GLuint undef
= REG(REG_TYPE_TEMP
,
259 /* constant one source */
260 static const GLuint pfs_one
= REG(REG_TYPE_TEMP
,
267 /* constant half source */
268 static const GLuint pfs_half
= REG(REG_TYPE_TEMP
,
275 /* constant zero source */
276 static const GLuint pfs_zero
= REG(REG_TYPE_TEMP
,
284 * Common functions prototypes
286 static void dump_program(struct r300_fragment_program
*rp
);
287 static void emit_arith(struct r300_fragment_program
*rp
, int op
,
288 GLuint dest
, int mask
,
289 GLuint src0
, GLuint src1
, GLuint src2
,
293 * Helper functions prototypes
295 static int get_hw_temp(struct r300_fragment_program
*rp
)
298 int r
= ffs(~cs
->hwreg_in_use
);
300 ERROR("Out of hardware temps\n");
304 cs
->hwreg_in_use
|= (1 << --r
);
305 if (r
> rp
->max_temp_idx
)
306 rp
->max_temp_idx
= r
;
311 static int get_hw_temp_tex(struct r300_fragment_program
*rp
)
316 r
= ffs(~(cs
->hwreg_in_use
| cs
->used_in_node
));
318 return get_hw_temp(rp
); /* Will cause an indirection */
320 cs
->hwreg_in_use
|= (1 << --r
);
321 if (r
> rp
->max_temp_idx
)
322 rp
->max_temp_idx
= r
;
327 static void free_hw_temp(struct r300_fragment_program
*rp
, int idx
)
330 cs
->hwreg_in_use
&= ~(1<<idx
);
333 static GLuint
get_temp_reg(struct r300_fragment_program
*rp
)
339 index
= ffs(~cs
->temp_in_use
);
341 ERROR("Out of program temps\n");
345 cs
->temp_in_use
|= (1 << --index
);
346 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
347 cs
->temps
[index
].reg
= -1;
349 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
350 REG_SET_INDEX(r
, index
);
351 REG_SET_VALID(r
, GL_TRUE
);
355 static GLuint
get_temp_reg_tex(struct r300_fragment_program
*rp
)
361 index
= ffs(~cs
->temp_in_use
);
363 ERROR("Out of program temps\n");
367 cs
->temp_in_use
|= (1 << --index
);
368 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
369 cs
->temps
[index
].reg
= get_hw_temp_tex(rp
);
371 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
372 REG_SET_INDEX(r
, index
);
373 REG_SET_VALID(r
, GL_TRUE
);
377 static void free_temp(struct r300_fragment_program
*rp
, GLuint r
)
380 GLuint index
= REG_GET_INDEX(r
);
382 if (!(cs
->temp_in_use
& (1 << index
)))
385 if (REG_GET_TYPE(r
) == REG_TYPE_TEMP
) {
386 free_hw_temp(rp
, cs
->temps
[index
].reg
);
387 cs
->temps
[index
].reg
= -1;
388 cs
->temp_in_use
&= ~(1 << index
);
389 } else if (REG_GET_TYPE(r
) == REG_TYPE_INPUT
) {
390 free_hw_temp(rp
, cs
->inputs
[index
].reg
);
391 cs
->inputs
[index
].reg
= -1;
395 static GLuint
emit_param4fv(struct r300_fragment_program
*rp
,
402 pidx
= rp
->param_nr
++;
403 index
= rp
->const_nr
++;
404 if (pidx
>= PFS_NUM_CONST_REGS
|| index
>= PFS_NUM_CONST_REGS
) {
405 ERROR("Out of const/param slots!\n");
409 rp
->param
[pidx
].idx
= index
;
410 rp
->param
[pidx
].values
= values
;
411 rp
->params_uptodate
= GL_FALSE
;
413 REG_SET_TYPE(r
, REG_TYPE_CONST
);
414 REG_SET_INDEX(r
, index
);
415 REG_SET_VALID(r
, GL_TRUE
);
419 static GLuint
emit_const4fv(struct r300_fragment_program
*rp
, GLfloat
*cp
)
424 index
= rp
->const_nr
++;
425 if (index
>= PFS_NUM_CONST_REGS
) {
426 ERROR("Out of hw constants!\n");
430 COPY_4V(rp
->constant
[index
], cp
);
432 REG_SET_TYPE(r
, REG_TYPE_CONST
);
433 REG_SET_INDEX(r
, index
);
434 REG_SET_VALID(r
, GL_TRUE
);
438 static inline GLuint
negate(GLuint r
)
445 /* Hack, to prevent clobbering sources used multiple times when
446 * emulating non-native instructions
448 static inline GLuint
keep(GLuint r
)
450 REG_SET_NO_USE(r
, GL_TRUE
);
454 static inline GLuint
absolute(GLuint r
)
460 static int swz_native(struct r300_fragment_program
*rp
,
465 /* Native swizzle, handle negation */
466 src
|= ((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
;
468 if ((arbneg
& 0x7) == 0x0) {
469 src
= src
& ~REG_NEGV_MASK
;
471 } else if ((arbneg
& 0x7) == 0x7) {
472 src
|= REG_NEGV_MASK
;
475 if (!REG_GET_VALID(*r
))
476 *r
= get_temp_reg(rp
);
477 src
|= REG_NEGV_MASK
;
486 src
= src
& ~REG_NEGV_MASK
;
490 (arbneg
^ 0x7) | WRITEMASK_W
,
500 static int swz_emit_partial(struct r300_fragment_program
*rp
,
510 if (!REG_GET_VALID(*r
))
511 *r
= get_temp_reg(rp
);
513 /* A partial match, VSWZ/mask define what parts of the
514 * desired swizzle we match
516 if (mc
+ s_mask
[mask
].count
== 3) {
518 src
|= ((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
;
521 tmp
= arbneg
& s_mask
[mask
].mask
;
523 tmp
= tmp
^ s_mask
[mask
].mask
;
528 arbneg
& s_mask
[mask
].mask
,
529 keep(src
) | REG_NEGV_MASK
,
534 REG_SET_NO_USE(src
, GL_TRUE
);
536 REG_SET_NO_USE(src
, GL_FALSE
);
548 REG_SET_NO_USE(src
, GL_TRUE
);
550 REG_SET_NO_USE(src
, GL_FALSE
);
555 (arbneg
& s_mask
[mask
].mask
) | wmask
,
563 REG_SET_NO_USE(src
, GL_TRUE
);
565 REG_SET_NO_USE(src
, GL_FALSE
);
567 emit_arith(rp
, PFS_OP_MAD
,
569 s_mask
[mask
].mask
| wmask
,
576 return s_mask
[mask
].count
;
579 static GLuint
do_swizzle(struct r300_fragment_program
*rp
,
589 /* If swizzling from something without an XYZW native swizzle,
590 * emit result to a temp, and do new swizzle from the temp.
592 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
||
593 REG_GET_SSWZ(src
) != SWIZZLE_W
) {
594 GLuint temp
= get_temp_reg(rp
);
606 /* set scalar swizzling */
607 REG_SET_SSWZ(src
, GET_SWZ(arbswz
, 3));
610 vswz
= REG_GET_VSWZ(src
);
614 REG_SET_VSWZ(src
, vswz
);
615 chash
= v_swiz
[REG_GET_VSWZ(src
)].hash
&
618 if (chash
== (arbswz
& s_mask
[c_mask
].hash
)) {
619 if (s_mask
[c_mask
].count
== 3) {
620 v_match
+= swz_native(rp
,
625 v_match
+= swz_emit_partial(rp
,
636 /* Fill with something invalid.. all 0's was
637 * wrong before, matched SWIZZLE_X. So all
638 * 1's will be okay for now
640 arbswz
|= (PFS_INVAL
& s_mask
[c_mask
].hash
);
642 } while(v_swiz
[++vswz
].hash
!= PFS_INVAL
);
643 REG_SET_VSWZ(src
, SWIZZLE_XYZ
);
644 } while (s_mask
[++c_mask
].hash
!= PFS_INVAL
);
646 ERROR("should NEVER get here\n");
650 static GLuint
t_src(struct r300_fragment_program
*rp
,
651 struct prog_src_register fpsrc
)
655 switch (fpsrc
.File
) {
656 case PROGRAM_TEMPORARY
:
657 REG_SET_INDEX(r
, fpsrc
.Index
);
658 REG_SET_VALID(r
, GL_TRUE
);
659 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
662 REG_SET_INDEX(r
, fpsrc
.Index
);
663 REG_SET_VALID(r
, GL_TRUE
);
664 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
666 case PROGRAM_LOCAL_PARAM
:
667 r
= emit_param4fv(rp
,
668 rp
->mesa_program
.Base
.LocalParams
[fpsrc
.Index
]);
670 case PROGRAM_ENV_PARAM
:
671 r
= emit_param4fv(rp
,
672 rp
->ctx
->FragmentProgram
.Parameters
[fpsrc
.Index
]);
674 case PROGRAM_STATE_VAR
:
675 case PROGRAM_NAMED_PARAM
:
676 r
= emit_param4fv(rp
,
677 rp
->mesa_program
.Base
.Parameters
->ParameterValues
[fpsrc
.Index
]);
680 ERROR("unknown SrcReg->File %x\n", fpsrc
.File
);
684 /* no point swizzling ONE/ZERO/HALF constants... */
685 if (REG_GET_VSWZ(r
) < SWIZZLE_111
|| REG_GET_SSWZ(r
) < SWIZZLE_ZERO
)
686 r
= do_swizzle(rp
, r
, fpsrc
.Swizzle
, fpsrc
.NegateBase
);
690 static GLuint
t_scalar_src(struct r300_fragment_program
*rp
,
691 struct prog_src_register fpsrc
)
693 struct prog_src_register src
= fpsrc
;
694 int sc
= GET_SWZ(fpsrc
.Swizzle
, 0); /* X */
696 src
.Swizzle
= ((sc
<<0)|(sc
<<3)|(sc
<<6)|(sc
<<9));
698 return t_src(rp
, src
);
701 static GLuint
t_dst(struct r300_fragment_program
*rp
,
702 struct prog_dst_register dest
)
707 case PROGRAM_TEMPORARY
:
708 REG_SET_INDEX(r
, dest
.Index
);
709 REG_SET_VALID(r
, GL_TRUE
);
710 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
713 REG_SET_TYPE(r
, REG_TYPE_OUTPUT
);
714 switch (dest
.Index
) {
715 case FRAG_RESULT_COLR
:
716 case FRAG_RESULT_DEPR
:
717 REG_SET_INDEX(r
, dest
.Index
);
718 REG_SET_VALID(r
, GL_TRUE
);
721 ERROR("Bad DstReg->Index 0x%x\n", dest
.Index
);
725 ERROR("Bad DstReg->File 0x%x\n", dest
.File
);
730 static int t_hw_src(struct r300_fragment_program
*rp
,
736 int index
= REG_GET_INDEX(src
);
738 switch(REG_GET_TYPE(src
)) {
740 /* NOTE: if reg==-1 here, a source is being read that
741 * hasn't been written to. Undefined results
743 if (cs
->temps
[index
].reg
== -1)
744 cs
->temps
[index
].reg
= get_hw_temp(rp
);
746 idx
= cs
->temps
[index
].reg
;
748 if (!REG_GET_NO_USE(src
) &&
749 (--cs
->temps
[index
].refcount
== 0))
753 idx
= cs
->inputs
[index
].reg
;
755 if (!REG_GET_NO_USE(src
) &&
756 (--cs
->inputs
[index
].refcount
== 0))
757 free_hw_temp(rp
, cs
->inputs
[index
].reg
);
760 return (index
| SRC_CONST
);
762 ERROR("Invalid type for source reg\n");
763 return (0 | SRC_CONST
);
767 cs
->used_in_node
|= (1 << idx
);
772 static int t_hw_dst(struct r300_fragment_program
*rp
,
778 GLuint index
= REG_GET_INDEX(dest
);
779 assert(REG_GET_VALID(dest
));
781 switch(REG_GET_TYPE(dest
)) {
783 if (cs
->temps
[REG_GET_INDEX(dest
)].reg
== -1) {
785 cs
->temps
[index
].reg
= get_hw_temp(rp
);
787 cs
->temps
[index
].reg
= get_hw_temp_tex(rp
);
790 idx
= cs
->temps
[index
].reg
;
792 if (!REG_GET_NO_USE(dest
) &&
793 (--cs
->temps
[index
].refcount
== 0))
796 cs
->dest_in_node
|= (1 << idx
);
797 cs
->used_in_node
|= (1 << idx
);
799 case REG_TYPE_OUTPUT
:
801 case FRAG_RESULT_COLR
:
802 rp
->node
[rp
->cur_node
].flags
|= R300_PFS_NODE_OUTPUT_COLOR
;
804 case FRAG_RESULT_DEPR
:
805 rp
->node
[rp
->cur_node
].flags
|= R300_PFS_NODE_OUTPUT_DEPTH
;
811 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
818 static void emit_nop(struct r300_fragment_program
*rp
,
825 cs
->v_pos
= cs
->s_pos
= MAX2(cs
->v_pos
, cs
->s_pos
);
827 if (mask
& WRITEMASK_XYZ
) {
828 rp
->alu
.inst
[cs
->v_pos
].inst0
= NOP_INST0
;
829 rp
->alu
.inst
[cs
->v_pos
].inst1
= NOP_INST1
;
833 if (mask
& WRITEMASK_W
) {
834 rp
->alu
.inst
[cs
->s_pos
].inst2
= NOP_INST2
;
835 rp
->alu
.inst
[cs
->s_pos
].inst3
= NOP_INST3
;
840 static void emit_tex(struct r300_fragment_program
*rp
,
841 struct prog_instruction
*fpi
,
845 GLuint coord
= t_src(rp
, fpi
->SrcReg
[0]);
846 GLuint dest
= undef
, rdest
= undef
;
847 GLuint din
= cs
->dest_in_node
, uin
= cs
->used_in_node
;
848 int unit
= fpi
->TexSrcUnit
;
851 /* Resolve source/dest to hardware registers */
852 hwsrc
= t_hw_src(rp
, coord
, GL_TRUE
);
853 if (opcode
!= R300_FPITX_OP_KIL
) {
854 dest
= t_dst(rp
, fpi
->DstReg
);
856 /* r300 doesn't seem to be able to do TEX->output reg */
857 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
859 dest
= get_temp_reg_tex(rp
);
861 hwdest
= t_hw_dst(rp
, dest
, GL_TRUE
);
863 /* Use a temp that hasn't been used in this node, rather
864 * than causing an indirection
866 if (uin
& (1 << hwdest
)) {
867 free_hw_temp(rp
, hwdest
);
868 hwdest
= get_hw_temp_tex(rp
);
869 cs
->temps
[REG_GET_INDEX(dest
)].reg
= hwdest
;
876 /* Indirection if source has been written in this node, or if the
877 * dest has been read/written in this node
879 if ((REG_GET_TYPE(coord
) != REG_TYPE_CONST
&&
880 (din
& (1<<hwsrc
))) || (uin
& (1<<hwdest
))) {
882 /* Finish off current node */
883 cs
->v_pos
= cs
->s_pos
= MAX2(cs
->v_pos
, cs
->s_pos
);
884 if (rp
->node
[rp
->cur_node
].alu_offset
== cs
->v_pos
) {
885 /* No alu instructions in the node? Emit a NOP. */
886 emit_nop(rp
, WRITEMASK_XYZW
, GL_TRUE
);
887 cs
->v_pos
= cs
->s_pos
= MAX2(cs
->v_pos
, cs
->s_pos
);
890 rp
->node
[rp
->cur_node
].alu_end
=
891 cs
->v_pos
- rp
->node
[rp
->cur_node
].alu_offset
- 1;
892 assert(rp
->node
[rp
->cur_node
].alu_end
>= 0);
894 if (++rp
->cur_node
>= PFS_MAX_TEX_INDIRECT
) {
895 ERROR("too many levels of texture indirection\n");
900 rp
->node
[rp
->cur_node
].tex_offset
= rp
->tex
.length
;
901 rp
->node
[rp
->cur_node
].alu_offset
= cs
->v_pos
;
902 rp
->node
[rp
->cur_node
].tex_end
= -1;
903 rp
->node
[rp
->cur_node
].alu_end
= -1;
904 rp
->node
[rp
->cur_node
].flags
= 0;
905 cs
->used_in_node
= 0;
906 cs
->dest_in_node
= 0;
909 if (rp
->cur_node
== 0)
910 rp
->first_node_has_tex
= 1;
912 rp
->tex
.inst
[rp
->tex
.length
++] = 0
913 | (hwsrc
<< R300_FPITX_SRC_SHIFT
)
914 | (hwdest
<< R300_FPITX_DST_SHIFT
)
915 | (unit
<< R300_FPITX_IMAGE_SHIFT
)
916 /* not entirely sure about this */
917 | (opcode
<< R300_FPITX_OPCODE_SHIFT
);
919 cs
->dest_in_node
|= (1 << hwdest
);
920 if (REG_GET_TYPE(coord
) != REG_TYPE_CONST
)
921 cs
->used_in_node
|= (1 << hwsrc
);
923 rp
->node
[rp
->cur_node
].tex_end
++;
925 /* Copy from temp to output if needed */
926 if (REG_GET_VALID(rdest
)) {
927 emit_arith(rp
, PFS_OP_MAD
, rdest
, WRITEMASK_XYZW
, dest
,
928 pfs_one
, pfs_zero
, 0);
933 /* Add sources to FPI1/FPI3 lists. If source is already on list,
934 * reuse the index instead of wasting a source.
936 static int add_src(struct r300_fragment_program
*rp
,
944 /* Look for matches */
945 for (i
=0,csm
=srcmask
; i
<3; i
++,csm
=csm
<<1) {
946 /* If sources have been allocated in this position(s)... */
947 if ((cs
->slot
[pos
].umask
& csm
) == csm
) {
948 /* ... and the register number(s) match, re-use the
950 if (srcmask
== SLOT_VECTOR
&&
951 cs
->slot
[pos
].vsrc
[i
] == reg
)
953 if (srcmask
== SLOT_SCALAR
&&
954 cs
->slot
[pos
].ssrc
[i
] == reg
)
956 if (srcmask
== SLOT_BOTH
&&
957 cs
->slot
[pos
].vsrc
[i
] == reg
&&
958 cs
->slot
[pos
].ssrc
[i
] == reg
)
963 /* Look for free spaces */
964 for (i
=0,csm
=srcmask
; i
<3; i
++,csm
=csm
<<1) {
965 /* If the position(s) haven't been allocated */
966 if ((cs
->slot
[pos
].umask
& csm
) == 0) {
967 cs
->slot
[pos
].umask
|= csm
;
969 if (srcmask
& SLOT_VECTOR
)
970 cs
->slot
[pos
].vsrc
[i
] = reg
;
971 if (srcmask
& SLOT_SCALAR
)
972 cs
->slot
[pos
].ssrc
[i
] = reg
;
977 //ERROR("Failed to allocate sources in FPI1/FPI3!\n");
981 /* Determine whether or not to position opcode in the same ALU slot for both
982 * vector and scalar portions of an instruction.
984 * It's not necessary to force the first case, but it makes disassembled
985 * shaders easier to read.
987 static GLboolean
force_same_slot(int vop
,
996 if (emit_vop
&& emit_sop
)
999 if (emit_vop
&& vop
== R300_FPI0_OUTC_REPL_ALPHA
)
1003 for (i
=0;i
<argc
;i
++)
1004 if (REG_GET_VSWZ(src
[i
]) == SWIZZLE_WZY
)
1011 static void emit_arith(struct r300_fragment_program
*rp
,
1021 GLuint src
[3] = { src0
, src1
, src2
};
1022 int hwsrc
[3], sswz
[3], vswz
[3];
1024 GLboolean emit_vop
= GL_FALSE
, emit_sop
= GL_FALSE
;
1029 vop
= r300_fpop
[op
].v_op
;
1030 sop
= r300_fpop
[op
].s_op
;
1031 argc
= r300_fpop
[op
].argc
;
1033 if ((mask
& WRITEMASK_XYZ
) || vop
== R300_FPI0_OUTC_DP3
)
1035 if ((mask
& WRITEMASK_W
) || vop
== R300_FPI0_OUTC_REPL_ALPHA
)
1038 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
&&
1039 REG_GET_INDEX(dest
) == FRAG_RESULT_DEPR
)
1040 emit_vop
= GL_FALSE
;
1042 if (force_same_slot(vop
, sop
, emit_vop
, emit_sop
, argc
, src
)) {
1043 vpos
= spos
= MAX2(cs
->v_pos
, cs
->s_pos
);
1047 /* Here is where we'd decide on where a safe place is to
1048 * combine this instruction with a previous one.
1050 * This is extremely simple for now.. if a source depends
1051 * on the opposite stream, force the same instruction.
1055 (v_swiz
[REG_GET_VSWZ(src
[i
])].flags
& SLOT_SCALAR
)) {
1056 vpos
= spos
= MAX2(vpos
, spos
);
1060 (s_swiz
[REG_GET_VSWZ(src
[i
])].flags
& SLOT_VECTOR
)) {
1061 vpos
= spos
= MAX2(vpos
, spos
);
1067 /* - Convert src->hwsrc, record for FPI1/FPI3
1068 * - Determine ARG parts of FPI0/FPI2, unused args are filled
1075 vswz
[i
] = R300_FPI0_ARGC_ZERO
;
1076 sswz
[i
] = R300_FPI2_ARGA_ZERO
;
1080 hwsrc
[i
] = t_hw_src(rp
, src
[i
], GL_FALSE
);
1082 if (emit_vop
&& vop
!= R300_FPI0_OUTC_REPL_ALPHA
) {
1083 srcpos
= add_src(rp
, hwsrc
[i
], vpos
,
1084 v_swiz
[REG_GET_VSWZ(src
[i
])].flags
);
1085 vswz
[i
] = (v_swiz
[REG_GET_VSWZ(src
[i
])].base
+
1087 v_swiz
[REG_GET_VSWZ(src
[i
])].stride
)) |
1088 ((src
[i
] & REG_NEGV_MASK
) ? ARG_NEG
: 0) |
1089 ((src
[i
] & REG_ABS_MASK
) ? ARG_ABS
: 0);
1090 } else vswz
[i
] = R300_FPI0_ARGC_ZERO
;
1093 srcpos
= add_src(rp
, hwsrc
[i
], spos
,
1094 s_swiz
[REG_GET_SSWZ(src
[i
])].flags
);
1095 sswz
[i
] = (s_swiz
[REG_GET_SSWZ(src
[i
])].base
+
1097 s_swiz
[REG_GET_SSWZ(src
[i
])].stride
)) |
1098 ((src
[i
] & REG_NEGS_MASK
) ? ARG_NEG
: 0) |
1099 ((src
[i
] & REG_ABS_MASK
) ? ARG_ABS
: 0);
1100 } else sswz
[i
] = R300_FPI2_ARGA_ZERO
;
1102 hwdest
= t_hw_dst(rp
, dest
, GL_FALSE
);
1104 if (flags
& PFS_FLAG_SAT
) {
1105 vop
|= R300_FPI0_OUTC_SAT
;
1106 sop
|= R300_FPI2_OUTA_SAT
;
1109 /* Throw the pieces together and get FPI0/1 */
1110 rp
->alu
.inst
[vpos
].inst1
=
1111 ((cs
->slot
[vpos
].vsrc
[0] << R300_FPI1_SRC0C_SHIFT
) |
1112 (cs
->slot
[vpos
].vsrc
[1] << R300_FPI1_SRC1C_SHIFT
) |
1113 (cs
->slot
[vpos
].vsrc
[2] << R300_FPI1_SRC2C_SHIFT
));
1115 rp
->alu
.inst
[vpos
].inst0
= vop
|
1116 (vswz
[0] << R300_FPI0_ARG0C_SHIFT
) |
1117 (vswz
[1] << R300_FPI0_ARG1C_SHIFT
) |
1118 (vswz
[2] << R300_FPI0_ARG2C_SHIFT
);
1120 rp
->alu
.inst
[vpos
].inst1
|= hwdest
<< R300_FPI1_DSTC_SHIFT
;
1121 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1122 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1123 rp
->alu
.inst
[vpos
].inst1
|=
1124 (mask
& WRITEMASK_XYZ
) << R300_FPI1_DSTC_OUTPUT_MASK_SHIFT
;
1127 rp
->alu
.inst
[vpos
].inst1
|=
1128 (mask
& WRITEMASK_XYZ
) << R300_FPI1_DSTC_REG_MASK_SHIFT
;
1131 } else if (spos
>= vpos
)
1132 rp
->alu
.inst
[spos
].inst0
= NOP_INST0
;
1134 /* And now FPI2/3 */
1135 rp
->alu
.inst
[spos
].inst3
=
1136 ((cs
->slot
[spos
].ssrc
[0] << R300_FPI3_SRC0A_SHIFT
) |
1137 (cs
->slot
[spos
].ssrc
[1] << R300_FPI3_SRC1A_SHIFT
) |
1138 (cs
->slot
[spos
].ssrc
[2] << R300_FPI3_SRC2A_SHIFT
));
1140 rp
->alu
.inst
[spos
].inst2
= sop
|
1141 sswz
[0] << R300_FPI2_ARG0A_SHIFT
|
1142 sswz
[1] << R300_FPI2_ARG1A_SHIFT
|
1143 sswz
[2] << R300_FPI2_ARG2A_SHIFT
;
1145 if (mask
& WRITEMASK_W
) {
1146 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1147 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1148 rp
->alu
.inst
[spos
].inst3
|=
1149 (hwdest
<< R300_FPI3_DSTA_SHIFT
) | R300_FPI3_DSTA_OUTPUT
;
1150 } else if (REG_GET_INDEX(dest
) == FRAG_RESULT_DEPR
) {
1151 rp
->alu
.inst
[spos
].inst3
|= R300_FPI3_DSTA_DEPTH
;
1154 rp
->alu
.inst
[spos
].inst3
|=
1155 (hwdest
<< R300_FPI3_DSTA_SHIFT
) | R300_FPI3_DSTA_REG
;
1159 } else if (vpos
>= spos
)
1160 rp
->alu
.inst
[vpos
].inst2
= NOP_INST2
;
1166 static GLuint
get_attrib(struct r300_fragment_program
*rp
, GLuint attr
)
1168 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
1171 if (!(mp
->Base
.InputsRead
& (1<<attr
))) {
1172 ERROR("Attribute %d was not provided!\n", attr
);
1176 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
1177 REG_SET_INDEX(r
, attr
);
1178 REG_SET_VALID(r
, GL_TRUE
);
1183 static GLboolean
parse_program(struct r300_fragment_program
*rp
)
1185 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
1186 const struct prog_instruction
*inst
= mp
->Base
.Instructions
;
1187 struct prog_instruction
*fpi
;
1188 GLuint src
[3], dest
, temp
;
1190 int flags
, mask
= 0;
1191 GLfloat cnstv
[4] = {0.0, 0.0, 0.0, 0.0};
1193 if (!inst
|| inst
[0].Opcode
== OPCODE_END
) {
1194 ERROR("empty program?\n");
1198 for (fpi
=mp
->Base
.Instructions
; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
1199 if (fpi
->SaturateMode
== SATURATE_ZERO_ONE
)
1200 flags
= PFS_FLAG_SAT
;
1204 if (fpi
->Opcode
!= OPCODE_KIL
) {
1205 dest
= t_dst(rp
, fpi
->DstReg
);
1206 mask
= fpi
->DstReg
.WriteMask
;
1209 switch (fpi
->Opcode
) {
1211 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1212 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1213 absolute(src
[0]), pfs_one
, pfs_zero
,
1217 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1218 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1219 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1220 src
[0], pfs_one
, src
[1],
1224 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1225 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1226 src
[2] = t_src(rp
, fpi
->SrcReg
[2]);
1227 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1228 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1230 emit_arith(rp
, PFS_OP_CMP
, dest
, mask
,
1231 src
[2], src
[1], src
[0],
1236 * cos using taylor serie:
1237 * cos(x) = 1 - x^2/2! + x^4/4! - x^6/6!
1239 temp
= get_temp_reg(rp
);
1241 cnstv
[1] = 0.041666667;
1242 cnstv
[2] = 0.001388889;
1244 cnst
= emit_const4fv(rp
, cnstv
);
1245 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1247 emit_arith(rp
, PFS_OP_MAD
, temp
,
1253 emit_arith(rp
, PFS_OP_MAD
, temp
,
1254 WRITEMASK_Y
| WRITEMASK_Z
,
1258 emit_arith(rp
, PFS_OP_MAD
, temp
,
1261 swizzle(temp
, X
, X
, X
, W
),
1264 emit_arith(rp
, PFS_OP_MAD
, temp
,
1269 emit_arith(rp
, PFS_OP_MAD
, temp
,
1275 emit_arith(rp
, PFS_OP_MAD
, temp
,
1279 swizzle(temp
, Y
, Y
, Y
, W
),
1281 emit_arith(rp
, PFS_OP_MAD
, temp
,
1285 negate(swizzle(temp
, Z
, Z
, Z
, W
)),
1287 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1288 swizzle(temp
, X
, X
, X
, X
),
1292 free_temp(rp
, temp
);
1295 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1296 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1297 emit_arith(rp
, PFS_OP_DP3
, dest
, mask
,
1298 src
[0], src
[1], undef
,
1302 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1303 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1304 emit_arith(rp
, PFS_OP_DP4
, dest
, mask
,
1305 src
[0], src
[1], undef
,
1309 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1310 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1311 /* src0.xyz1 -> temp
1312 * DP4 dest, temp, src1
1315 temp
= get_temp_reg(rp
);
1316 src
[0].s_swz
= SWIZZLE_ONE
;
1317 emit_arith(rp
, PFS_OP_MAD
, temp
, mask
,
1318 src
[0], pfs_one
, pfs_zero
,
1320 emit_arith(rp
, PFS_OP_DP4
, dest
, mask
,
1321 temp
, src
[1], undef
,
1323 free_temp(rp
, temp
);
1325 emit_arith(rp
, PFS_OP_DP4
, dest
, mask
,
1326 swizzle(src
[0], X
, Y
, Z
, ONE
), src
[1],
1331 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1332 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1333 /* dest.y = src0.y * src1.y */
1334 if (mask
& WRITEMASK_Y
)
1335 emit_arith(rp
, PFS_OP_MAD
, dest
, WRITEMASK_Y
,
1336 keep(src
[0]), keep(src
[1]),
1338 /* dest.z = src0.z */
1339 if (mask
& WRITEMASK_Z
)
1340 emit_arith(rp
, PFS_OP_MAD
, dest
, WRITEMASK_Z
,
1341 src
[0], pfs_one
, pfs_zero
, flags
);
1343 * result.w = src1.w */
1344 if (mask
& WRITEMASK_XW
) {
1345 REG_SET_VSWZ(src
[1], SWIZZLE_111
); /*Cheat*/
1346 emit_arith(rp
, PFS_OP_MAD
, dest
,
1347 mask
& WRITEMASK_XW
,
1348 src
[1], pfs_one
, pfs_zero
,
1353 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1354 emit_arith(rp
, PFS_OP_EX2
, dest
, mask
,
1355 src
[0], undef
, undef
,
1359 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1360 temp
= get_temp_reg(rp
);
1362 * MAD dest, src0, 1.0, -temp
1364 emit_arith(rp
, PFS_OP_FRC
, temp
, mask
,
1365 keep(src
[0]), undef
, undef
,
1367 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1368 src
[0], pfs_one
, negate(temp
),
1370 free_temp(rp
, temp
);
1373 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1374 emit_arith(rp
, PFS_OP_FRC
, dest
, mask
,
1375 src
[0], undef
, undef
,
1379 emit_tex(rp
, fpi
, R300_FPITX_OP_KIL
);
1382 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1383 emit_arith(rp
, PFS_OP_LG2
, dest
, mask
,
1384 src
[0], undef
, undef
,
1389 * if (s.x < 0) t.x = 0; else t.x = s.x;
1390 * if (s.y < 0) t.y = 0; else t.y = s.y;
1391 * if (s.w > 128.0) t.w = 128.0; else t.w = s.w;
1392 * if (s.w < -128.0) t.w = -128.0; else t.w = s.w;
1394 * if (t.x > 0) r.y = pow(t.y, t.w); else r.y = 0;
1395 * Also r.y = 0 if t.y < 0
1396 * For the t.x > 0 FGLRX use the CMPH opcode which
1397 * change the compare to (t.x + 0.5) > 0.5 we may
1398 * save one instruction by doing CMP -t.x
1400 cnstv
[0] = cnstv
[1] = cnstv
[2] = cnstv
[4] = 0.50001;
1401 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1402 temp
= get_temp_reg(rp
);
1403 cnst
= emit_const4fv(rp
, cnstv
);
1404 emit_arith(rp
, PFS_OP_CMP
, temp
,
1405 WRITEMASK_X
| WRITEMASK_Y
,
1406 src
[0], pfs_zero
, src
[0], flags
);
1407 emit_arith(rp
, PFS_OP_MIN
, temp
, WRITEMASK_Z
,
1408 swizzle(keep(src
[0]), W
, W
, W
, W
),
1409 cnst
, undef
, flags
);
1410 emit_arith(rp
, PFS_OP_LG2
, temp
, WRITEMASK_W
,
1411 swizzle(temp
, Y
, Y
, Y
, Y
),
1412 undef
, undef
, flags
);
1413 emit_arith(rp
, PFS_OP_MAX
, temp
, WRITEMASK_Z
,
1414 temp
, negate(cnst
), undef
, flags
);
1415 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_W
,
1416 temp
, swizzle(temp
, Z
, Z
, Z
, Z
),
1418 emit_arith(rp
, PFS_OP_EX2
, temp
, WRITEMASK_W
,
1419 temp
, undef
, undef
, flags
);
1420 emit_arith(rp
, PFS_OP_MAD
, dest
, WRITEMASK_Y
,
1421 swizzle(keep(temp
), X
, X
, X
, X
),
1422 pfs_one
, pfs_zero
, flags
);
1424 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_X
,
1425 temp
, pfs_one
, pfs_half
, flags
);
1426 emit_arith(rp
, PFS_OP_CMPH
, temp
, WRITEMASK_Z
,
1427 swizzle(keep(temp
), W
, W
, W
, W
),
1428 pfs_zero
, swizzle(keep(temp
), X
, X
, X
, X
),
1431 emit_arith(rp
, PFS_OP_CMP
, temp
, WRITEMASK_Z
,
1433 swizzle(keep(temp
), W
, W
, W
, W
),
1434 negate(swizzle(keep(temp
), X
, X
, X
, X
)),
1437 emit_arith(rp
, PFS_OP_CMP
, dest
, WRITEMASK_Z
,
1439 negate(swizzle(keep(temp
), Y
, Y
, Y
, Y
)),
1441 emit_arith(rp
, PFS_OP_MAD
, dest
,
1442 WRITEMASK_X
| WRITEMASK_W
,
1447 free_temp(rp
, temp
);
1450 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1451 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1452 src
[2] = t_src(rp
, fpi
->SrcReg
[2]);
1453 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1454 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1455 * MAD temp, -tmp0, tmp2, tmp2
1456 * MAD result, tmp0, tmp1, temp
1458 temp
= get_temp_reg(rp
);
1459 emit_arith(rp
, PFS_OP_MAD
, temp
, mask
,
1460 negate(keep(src
[0])), keep(src
[2]), src
[2],
1462 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1463 src
[0], src
[1], temp
,
1465 free_temp(rp
, temp
);
1468 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1469 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1470 src
[2] = t_src(rp
, fpi
->SrcReg
[2]);
1471 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1472 src
[0], src
[1], src
[2],
1476 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1477 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1478 emit_arith(rp
, PFS_OP_MAX
, dest
, mask
,
1479 src
[0], src
[1], undef
,
1483 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1484 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1485 emit_arith(rp
, PFS_OP_MIN
, dest
, mask
,
1486 src
[0], src
[1], undef
,
1491 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1492 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1493 src
[0], pfs_one
, pfs_zero
,
1497 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1498 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1499 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1500 src
[0], src
[1], pfs_zero
,
1504 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1505 src
[1] = t_scalar_src(rp
, fpi
->SrcReg
[1]);
1506 temp
= get_temp_reg(rp
);
1507 emit_arith(rp
, PFS_OP_LG2
, temp
, WRITEMASK_W
,
1508 src
[0], undef
, undef
,
1510 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_W
,
1511 temp
, src
[1], pfs_zero
,
1513 emit_arith(rp
, PFS_OP_EX2
, dest
, fpi
->DstReg
.WriteMask
,
1516 free_temp(rp
, temp
);
1519 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1520 emit_arith(rp
, PFS_OP_RCP
, dest
, mask
,
1521 src
[0], undef
, undef
,
1525 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1526 emit_arith(rp
, PFS_OP_RSQ
, dest
, mask
,
1527 absolute(src
[0]), pfs_zero
, pfs_zero
,
1531 ERROR("SCS not implemented\n");
1534 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1535 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1536 temp
= get_temp_reg(rp
);
1537 /* temp = src0 - src1
1538 * dest.c = (temp.c < 0.0) ? 0 : 1
1540 emit_arith(rp
, PFS_OP_MAD
, temp
, mask
,
1541 src
[0], pfs_one
, negate(src
[1]),
1543 emit_arith(rp
, PFS_OP_CMP
, dest
, mask
,
1544 pfs_one
, pfs_zero
, temp
,
1546 free_temp(rp
, temp
);
1550 * sin using taylor serie:
1551 * sin(x) = x - x^3/3! + x^5/5! - x^7/7!
1553 temp
= get_temp_reg(rp
);
1554 cnstv
[0] = 0.333333333;
1555 cnstv
[1] = 0.008333333;
1556 cnstv
[2] = 0.000198413;
1558 cnst
= emit_const4fv(rp
, cnstv
);
1559 src
[0] = t_scalar_src(rp
, fpi
->SrcReg
[0]);
1561 emit_arith(rp
, PFS_OP_MAD
, temp
,
1567 emit_arith(rp
, PFS_OP_MAD
, temp
,
1568 WRITEMASK_Y
| WRITEMASK_Z
,
1572 emit_arith(rp
, PFS_OP_MAD
, temp
,
1575 swizzle(temp
, X
, X
, X
, W
),
1578 emit_arith(rp
, PFS_OP_MAD
, temp
,
1584 emit_arith(rp
, PFS_OP_MAD
, temp
,
1589 emit_arith(rp
, PFS_OP_MAD
, temp
,
1595 emit_arith(rp
, PFS_OP_MAD
, temp
,
1599 swizzle(temp
, Y
, Y
, Y
, W
),
1601 emit_arith(rp
, PFS_OP_MAD
, temp
,
1605 negate(swizzle(temp
, Z
, Z
, Z
, W
)),
1607 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1608 swizzle(temp
, X
, X
, X
, X
),
1612 free_temp(rp
, temp
);
1615 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1616 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1617 temp
= get_temp_reg(rp
);
1618 /* temp = src0 - src1
1619 * dest.c = (temp.c < 0.0) ? 1 : 0
1621 emit_arith(rp
, PFS_OP_MAD
, temp
, mask
,
1622 src
[0], pfs_one
, negate(src
[1]),
1624 emit_arith(rp
, PFS_OP_CMP
, dest
, mask
,
1625 pfs_zero
, pfs_one
, temp
,
1627 free_temp(rp
, temp
);
1630 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1631 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1632 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
,
1633 src
[0], pfs_one
, negate(src
[1]),
1637 emit_tex(rp
, fpi
, R300_FPITX_OP_TEX
);
1640 emit_tex(rp
, fpi
, R300_FPITX_OP_TXB
);
1643 emit_tex(rp
, fpi
, R300_FPITX_OP_TXP
);
1646 src
[0] = t_src(rp
, fpi
->SrcReg
[0]);
1647 src
[1] = t_src(rp
, fpi
->SrcReg
[1]);
1648 temp
= get_temp_reg(rp
);
1649 /* temp = src0.zxy * src1.yzx */
1650 emit_arith(rp
, PFS_OP_MAD
, temp
, WRITEMASK_XYZ
,
1651 swizzle(keep(src
[0]), Z
, X
, Y
, W
),
1652 swizzle(keep(src
[1]), Y
, Z
, X
, W
),
1655 /* dest.xyz = src0.yzx * src1.zxy - temp
1656 * dest.w = undefined
1658 emit_arith(rp
, PFS_OP_MAD
, dest
, mask
& WRITEMASK_XYZ
,
1659 swizzle(src
[0], Y
, Z
, X
, W
),
1660 swizzle(src
[1], Z
, X
, Y
, W
),
1664 free_temp(rp
, temp
);
1668 ERROR("unknown fpi->Opcode %d\n", fpi
->Opcode
);
1680 /* - Init structures
1681 * - Determine what hwregs each input corresponds to
1683 static void init_program(struct r300_fragment_program
*rp
)
1685 struct r300_pfs_compile_state
*cs
= NULL
;
1686 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
1687 struct prog_instruction
*fpi
;
1688 GLuint InputsRead
= mp
->Base
.InputsRead
;
1689 GLuint temps_used
= 0; /* for rp->temps[] */
1692 /* New compile, reset tracking data */
1693 rp
->translated
= GL_FALSE
;
1694 rp
->error
= GL_FALSE
;
1695 rp
->cs
= cs
= &(R300_CONTEXT(rp
->ctx
)->state
.pfs_compile
);
1698 rp
->first_node_has_tex
= 0;
1701 rp
->params_uptodate
= GL_FALSE
;
1702 rp
->max_temp_idx
= 0;
1703 rp
->node
[0].alu_end
= -1;
1704 rp
->node
[0].tex_end
= -1;
1706 _mesa_memset(cs
, 0, sizeof(*rp
->cs
));
1707 for (i
=0;i
<PFS_MAX_ALU_INST
;i
++) {
1709 cs
->slot
[i
].vsrc
[j
] = SRC_CONST
;
1710 cs
->slot
[i
].ssrc
[j
] = SRC_CONST
;
1714 /* Work out what temps the Mesa inputs correspond to, this must match
1715 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
1716 * configures itself based on the fragprog's InputsRead
1718 * NOTE: this depends on get_hw_temp() allocating registers in order,
1719 * starting from register 0.
1722 /* Texcoords come first */
1723 for (i
=0;i
<rp
->ctx
->Const
.MaxTextureUnits
;i
++) {
1724 if (InputsRead
& (FRAG_BIT_TEX0
<< i
)) {
1725 cs
->inputs
[FRAG_ATTRIB_TEX0
+i
].refcount
= 0;
1726 cs
->inputs
[FRAG_ATTRIB_TEX0
+i
].reg
= get_hw_temp(rp
);
1729 InputsRead
&= ~FRAG_BITS_TEX_ANY
;
1731 /* fragment position treated as a texcoord */
1732 if (InputsRead
& FRAG_BIT_WPOS
) {
1733 cs
->inputs
[FRAG_ATTRIB_WPOS
].refcount
= 0;
1734 cs
->inputs
[FRAG_ATTRIB_WPOS
].reg
= get_hw_temp(rp
);
1736 InputsRead
&= ~FRAG_BIT_WPOS
;
1738 /* Then primary colour */
1739 if (InputsRead
& FRAG_BIT_COL0
) {
1740 cs
->inputs
[FRAG_ATTRIB_COL0
].refcount
= 0;
1741 cs
->inputs
[FRAG_ATTRIB_COL0
].reg
= get_hw_temp(rp
);
1743 InputsRead
&= ~FRAG_BIT_COL0
;
1745 /* Secondary color */
1746 if (InputsRead
& FRAG_BIT_COL1
) {
1747 cs
->inputs
[FRAG_ATTRIB_COL1
].refcount
= 0;
1748 cs
->inputs
[FRAG_ATTRIB_COL1
].reg
= get_hw_temp(rp
);
1750 InputsRead
&= ~FRAG_BIT_COL1
;
1754 WARN_ONCE("Don't know how to handle inputs 0x%x\n",
1756 /* force read from hwreg 0 for now */
1758 if (InputsRead
& (1<<i
)) cs
->inputs
[i
].reg
= 0;
1761 /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
1762 * That way, we can free up the reg when it's no longer needed
1764 if (!mp
->Base
.Instructions
) {
1765 ERROR("No instructions found in program\n");
1769 for (fpi
=mp
->Base
.Instructions
;fpi
->Opcode
!= OPCODE_END
; fpi
++) {
1773 idx
= fpi
->SrcReg
[i
].Index
;
1774 switch (fpi
->SrcReg
[i
].File
) {
1775 case PROGRAM_TEMPORARY
:
1776 if (!(temps_used
& (1<<idx
))) {
1777 cs
->temps
[idx
].reg
= -1;
1778 cs
->temps
[idx
].refcount
= 1;
1779 temps_used
|= (1 << idx
);
1781 cs
->temps
[idx
].refcount
++;
1784 cs
->inputs
[idx
].refcount
++;
1790 idx
= fpi
->DstReg
.Index
;
1791 if (fpi
->DstReg
.File
== PROGRAM_TEMPORARY
) {
1792 if (!(temps_used
& (1<<idx
))) {
1793 cs
->temps
[idx
].reg
= -1;
1794 cs
->temps
[idx
].refcount
= 1;
1795 temps_used
|= (1 << idx
);
1797 cs
->temps
[idx
].refcount
++;
1800 cs
->temp_in_use
= temps_used
;
1803 static void update_params(struct r300_fragment_program
*rp
)
1805 struct gl_fragment_program
*mp
= &rp
->mesa_program
;
1808 /* Ask Mesa nicely to fill in ParameterValues for us */
1810 _mesa_load_state_parameters(rp
->ctx
, mp
->Base
.Parameters
);
1812 for (i
=0;i
<rp
->param_nr
;i
++)
1813 COPY_4V(rp
->constant
[rp
->param
[i
].idx
], rp
->param
[i
].values
);
1815 rp
->params_uptodate
= GL_TRUE
;
1818 void r300_translate_fragment_shader(struct r300_fragment_program
*rp
)
1820 struct r300_pfs_compile_state
*cs
= NULL
;
1822 if (!rp
->translated
) {
1827 if (parse_program(rp
) == GL_FALSE
) {
1833 cs
->v_pos
= cs
->s_pos
= MAX2(cs
->v_pos
, cs
->s_pos
);
1834 rp
->node
[rp
->cur_node
].alu_end
=
1835 cs
->v_pos
- rp
->node
[rp
->cur_node
].alu_offset
- 1;
1836 if (rp
->node
[rp
->cur_node
].tex_end
< 0)
1837 rp
->node
[rp
->cur_node
].tex_end
= 0;
1839 rp
->alu_end
= cs
->v_pos
- 1;
1841 rp
->tex_end
= rp
->tex
.length
? rp
->tex
.length
- 1 : 0;
1842 assert(rp
->node
[rp
->cur_node
].alu_end
>= 0);
1843 assert(rp
->alu_end
>= 0);
1845 rp
->translated
= GL_TRUE
;
1846 if (0) dump_program(rp
);
1852 /* just some random things... */
1853 static void dump_program(struct r300_fragment_program
*rp
)
1858 fprintf(stderr
, "pc=%d*************************************\n", pc
++);
1860 fprintf(stderr
, "Mesa program:\n");
1861 fprintf(stderr
, "-------------\n");
1862 _mesa_print_program(&rp
->mesa_program
.Base
);
1865 fprintf(stderr
, "Hardware program\n");
1866 fprintf(stderr
, "----------------\n");
1868 fprintf(stderr
, "tex:\n");
1870 for(i
=0;i
<rp
->tex
.length
;i
++) {
1871 fprintf(stderr
, "%08x\n", rp
->tex
.inst
[i
]);
1874 for (i
=0;i
<(rp
->cur_node
+1);i
++) {
1875 fprintf(stderr
, "NODE %d: alu_offset: %d, tex_offset: %d, "\
1876 "alu_end: %d, tex_end: %d\n", i
,
1877 rp
->node
[i
].alu_offset
,
1878 rp
->node
[i
].tex_offset
,
1879 rp
->node
[i
].alu_end
,
1880 rp
->node
[i
].tex_end
);
1883 fprintf(stderr
, "%08x\n",
1884 ((rp
->tex_end
<< 16) | (R300_PFS_TEXI_0
>> 2)));
1885 for (i
=0;i
<=rp
->tex_end
;i
++)
1886 fprintf(stderr
, "%08x\n", rp
->tex
.inst
[i
]);
1888 /* dump program in pretty_print_command_stream.tcl-readable format */
1889 fprintf(stderr
, "%08x\n",
1890 ((rp
->alu_end
<< 16) | (R300_PFS_INSTR0_0
>> 2)));
1891 for (i
=0;i
<=rp
->alu_end
;i
++)
1892 fprintf(stderr
, "%08x\n", rp
->alu
.inst
[i
].inst0
);
1894 fprintf(stderr
, "%08x\n",
1895 ((rp
->alu_end
<< 16) | (R300_PFS_INSTR1_0
>> 2)));
1896 for (i
=0;i
<=rp
->alu_end
;i
++)
1897 fprintf(stderr
, "%08x\n", rp
->alu
.inst
[i
].inst1
);
1899 fprintf(stderr
, "%08x\n",
1900 ((rp
->alu_end
<< 16) | (R300_PFS_INSTR2_0
>> 2)));
1901 for (i
=0;i
<=rp
->alu_end
;i
++)
1902 fprintf(stderr
, "%08x\n", rp
->alu
.inst
[i
].inst2
);
1904 fprintf(stderr
, "%08x\n",
1905 ((rp
->alu_end
<< 16) | (R300_PFS_INSTR3_0
>> 2)));
1906 for (i
=0;i
<=rp
->alu_end
;i
++)
1907 fprintf(stderr
, "%08x\n", rp
->alu
.inst
[i
].inst3
);
1909 fprintf(stderr
, "00000000\n");