2 * Copyright (C) 2005 Ben Skeggs.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * \author Ben Skeggs <darktama@iinet.net.au>
33 * \author Jerome Glisse <j.glisse@gmail.com>
35 * \todo Depth write, WPOS/FOGC inputs
39 * \todo Verify results of opcodes for accuracy, I've only checked them in
46 #include "shader/prog_instruction.h"
47 #include "shader/prog_parameter.h"
48 #include "shader/prog_print.h"
50 #include "r300_context.h"
51 #include "r300_fragprog.h"
53 #include "r300_state.h"
55 /* Mapping Mesa registers to R300 temporaries */
57 int reg
; /* Assigned hw temp */
58 unsigned int refcount
; /* Number of uses by mesa program */
62 * Describe the current lifetime information for an R300 temporary
65 /* Index of the first slot where this register is free in the sense
66 that it can be used as a new destination register.
67 This is -1 if the register has been assigned to a Mesa register
68 and the last access to the register has not yet been emitted */
71 /* Index of the first slot where this register is currently reserved.
72 This is used to stop e.g. a scalar operation from being moved
73 before the allocation time of a register that was first allocated
74 for a vector operation. */
77 /* Index of the first slot in which the register can be used as a
78 source without losing the value that is written by the last
79 emitted instruction that writes to the register */
83 /* Index to the slot where the register was last read.
84 This is also the first slot in which the register may be written again */
90 * Store usage information about an ALU instruction slot during the
91 * compilation of a fragment program.
93 #define SLOT_SRC_VECTOR (1<<0)
94 #define SLOT_SRC_SCALAR (1<<3)
95 #define SLOT_SRC_BOTH (SLOT_SRC_VECTOR | SLOT_SRC_SCALAR)
96 #define SLOT_OP_VECTOR (1<<16)
97 #define SLOT_OP_SCALAR (1<<17)
98 #define SLOT_OP_BOTH (SLOT_OP_VECTOR | SLOT_OP_SCALAR)
100 struct r300_pfs_compile_slot
{
101 /* Bitmask indicating which parts of the slot are used, using SLOT_ constants
105 /* Selected sources */
111 * Store information during compilation of fragment programs.
113 struct r300_pfs_compile_state
{
115 struct r300_fragment_program
*fp
;
117 int nrslots
; /* number of ALU slots used so far */
119 /* Track which (parts of) slots are already filled with instructions */
120 struct r300_pfs_compile_slot slot
[PFS_MAX_ALU_INST
];
122 /* Track the validity of R300 temporaries */
123 struct reg_lifetime hwtemps
[PFS_NUM_TEMP_REGS
];
125 /* Used to map Mesa's inputs/temps onto hardware temps */
127 struct reg_acc temps
[PFS_NUM_TEMP_REGS
];
128 struct reg_acc inputs
[32]; /* don't actually need 32... */
130 /* Track usage of hardware temps, for register allocation,
131 * indirection detection, etc. */
138 * Usefull macros and values
140 #define ERROR(fmt, args...) do { \
141 fprintf(stderr, "%s::%s(): " fmt "\n", \
142 __FILE__, __FUNCTION__, ##args); \
143 fp->error = GL_TRUE; \
146 #define PFS_INVAL 0xFFFFFFFF
147 #define COMPILE_STATE \
148 struct r300_fragment_program *fp = cs->fp; \
149 struct r300_fragment_program_code *code = &fp->code; \
152 #define SWIZZLE_XYZ 0
153 #define SWIZZLE_XXX 1
154 #define SWIZZLE_YYY 2
155 #define SWIZZLE_ZZZ 3
156 #define SWIZZLE_WWW 4
157 #define SWIZZLE_YZX 5
158 #define SWIZZLE_ZXY 6
159 #define SWIZZLE_WZY 7
160 #define SWIZZLE_111 8
161 #define SWIZZLE_000 9
162 #define SWIZZLE_HHH 10
164 #define swizzle(r, x, y, z, w) do_swizzle(cs, r, \
171 #define REG_TYPE_INPUT 0
172 #define REG_TYPE_OUTPUT 1
173 #define REG_TYPE_TEMP 2
174 #define REG_TYPE_CONST 3
176 #define REG_TYPE_SHIFT 0
177 #define REG_INDEX_SHIFT 2
178 #define REG_VSWZ_SHIFT 8
179 #define REG_SSWZ_SHIFT 13
180 #define REG_NEGV_SHIFT 18
181 #define REG_NEGS_SHIFT 19
182 #define REG_ABS_SHIFT 20
183 #define REG_NO_USE_SHIFT 21 // Hack for refcounting
184 #define REG_VALID_SHIFT 22 // Does the register contain a defined value?
185 #define REG_BUILTIN_SHIFT 23 // Is it a builtin (like all zero/all one)?
187 #define REG_TYPE_MASK (0x03 << REG_TYPE_SHIFT)
188 #define REG_INDEX_MASK (0x3F << REG_INDEX_SHIFT)
189 #define REG_VSWZ_MASK (0x1F << REG_VSWZ_SHIFT)
190 #define REG_SSWZ_MASK (0x1F << REG_SSWZ_SHIFT)
191 #define REG_NEGV_MASK (0x01 << REG_NEGV_SHIFT)
192 #define REG_NEGS_MASK (0x01 << REG_NEGS_SHIFT)
193 #define REG_ABS_MASK (0x01 << REG_ABS_SHIFT)
194 #define REG_NO_USE_MASK (0x01 << REG_NO_USE_SHIFT)
195 #define REG_VALID_MASK (0x01 << REG_VALID_SHIFT)
196 #define REG_BUILTIN_MASK (0x01 << REG_BUILTIN_SHIFT)
198 #define REG(type, index, vswz, sswz, nouse, valid, builtin) \
199 (((type << REG_TYPE_SHIFT) & REG_TYPE_MASK) | \
200 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK) | \
201 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK) | \
202 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK) | \
203 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK) | \
204 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK) | \
205 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
206 #define REG_GET_TYPE(reg) \
207 ((reg & REG_TYPE_MASK) >> REG_TYPE_SHIFT)
208 #define REG_GET_INDEX(reg) \
209 ((reg & REG_INDEX_MASK) >> REG_INDEX_SHIFT)
210 #define REG_GET_VSWZ(reg) \
211 ((reg & REG_VSWZ_MASK) >> REG_VSWZ_SHIFT)
212 #define REG_GET_SSWZ(reg) \
213 ((reg & REG_SSWZ_MASK) >> REG_SSWZ_SHIFT)
214 #define REG_GET_NO_USE(reg) \
215 ((reg & REG_NO_USE_MASK) >> REG_NO_USE_SHIFT)
216 #define REG_GET_VALID(reg) \
217 ((reg & REG_VALID_MASK) >> REG_VALID_SHIFT)
218 #define REG_GET_BUILTIN(reg) \
219 ((reg & REG_BUILTIN_MASK) >> REG_BUILTIN_SHIFT)
220 #define REG_SET_TYPE(reg, type) \
221 reg = ((reg & ~REG_TYPE_MASK) | \
222 ((type << REG_TYPE_SHIFT) & REG_TYPE_MASK))
223 #define REG_SET_INDEX(reg, index) \
224 reg = ((reg & ~REG_INDEX_MASK) | \
225 ((index << REG_INDEX_SHIFT) & REG_INDEX_MASK))
226 #define REG_SET_VSWZ(reg, vswz) \
227 reg = ((reg & ~REG_VSWZ_MASK) | \
228 ((vswz << REG_VSWZ_SHIFT) & REG_VSWZ_MASK))
229 #define REG_SET_SSWZ(reg, sswz) \
230 reg = ((reg & ~REG_SSWZ_MASK) | \
231 ((sswz << REG_SSWZ_SHIFT) & REG_SSWZ_MASK))
232 #define REG_SET_NO_USE(reg, nouse) \
233 reg = ((reg & ~REG_NO_USE_MASK) | \
234 ((nouse << REG_NO_USE_SHIFT) & REG_NO_USE_MASK))
235 #define REG_SET_VALID(reg, valid) \
236 reg = ((reg & ~REG_VALID_MASK) | \
237 ((valid << REG_VALID_SHIFT) & REG_VALID_MASK))
238 #define REG_SET_BUILTIN(reg, builtin) \
239 reg = ((reg & ~REG_BUILTIN_MASK) | \
240 ((builtin << REG_BUILTIN_SHIFT) & REG_BUILTIN_MASK))
241 #define REG_ABS(reg) \
242 reg = (reg | REG_ABS_MASK)
243 #define REG_NEGV(reg) \
244 reg = (reg | REG_NEGV_MASK)
245 #define REG_NEGS(reg) \
246 reg = (reg | REG_NEGS_MASK)
248 #define NOP_INST0 ( \
249 (R300_ALU_OUTC_MAD) | \
250 (R300_ALU_ARGC_ZERO << R300_ALU_ARG0C_SHIFT) | \
251 (R300_ALU_ARGC_ZERO << R300_ALU_ARG1C_SHIFT) | \
252 (R300_ALU_ARGC_ZERO << R300_ALU_ARG2C_SHIFT))
253 #define NOP_INST1 ( \
254 ((0 | SRC_CONST) << R300_ALU_SRC0C_SHIFT) | \
255 ((0 | SRC_CONST) << R300_ALU_SRC1C_SHIFT) | \
256 ((0 | SRC_CONST) << R300_ALU_SRC2C_SHIFT))
257 #define NOP_INST2 ( \
258 (R300_ALU_OUTA_MAD) | \
259 (R300_ALU_ARGA_ZERO << R300_ALU_ARG0A_SHIFT) | \
260 (R300_ALU_ARGA_ZERO << R300_ALU_ARG1A_SHIFT) | \
261 (R300_ALU_ARGA_ZERO << R300_ALU_ARG2A_SHIFT))
262 #define NOP_INST3 ( \
263 ((0 | SRC_CONST) << R300_ALU_SRC0A_SHIFT) | \
264 ((0 | SRC_CONST) << R300_ALU_SRC1A_SHIFT) | \
265 ((0 | SRC_CONST) << R300_ALU_SRC2A_SHIFT))
269 * Datas structures for fragment program generation
272 /* description of r300 native hw instructions */
273 static const struct {
280 {"MAD", 3, R300_ALU_OUTC_MAD
, R300_ALU_OUTA_MAD
},
281 {"DP3", 2, R300_ALU_OUTC_DP3
, R300_ALU_OUTA_DP4
},
282 {"DP4", 2, R300_ALU_OUTC_DP4
, R300_ALU_OUTA_DP4
},
283 {"MIN", 2, R300_ALU_OUTC_MIN
, R300_ALU_OUTA_MIN
},
284 {"MAX", 2, R300_ALU_OUTC_MAX
, R300_ALU_OUTA_MAX
},
285 {"CMP", 3, R300_ALU_OUTC_CMP
, R300_ALU_OUTA_CMP
},
286 {"FRC", 1, R300_ALU_OUTC_FRC
, R300_ALU_OUTA_FRC
},
287 {"EX2", 1, R300_ALU_OUTC_REPL_ALPHA
, R300_ALU_OUTA_EX2
},
288 {"LG2", 1, R300_ALU_OUTC_REPL_ALPHA
, R300_ALU_OUTA_LG2
},
289 {"RCP", 1, R300_ALU_OUTC_REPL_ALPHA
, R300_ALU_OUTA_RCP
},
290 {"RSQ", 1, R300_ALU_OUTC_REPL_ALPHA
, R300_ALU_OUTA_RSQ
},
291 {"REPL_ALPHA", 1, R300_ALU_OUTC_REPL_ALPHA
, PFS_INVAL
},
292 {"CMPH", 3, R300_ALU_OUTC_CMPH
, PFS_INVAL
},
296 /* vector swizzles r300 can support natively, with a couple of
297 * cases we handle specially
299 * REG_VSWZ/REG_SSWZ is an index into this table
302 /* mapping from SWIZZLE_* to r300 native values for scalar insns */
303 #define SWIZZLE_HALF 6
305 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, \
309 /* native swizzles */
310 static const struct r300_pfs_swizzle
{
311 GLuint hash
; /* swizzle value this matches */
312 GLuint base
; /* base value for hw swizzle */
313 GLuint stride
; /* difference in base between arg0/1/2 */
317 {MAKE_SWZ3(X
, Y
, Z
), R300_ALU_ARGC_SRC0C_XYZ
, 4, SLOT_SRC_VECTOR
},
318 {MAKE_SWZ3(X
, X
, X
), R300_ALU_ARGC_SRC0C_XXX
, 4, SLOT_SRC_VECTOR
},
319 {MAKE_SWZ3(Y
, Y
, Y
), R300_ALU_ARGC_SRC0C_YYY
, 4, SLOT_SRC_VECTOR
},
320 {MAKE_SWZ3(Z
, Z
, Z
), R300_ALU_ARGC_SRC0C_ZZZ
, 4, SLOT_SRC_VECTOR
},
321 {MAKE_SWZ3(W
, W
, W
), R300_ALU_ARGC_SRC0A
, 1, SLOT_SRC_SCALAR
},
322 {MAKE_SWZ3(Y
, Z
, X
), R300_ALU_ARGC_SRC0C_YZX
, 1, SLOT_SRC_VECTOR
},
323 {MAKE_SWZ3(Z
, X
, Y
), R300_ALU_ARGC_SRC0C_ZXY
, 1, SLOT_SRC_VECTOR
},
324 {MAKE_SWZ3(W
, Z
, Y
), R300_ALU_ARGC_SRC0CA_WZY
, 1, SLOT_SRC_BOTH
},
325 {MAKE_SWZ3(ONE
, ONE
, ONE
), R300_ALU_ARGC_ONE
, 0, 0},
326 {MAKE_SWZ3(ZERO
, ZERO
, ZERO
), R300_ALU_ARGC_ZERO
, 0, 0},
327 {MAKE_SWZ3(HALF
, HALF
, HALF
), R300_ALU_ARGC_HALF
, 0, 0},
328 {PFS_INVAL
, 0, 0, 0},
332 /* used during matching of non-native swizzles */
333 #define SWZ_X_MASK (7 << 0)
334 #define SWZ_Y_MASK (7 << 3)
335 #define SWZ_Z_MASK (7 << 6)
336 #define SWZ_W_MASK (7 << 9)
337 static const struct {
338 GLuint hash
; /* used to mask matching swizzle components */
339 int mask
; /* actual outmask */
340 int count
; /* count of components matched */
343 {SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
, 1 | 2 | 4, 3},
344 {SWZ_X_MASK
| SWZ_Y_MASK
, 1 | 2, 2},
345 {SWZ_X_MASK
| SWZ_Z_MASK
, 1 | 4, 2},
346 {SWZ_Y_MASK
| SWZ_Z_MASK
, 2 | 4, 2},
350 {PFS_INVAL
, PFS_INVAL
, PFS_INVAL
}
354 static const struct {
355 int base
; /* hw value of swizzle */
356 int stride
; /* difference between SRC0/1/2 */
360 {R300_ALU_ARGA_SRC0C_X
, 3, SLOT_SRC_VECTOR
},
361 {R300_ALU_ARGA_SRC0C_Y
, 3, SLOT_SRC_VECTOR
},
362 {R300_ALU_ARGA_SRC0C_Z
, 3, SLOT_SRC_VECTOR
},
363 {R300_ALU_ARGA_SRC0A
, 1, SLOT_SRC_SCALAR
},
364 {R300_ALU_ARGA_ZERO
, 0, 0},
365 {R300_ALU_ARGA_ONE
, 0, 0},
366 {R300_ALU_ARGA_HALF
, 0, 0}
370 /* boiler-plate reg, for convenience */
371 static const GLuint undef
= REG(REG_TYPE_TEMP
,
379 /* constant one source */
380 static const GLuint pfs_one
= REG(REG_TYPE_CONST
,
388 /* constant half source */
389 static const GLuint pfs_half
= REG(REG_TYPE_CONST
,
397 /* constant zero source */
398 static const GLuint pfs_zero
= REG(REG_TYPE_CONST
,
407 * Common functions prototypes
409 static void dump_program(struct r300_fragment_program
*fp
,
410 struct r300_fragment_program_code
*code
);
411 static void emit_arith(struct r300_pfs_compile_state
*cs
, int op
,
412 GLuint dest
, int mask
,
413 GLuint src0
, GLuint src1
, GLuint src2
, int flags
);
416 * Get an R300 temporary that can be written to in the given slot.
418 static int get_hw_temp(struct r300_pfs_compile_state
*cs
, int slot
)
423 for (r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
424 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= slot
)
428 if (r
>= PFS_NUM_TEMP_REGS
) {
429 ERROR("Out of hardware temps\n");
432 // Reserved is used to avoid the following scenario:
433 // R300 temporary X is first assigned to Mesa temporary Y during vector ops
434 // R300 temporary X is then assigned to Mesa temporary Z for further vector ops
435 // Then scalar ops on Mesa temporary Z are emitted and move back in time
436 // to overwrite the value of temporary Y.
438 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
439 cs
->hwtemps
[r
].free
= -1;
441 // Reset to some value that won't mess things up when the user
442 // tries to read from a temporary that hasn't been assigned a value yet.
443 // In the normal case, vector_valid and scalar_valid should be set to
444 // a sane value by the first emit that writes to this temporary.
445 cs
->hwtemps
[r
].vector_valid
= 0;
446 cs
->hwtemps
[r
].scalar_valid
= 0;
448 if (r
> fp
->code
.max_temp_idx
)
449 fp
->code
.max_temp_idx
= r
;
455 * Get an R300 temporary that will act as a TEX destination register.
457 static int get_hw_temp_tex(struct r300_pfs_compile_state
*cs
)
462 for (r
= 0; r
< PFS_NUM_TEMP_REGS
; ++r
) {
463 if (cs
->used_in_node
& (1 << r
))
466 // Note: Be very careful here
467 if (cs
->hwtemps
[r
].free
>= 0 && cs
->hwtemps
[r
].free
<= 0)
471 if (r
>= PFS_NUM_TEMP_REGS
)
472 return get_hw_temp(cs
, 0); /* Will cause an indirection */
474 cs
->hwtemps
[r
].reserved
= cs
->hwtemps
[r
].free
;
475 cs
->hwtemps
[r
].free
= -1;
477 // Reset to some value that won't mess things up when the user
478 // tries to read from a temporary that hasn't been assigned a value yet.
479 // In the normal case, vector_valid and scalar_valid should be set to
480 // a sane value by the first emit that writes to this temporary.
481 cs
->hwtemps
[r
].vector_valid
= cs
->nrslots
;
482 cs
->hwtemps
[r
].scalar_valid
= cs
->nrslots
;
484 if (r
> code
->max_temp_idx
)
485 code
->max_temp_idx
= r
;
491 * Mark the given hardware register as free.
493 static void free_hw_temp(struct r300_pfs_compile_state
*cs
, int idx
)
495 // Be very careful here. Consider sequences like
498 // The TEX instruction may be moved in front of the MAD instruction
499 // due to the way nodes work. We don't want to alias r1 and r4 in
501 // I'm certain the register allocation could be further sanitized,
502 // but it's tricky because of stuff that can happen inside emit_tex
504 cs
->hwtemps
[idx
].free
= cs
->nrslots
+ 1;
508 * Create a new Mesa temporary register.
510 static GLuint
get_temp_reg(struct r300_pfs_compile_state
*cs
)
516 index
= ffs(~cs
->temp_in_use
);
518 ERROR("Out of program temps\n");
522 cs
->temp_in_use
|= (1 << --index
);
523 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
524 cs
->temps
[index
].reg
= -1;
526 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
527 REG_SET_INDEX(r
, index
);
528 REG_SET_VALID(r
, GL_TRUE
);
533 * Create a new Mesa temporary register that will act as the destination
534 * register for a texture read.
536 static GLuint
get_temp_reg_tex(struct r300_pfs_compile_state
*cs
)
542 index
= ffs(~cs
->temp_in_use
);
544 ERROR("Out of program temps\n");
548 cs
->temp_in_use
|= (1 << --index
);
549 cs
->temps
[index
].refcount
= 0xFFFFFFFF;
550 cs
->temps
[index
].reg
= get_hw_temp_tex(cs
);
552 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
553 REG_SET_INDEX(r
, index
);
554 REG_SET_VALID(r
, GL_TRUE
);
559 * Free a Mesa temporary and the associated R300 temporary.
561 static void free_temp(struct r300_pfs_compile_state
*cs
, GLuint r
)
563 GLuint index
= REG_GET_INDEX(r
);
565 if (!(cs
->temp_in_use
& (1 << index
)))
568 if (REG_GET_TYPE(r
) == REG_TYPE_TEMP
) {
569 free_hw_temp(cs
, cs
->temps
[index
].reg
);
570 cs
->temps
[index
].reg
= -1;
571 cs
->temp_in_use
&= ~(1 << index
);
572 } else if (REG_GET_TYPE(r
) == REG_TYPE_INPUT
) {
573 free_hw_temp(cs
, cs
->inputs
[index
].reg
);
574 cs
->inputs
[index
].reg
= -1;
579 * Emit a hardware constant/parameter.
581 * \p cp Stable pointer to an array of 4 floats.
582 * The pointer must be stable in the sense that it remains to be valid
583 * and hold the contents of the constant/parameter throughout the lifetime
584 * of the fragment program (actually, up until the next time the fragment
585 * program is translated).
587 static GLuint
emit_const4fv(struct r300_pfs_compile_state
*cs
,
594 for (index
= 0; index
< code
->const_nr
; ++index
) {
595 if (code
->constant
[index
] == cp
)
599 if (index
>= code
->const_nr
) {
600 if (index
>= PFS_NUM_CONST_REGS
) {
601 ERROR("Out of hw constants!\n");
606 code
->constant
[index
] = cp
;
609 REG_SET_TYPE(reg
, REG_TYPE_CONST
);
610 REG_SET_INDEX(reg
, index
);
611 REG_SET_VALID(reg
, GL_TRUE
);
615 static inline GLuint
negate(GLuint r
)
622 /* Hack, to prevent clobbering sources used multiple times when
623 * emulating non-native instructions
625 static inline GLuint
keep(GLuint r
)
627 REG_SET_NO_USE(r
, GL_TRUE
);
631 static inline GLuint
absolute(GLuint r
)
637 static int swz_native(struct r300_pfs_compile_state
*cs
,
638 GLuint src
, GLuint
* r
, GLuint arbneg
)
642 /* Native swizzle, handle negation */
643 src
= (src
& ~REG_NEGS_MASK
) | (((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
);
645 if ((arbneg
& 0x7) == 0x0) {
646 src
= src
& ~REG_NEGV_MASK
;
648 } else if ((arbneg
& 0x7) == 0x7) {
649 src
|= REG_NEGV_MASK
;
652 if (!REG_GET_VALID(*r
))
653 *r
= get_temp_reg(cs
);
654 src
|= REG_NEGV_MASK
;
657 *r
, arbneg
& 0x7, keep(src
), pfs_one
, pfs_zero
, 0);
658 src
= src
& ~REG_NEGV_MASK
;
662 (arbneg
^ 0x7) | WRITEMASK_W
,
663 src
, pfs_one
, pfs_zero
, 0);
669 static int swz_emit_partial(struct r300_pfs_compile_state
*cs
,
671 GLuint
* r
, int mask
, int mc
, GLuint arbneg
)
677 if (!REG_GET_VALID(*r
))
678 *r
= get_temp_reg(cs
);
680 /* A partial match, VSWZ/mask define what parts of the
681 * desired swizzle we match
683 if (mc
+ s_mask
[mask
].count
== 3) {
685 src
|= ((arbneg
>> 3) & 1) << REG_NEGS_SHIFT
;
688 tmp
= arbneg
& s_mask
[mask
].mask
;
690 tmp
= tmp
^ s_mask
[mask
].mask
;
695 arbneg
& s_mask
[mask
].mask
,
696 keep(src
) | REG_NEGV_MASK
,
697 pfs_one
, pfs_zero
, 0);
699 REG_SET_NO_USE(src
, GL_TRUE
);
701 REG_SET_NO_USE(src
, GL_FALSE
);
705 *r
, tmp
| wmask
, src
, pfs_one
, pfs_zero
, 0);
708 REG_SET_NO_USE(src
, GL_TRUE
);
710 REG_SET_NO_USE(src
, GL_FALSE
);
715 (arbneg
& s_mask
[mask
].mask
) | wmask
,
716 src
| REG_NEGV_MASK
, pfs_one
, pfs_zero
, 0);
720 REG_SET_NO_USE(src
, GL_TRUE
);
722 REG_SET_NO_USE(src
, GL_FALSE
);
724 emit_arith(cs
, PFS_OP_MAD
,
726 s_mask
[mask
].mask
| wmask
,
727 src
, pfs_one
, pfs_zero
, 0);
730 return s_mask
[mask
].count
;
733 static GLuint
do_swizzle(struct r300_pfs_compile_state
*cs
,
734 GLuint src
, GLuint arbswz
, GLuint arbneg
)
742 /* If swizzling from something without an XYZW native swizzle,
743 * emit result to a temp, and do new swizzle from the temp.
746 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
|| REG_GET_SSWZ(src
) != SWIZZLE_W
) {
747 GLuint temp
= get_temp_reg(fp
);
750 temp
, WRITEMASK_XYZW
, src
, pfs_one
, pfs_zero
, 0);
755 if (REG_GET_VSWZ(src
) != SWIZZLE_XYZ
|| REG_GET_SSWZ(src
) != SWIZZLE_W
) {
757 (v_swiz
[REG_GET_VSWZ(src
)].
758 hash
& (SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
)) |
759 REG_GET_SSWZ(src
) << 9;
764 for (i
= 0; i
< 4; ++i
) {
765 offset
= GET_SWZ(arbswz
, i
);
768 (offset
<= 3) ? GET_SWZ(vsrcswz
,
773 arbswz
= newswz
& (SWZ_X_MASK
| SWZ_Y_MASK
| SWZ_Z_MASK
);
774 REG_SET_SSWZ(src
, GET_SWZ(newswz
, 3));
776 /* set scalar swizzling */
777 REG_SET_SSWZ(src
, GET_SWZ(arbswz
, 3));
781 vswz
= REG_GET_VSWZ(src
);
785 REG_SET_VSWZ(src
, vswz
);
786 chash
= v_swiz
[REG_GET_VSWZ(src
)].hash
&
789 if (chash
== (arbswz
& s_mask
[c_mask
].hash
)) {
790 if (s_mask
[c_mask
].count
== 3) {
791 v_match
+= swz_native(cs
,
794 v_match
+= swz_emit_partial(cs
,
805 /* Fill with something invalid.. all 0's was
806 * wrong before, matched SWIZZLE_X. So all
807 * 1's will be okay for now
809 arbswz
|= (PFS_INVAL
& s_mask
[c_mask
].hash
);
811 } while (v_swiz
[++vswz
].hash
!= PFS_INVAL
);
812 REG_SET_VSWZ(src
, SWIZZLE_XYZ
);
813 } while (s_mask
[++c_mask
].hash
!= PFS_INVAL
);
815 ERROR("should NEVER get here\n");
819 static GLuint
t_src(struct r300_pfs_compile_state
*cs
,
820 struct prog_src_register fpsrc
)
825 switch (fpsrc
.File
) {
826 case PROGRAM_TEMPORARY
:
827 REG_SET_INDEX(r
, fpsrc
.Index
);
828 REG_SET_VALID(r
, GL_TRUE
);
829 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
832 REG_SET_INDEX(r
, fpsrc
.Index
);
833 REG_SET_VALID(r
, GL_TRUE
);
834 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
836 case PROGRAM_LOCAL_PARAM
:
837 r
= emit_const4fv(cs
,
838 fp
->mesa_program
.Base
.LocalParams
[fpsrc
.
841 case PROGRAM_ENV_PARAM
:
842 r
= emit_const4fv(cs
,
843 cs
->r300
->radeon
.glCtx
->FragmentProgram
.Parameters
[fpsrc
.Index
]);
845 case PROGRAM_STATE_VAR
:
846 case PROGRAM_NAMED_PARAM
:
847 case PROGRAM_CONSTANT
:
848 r
= emit_const4fv(cs
,
849 fp
->mesa_program
.Base
.Parameters
->
850 ParameterValues
[fpsrc
.Index
]);
853 ERROR("unknown SrcReg->File %x\n", fpsrc
.File
);
857 /* no point swizzling ONE/ZERO/HALF constants... */
858 if (REG_GET_VSWZ(r
) < SWIZZLE_111
|| REG_GET_SSWZ(r
) < SWIZZLE_ZERO
)
859 r
= do_swizzle(cs
, r
, fpsrc
.Swizzle
, fpsrc
.NegateBase
);
863 static GLuint
t_scalar_src(struct r300_pfs_compile_state
*cs
,
864 struct prog_src_register fpsrc
)
866 struct prog_src_register src
= fpsrc
;
867 int sc
= GET_SWZ(fpsrc
.Swizzle
, 0); /* X */
869 src
.Swizzle
= ((sc
<< 0) | (sc
<< 3) | (sc
<< 6) | (sc
<< 9));
871 return t_src(cs
, src
);
874 static GLuint
t_dst(struct r300_pfs_compile_state
*cs
,
875 struct prog_dst_register dest
)
881 case PROGRAM_TEMPORARY
:
882 REG_SET_INDEX(r
, dest
.Index
);
883 REG_SET_VALID(r
, GL_TRUE
);
884 REG_SET_TYPE(r
, REG_TYPE_TEMP
);
887 REG_SET_TYPE(r
, REG_TYPE_OUTPUT
);
888 switch (dest
.Index
) {
889 case FRAG_RESULT_COLR
:
890 case FRAG_RESULT_DEPR
:
891 REG_SET_INDEX(r
, dest
.Index
);
892 REG_SET_VALID(r
, GL_TRUE
);
895 ERROR("Bad DstReg->Index 0x%x\n", dest
.Index
);
899 ERROR("Bad DstReg->File 0x%x\n", dest
.File
);
904 static int t_hw_src(struct r300_pfs_compile_state
*cs
, GLuint src
, GLboolean tex
)
908 int index
= REG_GET_INDEX(src
);
910 switch (REG_GET_TYPE(src
)) {
912 /* NOTE: if reg==-1 here, a source is being read that
913 * hasn't been written to. Undefined results.
915 if (cs
->temps
[index
].reg
== -1)
916 cs
->temps
[index
].reg
= get_hw_temp(cs
, cs
->nrslots
);
918 idx
= cs
->temps
[index
].reg
;
920 if (!REG_GET_NO_USE(src
) && (--cs
->temps
[index
].refcount
== 0))
924 idx
= cs
->inputs
[index
].reg
;
926 if (!REG_GET_NO_USE(src
) && (--cs
->inputs
[index
].refcount
== 0))
927 free_hw_temp(cs
, cs
->inputs
[index
].reg
);
930 return (index
| SRC_CONST
);
932 ERROR("Invalid type for source reg\n");
933 return (0 | SRC_CONST
);
937 cs
->used_in_node
|= (1 << idx
);
942 static int t_hw_dst(struct r300_pfs_compile_state
*cs
,
943 GLuint dest
, GLboolean tex
, int slot
)
947 GLuint index
= REG_GET_INDEX(dest
);
948 assert(REG_GET_VALID(dest
));
950 switch (REG_GET_TYPE(dest
)) {
952 if (cs
->temps
[REG_GET_INDEX(dest
)].reg
== -1) {
954 cs
->temps
[index
].reg
= get_hw_temp(cs
, slot
);
956 cs
->temps
[index
].reg
= get_hw_temp_tex(cs
);
959 idx
= cs
->temps
[index
].reg
;
961 if (!REG_GET_NO_USE(dest
) && (--cs
->temps
[index
].refcount
== 0))
964 cs
->dest_in_node
|= (1 << idx
);
965 cs
->used_in_node
|= (1 << idx
);
967 case REG_TYPE_OUTPUT
:
969 case FRAG_RESULT_COLR
:
970 code
->node
[code
->cur_node
].flags
|= R300_RGBA_OUT
;
972 case FRAG_RESULT_DEPR
:
973 fp
->WritesDepth
= GL_TRUE
;
974 code
->node
[code
->cur_node
].flags
|= R300_W_OUT
;
980 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
987 static void emit_nop(struct r300_pfs_compile_state
*cs
)
991 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
992 ERROR("Out of ALU instruction slots\n");
996 code
->alu
.inst
[cs
->nrslots
].inst0
= NOP_INST0
;
997 code
->alu
.inst
[cs
->nrslots
].inst1
= NOP_INST1
;
998 code
->alu
.inst
[cs
->nrslots
].inst2
= NOP_INST2
;
999 code
->alu
.inst
[cs
->nrslots
].inst3
= NOP_INST3
;
1003 static void emit_tex(struct r300_pfs_compile_state
*cs
,
1004 struct prog_instruction
*fpi
, int opcode
)
1007 GLuint coord
= t_src(cs
, fpi
->SrcReg
[0]);
1008 GLuint dest
= undef
, rdest
= undef
;
1010 int unit
= fpi
->TexSrcUnit
;
1015 * Hardware uses [0..1]x[0..1] range for rectangle textures
1016 * instead of [0..Width]x[0..Height].
1017 * Add a scaling instruction.
1019 * \todo Refactor this once we have proper rewriting/optimization
1020 * support for programs.
1022 if (opcode
!= R300_TEX_OP_KIL
&& fpi
->TexSrcTarget
== TEXTURE_RECT_INDEX
) {
1023 gl_state_index tokens
[STATE_LENGTH
] = {
1024 STATE_INTERNAL
, STATE_R300_TEXRECT_FACTOR
, 0, 0,
1032 _mesa_add_state_reference(cs
->fp
->mesa_program
.Base
.
1033 Parameters
, tokens
);
1036 cs
->fp
->mesa_program
.Base
.Parameters
->
1037 ParameterValues
[factor_index
]);
1038 tempreg
= keep(get_temp_reg(cs
));
1040 emit_arith(cs
, PFS_OP_MAD
, tempreg
, WRITEMASK_XYZW
,
1041 coord
, factorreg
, pfs_zero
, 0);
1046 /* Texture operations do not support swizzles etc. in hardware,
1047 * so emit an additional arithmetic operation if necessary.
1049 if (REG_GET_VSWZ(coord
) != SWIZZLE_XYZ
||
1050 REG_GET_SSWZ(coord
) != SWIZZLE_W
||
1051 coord
& (REG_NEGV_MASK
| REG_NEGS_MASK
| REG_ABS_MASK
)) {
1052 assert(tempreg
== 0);
1053 tempreg
= keep(get_temp_reg(cs
));
1054 emit_arith(cs
, PFS_OP_MAD
, tempreg
, WRITEMASK_XYZW
,
1055 coord
, pfs_one
, pfs_zero
, 0);
1059 /* Ensure correct node indirection */
1060 uin
= cs
->used_in_node
;
1061 din
= cs
->dest_in_node
;
1063 /* Resolve source/dest to hardware registers */
1064 hwsrc
= t_hw_src(cs
, coord
, GL_TRUE
);
1066 if (opcode
!= R300_TEX_OP_KIL
) {
1067 dest
= t_dst(cs
, fpi
->DstReg
);
1069 /* r300 doesn't seem to be able to do TEX->output reg */
1070 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1072 dest
= get_temp_reg_tex(cs
);
1073 } else if (fpi
->DstReg
.WriteMask
!= WRITEMASK_XYZW
) {
1074 /* in case write mask isn't XYZW */
1076 dest
= get_temp_reg_tex(cs
);
1079 t_hw_dst(cs
, dest
, GL_TRUE
,
1080 code
->node
[code
->cur_node
].alu_offset
);
1082 /* Use a temp that hasn't been used in this node, rather
1083 * than causing an indirection
1085 if (uin
& (1 << hwdest
)) {
1086 free_hw_temp(cs
, hwdest
);
1087 hwdest
= get_hw_temp_tex(cs
);
1088 cs
->temps
[REG_GET_INDEX(dest
)].reg
= hwdest
;
1095 /* Indirection if source has been written in this node, or if the
1096 * dest has been read/written in this node
1098 if ((REG_GET_TYPE(coord
) != REG_TYPE_CONST
&&
1099 (din
& (1 << hwsrc
))) || (uin
& (1 << hwdest
))) {
1101 /* Finish off current node */
1102 if (code
->node
[code
->cur_node
].alu_offset
== cs
->nrslots
)
1105 code
->node
[code
->cur_node
].alu_end
=
1106 cs
->nrslots
- code
->node
[code
->cur_node
].alu_offset
- 1;
1107 assert(code
->node
[code
->cur_node
].alu_end
>= 0);
1109 if (++code
->cur_node
>= PFS_MAX_TEX_INDIRECT
) {
1110 ERROR("too many levels of texture indirection\n");
1114 /* Start new node */
1115 code
->node
[code
->cur_node
].tex_offset
= code
->tex
.length
;
1116 code
->node
[code
->cur_node
].alu_offset
= cs
->nrslots
;
1117 code
->node
[code
->cur_node
].tex_end
= -1;
1118 code
->node
[code
->cur_node
].alu_end
= -1;
1119 code
->node
[code
->cur_node
].flags
= 0;
1120 cs
->used_in_node
= 0;
1121 cs
->dest_in_node
= 0;
1124 if (code
->cur_node
== 0)
1125 code
->first_node_has_tex
= 1;
1127 code
->tex
.inst
[code
->tex
.length
++] = 0 | (hwsrc
<< R300_SRC_ADDR_SHIFT
)
1128 | (hwdest
<< R300_DST_ADDR_SHIFT
)
1129 | (unit
<< R300_TEX_ID_SHIFT
)
1130 | (opcode
<< R300_TEX_INST_SHIFT
);
1132 cs
->dest_in_node
|= (1 << hwdest
);
1133 if (REG_GET_TYPE(coord
) != REG_TYPE_CONST
)
1134 cs
->used_in_node
|= (1 << hwsrc
);
1136 code
->node
[code
->cur_node
].tex_end
++;
1138 /* Copy from temp to output if needed */
1139 if (REG_GET_VALID(rdest
)) {
1140 emit_arith(cs
, PFS_OP_MAD
, rdest
, fpi
->DstReg
.WriteMask
, dest
,
1141 pfs_one
, pfs_zero
, 0);
1142 free_temp(cs
, dest
);
1145 /* Free temp register */
1147 free_temp(cs
, tempreg
);
1151 * Returns the first slot where we could possibly allow writing to dest,
1152 * according to register allocation.
1154 static int get_earliest_allowed_write(struct r300_pfs_compile_state
*cs
,
1155 GLuint dest
, int mask
)
1160 GLuint index
= REG_GET_INDEX(dest
);
1161 assert(REG_GET_VALID(dest
));
1163 switch (REG_GET_TYPE(dest
)) {
1165 if (cs
->temps
[index
].reg
== -1)
1168 idx
= cs
->temps
[index
].reg
;
1170 case REG_TYPE_OUTPUT
:
1173 ERROR("invalid dest reg type %d\n", REG_GET_TYPE(dest
));
1177 pos
= cs
->hwtemps
[idx
].reserved
;
1178 if (mask
& WRITEMASK_XYZ
) {
1179 if (pos
< cs
->hwtemps
[idx
].vector_lastread
)
1180 pos
= cs
->hwtemps
[idx
].vector_lastread
;
1182 if (mask
& WRITEMASK_W
) {
1183 if (pos
< cs
->hwtemps
[idx
].scalar_lastread
)
1184 pos
= cs
->hwtemps
[idx
].scalar_lastread
;
1191 * Allocates a slot for an ALU instruction that can consist of
1192 * a vertex part or a scalar part or both.
1194 * Sources from src (src[0] to src[argc-1]) are added to the slot in the
1195 * appropriate position (vector and/or scalar), and their positions are
1196 * recorded in the srcpos array.
1198 * This function emits instruction code for the source fetch and the
1199 * argument selection. It does not emit instruction code for the
1200 * opcode or the destination selection.
1202 * @return the index of the slot
1204 static int find_and_prepare_slot(struct r300_pfs_compile_state
*cs
,
1207 int argc
, GLuint
* src
, GLuint dest
, int mask
)
1220 // Determine instruction slots, whether sources are required on
1221 // vector or scalar side, and the smallest slot number where
1222 // all source registers are available
1225 used
|= SLOT_OP_VECTOR
;
1227 used
|= SLOT_OP_SCALAR
;
1229 pos
= get_earliest_allowed_write(cs
, dest
, mask
);
1231 if (code
->node
[code
->cur_node
].alu_offset
> pos
)
1232 pos
= code
->node
[code
->cur_node
].alu_offset
;
1233 for (i
= 0; i
< argc
; ++i
) {
1234 if (!REG_GET_BUILTIN(src
[i
])) {
1236 used
|= v_swiz
[REG_GET_VSWZ(src
[i
])].flags
<< i
;
1238 used
|= s_swiz
[REG_GET_SSWZ(src
[i
])].flags
<< i
;
1241 hwsrc
[i
] = t_hw_src(cs
, src
[i
], GL_FALSE
); /* Note: sideeffects wrt refcounting! */
1242 regnr
= hwsrc
[i
] & 31;
1244 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1245 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1246 if (cs
->hwtemps
[regnr
].vector_valid
> pos
)
1247 pos
= cs
->hwtemps
[regnr
].vector_valid
;
1249 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1250 if (cs
->hwtemps
[regnr
].scalar_valid
> pos
)
1251 pos
= cs
->hwtemps
[regnr
].scalar_valid
;
1256 // Find a slot that fits
1258 if (cs
->slot
[pos
].used
& used
& SLOT_OP_BOTH
)
1261 if (pos
>= cs
->nrslots
) {
1262 if (cs
->nrslots
>= PFS_MAX_ALU_INST
) {
1263 ERROR("Out of ALU instruction slots\n");
1267 fp
->code
.alu
.inst
[pos
].inst0
= NOP_INST0
;
1268 fp
->code
.alu
.inst
[pos
].inst1
= NOP_INST1
;
1269 fp
->code
.alu
.inst
[pos
].inst2
= NOP_INST2
;
1270 fp
->code
.alu
.inst
[pos
].inst3
= NOP_INST3
;
1274 // Note: When we need both parts (vector and scalar) of a source,
1275 // we always try to put them into the same position. This makes the
1276 // code easier to read, and it is optimal (i.e. one doesn't gain
1277 // anything by splitting the parts).
1278 // It also avoids headaches with swizzles that access both parts (i.e WXY)
1279 tempused
= cs
->slot
[pos
].used
;
1280 for (i
= 0; i
< 3; ++i
) {
1281 tempvsrc
[i
] = cs
->slot
[pos
].vsrc
[i
];
1282 tempssrc
[i
] = cs
->slot
[pos
].ssrc
[i
];
1285 for (i
= 0; i
< argc
; ++i
) {
1286 int flags
= (used
>> i
) & SLOT_SRC_BOTH
;
1293 for (j
= 0; j
< 3; ++j
) {
1294 if ((tempused
>> j
) & flags
& SLOT_SRC_VECTOR
) {
1295 if (tempvsrc
[j
] != hwsrc
[i
])
1299 if ((tempused
>> j
) & flags
& SLOT_SRC_SCALAR
) {
1300 if (tempssrc
[j
] != hwsrc
[i
])
1311 tempused
|= flags
<< j
;
1312 if (flags
& SLOT_SRC_VECTOR
)
1313 tempvsrc
[j
] = hwsrc
[i
];
1314 if (flags
& SLOT_SRC_SCALAR
)
1315 tempssrc
[j
] = hwsrc
[i
];
1322 // Found a slot, reserve it
1323 cs
->slot
[pos
].used
= tempused
| (used
& SLOT_OP_BOTH
);
1324 for (i
= 0; i
< 3; ++i
) {
1325 cs
->slot
[pos
].vsrc
[i
] = tempvsrc
[i
];
1326 cs
->slot
[pos
].ssrc
[i
] = tempssrc
[i
];
1329 for (i
= 0; i
< argc
; ++i
) {
1330 if (REG_GET_TYPE(src
[i
]) == REG_TYPE_TEMP
) {
1331 int regnr
= hwsrc
[i
] & 31;
1333 if (used
& (SLOT_SRC_VECTOR
<< i
)) {
1334 if (cs
->hwtemps
[regnr
].vector_lastread
< pos
)
1335 cs
->hwtemps
[regnr
].vector_lastread
=
1338 if (used
& (SLOT_SRC_SCALAR
<< i
)) {
1339 if (cs
->hwtemps
[regnr
].scalar_lastread
< pos
)
1340 cs
->hwtemps
[regnr
].scalar_lastread
=
1346 // Emit the source fetch code
1347 code
->alu
.inst
[pos
].inst1
&= ~R300_ALU_SRC_MASK
;
1348 code
->alu
.inst
[pos
].inst1
|=
1349 ((cs
->slot
[pos
].vsrc
[0] << R300_ALU_SRC0C_SHIFT
) |
1350 (cs
->slot
[pos
].vsrc
[1] << R300_ALU_SRC1C_SHIFT
) |
1351 (cs
->slot
[pos
].vsrc
[2] << R300_ALU_SRC2C_SHIFT
));
1353 code
->alu
.inst
[pos
].inst3
&= ~R300_ALU_SRC_MASK
;
1354 code
->alu
.inst
[pos
].inst3
|=
1355 ((cs
->slot
[pos
].ssrc
[0] << R300_ALU_SRC0A_SHIFT
) |
1356 (cs
->slot
[pos
].ssrc
[1] << R300_ALU_SRC1A_SHIFT
) |
1357 (cs
->slot
[pos
].ssrc
[2] << R300_ALU_SRC2A_SHIFT
));
1359 // Emit the argument selection code
1363 for (i
= 0; i
< 3; ++i
) {
1365 swz
[i
] = (v_swiz
[REG_GET_VSWZ(src
[i
])].base
+
1367 v_swiz
[REG_GET_VSWZ(src
[i
])].
1368 stride
)) | ((src
[i
] & REG_NEGV_MASK
)
1369 ? ARG_NEG
: 0) | ((src
[i
]
1376 swz
[i
] = R300_ALU_ARGC_ZERO
;
1380 code
->alu
.inst
[pos
].inst0
&=
1381 ~(R300_ALU_ARG0C_MASK
| R300_ALU_ARG1C_MASK
|
1382 R300_ALU_ARG2C_MASK
);
1383 code
->alu
.inst
[pos
].inst0
|=
1384 (swz
[0] << R300_ALU_ARG0C_SHIFT
) | (swz
[1] <<
1385 R300_ALU_ARG1C_SHIFT
)
1386 | (swz
[2] << R300_ALU_ARG2C_SHIFT
);
1392 for (i
= 0; i
< 3; ++i
) {
1394 swz
[i
] = (s_swiz
[REG_GET_SSWZ(src
[i
])].base
+
1396 s_swiz
[REG_GET_SSWZ(src
[i
])].
1397 stride
)) | ((src
[i
] & REG_NEGV_MASK
)
1398 ? ARG_NEG
: 0) | ((src
[i
]
1405 swz
[i
] = R300_ALU_ARGA_ZERO
;
1409 code
->alu
.inst
[pos
].inst2
&=
1410 ~(R300_ALU_ARG0A_MASK
| R300_ALU_ARG1A_MASK
|
1411 R300_ALU_ARG2A_MASK
);
1412 code
->alu
.inst
[pos
].inst2
|=
1413 (swz
[0] << R300_ALU_ARG0A_SHIFT
) | (swz
[1] <<
1414 R300_ALU_ARG1A_SHIFT
)
1415 | (swz
[2] << R300_ALU_ARG2A_SHIFT
);
1422 * Append an ALU instruction to the instruction list.
1424 static void emit_arith(struct r300_pfs_compile_state
*cs
,
1428 GLuint src0
, GLuint src1
, GLuint src2
, int flags
)
1431 GLuint src
[3] = { src0
, src1
, src2
};
1433 GLboolean emit_vop
, emit_sop
;
1437 vop
= r300_fpop
[op
].v_op
;
1438 sop
= r300_fpop
[op
].s_op
;
1439 argc
= r300_fpop
[op
].argc
;
1441 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
&&
1442 REG_GET_INDEX(dest
) == FRAG_RESULT_DEPR
) {
1443 if (mask
& WRITEMASK_Z
) {
1450 emit_vop
= GL_FALSE
;
1451 emit_sop
= GL_FALSE
;
1452 if ((mask
& WRITEMASK_XYZ
) || vop
== R300_ALU_OUTC_DP3
)
1454 if ((mask
& WRITEMASK_W
) || vop
== R300_ALU_OUTC_REPL_ALPHA
)
1458 find_and_prepare_slot(cs
, emit_vop
, emit_sop
, argc
, src
, dest
,
1463 hwdest
= t_hw_dst(cs
, dest
, GL_FALSE
, pos
); /* Note: Side effects wrt register allocation */
1465 if (flags
& PFS_FLAG_SAT
) {
1466 vop
|= R300_ALU_OUTC_CLAMP
;
1467 sop
|= R300_ALU_OUTA_CLAMP
;
1470 /* Throw the pieces together and get ALU/1 */
1472 code
->alu
.inst
[pos
].inst0
|= vop
;
1474 code
->alu
.inst
[pos
].inst1
|= hwdest
<< R300_ALU_DSTC_SHIFT
;
1476 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1477 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1478 code
->alu
.inst
[pos
].inst1
|=
1479 (mask
& WRITEMASK_XYZ
) <<
1480 R300_ALU_DSTC_OUTPUT_MASK_SHIFT
;
1484 code
->alu
.inst
[pos
].inst1
|=
1485 (mask
& WRITEMASK_XYZ
) <<
1486 R300_ALU_DSTC_REG_MASK_SHIFT
;
1488 cs
->hwtemps
[hwdest
].vector_valid
= pos
+ 1;
1494 code
->alu
.inst
[pos
].inst2
|= sop
;
1496 if (mask
& WRITEMASK_W
) {
1497 if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1498 if (REG_GET_INDEX(dest
) == FRAG_RESULT_COLR
) {
1499 code
->alu
.inst
[pos
].inst3
|=
1500 (hwdest
<< R300_ALU_DSTA_SHIFT
) |
1501 R300_ALU_DSTA_OUTPUT
;
1502 } else if (REG_GET_INDEX(dest
) ==
1504 code
->alu
.inst
[pos
].inst3
|=
1505 R300_ALU_DSTA_DEPTH
;
1509 code
->alu
.inst
[pos
].inst3
|=
1510 (hwdest
<< R300_ALU_DSTA_SHIFT
) |
1513 cs
->hwtemps
[hwdest
].scalar_valid
= pos
+ 1;
1522 static GLuint
get_attrib(struct r300_fragment_program
*fp
, GLuint attr
)
1524 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
1527 if (!(mp
->Base
.InputsRead
& (1 << attr
))) {
1528 ERROR("Attribute %d was not provided!\n", attr
);
1532 REG_SET_TYPE(r
, REG_TYPE_INPUT
);
1533 REG_SET_INDEX(r
, attr
);
1534 REG_SET_VALID(r
, GL_TRUE
);
1539 static GLfloat SinCosConsts
[2][4] = {
1541 1.273239545, // 4/PI
1542 -0.405284735, // -4/(PI*PI)
1549 0.159154943, // 1/(2*PI)
1555 * Emit a LIT instruction.
1556 * \p flags may be PFS_FLAG_SAT
1558 * Definition of LIT (from ARB_fragment_program):
1559 * tmp = VectorLoad(op0);
1560 * if (tmp.x < 0) tmp.x = 0;
1561 * if (tmp.y < 0) tmp.y = 0;
1562 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
1563 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
1566 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
1569 * The longest path of computation is the one leading to result.z,
1570 * consisting of 5 operations. This implementation of LIT takes
1571 * 5 slots. So unless there's some special undocumented opcode,
1572 * this implementation is potentially optimal. Unfortunately,
1573 * emit_arith is a bit too conservative because it doesn't understand
1574 * partial writes to the vector component.
1576 static const GLfloat LitConst
[4] =
1577 { 127.999999, 127.999999, 127.999999, -127.999999 };
1579 static void emit_lit(struct r300_pfs_compile_state
*cs
,
1580 GLuint dest
, int mask
, GLuint src
, int flags
)
1587 cnst
= emit_const4fv(cs
, LitConst
);
1590 if ((mask
& WRITEMASK_XYZW
) != WRITEMASK_XYZW
) {
1592 } else if (REG_GET_TYPE(dest
) == REG_TYPE_OUTPUT
) {
1593 // LIT is typically followed by DP3/DP4, so there's no point
1594 // in creating special code for this case
1598 if (needTemporary
) {
1599 temp
= keep(get_temp_reg(cs
));
1604 // Note: The order of emit_arith inside the slots is relevant,
1605 // because emit_arith only looks at scalar vs. vector when resolving
1606 // dependencies, and it does not consider individual vector components,
1607 // so swizzling between the two parts can create fake dependencies.
1610 emit_arith(cs
, PFS_OP_MAX
, temp
, WRITEMASK_XY
,
1611 keep(src
), pfs_zero
, undef
, 0);
1612 emit_arith(cs
, PFS_OP_MAX
, temp
, WRITEMASK_W
, src
, cnst
, undef
, 0);
1615 emit_arith(cs
, PFS_OP_MIN
, temp
, WRITEMASK_Z
,
1616 swizzle(temp
, W
, W
, W
, W
), cnst
, undef
, 0);
1617 emit_arith(cs
, PFS_OP_LG2
, temp
, WRITEMASK_W
,
1618 swizzle(temp
, Y
, Y
, Y
, Y
), undef
, undef
, 0);
1621 // If desired, we saturate the y result here.
1622 // This does not affect the use as a condition variable in the CMP later
1623 emit_arith(cs
, PFS_OP_MAD
, temp
, WRITEMASK_W
,
1624 temp
, swizzle(temp
, Z
, Z
, Z
, Z
), pfs_zero
, 0);
1625 emit_arith(cs
, PFS_OP_MAD
, temp
, WRITEMASK_Y
,
1626 swizzle(temp
, X
, X
, X
, X
), pfs_one
, pfs_zero
, flags
);
1629 emit_arith(cs
, PFS_OP_MAD
, temp
, WRITEMASK_X
,
1630 pfs_one
, pfs_one
, pfs_zero
, 0);
1631 emit_arith(cs
, PFS_OP_EX2
, temp
, WRITEMASK_W
, temp
, undef
, undef
, 0);
1634 emit_arith(cs
, PFS_OP_CMP
, temp
, WRITEMASK_Z
,
1635 pfs_zero
, swizzle(temp
, W
, W
, W
, W
),
1636 negate(swizzle(temp
, Y
, Y
, Y
, Y
)), flags
);
1637 emit_arith(cs
, PFS_OP_MAD
, temp
, WRITEMASK_W
, pfs_one
, pfs_one
,
1640 if (needTemporary
) {
1641 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1642 temp
, pfs_one
, pfs_zero
, flags
);
1643 free_temp(cs
, temp
);
1645 // Decrease refcount of the destination
1646 t_hw_dst(cs
, dest
, GL_FALSE
, cs
->nrslots
);
1650 static GLboolean
parse_program(struct r300_pfs_compile_state
*cs
)
1653 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
1654 const struct prog_instruction
*inst
= mp
->Base
.Instructions
;
1655 struct prog_instruction
*fpi
;
1656 GLuint src
[3], dest
, temp
[2];
1657 int flags
, mask
= 0;
1660 if (!inst
|| inst
[0].Opcode
== OPCODE_END
) {
1661 ERROR("empty program?\n");
1665 for (fpi
= mp
->Base
.Instructions
; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
1666 if (fpi
->SaturateMode
== SATURATE_ZERO_ONE
)
1667 flags
= PFS_FLAG_SAT
;
1671 if (fpi
->Opcode
!= OPCODE_KIL
) {
1672 dest
= t_dst(cs
, fpi
->DstReg
);
1673 mask
= fpi
->DstReg
.WriteMask
;
1676 switch (fpi
->Opcode
) {
1678 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1679 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1680 absolute(src
[0]), pfs_one
, pfs_zero
, flags
);
1683 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1684 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1685 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1686 src
[0], pfs_one
, src
[1], flags
);
1689 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1690 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1691 src
[2] = t_src(cs
, fpi
->SrcReg
[2]);
1692 /* ARB_f_p - if src0.c < 0.0 ? src1.c : src2.c
1693 * r300 - if src2.c < 0.0 ? src1.c : src0.c
1695 emit_arith(cs
, PFS_OP_CMP
, dest
, mask
,
1696 src
[2], src
[1], src
[0], flags
);
1700 * cos using a parabola (see SIN):
1702 * x = (x/(2*PI))+0.75
1707 temp
[0] = get_temp_reg(cs
);
1708 const_sin
[0] = emit_const4fv(cs
, SinCosConsts
[0]);
1709 const_sin
[1] = emit_const4fv(cs
, SinCosConsts
[1]);
1710 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1712 /* add 0.5*PI and do range reduction */
1714 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1715 swizzle(src
[0], X
, X
, X
, X
),
1716 swizzle(const_sin
[1], Z
, Z
, Z
, Z
),
1717 swizzle(const_sin
[1], X
, X
, X
, X
), 0);
1719 emit_arith(cs
, PFS_OP_FRC
, temp
[0], WRITEMASK_X
,
1720 swizzle(temp
[0], X
, X
, X
, X
),
1723 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
, swizzle(temp
[0], X
, X
, X
, X
), swizzle(const_sin
[1], W
, W
, W
, W
), //2*PI
1724 negate(swizzle(const_sin
[0], Z
, Z
, Z
, Z
)), //-PI
1729 emit_arith(cs
, PFS_OP_MAD
, temp
[0],
1730 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[0],
1733 const_sin
[0], pfs_zero
, 0);
1735 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1736 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1737 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1738 swizzle(temp
[0], X
, X
, X
, X
), 0);
1740 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_Y
,
1741 swizzle(temp
[0], X
, X
, X
, X
),
1742 absolute(swizzle(temp
[0], X
, X
, X
, X
)),
1743 negate(swizzle(temp
[0], X
, X
, X
, X
)), 0);
1745 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1746 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1747 swizzle(const_sin
[0], W
, W
, W
, W
),
1748 swizzle(temp
[0], X
, X
, X
, X
), flags
);
1750 free_temp(cs
, temp
[0]);
1753 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1754 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1755 emit_arith(cs
, PFS_OP_DP3
, dest
, mask
,
1756 src
[0], src
[1], undef
, flags
);
1759 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1760 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1761 emit_arith(cs
, PFS_OP_DP4
, dest
, mask
,
1762 src
[0], src
[1], undef
, flags
);
1765 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1766 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1767 /* src0.xyz1 -> temp
1768 * DP4 dest, temp, src1
1770 emit_arith(cs
, PFS_OP_DP4
, dest
, mask
,
1771 swizzle(src
[0], X
, Y
, Z
, ONE
), src
[1],
1775 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1776 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1777 /* dest.y = src0.y * src1.y */
1778 if (mask
& WRITEMASK_Y
)
1779 emit_arith(cs
, PFS_OP_MAD
, dest
, WRITEMASK_Y
,
1780 keep(src
[0]), keep(src
[1]),
1782 /* dest.z = src0.z */
1783 if (mask
& WRITEMASK_Z
)
1784 emit_arith(cs
, PFS_OP_MAD
, dest
, WRITEMASK_Z
,
1785 src
[0], pfs_one
, pfs_zero
, flags
);
1787 * result.w = src1.w */
1788 if (mask
& WRITEMASK_XW
) {
1789 REG_SET_VSWZ(src
[1], SWIZZLE_111
); /*Cheat */
1790 emit_arith(cs
, PFS_OP_MAD
, dest
,
1791 mask
& WRITEMASK_XW
,
1792 src
[1], pfs_one
, pfs_zero
, flags
);
1796 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1797 emit_arith(cs
, PFS_OP_EX2
, dest
, mask
,
1798 src
[0], undef
, undef
, flags
);
1801 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1802 temp
[0] = get_temp_reg(cs
);
1804 * MAD dest, src0, 1.0, -temp
1806 emit_arith(cs
, PFS_OP_FRC
, temp
[0], mask
,
1807 keep(src
[0]), undef
, undef
, 0);
1808 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1809 src
[0], pfs_one
, negate(temp
[0]), flags
);
1810 free_temp(cs
, temp
[0]);
1813 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1814 emit_arith(cs
, PFS_OP_FRC
, dest
, mask
,
1815 src
[0], undef
, undef
, flags
);
1818 emit_tex(cs
, fpi
, R300_TEX_OP_KIL
);
1821 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1822 emit_arith(cs
, PFS_OP_LG2
, dest
, mask
,
1823 src
[0], undef
, undef
, flags
);
1826 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1827 emit_lit(cs
, dest
, mask
, src
[0], flags
);
1830 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1831 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1832 src
[2] = t_src(cs
, fpi
->SrcReg
[2]);
1833 /* result = tmp0tmp1 + (1 - tmp0)tmp2
1834 * = tmp0tmp1 + tmp2 + (-tmp0)tmp2
1835 * MAD temp, -tmp0, tmp2, tmp2
1836 * MAD result, tmp0, tmp1, temp
1838 temp
[0] = get_temp_reg(cs
);
1839 emit_arith(cs
, PFS_OP_MAD
, temp
[0], mask
,
1840 negate(keep(src
[0])), keep(src
[2]), src
[2],
1842 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1843 src
[0], src
[1], temp
[0], flags
);
1844 free_temp(cs
, temp
[0]);
1847 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1848 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1849 src
[2] = t_src(cs
, fpi
->SrcReg
[2]);
1850 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1851 src
[0], src
[1], src
[2], flags
);
1854 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1855 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1856 emit_arith(cs
, PFS_OP_MAX
, dest
, mask
,
1857 src
[0], src
[1], undef
, flags
);
1860 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1861 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1862 emit_arith(cs
, PFS_OP_MIN
, dest
, mask
,
1863 src
[0], src
[1], undef
, flags
);
1867 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1868 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1869 src
[0], pfs_one
, pfs_zero
, flags
);
1872 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1873 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1874 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
1875 src
[0], src
[1], pfs_zero
, flags
);
1878 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1879 src
[1] = t_scalar_src(cs
, fpi
->SrcReg
[1]);
1880 temp
[0] = get_temp_reg(cs
);
1881 emit_arith(cs
, PFS_OP_LG2
, temp
[0], WRITEMASK_W
,
1882 src
[0], undef
, undef
, 0);
1883 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_W
,
1884 temp
[0], src
[1], pfs_zero
, 0);
1885 emit_arith(cs
, PFS_OP_EX2
, dest
, fpi
->DstReg
.WriteMask
,
1886 temp
[0], undef
, undef
, 0);
1887 free_temp(cs
, temp
[0]);
1890 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1891 emit_arith(cs
, PFS_OP_RCP
, dest
, mask
,
1892 src
[0], undef
, undef
, flags
);
1895 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1896 emit_arith(cs
, PFS_OP_RSQ
, dest
, mask
,
1897 absolute(src
[0]), pfs_zero
, pfs_zero
, flags
);
1901 * scs using a parabola :
1903 * result.x = sin(-abs(x)+0.5*PI) (cos)
1904 * result.y = sin(x) (sin)
1907 temp
[0] = get_temp_reg(cs
);
1908 temp
[1] = get_temp_reg(cs
);
1909 const_sin
[0] = emit_const4fv(cs
, SinCosConsts
[0]);
1910 const_sin
[1] = emit_const4fv(cs
, SinCosConsts
[1]);
1911 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1913 /* x = -abs(x)+0.5*PI */
1914 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
, swizzle(const_sin
[0], Z
, Z
, Z
, Z
), //PI
1917 (swizzle(keep(src
[0]), X
, X
, X
, X
))),
1921 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_W
,
1922 swizzle(const_sin
[0], Y
, Y
, Y
, Y
),
1923 swizzle(keep(src
[0]), X
, X
, X
, X
),
1926 /* B*x, C*x (cos) */
1927 emit_arith(cs
, PFS_OP_MAD
, temp
[0],
1928 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[0],
1931 const_sin
[0], pfs_zero
, 0);
1934 emit_arith(cs
, PFS_OP_MAD
, temp
[1], WRITEMASK_W
,
1935 swizzle(const_sin
[0], X
, X
, X
, X
),
1936 keep(src
[0]), pfs_zero
, 0);
1938 /* y = B*x + C*x*abs(x) (sin) */
1939 emit_arith(cs
, PFS_OP_MAD
, temp
[1], WRITEMASK_Z
,
1941 swizzle(temp
[0], W
, W
, W
, W
),
1942 swizzle(temp
[1], W
, W
, W
, W
), 0);
1944 /* y = B*x + C*x*abs(x) (cos) */
1945 emit_arith(cs
, PFS_OP_MAD
, temp
[1], WRITEMASK_W
,
1946 swizzle(temp
[0], Y
, Y
, Y
, Y
),
1947 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
1948 swizzle(temp
[0], X
, X
, X
, X
), 0);
1950 /* y*abs(y) - y (cos), y*abs(y) - y (sin) */
1951 emit_arith(cs
, PFS_OP_MAD
, temp
[0],
1952 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[1],
1955 absolute(swizzle(temp
[1], W
, Z
, Y
, X
)),
1956 negate(swizzle(temp
[1], W
, Z
, Y
, X
)), 0);
1958 /* dest.xy = mad(temp.xy, P, temp2.wz) */
1959 emit_arith(cs
, PFS_OP_MAD
, dest
,
1960 mask
& (WRITEMASK_X
| WRITEMASK_Y
), temp
[0],
1961 swizzle(const_sin
[0], W
, W
, W
, W
),
1962 swizzle(temp
[1], W
, Z
, Y
, X
), flags
);
1964 free_temp(cs
, temp
[0]);
1965 free_temp(cs
, temp
[1]);
1968 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
1969 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
1970 temp
[0] = get_temp_reg(cs
);
1971 /* temp = src0 - src1
1972 * dest.c = (temp.c < 0.0) ? 0 : 1
1974 emit_arith(cs
, PFS_OP_MAD
, temp
[0], mask
,
1975 src
[0], pfs_one
, negate(src
[1]), 0);
1976 emit_arith(cs
, PFS_OP_CMP
, dest
, mask
,
1977 pfs_one
, pfs_zero
, temp
[0], 0);
1978 free_temp(cs
, temp
[0]);
1983 * sin(x) = 4/pi * x + -4/(pi*pi) * x * abs(x)
1984 * extra precision is obtained by weighting against
1988 temp
[0] = get_temp_reg(cs
);
1989 const_sin
[0] = emit_const4fv(cs
, SinCosConsts
[0]);
1990 const_sin
[1] = emit_const4fv(cs
, SinCosConsts
[1]);
1991 src
[0] = t_scalar_src(cs
, fpi
->SrcReg
[0]);
1993 /* do range reduction */
1995 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
1996 swizzle(keep(src
[0]), X
, X
, X
, X
),
1997 swizzle(const_sin
[1], Z
, Z
, Z
, Z
),
2000 emit_arith(cs
, PFS_OP_FRC
, temp
[0], WRITEMASK_X
,
2001 swizzle(temp
[0], X
, X
, X
, X
),
2004 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_Z
, swizzle(temp
[0], X
, X
, X
, X
), swizzle(const_sin
[1], W
, W
, W
, W
), //2*PI
2005 negate(swizzle(const_sin
[0], Z
, Z
, Z
, Z
)), //PI
2010 emit_arith(cs
, PFS_OP_MAD
, temp
[0],
2011 WRITEMASK_X
| WRITEMASK_Y
, swizzle(temp
[0],
2014 const_sin
[0], pfs_zero
, 0);
2016 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_X
,
2017 swizzle(temp
[0], Y
, Y
, Y
, Y
),
2018 absolute(swizzle(temp
[0], Z
, Z
, Z
, Z
)),
2019 swizzle(temp
[0], X
, X
, X
, X
), 0);
2021 emit_arith(cs
, PFS_OP_MAD
, temp
[0], WRITEMASK_Y
,
2022 swizzle(temp
[0], X
, X
, X
, X
),
2023 absolute(swizzle(temp
[0], X
, X
, X
, X
)),
2024 negate(swizzle(temp
[0], X
, X
, X
, X
)), 0);
2026 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
2027 swizzle(temp
[0], Y
, Y
, Y
, Y
),
2028 swizzle(const_sin
[0], W
, W
, W
, W
),
2029 swizzle(temp
[0], X
, X
, X
, X
), flags
);
2031 free_temp(cs
, temp
[0]);
2034 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
2035 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
2036 temp
[0] = get_temp_reg(cs
);
2037 /* temp = src0 - src1
2038 * dest.c = (temp.c < 0.0) ? 1 : 0
2040 emit_arith(cs
, PFS_OP_MAD
, temp
[0], mask
,
2041 src
[0], pfs_one
, negate(src
[1]), 0);
2042 emit_arith(cs
, PFS_OP_CMP
, dest
, mask
,
2043 pfs_zero
, pfs_one
, temp
[0], 0);
2044 free_temp(cs
, temp
[0]);
2047 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
2048 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
2049 emit_arith(cs
, PFS_OP_MAD
, dest
, mask
,
2050 src
[0], pfs_one
, negate(src
[1]), flags
);
2053 emit_tex(cs
, fpi
, R300_TEX_OP_LD
);
2056 emit_tex(cs
, fpi
, R300_TEX_OP_TXB
);
2059 emit_tex(cs
, fpi
, R300_TEX_OP_TXP
);
2062 src
[0] = t_src(cs
, fpi
->SrcReg
[0]);
2063 src
[1] = t_src(cs
, fpi
->SrcReg
[1]);
2064 temp
[0] = get_temp_reg(cs
);
2065 /* temp = src0.zxy * src1.yzx */
2066 emit_arith(cs
, PFS_OP_MAD
, temp
[0],
2067 WRITEMASK_XYZ
, swizzle(keep(src
[0]),
2069 swizzle(keep(src
[1]), Y
, Z
, X
, W
),
2071 /* dest.xyz = src0.yzx * src1.zxy - temp
2072 * dest.w = undefined
2074 emit_arith(cs
, PFS_OP_MAD
, dest
,
2075 mask
& WRITEMASK_XYZ
, swizzle(src
[0],
2078 swizzle(src
[1], Z
, X
, Y
, W
),
2079 negate(temp
[0]), flags
);
2081 free_temp(cs
, temp
[0]);
2085 ERROR("unknown fpi->Opcode %d\n", fpi
->Opcode
);
2097 static void insert_wpos(struct gl_program
*prog
)
2099 static gl_state_index tokens
[STATE_LENGTH
] = {
2100 STATE_INTERNAL
, STATE_R300_WINDOW_DIMENSION
, 0, 0, 0
2102 struct prog_instruction
*fpi
;
2103 GLuint window_index
;
2105 GLuint tempregi
= prog
->NumTemporaries
;
2106 /* should do something else if no temps left... */
2107 prog
->NumTemporaries
++;
2109 fpi
= _mesa_alloc_instructions(prog
->NumInstructions
+ 3);
2110 _mesa_init_instructions(fpi
, prog
->NumInstructions
+ 3);
2112 /* perspective divide */
2113 fpi
[i
].Opcode
= OPCODE_RCP
;
2115 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2116 fpi
[i
].DstReg
.Index
= tempregi
;
2117 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_W
;
2118 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2120 fpi
[i
].SrcReg
[0].File
= PROGRAM_INPUT
;
2121 fpi
[i
].SrcReg
[0].Index
= FRAG_ATTRIB_WPOS
;
2122 fpi
[i
].SrcReg
[0].Swizzle
= SWIZZLE_WWWW
;
2125 fpi
[i
].Opcode
= OPCODE_MUL
;
2127 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2128 fpi
[i
].DstReg
.Index
= tempregi
;
2129 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_XYZ
;
2130 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2132 fpi
[i
].SrcReg
[0].File
= PROGRAM_INPUT
;
2133 fpi
[i
].SrcReg
[0].Index
= FRAG_ATTRIB_WPOS
;
2134 fpi
[i
].SrcReg
[0].Swizzle
= SWIZZLE_XYZW
;
2136 fpi
[i
].SrcReg
[1].File
= PROGRAM_TEMPORARY
;
2137 fpi
[i
].SrcReg
[1].Index
= tempregi
;
2138 fpi
[i
].SrcReg
[1].Swizzle
= SWIZZLE_WWWW
;
2141 /* viewport transformation */
2142 window_index
= _mesa_add_state_reference(prog
->Parameters
, tokens
);
2144 fpi
[i
].Opcode
= OPCODE_MAD
;
2146 fpi
[i
].DstReg
.File
= PROGRAM_TEMPORARY
;
2147 fpi
[i
].DstReg
.Index
= tempregi
;
2148 fpi
[i
].DstReg
.WriteMask
= WRITEMASK_XYZ
;
2149 fpi
[i
].DstReg
.CondMask
= COND_TR
;
2151 fpi
[i
].SrcReg
[0].File
= PROGRAM_TEMPORARY
;
2152 fpi
[i
].SrcReg
[0].Index
= tempregi
;
2153 fpi
[i
].SrcReg
[0].Swizzle
=
2154 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2156 fpi
[i
].SrcReg
[1].File
= PROGRAM_STATE_VAR
;
2157 fpi
[i
].SrcReg
[1].Index
= window_index
;
2158 fpi
[i
].SrcReg
[1].Swizzle
=
2159 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2161 fpi
[i
].SrcReg
[2].File
= PROGRAM_STATE_VAR
;
2162 fpi
[i
].SrcReg
[2].Index
= window_index
;
2163 fpi
[i
].SrcReg
[2].Swizzle
=
2164 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ZERO
);
2167 _mesa_copy_instructions(&fpi
[i
], prog
->Instructions
,
2168 prog
->NumInstructions
);
2170 free(prog
->Instructions
);
2172 prog
->Instructions
= fpi
;
2174 prog
->NumInstructions
+= i
;
2175 fpi
= &prog
->Instructions
[prog
->NumInstructions
- 1];
2177 assert(fpi
->Opcode
== OPCODE_END
);
2179 for (fpi
= &prog
->Instructions
[3]; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
2180 for (i
= 0; i
< 3; i
++)
2181 if (fpi
->SrcReg
[i
].File
== PROGRAM_INPUT
&&
2182 fpi
->SrcReg
[i
].Index
== FRAG_ATTRIB_WPOS
) {
2183 fpi
->SrcReg
[i
].File
= PROGRAM_TEMPORARY
;
2184 fpi
->SrcReg
[i
].Index
= tempregi
;
2189 /* - Init structures
2190 * - Determine what hwregs each input corresponds to
2192 static void init_program(struct r300_pfs_compile_state
*cs
)
2195 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
2196 struct prog_instruction
*fpi
;
2197 GLuint InputsRead
= mp
->Base
.InputsRead
;
2198 GLuint temps_used
= 0; /* for fp->temps[] */
2201 /* New compile, reset tracking data */
2203 driQueryOptioni(&cs
->r300
->radeon
.optionCache
, "fp_optimization");
2204 fp
->translated
= GL_FALSE
;
2205 fp
->error
= GL_FALSE
;
2206 fp
->WritesDepth
= GL_FALSE
;
2207 code
->tex
.length
= 0;
2209 code
->first_node_has_tex
= 0;
2211 code
->max_temp_idx
= 0;
2212 code
->node
[0].alu_end
= -1;
2213 code
->node
[0].tex_end
= -1;
2215 for (i
= 0; i
< PFS_MAX_ALU_INST
; i
++) {
2216 for (j
= 0; j
< 3; j
++) {
2217 cs
->slot
[i
].vsrc
[j
] = SRC_CONST
;
2218 cs
->slot
[i
].ssrc
[j
] = SRC_CONST
;
2222 /* Work out what temps the Mesa inputs correspond to, this must match
2223 * what setup_rs_unit does, which shouldn't be a problem as rs_unit
2224 * configures itself based on the fragprog's InputsRead
2226 * NOTE: this depends on get_hw_temp() allocating registers in order,
2227 * starting from register 0.
2230 /* Texcoords come first */
2231 for (i
= 0; i
< cs
->r300
->radeon
.glCtx
->Const
.MaxTextureUnits
; i
++) {
2232 if (InputsRead
& (FRAG_BIT_TEX0
<< i
)) {
2233 cs
->inputs
[FRAG_ATTRIB_TEX0
+ i
].refcount
= 0;
2234 cs
->inputs
[FRAG_ATTRIB_TEX0
+ i
].reg
=
2238 InputsRead
&= ~FRAG_BITS_TEX_ANY
;
2240 /* fragment position treated as a texcoord */
2241 if (InputsRead
& FRAG_BIT_WPOS
) {
2242 cs
->inputs
[FRAG_ATTRIB_WPOS
].refcount
= 0;
2243 cs
->inputs
[FRAG_ATTRIB_WPOS
].reg
= get_hw_temp(cs
, 0);
2244 insert_wpos(&mp
->Base
);
2246 InputsRead
&= ~FRAG_BIT_WPOS
;
2248 /* Then primary colour */
2249 if (InputsRead
& FRAG_BIT_COL0
) {
2250 cs
->inputs
[FRAG_ATTRIB_COL0
].refcount
= 0;
2251 cs
->inputs
[FRAG_ATTRIB_COL0
].reg
= get_hw_temp(cs
, 0);
2253 InputsRead
&= ~FRAG_BIT_COL0
;
2255 /* Secondary color */
2256 if (InputsRead
& FRAG_BIT_COL1
) {
2257 cs
->inputs
[FRAG_ATTRIB_COL1
].refcount
= 0;
2258 cs
->inputs
[FRAG_ATTRIB_COL1
].reg
= get_hw_temp(cs
, 0);
2260 InputsRead
&= ~FRAG_BIT_COL1
;
2264 WARN_ONCE("Don't know how to handle inputs 0x%x\n", InputsRead
);
2265 /* force read from hwreg 0 for now */
2266 for (i
= 0; i
< 32; i
++)
2267 if (InputsRead
& (1 << i
))
2268 cs
->inputs
[i
].reg
= 0;
2271 /* Pre-parse the mesa program, grabbing refcounts on input/temp regs.
2272 * That way, we can free up the reg when it's no longer needed
2274 if (!mp
->Base
.Instructions
) {
2275 ERROR("No instructions found in program\n");
2279 for (fpi
= mp
->Base
.Instructions
; fpi
->Opcode
!= OPCODE_END
; fpi
++) {
2282 for (i
= 0; i
< 3; i
++) {
2283 idx
= fpi
->SrcReg
[i
].Index
;
2284 switch (fpi
->SrcReg
[i
].File
) {
2285 case PROGRAM_TEMPORARY
:
2286 if (!(temps_used
& (1 << idx
))) {
2287 cs
->temps
[idx
].reg
= -1;
2288 cs
->temps
[idx
].refcount
= 1;
2289 temps_used
|= (1 << idx
);
2291 cs
->temps
[idx
].refcount
++;
2294 cs
->inputs
[idx
].refcount
++;
2301 idx
= fpi
->DstReg
.Index
;
2302 if (fpi
->DstReg
.File
== PROGRAM_TEMPORARY
) {
2303 if (!(temps_used
& (1 << idx
))) {
2304 cs
->temps
[idx
].reg
= -1;
2305 cs
->temps
[idx
].refcount
= 1;
2306 temps_used
|= (1 << idx
);
2308 cs
->temps
[idx
].refcount
++;
2311 cs
->temp_in_use
= temps_used
;
2314 static void update_params(r300ContextPtr r300
, struct r300_fragment_program
*fp
)
2316 struct gl_fragment_program
*mp
= &fp
->mesa_program
;
2318 /* Ask Mesa nicely to fill in ParameterValues for us */
2319 if (mp
->Base
.Parameters
)
2320 _mesa_load_state_parameters(r300
->radeon
.glCtx
, mp
->Base
.Parameters
);
2323 void r300TranslateFragmentShader(r300ContextPtr r300
,
2324 struct r300_fragment_program
*fp
)
2326 if (!fp
->translated
) {
2327 struct r300_pfs_compile_state cs
;
2329 _mesa_memset(&cs
, 0, sizeof(cs
));
2334 if (parse_program(&cs
) == GL_FALSE
) {
2335 dump_program(fp
, &fp
->code
);
2340 fp
->code
.node
[fp
->code
.cur_node
].alu_end
=
2341 cs
.nrslots
- fp
->code
.node
[fp
->code
.cur_node
].alu_offset
- 1;
2342 if (fp
->code
.node
[fp
->code
.cur_node
].tex_end
< 0)
2343 fp
->code
.node
[fp
->code
.cur_node
].tex_end
= 0;
2344 fp
->code
.alu_offset
= 0;
2345 fp
->code
.alu_end
= cs
.nrslots
- 1;
2346 fp
->code
.tex_offset
= 0;
2347 fp
->code
.tex_end
= fp
->code
.tex
.length
? fp
->code
.tex
.length
- 1 : 0;
2348 assert(fp
->code
.node
[fp
->code
.cur_node
].alu_end
>= 0);
2349 assert(fp
->code
.alu_end
>= 0);
2351 fp
->translated
= GL_TRUE
;
2352 if (RADEON_DEBUG
& DEBUG_PIXEL
)
2353 dump_program(fp
, &fp
->code
);
2354 r300UpdateStateParameters(r300
->radeon
.glCtx
, _NEW_PROGRAM
);
2357 update_params(r300
, fp
);
2360 /* just some random things... */
2361 static void dump_program(struct r300_fragment_program
*fp
,
2362 struct r300_fragment_program_code
*code
)
2367 fprintf(stderr
, "pc=%d*************************************\n", pc
++);
2369 fprintf(stderr
, "Mesa program:\n");
2370 fprintf(stderr
, "-------------\n");
2371 _mesa_print_program(&fp
->mesa_program
.Base
);
2374 fprintf(stderr
, "Hardware program\n");
2375 fprintf(stderr
, "----------------\n");
2377 for (n
= 0; n
< (code
->cur_node
+ 1); n
++) {
2378 fprintf(stderr
, "NODE %d: alu_offset: %d, tex_offset: %d, "
2379 "alu_end: %d, tex_end: %d\n", n
,
2380 code
->node
[n
].alu_offset
,
2381 code
->node
[n
].tex_offset
,
2382 code
->node
[n
].alu_end
, code
->node
[n
].tex_end
);
2384 if (code
->tex
.length
) {
2385 fprintf(stderr
, " TEX:\n");
2386 for (i
= code
->node
[n
].tex_offset
;
2387 i
<= code
->node
[n
].tex_offset
+ code
->node
[n
].tex_end
;
2392 inst
[i
] >> R300_TEX_INST_SHIFT
) &
2394 case R300_TEX_OP_LD
:
2397 case R300_TEX_OP_KIL
:
2400 case R300_TEX_OP_TXP
:
2403 case R300_TEX_OP_TXB
:
2411 " %s t%i, %c%i, texture[%i] (%08x)\n",
2414 inst
[i
] >> R300_DST_ADDR_SHIFT
) & 31,
2417 inst
[i
] >> R300_SRC_ADDR_SHIFT
) & 31,
2419 inst
[i
] & R300_TEX_ID_MASK
) >>
2425 for (i
= code
->node
[n
].alu_offset
;
2426 i
<= code
->node
[n
].alu_offset
+ code
->node
[n
].alu_end
; ++i
) {
2427 char srcc
[3][10], dstc
[20];
2428 char srca
[3][10], dsta
[20];
2431 char flags
[5], tmp
[10];
2433 for (j
= 0; j
< 3; ++j
) {
2434 int regc
= code
->alu
.inst
[i
].inst1
>> (j
* 6);
2435 int rega
= code
->alu
.inst
[i
].inst3
>> (j
* 6);
2437 sprintf(srcc
[j
], "%c%i",
2438 (regc
& 32) ? 'c' : 't', regc
& 31);
2439 sprintf(srca
[j
], "%c%i",
2440 (rega
& 32) ? 'c' : 't', rega
& 31);
2444 sprintf(flags
, "%s%s%s",
2446 inst1
& R300_ALU_DSTC_REG_X
) ? "x" : "",
2448 inst1
& R300_ALU_DSTC_REG_Y
) ? "y" : "",
2450 inst1
& R300_ALU_DSTC_REG_Z
) ? "z" : "");
2451 if (flags
[0] != 0) {
2452 sprintf(dstc
, "t%i.%s ",
2454 inst1
>> R300_ALU_DSTC_SHIFT
) & 31,
2457 sprintf(flags
, "%s%s%s",
2459 inst1
& R300_ALU_DSTC_OUTPUT_X
) ? "x" : "",
2461 inst1
& R300_ALU_DSTC_OUTPUT_Y
) ? "y" : "",
2463 inst1
& R300_ALU_DSTC_OUTPUT_Z
) ? "z" : "");
2464 if (flags
[0] != 0) {
2465 sprintf(tmp
, "o%i.%s",
2467 inst1
>> R300_ALU_DSTC_SHIFT
) & 31,
2473 if (code
->alu
.inst
[i
].inst3
& R300_ALU_DSTA_REG
) {
2474 sprintf(dsta
, "t%i.w ",
2476 inst3
>> R300_ALU_DSTA_SHIFT
) & 31);
2478 if (code
->alu
.inst
[i
].inst3
& R300_ALU_DSTA_OUTPUT
) {
2479 sprintf(tmp
, "o%i.w ",
2481 inst3
>> R300_ALU_DSTA_SHIFT
) & 31);
2484 if (code
->alu
.inst
[i
].inst3
& R300_ALU_DSTA_DEPTH
) {
2489 "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
2490 " w: %3s %3s %3s -> %-20s (%08x)\n", i
,
2491 srcc
[0], srcc
[1], srcc
[2], dstc
,
2492 code
->alu
.inst
[i
].inst1
, srca
[0], srca
[1],
2493 srca
[2], dsta
, code
->alu
.inst
[i
].inst3
);
2495 for (j
= 0; j
< 3; ++j
) {
2496 int regc
= code
->alu
.inst
[i
].inst0
>> (j
* 7);
2497 int rega
= code
->alu
.inst
[i
].inst2
>> (j
* 7);
2504 case R300_ALU_ARGC_SRC0C_XYZ
:
2505 sprintf(buf
, "%s.xyz",
2508 case R300_ALU_ARGC_SRC0C_XXX
:
2509 sprintf(buf
, "%s.xxx",
2512 case R300_ALU_ARGC_SRC0C_YYY
:
2513 sprintf(buf
, "%s.yyy",
2516 case R300_ALU_ARGC_SRC0C_ZZZ
:
2517 sprintf(buf
, "%s.zzz",
2521 } else if (d
< 15) {
2522 sprintf(buf
, "%s.www", srca
[d
- 12]);
2523 } else if (d
== 20) {
2524 sprintf(buf
, "0.0");
2525 } else if (d
== 21) {
2526 sprintf(buf
, "1.0");
2527 } else if (d
== 22) {
2528 sprintf(buf
, "0.5");
2529 } else if (d
>= 23 && d
< 32) {
2533 sprintf(buf
, "%s.yzx",
2537 sprintf(buf
, "%s.zxy",
2541 sprintf(buf
, "%s.Wzy",
2546 sprintf(buf
, "%i", d
);
2549 sprintf(argc
[j
], "%s%s%s%s",
2550 (regc
& 32) ? "-" : "",
2551 (regc
& 64) ? "|" : "",
2552 buf
, (regc
& 64) ? "|" : "");
2556 sprintf(buf
, "%s.%c", srcc
[d
/ 3],
2557 'x' + (char)(d
% 3));
2558 } else if (d
< 12) {
2559 sprintf(buf
, "%s.w", srca
[d
- 9]);
2560 } else if (d
== 16) {
2561 sprintf(buf
, "0.0");
2562 } else if (d
== 17) {
2563 sprintf(buf
, "1.0");
2564 } else if (d
== 18) {
2565 sprintf(buf
, "0.5");
2567 sprintf(buf
, "%i", d
);
2570 sprintf(arga
[j
], "%s%s%s%s",
2571 (rega
& 32) ? "-" : "",
2572 (rega
& 64) ? "|" : "",
2573 buf
, (rega
& 64) ? "|" : "");
2576 fprintf(stderr
, " xyz: %8s %8s %8s op: %08x\n"
2577 " w: %8s %8s %8s op: %08x\n",
2578 argc
[0], argc
[1], argc
[2],
2579 code
->alu
.inst
[i
].inst0
, arga
[0], arga
[1],
2580 arga
[2], code
->alu
.inst
[i
].inst2
);