2 * Copyright (C) 2008 Nicolai Haehnle.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
36 #include "radeon_program_alu.h"
38 #include "radeon_compiler.h"
39 #include "radeon_compiler_util.h"
42 static struct rc_instruction
*emit1(
43 struct radeon_compiler
* c
, struct rc_instruction
* after
,
44 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
45 struct rc_src_register SrcReg
)
47 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
49 fpi
->U
.I
.Opcode
= Opcode
;
50 fpi
->U
.I
.SaturateMode
= Saturate
;
51 fpi
->U
.I
.DstReg
= DstReg
;
52 fpi
->U
.I
.SrcReg
[0] = SrcReg
;
56 static struct rc_instruction
*emit2(
57 struct radeon_compiler
* c
, struct rc_instruction
* after
,
58 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
59 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
)
61 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
63 fpi
->U
.I
.Opcode
= Opcode
;
64 fpi
->U
.I
.SaturateMode
= Saturate
;
65 fpi
->U
.I
.DstReg
= DstReg
;
66 fpi
->U
.I
.SrcReg
[0] = SrcReg0
;
67 fpi
->U
.I
.SrcReg
[1] = SrcReg1
;
71 static struct rc_instruction
*emit3(
72 struct radeon_compiler
* c
, struct rc_instruction
* after
,
73 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
74 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
,
75 struct rc_src_register SrcReg2
)
77 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
79 fpi
->U
.I
.Opcode
= Opcode
;
80 fpi
->U
.I
.SaturateMode
= Saturate
;
81 fpi
->U
.I
.DstReg
= DstReg
;
82 fpi
->U
.I
.SrcReg
[0] = SrcReg0
;
83 fpi
->U
.I
.SrcReg
[1] = SrcReg1
;
84 fpi
->U
.I
.SrcReg
[2] = SrcReg2
;
88 static struct rc_dst_register
dstregtmpmask(int index
, int mask
)
90 struct rc_dst_register dst
= {0, 0, 0};
91 dst
.File
= RC_FILE_TEMPORARY
;
97 static const struct rc_src_register builtin_zero
= {
100 .Swizzle
= RC_SWIZZLE_0000
102 static const struct rc_src_register builtin_one
= {
103 .File
= RC_FILE_NONE
,
105 .Swizzle
= RC_SWIZZLE_1111
108 static const struct rc_src_register builtin_half
= {
109 .File
= RC_FILE_NONE
,
111 .Swizzle
= RC_SWIZZLE_HHHH
114 static const struct rc_src_register srcreg_undefined
= {
115 .File
= RC_FILE_NONE
,
117 .Swizzle
= RC_SWIZZLE_XYZW
120 static struct rc_src_register
srcreg(int file
, int index
)
122 struct rc_src_register src
= srcreg_undefined
;
128 static struct rc_src_register
srcregswz(int file
, int index
, int swz
)
130 struct rc_src_register src
= srcreg_undefined
;
137 static struct rc_src_register
absolute(struct rc_src_register reg
)
139 struct rc_src_register newreg
= reg
;
141 newreg
.Negate
= RC_MASK_NONE
;
145 static struct rc_src_register
negate(struct rc_src_register reg
)
147 struct rc_src_register newreg
= reg
;
148 newreg
.Negate
= newreg
.Negate
^ RC_MASK_XYZW
;
152 static struct rc_src_register
swizzle(struct rc_src_register reg
,
153 rc_swizzle x
, rc_swizzle y
, rc_swizzle z
, rc_swizzle w
)
155 struct rc_src_register swizzled
= reg
;
156 swizzled
.Swizzle
= combine_swizzles4(reg
.Swizzle
, x
, y
, z
, w
);
160 static struct rc_src_register
swizzle_smear(struct rc_src_register reg
,
163 return swizzle(reg
, x
, x
, x
, x
);
166 static struct rc_src_register
swizzle_xxxx(struct rc_src_register reg
)
168 return swizzle_smear(reg
, RC_SWIZZLE_X
);
171 static struct rc_src_register
swizzle_yyyy(struct rc_src_register reg
)
173 return swizzle_smear(reg
, RC_SWIZZLE_Y
);
176 static struct rc_src_register
swizzle_zzzz(struct rc_src_register reg
)
178 return swizzle_smear(reg
, RC_SWIZZLE_Z
);
181 static struct rc_src_register
swizzle_wwww(struct rc_src_register reg
)
183 return swizzle_smear(reg
, RC_SWIZZLE_W
);
186 static int is_dst_safe_to_reuse(struct rc_instruction
*inst
)
188 const struct rc_opcode_info
*info
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
191 assert(info
->HasDstReg
);
193 if (inst
->U
.I
.DstReg
.File
!= RC_FILE_TEMPORARY
)
196 for (i
= 0; i
< info
->NumSrcRegs
; i
++) {
197 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
&&
198 inst
->U
.I
.SrcReg
[i
].Index
== inst
->U
.I
.DstReg
.Index
)
205 static struct rc_dst_register
try_to_reuse_dst(struct radeon_compiler
*c
,
206 struct rc_instruction
*inst
)
210 if (is_dst_safe_to_reuse(inst
))
211 tmp
= inst
->U
.I
.DstReg
.Index
;
213 tmp
= rc_find_free_temporary(c
);
215 return dstregtmpmask(tmp
, inst
->U
.I
.DstReg
.WriteMask
);
218 static void transform_ABS(struct radeon_compiler
* c
,
219 struct rc_instruction
* inst
)
221 struct rc_src_register src
= inst
->U
.I
.SrcReg
[0];
223 src
.Negate
= RC_MASK_NONE
;
224 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src
);
225 rc_remove_instruction(inst
);
228 static void transform_CEIL(struct radeon_compiler
* c
,
229 struct rc_instruction
* inst
)
232 * ceil(x) = -floor(-x)
234 * After inlining floor:
235 * ceil(x) = -(-x-frac(-x))
237 * After simplification:
238 * ceil(x) = x+frac(-x)
241 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
242 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dst
, negate(inst
->U
.I
.SrcReg
[0]));
243 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
244 inst
->U
.I
.SrcReg
[0], srcreg(RC_FILE_TEMPORARY
, dst
.Index
));
245 rc_remove_instruction(inst
);
248 static void transform_CLAMP(struct radeon_compiler
*c
,
249 struct rc_instruction
*inst
)
251 /* CLAMP dst, src, min, max
256 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
257 emit2(c
, inst
->Prev
, RC_OPCODE_MIN
, 0, dst
,
258 inst
->U
.I
.SrcReg
[0], inst
->U
.I
.SrcReg
[2]);
259 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
260 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), inst
->U
.I
.SrcReg
[1]);
261 rc_remove_instruction(inst
);
264 static void transform_DP2(struct radeon_compiler
* c
,
265 struct rc_instruction
* inst
)
267 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
268 struct rc_src_register src1
= inst
->U
.I
.SrcReg
[1];
269 src0
.Negate
&= ~(RC_MASK_Z
| RC_MASK_W
);
270 src0
.Swizzle
&= ~(63 << (3 * 2));
271 src0
.Swizzle
|= (RC_SWIZZLE_ZERO
<< (3 * 2)) | (RC_SWIZZLE_ZERO
<< (3 * 3));
272 src1
.Negate
&= ~(RC_MASK_Z
| RC_MASK_W
);
273 src1
.Swizzle
&= ~(63 << (3 * 2));
274 src1
.Swizzle
|= (RC_SWIZZLE_ZERO
<< (3 * 2)) | (RC_SWIZZLE_ZERO
<< (3 * 3));
275 emit2(c
, inst
->Prev
, RC_OPCODE_DP3
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src0
, src1
);
276 rc_remove_instruction(inst
);
279 static void transform_DPH(struct radeon_compiler
* c
,
280 struct rc_instruction
* inst
)
282 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
283 src0
.Negate
&= ~RC_MASK_W
;
284 src0
.Swizzle
&= ~(7 << (3 * 3));
285 src0
.Swizzle
|= RC_SWIZZLE_ONE
<< (3 * 3);
286 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src0
, inst
->U
.I
.SrcReg
[1]);
287 rc_remove_instruction(inst
);
291 * [1, src0.y*src1.y, src0.z, src1.w]
292 * So basically MUL with lotsa swizzling.
294 static void transform_DST(struct radeon_compiler
* c
,
295 struct rc_instruction
* inst
)
297 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
298 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_ONE
),
299 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_ONE
, RC_SWIZZLE_W
));
300 rc_remove_instruction(inst
);
303 static void transform_FLR(struct radeon_compiler
* c
,
304 struct rc_instruction
* inst
)
306 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
307 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dst
, inst
->U
.I
.SrcReg
[0]);
308 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
309 inst
->U
.I
.SrcReg
[0], negate(srcreg(RC_FILE_TEMPORARY
, dst
.Index
)));
310 rc_remove_instruction(inst
);
314 * Definition of LIT (from ARB_fragment_program):
316 * tmp = VectorLoad(op0);
317 * if (tmp.x < 0) tmp.x = 0;
318 * if (tmp.y < 0) tmp.y = 0;
319 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
320 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
323 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
326 * The longest path of computation is the one leading to result.z,
327 * consisting of 5 operations. This implementation of LIT takes
328 * 5 slots, if the subsequent optimization passes are clever enough
329 * to pair instructions correctly.
331 static void transform_LIT(struct radeon_compiler
* c
,
332 struct rc_instruction
* inst
)
334 unsigned int constant
;
335 unsigned int constant_swizzle
;
337 struct rc_src_register srctemp
;
339 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, -127.999999, &constant_swizzle
);
341 if (inst
->U
.I
.DstReg
.WriteMask
!= RC_MASK_XYZW
|| inst
->U
.I
.DstReg
.File
!= RC_FILE_TEMPORARY
) {
342 struct rc_instruction
* inst_mov
;
344 inst_mov
= emit1(c
, inst
,
345 RC_OPCODE_MOV
, 0, inst
->U
.I
.DstReg
,
346 srcreg(RC_FILE_TEMPORARY
, rc_find_free_temporary(c
)));
348 inst
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
349 inst
->U
.I
.DstReg
.Index
= inst_mov
->U
.I
.SrcReg
[0].Index
;
350 inst
->U
.I
.DstReg
.WriteMask
= RC_MASK_XYZW
;
353 temp
= inst
->U
.I
.DstReg
.Index
;
354 srctemp
= srcreg(RC_FILE_TEMPORARY
, temp
);
356 /* tmp.x = max(0.0, Src.x); */
357 /* tmp.y = max(0.0, Src.y); */
358 /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
359 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
360 dstregtmpmask(temp
, RC_MASK_XYW
),
362 swizzle(srcreg(RC_FILE_CONSTANT
, constant
),
363 RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, constant_swizzle
&3));
364 emit2(c
, inst
->Prev
, RC_OPCODE_MIN
, 0,
365 dstregtmpmask(temp
, RC_MASK_Z
),
366 swizzle_wwww(srctemp
),
367 negate(srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
)));
369 /* tmp.w = Pow(tmp.y, tmp.w) */
370 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0,
371 dstregtmpmask(temp
, RC_MASK_W
),
372 swizzle_yyyy(srctemp
));
373 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0,
374 dstregtmpmask(temp
, RC_MASK_W
),
375 swizzle_wwww(srctemp
),
376 swizzle_zzzz(srctemp
));
377 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, 0,
378 dstregtmpmask(temp
, RC_MASK_W
),
379 swizzle_wwww(srctemp
));
381 /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
382 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
,
383 dstregtmpmask(temp
, RC_MASK_Z
),
384 negate(swizzle_xxxx(srctemp
)),
385 swizzle_wwww(srctemp
),
388 /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
389 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
,
390 dstregtmpmask(temp
, RC_MASK_XYW
),
391 swizzle(srctemp
, RC_SWIZZLE_ONE
, RC_SWIZZLE_X
, RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
));
393 rc_remove_instruction(inst
);
396 static void transform_LRP(struct radeon_compiler
* c
,
397 struct rc_instruction
* inst
)
399 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
401 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0,
403 inst
->U
.I
.SrcReg
[1], negate(inst
->U
.I
.SrcReg
[2]));
404 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, inst
->U
.I
.SaturateMode
,
406 inst
->U
.I
.SrcReg
[0], srcreg(RC_FILE_TEMPORARY
, dst
.Index
), inst
->U
.I
.SrcReg
[2]);
408 rc_remove_instruction(inst
);
411 static void transform_POW(struct radeon_compiler
* c
,
412 struct rc_instruction
* inst
)
414 struct rc_dst_register tempdst
= try_to_reuse_dst(c
, inst
);
415 struct rc_src_register tempsrc
= srcreg(RC_FILE_TEMPORARY
, tempdst
.Index
);
416 tempdst
.WriteMask
= RC_MASK_W
;
417 tempsrc
.Swizzle
= RC_SWIZZLE_WWWW
;
419 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0, tempdst
, swizzle_xxxx(inst
->U
.I
.SrcReg
[0]));
420 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, tempdst
, tempsrc
, swizzle_xxxx(inst
->U
.I
.SrcReg
[1]));
421 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, tempsrc
);
423 rc_remove_instruction(inst
);
426 /* dst = ROUND(src) :
431 * According to the GLSL spec, the implementor can decide which way to round
432 * when the fraction is .5. We round down for .5.
435 static void transform_ROUND(struct radeon_compiler
* c
,
436 struct rc_instruction
* inst
)
438 unsigned int mask
= inst
->U
.I
.DstReg
.WriteMask
;
439 unsigned int frac_index
, add_index
;
440 struct rc_dst_register frac_dst
, add_dst
;
441 struct rc_src_register frac_src
, add_src
;
444 add_index
= rc_find_free_temporary(c
);
445 add_dst
= dstregtmpmask(add_index
, mask
);
446 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, add_dst
, inst
->U
.I
.SrcReg
[0],
448 add_src
= srcreg(RC_FILE_TEMPORARY
, add_dst
.Index
);
451 /* frac = FRC(add) */
452 frac_index
= rc_find_free_temporary(c
);
453 frac_dst
= dstregtmpmask(frac_index
, mask
);
454 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, frac_dst
, add_src
);
455 frac_src
= srcreg(RC_FILE_TEMPORARY
, frac_dst
.Index
);
457 /* dst = add - frac */
458 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, inst
->U
.I
.DstReg
,
459 add_src
, negate(frac_src
));
460 rc_remove_instruction(inst
);
463 static void transform_RSQ(struct radeon_compiler
* c
,
464 struct rc_instruction
* inst
)
466 inst
->U
.I
.SrcReg
[0] = absolute(inst
->U
.I
.SrcReg
[0]);
469 static void transform_SEQ(struct radeon_compiler
* c
,
470 struct rc_instruction
* inst
)
472 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
474 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
475 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
476 negate(absolute(srcreg(RC_FILE_TEMPORARY
, dst
.Index
))), builtin_zero
, builtin_one
);
478 rc_remove_instruction(inst
);
481 static void transform_SFL(struct radeon_compiler
* c
,
482 struct rc_instruction
* inst
)
484 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, builtin_zero
);
485 rc_remove_instruction(inst
);
488 static void transform_SGE(struct radeon_compiler
* c
,
489 struct rc_instruction
* inst
)
491 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
493 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
494 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
495 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), builtin_zero
, builtin_one
);
497 rc_remove_instruction(inst
);
500 static void transform_SGT(struct radeon_compiler
* c
,
501 struct rc_instruction
* inst
)
503 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
505 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, negate(inst
->U
.I
.SrcReg
[0]), inst
->U
.I
.SrcReg
[1]);
506 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
507 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), builtin_one
, builtin_zero
);
509 rc_remove_instruction(inst
);
512 static void transform_SLE(struct radeon_compiler
* c
,
513 struct rc_instruction
* inst
)
515 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
517 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, negate(inst
->U
.I
.SrcReg
[0]), inst
->U
.I
.SrcReg
[1]);
518 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
519 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), builtin_zero
, builtin_one
);
521 rc_remove_instruction(inst
);
524 static void transform_SLT(struct radeon_compiler
* c
,
525 struct rc_instruction
* inst
)
527 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
529 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
530 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
531 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), builtin_one
, builtin_zero
);
533 rc_remove_instruction(inst
);
536 static void transform_SNE(struct radeon_compiler
* c
,
537 struct rc_instruction
* inst
)
539 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
541 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
542 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
543 negate(absolute(srcreg(RC_FILE_TEMPORARY
, dst
.Index
))), builtin_one
, builtin_zero
);
545 rc_remove_instruction(inst
);
548 static void transform_SSG(struct radeon_compiler
* c
,
549 struct rc_instruction
* inst
)
555 * ADD result, tmp0, -tmp1;
557 struct rc_dst_register dst0
;
561 dst0
= try_to_reuse_dst(c
, inst
);
562 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, 0,
564 negate(inst
->U
.I
.SrcReg
[0]),
569 tmp1
= rc_find_free_temporary(c
);
570 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, 0,
571 dstregtmpmask(tmp1
, inst
->U
.I
.DstReg
.WriteMask
),
576 /* Either both are zero, or one of them is one and the other is zero. */
577 /* result = tmp0 - tmp1 */
578 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0,
580 srcreg(RC_FILE_TEMPORARY
, dst0
.Index
),
581 negate(srcreg(RC_FILE_TEMPORARY
, tmp1
)));
583 rc_remove_instruction(inst
);
586 static void transform_SUB(struct radeon_compiler
* c
,
587 struct rc_instruction
* inst
)
589 inst
->U
.I
.Opcode
= RC_OPCODE_ADD
;
590 inst
->U
.I
.SrcReg
[1] = negate(inst
->U
.I
.SrcReg
[1]);
593 static void transform_SWZ(struct radeon_compiler
* c
,
594 struct rc_instruction
* inst
)
596 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
599 static void transform_XPD(struct radeon_compiler
* c
,
600 struct rc_instruction
* inst
)
602 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
604 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dst
,
605 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
606 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
));
607 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
608 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
),
609 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
610 negate(srcreg(RC_FILE_TEMPORARY
, dst
.Index
)));
612 rc_remove_instruction(inst
);
617 * Can be used as a transformation for @ref radeonClauseLocalTransform,
618 * no userData necessary.
620 * Eliminates the following ALU instructions:
621 * ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
623 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
625 * Transforms RSQ to Radeon's native RSQ by explicitly setting
628 * @note should be applicable to R300 and R500 fragment programs.
630 int radeonTransformALU(
631 struct radeon_compiler
* c
,
632 struct rc_instruction
* inst
,
635 switch(inst
->U
.I
.Opcode
) {
636 case RC_OPCODE_ABS
: transform_ABS(c
, inst
); return 1;
637 case RC_OPCODE_CEIL
: transform_CEIL(c
, inst
); return 1;
638 case RC_OPCODE_CLAMP
: transform_CLAMP(c
, inst
); return 1;
639 case RC_OPCODE_DP2
: transform_DP2(c
, inst
); return 1;
640 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
641 case RC_OPCODE_DST
: transform_DST(c
, inst
); return 1;
642 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
643 case RC_OPCODE_LIT
: transform_LIT(c
, inst
); return 1;
644 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
645 case RC_OPCODE_POW
: transform_POW(c
, inst
); return 1;
646 case RC_OPCODE_ROUND
: transform_ROUND(c
, inst
); return 1;
647 case RC_OPCODE_RSQ
: transform_RSQ(c
, inst
); return 1;
648 case RC_OPCODE_SEQ
: transform_SEQ(c
, inst
); return 1;
649 case RC_OPCODE_SFL
: transform_SFL(c
, inst
); return 1;
650 case RC_OPCODE_SGE
: transform_SGE(c
, inst
); return 1;
651 case RC_OPCODE_SGT
: transform_SGT(c
, inst
); return 1;
652 case RC_OPCODE_SLE
: transform_SLE(c
, inst
); return 1;
653 case RC_OPCODE_SLT
: transform_SLT(c
, inst
); return 1;
654 case RC_OPCODE_SNE
: transform_SNE(c
, inst
); return 1;
655 case RC_OPCODE_SSG
: transform_SSG(c
, inst
); return 1;
656 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
657 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
658 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
665 static void transform_r300_vertex_ABS(struct radeon_compiler
* c
,
666 struct rc_instruction
* inst
)
668 /* Note: r500 can take absolute values, but r300 cannot. */
669 inst
->U
.I
.Opcode
= RC_OPCODE_MAX
;
670 inst
->U
.I
.SrcReg
[1] = inst
->U
.I
.SrcReg
[0];
671 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
674 static void transform_r300_vertex_CMP(struct radeon_compiler
* c
,
675 struct rc_instruction
* inst
)
677 /* There is no decent CMP available, so let's rig one up.
678 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
679 * The following sequence consumes zero to two temps and two extra slots
680 * (the second temp and the second slot is consumed by transform_LRP),
681 * but should be equivalent:
683 * SLT tmp0, src0, 0.0
684 * LRP dst, tmp0, src1, src2
686 * Yes, I know, I'm a mad scientist. ~ C. & M. */
687 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
689 /* SLT tmp0, src0, 0.0 */
690 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
692 inst
->U
.I
.SrcReg
[0], builtin_zero
);
694 /* LRP dst, tmp0, src1, src2 */
696 emit3(c
, inst
->Prev
, RC_OPCODE_LRP
, 0,
698 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[2]));
700 rc_remove_instruction(inst
);
703 static void transform_r300_vertex_DP2(struct radeon_compiler
* c
,
704 struct rc_instruction
* inst
)
706 struct rc_instruction
*next_inst
= inst
->Next
;
707 transform_DP2(c
, inst
);
708 next_inst
->Prev
->U
.I
.Opcode
= RC_OPCODE_DP4
;
711 static void transform_r300_vertex_DP3(struct radeon_compiler
* c
,
712 struct rc_instruction
* inst
)
714 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
715 struct rc_src_register src1
= inst
->U
.I
.SrcReg
[1];
716 src0
.Negate
&= ~RC_MASK_W
;
717 src0
.Swizzle
&= ~(7 << (3 * 3));
718 src0
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
719 src1
.Negate
&= ~RC_MASK_W
;
720 src1
.Swizzle
&= ~(7 << (3 * 3));
721 src1
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
722 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src0
, src1
);
723 rc_remove_instruction(inst
);
726 static void transform_r300_vertex_fix_LIT(struct radeon_compiler
* c
,
727 struct rc_instruction
* inst
)
729 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
730 unsigned constant_swizzle
;
731 int constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
,
732 0.0000000000000000001,
736 dst
.WriteMask
= RC_MASK_XYZW
;
737 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, 0,
739 inst
->U
.I
.SrcReg
[0]);
741 /* MAX dst.y, src, 0.00...001 */
742 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
743 dstregtmpmask(dst
.Index
, RC_MASK_Y
),
744 srcreg(RC_FILE_TEMPORARY
, dst
.Index
),
745 srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
));
747 inst
->U
.I
.SrcReg
[0] = srcreg(RC_FILE_TEMPORARY
, dst
.Index
);
750 static void transform_r300_vertex_SEQ(struct radeon_compiler
*c
,
751 struct rc_instruction
*inst
)
753 /* x = y <==> x >= y && y >= x */
754 int tmp
= rc_find_free_temporary(c
);
757 emit2(c
, inst
->Prev
, RC_OPCODE_SGE
, 0,
758 dstregtmpmask(tmp
, inst
->U
.I
.DstReg
.WriteMask
),
760 inst
->U
.I
.SrcReg
[1]);
763 emit2(c
, inst
->Prev
, RC_OPCODE_SGE
, 0,
766 inst
->U
.I
.SrcReg
[0]);
769 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0,
771 srcreg(RC_FILE_TEMPORARY
, tmp
),
772 srcreg(inst
->U
.I
.DstReg
.File
, inst
->U
.I
.DstReg
.Index
));
774 rc_remove_instruction(inst
);
777 static void transform_r300_vertex_SNE(struct radeon_compiler
*c
,
778 struct rc_instruction
*inst
)
780 /* x != y <==> x < y || y < x */
781 int tmp
= rc_find_free_temporary(c
);
784 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
785 dstregtmpmask(tmp
, inst
->U
.I
.DstReg
.WriteMask
),
787 inst
->U
.I
.SrcReg
[1]);
790 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
793 inst
->U
.I
.SrcReg
[0]);
795 /* x || y = max(x, y) */
796 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
798 srcreg(RC_FILE_TEMPORARY
, tmp
),
799 srcreg(inst
->U
.I
.DstReg
.File
, inst
->U
.I
.DstReg
.Index
));
801 rc_remove_instruction(inst
);
804 static void transform_r300_vertex_SGT(struct radeon_compiler
* c
,
805 struct rc_instruction
* inst
)
807 /* x > y <==> -x < -y */
808 inst
->U
.I
.Opcode
= RC_OPCODE_SLT
;
809 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
810 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
813 static void transform_r300_vertex_SLE(struct radeon_compiler
* c
,
814 struct rc_instruction
* inst
)
816 /* x <= y <==> -x >= -y */
817 inst
->U
.I
.Opcode
= RC_OPCODE_SGE
;
818 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
819 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
822 static void transform_r300_vertex_SSG(struct radeon_compiler
* c
,
823 struct rc_instruction
* inst
)
829 * ADD result, tmp0, -tmp1;
831 struct rc_dst_register dst0
= try_to_reuse_dst(c
, inst
);
835 dst0
= try_to_reuse_dst(c
, inst
);
836 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
839 inst
->U
.I
.SrcReg
[0]);
842 tmp1
= rc_find_free_temporary(c
);
843 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
844 dstregtmpmask(tmp1
, inst
->U
.I
.DstReg
.WriteMask
),
848 /* Either both are zero, or one of them is one and the other is zero. */
849 /* result = tmp0 - tmp1 */
850 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0,
852 srcreg(RC_FILE_TEMPORARY
, dst0
.Index
),
853 negate(srcreg(RC_FILE_TEMPORARY
, tmp1
)));
855 rc_remove_instruction(inst
);
859 * For use with rc_local_transform, this transforms non-native ALU
860 * instructions of the r300 up to r500 vertex engine.
862 int r300_transform_vertex_alu(
863 struct radeon_compiler
* c
,
864 struct rc_instruction
* inst
,
867 switch(inst
->U
.I
.Opcode
) {
868 case RC_OPCODE_ABS
: transform_r300_vertex_ABS(c
, inst
); return 1;
869 case RC_OPCODE_CEIL
: transform_CEIL(c
, inst
); return 1;
870 case RC_OPCODE_CLAMP
: transform_CLAMP(c
, inst
); return 1;
871 case RC_OPCODE_CMP
: transform_r300_vertex_CMP(c
, inst
); return 1;
872 case RC_OPCODE_DP2
: transform_r300_vertex_DP2(c
, inst
); return 1;
873 case RC_OPCODE_DP3
: transform_r300_vertex_DP3(c
, inst
); return 1;
874 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
875 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
876 case RC_OPCODE_LIT
: transform_r300_vertex_fix_LIT(c
, inst
); return 1;
877 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
880 transform_r300_vertex_SEQ(c
, inst
);
884 case RC_OPCODE_SFL
: transform_SFL(c
, inst
); return 1;
885 case RC_OPCODE_SGT
: transform_r300_vertex_SGT(c
, inst
); return 1;
886 case RC_OPCODE_SLE
: transform_r300_vertex_SLE(c
, inst
); return 1;
889 transform_r300_vertex_SNE(c
, inst
);
893 case RC_OPCODE_SSG
: transform_r300_vertex_SSG(c
, inst
); return 1;
894 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
895 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
896 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
902 static void sincos_constants(struct radeon_compiler
* c
, unsigned int *constants
)
904 static const float SinCosConsts
[2][4] = {
906 1.273239545, /* 4/PI */
907 -0.405284735, /* -4/(PI*PI) */
908 3.141592654, /* PI */
914 0.159154943, /* 1/(2*PI) */
915 6.283185307 /* 2*PI */
920 for(i
= 0; i
< 2; ++i
)
921 constants
[i
] = rc_constants_add_immediate_vec4(&c
->Program
.Constants
, SinCosConsts
[i
]);
925 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
927 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
928 * MAD tmp.x, tmp.y, |src|, tmp.x
929 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
930 * MAD dest, tmp.y, weight, tmp.x
932 static void sin_approx(
933 struct radeon_compiler
* c
, struct rc_instruction
* inst
,
934 struct rc_dst_register dst
, struct rc_src_register src
, const unsigned int* constants
)
936 unsigned int tempreg
= rc_find_free_temporary(c
);
938 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
940 srcreg(RC_FILE_CONSTANT
, constants
[0]));
941 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_X
),
942 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
943 absolute(swizzle_xxxx(src
)),
944 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
945 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_Y
),
946 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
947 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
))),
948 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
))));
949 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dst
,
950 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
951 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[0])),
952 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
956 * Translate the trigonometric functions COS, SIN, and SCS
957 * using only the basic instructions
958 * MOV, ADD, MUL, MAD, FRC
960 int r300_transform_trig_simple(struct radeon_compiler
* c
,
961 struct rc_instruction
* inst
,
964 unsigned int constants
[2];
965 unsigned int tempreg
;
967 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
968 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
969 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
972 tempreg
= rc_find_free_temporary(c
);
974 sincos_constants(c
, constants
);
976 if (inst
->U
.I
.Opcode
== RC_OPCODE_COS
) {
977 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
978 /* FRC tmp.x, tmp.x */
979 /* MAD tmp.z, tmp.x, 2*PI, -PI */
980 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
981 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
982 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
983 swizzle_xxxx(srcreg(RC_FILE_CONSTANT
, constants
[1])));
984 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
985 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
986 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
987 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
988 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
989 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
991 sin_approx(c
, inst
, inst
->U
.I
.DstReg
,
992 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
994 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SIN
) {
995 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
996 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
997 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
998 swizzle_yyyy(srcreg(RC_FILE_CONSTANT
, constants
[1])));
999 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
1000 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
1001 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
1002 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
1003 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
1004 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
1006 sin_approx(c
, inst
, inst
->U
.I
.DstReg
,
1007 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
1010 struct rc_dst_register dst
;
1012 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
1013 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
1014 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
1015 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_W
));
1016 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
1017 srcreg(RC_FILE_TEMPORARY
, tempreg
));
1018 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
1019 srcreg(RC_FILE_TEMPORARY
, tempreg
),
1020 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
1021 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
1023 dst
= inst
->U
.I
.DstReg
;
1025 dst
.WriteMask
= inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_X
;
1026 sin_approx(c
, inst
, dst
,
1027 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
1030 dst
.WriteMask
= inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_Y
;
1031 sin_approx(c
, inst
, dst
,
1032 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
1036 rc_remove_instruction(inst
);
1041 static void r300_transform_SIN_COS_SCS(struct radeon_compiler
*c
,
1042 struct rc_instruction
*inst
,
1045 if (inst
->U
.I
.Opcode
== RC_OPCODE_COS
) {
1046 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
1047 srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
1048 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SIN
) {
1049 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, inst
->U
.I
.SaturateMode
,
1050 inst
->U
.I
.DstReg
, srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
1051 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SCS
) {
1052 struct rc_dst_register moddst
= inst
->U
.I
.DstReg
;
1054 if (inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_X
) {
1055 moddst
.WriteMask
= RC_MASK_X
;
1056 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, inst
->U
.I
.SaturateMode
, moddst
,
1057 srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
1059 if (inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_Y
) {
1060 moddst
.WriteMask
= RC_MASK_Y
;
1061 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, inst
->U
.I
.SaturateMode
, moddst
,
1062 srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
1066 rc_remove_instruction(inst
);
1071 * Transform the trigonometric functions COS, SIN, and SCS
1072 * to include pre-scaling by 1/(2*PI) and taking the fractional
1073 * part, so that the input to COS and SIN is always in the range [0,1).
1074 * SCS is replaced by one COS and one SIN instruction.
1076 * @warning This transformation implicitly changes the semantics of SIN and COS!
1078 int radeonTransformTrigScale(struct radeon_compiler
* c
,
1079 struct rc_instruction
* inst
,
1082 static const float RCP_2PI
= 0.15915494309189535;
1084 unsigned int constant
;
1085 unsigned int constant_swizzle
;
1087 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
1088 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
1089 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
1092 temp
= rc_find_free_temporary(c
);
1093 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, RCP_2PI
, &constant_swizzle
);
1095 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstregtmpmask(temp
, RC_MASK_W
),
1096 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
1097 srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
));
1098 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(temp
, RC_MASK_W
),
1099 srcreg(RC_FILE_TEMPORARY
, temp
));
1101 r300_transform_SIN_COS_SCS(c
, inst
, temp
);
1106 * Transform the trigonometric functions COS, SIN, and SCS
1107 * so that the input to COS and SIN is always in the range [-PI, PI].
1108 * SCS is replaced by one COS and one SIN instruction.
1110 int r300_transform_trig_scale_vertex(struct radeon_compiler
*c
,
1111 struct rc_instruction
*inst
,
1114 static const float cons
[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1116 unsigned int constant
;
1118 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
1119 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
1120 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
1123 /* Repeat x in the range [-PI, PI]:
1125 * repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1128 temp
= rc_find_free_temporary(c
);
1129 constant
= rc_constants_add_immediate_vec4(&c
->Program
.Constants
, cons
);
1131 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(temp
, RC_MASK_W
),
1132 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
1133 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_XXXX
),
1134 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_YYYY
));
1135 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(temp
, RC_MASK_W
),
1136 srcreg(RC_FILE_TEMPORARY
, temp
));
1137 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(temp
, RC_MASK_W
),
1138 srcreg(RC_FILE_TEMPORARY
, temp
),
1139 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_ZZZZ
),
1140 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_WWWW
));
1142 r300_transform_SIN_COS_SCS(c
, inst
, temp
);
1147 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1148 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1149 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1151 * @warning This explicitly changes the form of DDX and DDY!
1154 int radeonTransformDeriv(struct radeon_compiler
* c
,
1155 struct rc_instruction
* inst
,
1158 if (inst
->U
.I
.Opcode
!= RC_OPCODE_DDX
&& inst
->U
.I
.Opcode
!= RC_OPCODE_DDY
)
1161 inst
->U
.I
.SrcReg
[1].Swizzle
= RC_SWIZZLE_1111
;
1162 inst
->U
.I
.SrcReg
[1].Negate
= RC_MASK_XYZW
;
1168 * IF Temp[0].x -> IF Temp[0].x
1170 * KILP -> KIL -abs(Temp[0].x)
1177 * KILP - > KIL -abs(Temp[0].x)
1182 * IF Temp[0].x -> IF Temp[0].x
1186 * KILP -> KIL -abs(Temp[0].x)
1192 * KILP -> KIL -none.1111
1194 * This needs to be done in its own pass, because it might modify the
1195 * instructions before and after KILP.
1197 void rc_transform_KILP(struct radeon_compiler
* c
, void *user
)
1199 struct rc_instruction
* inst
;
1200 for (inst
= c
->Program
.Instructions
.Next
;
1201 inst
!= &c
->Program
.Instructions
; inst
= inst
->Next
) {
1202 struct rc_instruction
* if_inst
;
1205 if (inst
->U
.I
.Opcode
!= RC_OPCODE_KILP
)
1208 for (if_inst
= inst
->Prev
; if_inst
!= &c
->Program
.Instructions
;
1209 if_inst
= if_inst
->Prev
) {
1211 if (if_inst
->U
.I
.Opcode
== RC_OPCODE_IF
) {
1217 inst
->U
.I
.Opcode
= RC_OPCODE_KIL
;
1220 inst
->U
.I
.SrcReg
[0] = negate(builtin_one
);
1222 /* This should work even if the KILP is inside the ELSE
1223 * block, because -0.0 is considered negative. */
1224 inst
->U
.I
.SrcReg
[0] =
1225 negate(absolute(if_inst
->U
.I
.SrcReg
[0]));
1227 if (inst
->Prev
->U
.I
.Opcode
!= RC_OPCODE_IF
1228 && inst
->Next
->U
.I
.Opcode
!= RC_OPCODE_ENDIF
) {
1230 /* Optimize the special case:
1237 rc_remove_instruction(inst
->Prev
);
1239 rc_remove_instruction(inst
->Next
);