2 * Copyright (C) 2008 Nicolai Haehnle.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
36 #include "radeon_program_alu.h"
38 #include "radeon_compiler.h"
39 #include "radeon_compiler_util.h"
42 static struct rc_instruction
*emit1(
43 struct radeon_compiler
* c
, struct rc_instruction
* after
,
44 rc_opcode Opcode
, struct rc_sub_instruction
* base
,
45 struct rc_dst_register DstReg
, struct rc_src_register SrcReg
)
47 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
50 memcpy(&fpi
->U
.I
, base
, sizeof(struct rc_sub_instruction
));
53 fpi
->U
.I
.Opcode
= Opcode
;
54 fpi
->U
.I
.DstReg
= DstReg
;
55 fpi
->U
.I
.SrcReg
[0] = SrcReg
;
59 static struct rc_instruction
*emit2(
60 struct radeon_compiler
* c
, struct rc_instruction
* after
,
61 rc_opcode Opcode
, struct rc_sub_instruction
* base
,
62 struct rc_dst_register DstReg
,
63 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
)
65 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
68 memcpy(&fpi
->U
.I
, base
, sizeof(struct rc_sub_instruction
));
71 fpi
->U
.I
.Opcode
= Opcode
;
72 fpi
->U
.I
.DstReg
= DstReg
;
73 fpi
->U
.I
.SrcReg
[0] = SrcReg0
;
74 fpi
->U
.I
.SrcReg
[1] = SrcReg1
;
78 static struct rc_instruction
*emit3(
79 struct radeon_compiler
* c
, struct rc_instruction
* after
,
80 rc_opcode Opcode
, struct rc_sub_instruction
* base
,
81 struct rc_dst_register DstReg
,
82 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
,
83 struct rc_src_register SrcReg2
)
85 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
88 memcpy(&fpi
->U
.I
, base
, sizeof(struct rc_sub_instruction
));
91 fpi
->U
.I
.Opcode
= Opcode
;
92 fpi
->U
.I
.DstReg
= DstReg
;
93 fpi
->U
.I
.SrcReg
[0] = SrcReg0
;
94 fpi
->U
.I
.SrcReg
[1] = SrcReg1
;
95 fpi
->U
.I
.SrcReg
[2] = SrcReg2
;
99 static struct rc_dst_register
dstregtmpmask(int index
, int mask
)
101 struct rc_dst_register dst
= {0, 0, 0};
102 dst
.File
= RC_FILE_TEMPORARY
;
104 dst
.WriteMask
= mask
;
108 static const struct rc_src_register builtin_zero
= {
109 .File
= RC_FILE_NONE
,
111 .Swizzle
= RC_SWIZZLE_0000
113 static const struct rc_src_register builtin_one
= {
114 .File
= RC_FILE_NONE
,
116 .Swizzle
= RC_SWIZZLE_1111
119 static const struct rc_src_register builtin_half
= {
120 .File
= RC_FILE_NONE
,
122 .Swizzle
= RC_SWIZZLE_HHHH
125 static const struct rc_src_register srcreg_undefined
= {
126 .File
= RC_FILE_NONE
,
128 .Swizzle
= RC_SWIZZLE_XYZW
131 static struct rc_src_register
srcreg(int file
, int index
)
133 struct rc_src_register src
= srcreg_undefined
;
139 static struct rc_src_register
srcregswz(int file
, int index
, int swz
)
141 struct rc_src_register src
= srcreg_undefined
;
148 static struct rc_src_register
absolute(struct rc_src_register reg
)
150 struct rc_src_register newreg
= reg
;
152 newreg
.Negate
= RC_MASK_NONE
;
156 static struct rc_src_register
negate(struct rc_src_register reg
)
158 struct rc_src_register newreg
= reg
;
159 newreg
.Negate
= newreg
.Negate
^ RC_MASK_XYZW
;
163 static struct rc_src_register
swizzle(struct rc_src_register reg
,
164 rc_swizzle x
, rc_swizzle y
, rc_swizzle z
, rc_swizzle w
)
166 struct rc_src_register swizzled
= reg
;
167 swizzled
.Swizzle
= combine_swizzles4(reg
.Swizzle
, x
, y
, z
, w
);
171 static struct rc_src_register
swizzle_smear(struct rc_src_register reg
,
174 return swizzle(reg
, x
, x
, x
, x
);
177 static struct rc_src_register
swizzle_xxxx(struct rc_src_register reg
)
179 return swizzle_smear(reg
, RC_SWIZZLE_X
);
182 static struct rc_src_register
swizzle_yyyy(struct rc_src_register reg
)
184 return swizzle_smear(reg
, RC_SWIZZLE_Y
);
187 static struct rc_src_register
swizzle_zzzz(struct rc_src_register reg
)
189 return swizzle_smear(reg
, RC_SWIZZLE_Z
);
192 static struct rc_src_register
swizzle_wwww(struct rc_src_register reg
)
194 return swizzle_smear(reg
, RC_SWIZZLE_W
);
197 static int is_dst_safe_to_reuse(struct rc_instruction
*inst
)
199 const struct rc_opcode_info
*info
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
202 assert(info
->HasDstReg
);
204 if (inst
->U
.I
.DstReg
.File
!= RC_FILE_TEMPORARY
)
207 for (i
= 0; i
< info
->NumSrcRegs
; i
++) {
208 if (inst
->U
.I
.SrcReg
[i
].File
== RC_FILE_TEMPORARY
&&
209 inst
->U
.I
.SrcReg
[i
].Index
== inst
->U
.I
.DstReg
.Index
)
216 static struct rc_dst_register
try_to_reuse_dst(struct radeon_compiler
*c
,
217 struct rc_instruction
*inst
)
221 if (is_dst_safe_to_reuse(inst
))
222 tmp
= inst
->U
.I
.DstReg
.Index
;
224 tmp
= rc_find_free_temporary(c
);
226 return dstregtmpmask(tmp
, inst
->U
.I
.DstReg
.WriteMask
);
229 static void transform_ABS(struct radeon_compiler
* c
,
230 struct rc_instruction
* inst
)
232 struct rc_src_register src
= inst
->U
.I
.SrcReg
[0];
234 src
.Negate
= RC_MASK_NONE
;
235 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, &inst
->U
.I
, inst
->U
.I
.DstReg
, src
);
236 rc_remove_instruction(inst
);
239 static void transform_CEIL(struct radeon_compiler
* c
,
240 struct rc_instruction
* inst
)
243 * ceil(x) = -floor(-x)
245 * After inlining floor:
246 * ceil(x) = -(-x-frac(-x))
248 * After simplification:
249 * ceil(x) = x+frac(-x)
252 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
253 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dst
, negate(inst
->U
.I
.SrcReg
[0]));
254 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
255 inst
->U
.I
.SrcReg
[0], srcreg(RC_FILE_TEMPORARY
, dst
.Index
));
256 rc_remove_instruction(inst
);
259 static void transform_CLAMP(struct radeon_compiler
*c
,
260 struct rc_instruction
*inst
)
262 /* CLAMP dst, src, min, max
267 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
268 emit2(c
, inst
->Prev
, RC_OPCODE_MIN
, 0, dst
,
269 inst
->U
.I
.SrcReg
[0], inst
->U
.I
.SrcReg
[2]);
270 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
271 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), inst
->U
.I
.SrcReg
[1]);
272 rc_remove_instruction(inst
);
275 static void transform_DP2(struct radeon_compiler
* c
,
276 struct rc_instruction
* inst
)
278 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
279 struct rc_src_register src1
= inst
->U
.I
.SrcReg
[1];
280 src0
.Negate
&= ~(RC_MASK_Z
| RC_MASK_W
);
281 src0
.Swizzle
&= ~(63 << (3 * 2));
282 src0
.Swizzle
|= (RC_SWIZZLE_ZERO
<< (3 * 2)) | (RC_SWIZZLE_ZERO
<< (3 * 3));
283 src1
.Negate
&= ~(RC_MASK_Z
| RC_MASK_W
);
284 src1
.Swizzle
&= ~(63 << (3 * 2));
285 src1
.Swizzle
|= (RC_SWIZZLE_ZERO
<< (3 * 2)) | (RC_SWIZZLE_ZERO
<< (3 * 3));
286 emit2(c
, inst
->Prev
, RC_OPCODE_DP3
, &inst
->U
.I
, inst
->U
.I
.DstReg
, src0
, src1
);
287 rc_remove_instruction(inst
);
290 static void transform_DPH(struct radeon_compiler
* c
,
291 struct rc_instruction
* inst
)
293 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
294 src0
.Negate
&= ~RC_MASK_W
;
295 src0
.Swizzle
&= ~(7 << (3 * 3));
296 src0
.Swizzle
|= RC_SWIZZLE_ONE
<< (3 * 3);
297 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, &inst
->U
.I
, inst
->U
.I
.DstReg
, src0
, inst
->U
.I
.SrcReg
[1]);
298 rc_remove_instruction(inst
);
302 * [1, src0.y*src1.y, src0.z, src1.w]
303 * So basically MUL with lotsa swizzling.
305 static void transform_DST(struct radeon_compiler
* c
,
306 struct rc_instruction
* inst
)
308 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
309 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_ONE
),
310 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_ONE
, RC_SWIZZLE_W
));
311 rc_remove_instruction(inst
);
314 static void transform_FLR(struct radeon_compiler
* c
,
315 struct rc_instruction
* inst
)
317 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
318 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dst
, inst
->U
.I
.SrcReg
[0]);
319 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
320 inst
->U
.I
.SrcReg
[0], negate(srcreg(RC_FILE_TEMPORARY
, dst
.Index
)));
321 rc_remove_instruction(inst
);
324 static void transform_TRUNC(struct radeon_compiler
* c
,
325 struct rc_instruction
* inst
)
327 /* Definition of trunc:
328 * trunc(x) = (abs(x) - fract(abs(x))) * sgn(x)
330 * The multiplication by sgn(x) can be simplified using CMP:
331 * y * sgn(x) = (x < 0 ? -y : y)
333 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
334 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dst
, absolute(inst
->U
.I
.SrcReg
[0]));
335 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, absolute(inst
->U
.I
.SrcReg
[0]),
336 negate(srcreg(RC_FILE_TEMPORARY
, dst
.Index
)));
337 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, &inst
->U
.I
, inst
->U
.I
.DstReg
, inst
->U
.I
.SrcReg
[0],
338 negate(srcreg(RC_FILE_TEMPORARY
, dst
.Index
)), srcreg(RC_FILE_TEMPORARY
, dst
.Index
));
339 rc_remove_instruction(inst
);
343 * Definition of LIT (from ARB_fragment_program):
345 * tmp = VectorLoad(op0);
346 * if (tmp.x < 0) tmp.x = 0;
347 * if (tmp.y < 0) tmp.y = 0;
348 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
349 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
352 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
355 * The longest path of computation is the one leading to result.z,
356 * consisting of 5 operations. This implementation of LIT takes
357 * 5 slots, if the subsequent optimization passes are clever enough
358 * to pair instructions correctly.
360 static void transform_LIT(struct radeon_compiler
* c
,
361 struct rc_instruction
* inst
)
363 unsigned int constant
;
364 unsigned int constant_swizzle
;
366 struct rc_src_register srctemp
;
368 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, -127.999999, &constant_swizzle
);
370 if (inst
->U
.I
.DstReg
.WriteMask
!= RC_MASK_XYZW
|| inst
->U
.I
.DstReg
.File
!= RC_FILE_TEMPORARY
) {
371 struct rc_instruction
* inst_mov
;
373 inst_mov
= emit1(c
, inst
,
374 RC_OPCODE_MOV
, 0, inst
->U
.I
.DstReg
,
375 srcreg(RC_FILE_TEMPORARY
, rc_find_free_temporary(c
)));
377 inst
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
378 inst
->U
.I
.DstReg
.Index
= inst_mov
->U
.I
.SrcReg
[0].Index
;
379 inst
->U
.I
.DstReg
.WriteMask
= RC_MASK_XYZW
;
382 temp
= inst
->U
.I
.DstReg
.Index
;
383 srctemp
= srcreg(RC_FILE_TEMPORARY
, temp
);
385 /* tmp.x = max(0.0, Src.x); */
386 /* tmp.y = max(0.0, Src.y); */
387 /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
388 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
389 dstregtmpmask(temp
, RC_MASK_XYW
),
391 swizzle(srcreg(RC_FILE_CONSTANT
, constant
),
392 RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, constant_swizzle
&3));
393 emit2(c
, inst
->Prev
, RC_OPCODE_MIN
, 0,
394 dstregtmpmask(temp
, RC_MASK_Z
),
395 swizzle_wwww(srctemp
),
396 negate(srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
)));
398 /* tmp.w = Pow(tmp.y, tmp.w) */
399 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0,
400 dstregtmpmask(temp
, RC_MASK_W
),
401 swizzle_yyyy(srctemp
));
402 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0,
403 dstregtmpmask(temp
, RC_MASK_W
),
404 swizzle_wwww(srctemp
),
405 swizzle_zzzz(srctemp
));
406 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, 0,
407 dstregtmpmask(temp
, RC_MASK_W
),
408 swizzle_wwww(srctemp
));
410 /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
411 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, &inst
->U
.I
,
412 dstregtmpmask(temp
, RC_MASK_Z
),
413 negate(swizzle_xxxx(srctemp
)),
414 swizzle_wwww(srctemp
),
417 /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
418 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, &inst
->U
.I
,
419 dstregtmpmask(temp
, RC_MASK_XYW
),
420 swizzle(srctemp
, RC_SWIZZLE_ONE
, RC_SWIZZLE_X
, RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
));
422 rc_remove_instruction(inst
);
425 static void transform_LRP(struct radeon_compiler
* c
,
426 struct rc_instruction
* inst
)
428 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
430 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0,
432 inst
->U
.I
.SrcReg
[1], negate(inst
->U
.I
.SrcReg
[2]));
433 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, &inst
->U
.I
,
435 inst
->U
.I
.SrcReg
[0], srcreg(RC_FILE_TEMPORARY
, dst
.Index
), inst
->U
.I
.SrcReg
[2]);
437 rc_remove_instruction(inst
);
440 static void transform_POW(struct radeon_compiler
* c
,
441 struct rc_instruction
* inst
)
443 struct rc_dst_register tempdst
= try_to_reuse_dst(c
, inst
);
444 struct rc_src_register tempsrc
= srcreg(RC_FILE_TEMPORARY
, tempdst
.Index
);
445 tempdst
.WriteMask
= RC_MASK_W
;
446 tempsrc
.Swizzle
= RC_SWIZZLE_WWWW
;
448 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0, tempdst
, swizzle_xxxx(inst
->U
.I
.SrcReg
[0]));
449 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, tempdst
, tempsrc
, swizzle_xxxx(inst
->U
.I
.SrcReg
[1]));
450 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, &inst
->U
.I
, inst
->U
.I
.DstReg
, tempsrc
);
452 rc_remove_instruction(inst
);
455 /* dst = ROUND(src) :
460 * According to the GLSL spec, the implementor can decide which way to round
461 * when the fraction is .5. We round down for .5.
464 static void transform_ROUND(struct radeon_compiler
* c
,
465 struct rc_instruction
* inst
)
467 unsigned int mask
= inst
->U
.I
.DstReg
.WriteMask
;
468 unsigned int frac_index
, add_index
;
469 struct rc_dst_register frac_dst
, add_dst
;
470 struct rc_src_register frac_src
, add_src
;
473 add_index
= rc_find_free_temporary(c
);
474 add_dst
= dstregtmpmask(add_index
, mask
);
475 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, add_dst
, inst
->U
.I
.SrcReg
[0],
477 add_src
= srcreg(RC_FILE_TEMPORARY
, add_dst
.Index
);
480 /* frac = FRC(add) */
481 frac_index
= rc_find_free_temporary(c
);
482 frac_dst
= dstregtmpmask(frac_index
, mask
);
483 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, frac_dst
, add_src
);
484 frac_src
= srcreg(RC_FILE_TEMPORARY
, frac_dst
.Index
);
486 /* dst = add - frac */
487 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, inst
->U
.I
.DstReg
,
488 add_src
, negate(frac_src
));
489 rc_remove_instruction(inst
);
492 static void transform_RSQ(struct radeon_compiler
* c
,
493 struct rc_instruction
* inst
)
495 inst
->U
.I
.SrcReg
[0] = absolute(inst
->U
.I
.SrcReg
[0]);
498 static void transform_SEQ(struct radeon_compiler
* c
,
499 struct rc_instruction
* inst
)
501 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
503 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
504 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
505 negate(absolute(srcreg(RC_FILE_TEMPORARY
, dst
.Index
))), builtin_zero
, builtin_one
);
507 rc_remove_instruction(inst
);
510 static void transform_SFL(struct radeon_compiler
* c
,
511 struct rc_instruction
* inst
)
513 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, &inst
->U
.I
, inst
->U
.I
.DstReg
, builtin_zero
);
514 rc_remove_instruction(inst
);
517 static void transform_SGE(struct radeon_compiler
* c
,
518 struct rc_instruction
* inst
)
520 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
522 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
523 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
524 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), builtin_zero
, builtin_one
);
526 rc_remove_instruction(inst
);
529 static void transform_SGT(struct radeon_compiler
* c
,
530 struct rc_instruction
* inst
)
532 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
534 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, negate(inst
->U
.I
.SrcReg
[0]), inst
->U
.I
.SrcReg
[1]);
535 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
536 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), builtin_one
, builtin_zero
);
538 rc_remove_instruction(inst
);
541 static void transform_SLE(struct radeon_compiler
* c
,
542 struct rc_instruction
* inst
)
544 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
546 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, negate(inst
->U
.I
.SrcReg
[0]), inst
->U
.I
.SrcReg
[1]);
547 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
548 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), builtin_zero
, builtin_one
);
550 rc_remove_instruction(inst
);
553 static void transform_SLT(struct radeon_compiler
* c
,
554 struct rc_instruction
* inst
)
556 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
558 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
559 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
560 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), builtin_one
, builtin_zero
);
562 rc_remove_instruction(inst
);
565 static void transform_SNE(struct radeon_compiler
* c
,
566 struct rc_instruction
* inst
)
568 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
570 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dst
, inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
571 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
572 negate(absolute(srcreg(RC_FILE_TEMPORARY
, dst
.Index
))), builtin_one
, builtin_zero
);
574 rc_remove_instruction(inst
);
577 static void transform_SSG(struct radeon_compiler
* c
,
578 struct rc_instruction
* inst
)
584 * ADD result, tmp0, -tmp1;
586 struct rc_dst_register dst0
;
590 dst0
= try_to_reuse_dst(c
, inst
);
591 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, 0,
593 negate(inst
->U
.I
.SrcReg
[0]),
598 tmp1
= rc_find_free_temporary(c
);
599 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, 0,
600 dstregtmpmask(tmp1
, inst
->U
.I
.DstReg
.WriteMask
),
605 /* Either both are zero, or one of them is one and the other is zero. */
606 /* result = tmp0 - tmp1 */
607 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0,
609 srcreg(RC_FILE_TEMPORARY
, dst0
.Index
),
610 negate(srcreg(RC_FILE_TEMPORARY
, tmp1
)));
612 rc_remove_instruction(inst
);
615 static void transform_SUB(struct radeon_compiler
* c
,
616 struct rc_instruction
* inst
)
618 inst
->U
.I
.Opcode
= RC_OPCODE_ADD
;
619 inst
->U
.I
.SrcReg
[1] = negate(inst
->U
.I
.SrcReg
[1]);
622 static void transform_SWZ(struct radeon_compiler
* c
,
623 struct rc_instruction
* inst
)
625 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
628 static void transform_XPD(struct radeon_compiler
* c
,
629 struct rc_instruction
* inst
)
631 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
633 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dst
,
634 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
635 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
));
636 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
637 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
),
638 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
639 negate(srcreg(RC_FILE_TEMPORARY
, dst
.Index
)));
641 rc_remove_instruction(inst
);
646 * Can be used as a transformation for @ref radeonClauseLocalTransform,
647 * no userData necessary.
649 * Eliminates the following ALU instructions:
650 * ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
652 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
654 * Transforms RSQ to Radeon's native RSQ by explicitly setting
657 * @note should be applicable to R300 and R500 fragment programs.
659 int radeonTransformALU(
660 struct radeon_compiler
* c
,
661 struct rc_instruction
* inst
,
664 switch(inst
->U
.I
.Opcode
) {
665 case RC_OPCODE_ABS
: transform_ABS(c
, inst
); return 1;
666 case RC_OPCODE_CEIL
: transform_CEIL(c
, inst
); return 1;
667 case RC_OPCODE_CLAMP
: transform_CLAMP(c
, inst
); return 1;
668 case RC_OPCODE_DP2
: transform_DP2(c
, inst
); return 1;
669 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
670 case RC_OPCODE_DST
: transform_DST(c
, inst
); return 1;
671 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
672 case RC_OPCODE_LIT
: transform_LIT(c
, inst
); return 1;
673 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
674 case RC_OPCODE_POW
: transform_POW(c
, inst
); return 1;
675 case RC_OPCODE_ROUND
: transform_ROUND(c
, inst
); return 1;
676 case RC_OPCODE_RSQ
: transform_RSQ(c
, inst
); return 1;
677 case RC_OPCODE_SEQ
: transform_SEQ(c
, inst
); return 1;
678 case RC_OPCODE_SFL
: transform_SFL(c
, inst
); return 1;
679 case RC_OPCODE_SGE
: transform_SGE(c
, inst
); return 1;
680 case RC_OPCODE_SGT
: transform_SGT(c
, inst
); return 1;
681 case RC_OPCODE_SLE
: transform_SLE(c
, inst
); return 1;
682 case RC_OPCODE_SLT
: transform_SLT(c
, inst
); return 1;
683 case RC_OPCODE_SNE
: transform_SNE(c
, inst
); return 1;
684 case RC_OPCODE_SSG
: transform_SSG(c
, inst
); return 1;
685 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
686 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
687 case RC_OPCODE_TRUNC
: transform_TRUNC(c
, inst
); return 1;
688 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
695 static void transform_r300_vertex_ABS(struct radeon_compiler
* c
,
696 struct rc_instruction
* inst
)
698 /* Note: r500 can take absolute values, but r300 cannot. */
699 inst
->U
.I
.Opcode
= RC_OPCODE_MAX
;
700 inst
->U
.I
.SrcReg
[1] = inst
->U
.I
.SrcReg
[0];
701 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
704 static void transform_r300_vertex_CMP(struct radeon_compiler
* c
,
705 struct rc_instruction
* inst
)
707 /* There is no decent CMP available, so let's rig one up.
708 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
709 * The following sequence consumes zero to two temps and two extra slots
710 * (the second temp and the second slot is consumed by transform_LRP),
711 * but should be equivalent:
713 * SLT tmp0, src0, 0.0
714 * LRP dst, tmp0, src1, src2
716 * Yes, I know, I'm a mad scientist. ~ C. & M. */
717 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
719 /* SLT tmp0, src0, 0.0 */
720 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
722 inst
->U
.I
.SrcReg
[0], builtin_zero
);
724 /* LRP dst, tmp0, src1, src2 */
726 emit3(c
, inst
->Prev
, RC_OPCODE_LRP
, 0,
728 srcreg(RC_FILE_TEMPORARY
, dst
.Index
), inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[2]));
730 rc_remove_instruction(inst
);
733 static void transform_r300_vertex_DP2(struct radeon_compiler
* c
,
734 struct rc_instruction
* inst
)
736 struct rc_instruction
*next_inst
= inst
->Next
;
737 transform_DP2(c
, inst
);
738 next_inst
->Prev
->U
.I
.Opcode
= RC_OPCODE_DP4
;
741 static void transform_r300_vertex_DP3(struct radeon_compiler
* c
,
742 struct rc_instruction
* inst
)
744 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
745 struct rc_src_register src1
= inst
->U
.I
.SrcReg
[1];
746 src0
.Negate
&= ~RC_MASK_W
;
747 src0
.Swizzle
&= ~(7 << (3 * 3));
748 src0
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
749 src1
.Negate
&= ~RC_MASK_W
;
750 src1
.Swizzle
&= ~(7 << (3 * 3));
751 src1
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
752 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, &inst
->U
.I
, inst
->U
.I
.DstReg
, src0
, src1
);
753 rc_remove_instruction(inst
);
756 static void transform_r300_vertex_fix_LIT(struct radeon_compiler
* c
,
757 struct rc_instruction
* inst
)
759 struct rc_dst_register dst
= try_to_reuse_dst(c
, inst
);
760 unsigned constant_swizzle
;
761 int constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
,
762 0.0000000000000000001,
766 dst
.WriteMask
= RC_MASK_XYZW
;
767 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, 0,
769 inst
->U
.I
.SrcReg
[0]);
771 /* MAX dst.y, src, 0.00...001 */
772 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
773 dstregtmpmask(dst
.Index
, RC_MASK_Y
),
774 srcreg(RC_FILE_TEMPORARY
, dst
.Index
),
775 srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
));
777 inst
->U
.I
.SrcReg
[0] = srcreg(RC_FILE_TEMPORARY
, dst
.Index
);
780 static void transform_r300_vertex_SEQ(struct radeon_compiler
*c
,
781 struct rc_instruction
*inst
)
783 /* x = y <==> x >= y && y >= x */
784 int tmp
= rc_find_free_temporary(c
);
787 emit2(c
, inst
->Prev
, RC_OPCODE_SGE
, 0,
788 dstregtmpmask(tmp
, inst
->U
.I
.DstReg
.WriteMask
),
790 inst
->U
.I
.SrcReg
[1]);
793 emit2(c
, inst
->Prev
, RC_OPCODE_SGE
, 0,
796 inst
->U
.I
.SrcReg
[0]);
799 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0,
801 srcreg(RC_FILE_TEMPORARY
, tmp
),
802 srcreg(inst
->U
.I
.DstReg
.File
, inst
->U
.I
.DstReg
.Index
));
804 rc_remove_instruction(inst
);
807 static void transform_r300_vertex_SNE(struct radeon_compiler
*c
,
808 struct rc_instruction
*inst
)
810 /* x != y <==> x < y || y < x */
811 int tmp
= rc_find_free_temporary(c
);
814 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
815 dstregtmpmask(tmp
, inst
->U
.I
.DstReg
.WriteMask
),
817 inst
->U
.I
.SrcReg
[1]);
820 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
823 inst
->U
.I
.SrcReg
[0]);
825 /* x || y = max(x, y) */
826 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
828 srcreg(RC_FILE_TEMPORARY
, tmp
),
829 srcreg(inst
->U
.I
.DstReg
.File
, inst
->U
.I
.DstReg
.Index
));
831 rc_remove_instruction(inst
);
834 static void transform_r300_vertex_SGT(struct radeon_compiler
* c
,
835 struct rc_instruction
* inst
)
837 /* x > y <==> -x < -y */
838 inst
->U
.I
.Opcode
= RC_OPCODE_SLT
;
839 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
840 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
843 static void transform_r300_vertex_SLE(struct radeon_compiler
* c
,
844 struct rc_instruction
* inst
)
846 /* x <= y <==> -x >= -y */
847 inst
->U
.I
.Opcode
= RC_OPCODE_SGE
;
848 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
849 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
852 static void transform_r300_vertex_SSG(struct radeon_compiler
* c
,
853 struct rc_instruction
* inst
)
859 * ADD result, tmp0, -tmp1;
861 struct rc_dst_register dst0
= try_to_reuse_dst(c
, inst
);
865 dst0
= try_to_reuse_dst(c
, inst
);
866 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
869 inst
->U
.I
.SrcReg
[0]);
872 tmp1
= rc_find_free_temporary(c
);
873 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
874 dstregtmpmask(tmp1
, inst
->U
.I
.DstReg
.WriteMask
),
878 /* Either both are zero, or one of them is one and the other is zero. */
879 /* result = tmp0 - tmp1 */
880 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0,
882 srcreg(RC_FILE_TEMPORARY
, dst0
.Index
),
883 negate(srcreg(RC_FILE_TEMPORARY
, tmp1
)));
885 rc_remove_instruction(inst
);
888 static void transform_vertex_TRUNC(struct radeon_compiler
* c
,
889 struct rc_instruction
* inst
)
891 struct rc_instruction
*next
= inst
->Next
;
893 /* next->Prev is removed after each transformation and replaced
894 * by a new instruction. */
895 transform_TRUNC(c
, next
->Prev
);
896 transform_r300_vertex_CMP(c
, next
->Prev
);
900 * For use with rc_local_transform, this transforms non-native ALU
901 * instructions of the r300 up to r500 vertex engine.
903 int r300_transform_vertex_alu(
904 struct radeon_compiler
* c
,
905 struct rc_instruction
* inst
,
908 switch(inst
->U
.I
.Opcode
) {
909 case RC_OPCODE_ABS
: transform_r300_vertex_ABS(c
, inst
); return 1;
910 case RC_OPCODE_CEIL
: transform_CEIL(c
, inst
); return 1;
911 case RC_OPCODE_CLAMP
: transform_CLAMP(c
, inst
); return 1;
912 case RC_OPCODE_CMP
: transform_r300_vertex_CMP(c
, inst
); return 1;
913 case RC_OPCODE_DP2
: transform_r300_vertex_DP2(c
, inst
); return 1;
914 case RC_OPCODE_DP3
: transform_r300_vertex_DP3(c
, inst
); return 1;
915 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
916 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
917 case RC_OPCODE_LIT
: transform_r300_vertex_fix_LIT(c
, inst
); return 1;
918 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
921 transform_r300_vertex_SEQ(c
, inst
);
925 case RC_OPCODE_SFL
: transform_SFL(c
, inst
); return 1;
926 case RC_OPCODE_SGT
: transform_r300_vertex_SGT(c
, inst
); return 1;
927 case RC_OPCODE_SLE
: transform_r300_vertex_SLE(c
, inst
); return 1;
930 transform_r300_vertex_SNE(c
, inst
);
934 case RC_OPCODE_SSG
: transform_r300_vertex_SSG(c
, inst
); return 1;
935 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
936 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
937 case RC_OPCODE_TRUNC
: transform_vertex_TRUNC(c
, inst
); return 1;
938 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
944 static void sincos_constants(struct radeon_compiler
* c
, unsigned int *constants
)
946 static const float SinCosConsts
[2][4] = {
948 1.273239545, /* 4/PI */
949 -0.405284735, /* -4/(PI*PI) */
950 3.141592654, /* PI */
956 0.159154943, /* 1/(2*PI) */
957 6.283185307 /* 2*PI */
962 for(i
= 0; i
< 2; ++i
)
963 constants
[i
] = rc_constants_add_immediate_vec4(&c
->Program
.Constants
, SinCosConsts
[i
]);
967 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
969 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
970 * MAD tmp.x, tmp.y, |src|, tmp.x
971 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
972 * MAD dest, tmp.y, weight, tmp.x
974 static void sin_approx(
975 struct radeon_compiler
* c
, struct rc_instruction
* inst
,
976 struct rc_dst_register dst
, struct rc_src_register src
, const unsigned int* constants
)
978 unsigned int tempreg
= rc_find_free_temporary(c
);
980 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
982 srcreg(RC_FILE_CONSTANT
, constants
[0]));
983 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_X
),
984 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
985 absolute(swizzle_xxxx(src
)),
986 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
987 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_Y
),
988 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
989 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
))),
990 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
))));
991 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dst
,
992 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
993 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[0])),
994 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
998 * Translate the trigonometric functions COS, SIN, and SCS
999 * using only the basic instructions
1000 * MOV, ADD, MUL, MAD, FRC
1002 int r300_transform_trig_simple(struct radeon_compiler
* c
,
1003 struct rc_instruction
* inst
,
1006 unsigned int constants
[2];
1007 unsigned int tempreg
;
1009 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
1010 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
1011 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
1014 tempreg
= rc_find_free_temporary(c
);
1016 sincos_constants(c
, constants
);
1018 if (inst
->U
.I
.Opcode
== RC_OPCODE_COS
) {
1019 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
1020 /* FRC tmp.x, tmp.x */
1021 /* MAD tmp.z, tmp.x, 2*PI, -PI */
1022 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
1023 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
1024 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
1025 swizzle_xxxx(srcreg(RC_FILE_CONSTANT
, constants
[1])));
1026 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
1027 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
1028 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
1029 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
1030 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
1031 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
1033 sin_approx(c
, inst
, inst
->U
.I
.DstReg
,
1034 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
1036 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SIN
) {
1037 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
1038 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
1039 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
1040 swizzle_yyyy(srcreg(RC_FILE_CONSTANT
, constants
[1])));
1041 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
1042 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
1043 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
1044 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
1045 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
1046 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
1048 sin_approx(c
, inst
, inst
->U
.I
.DstReg
,
1049 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
1052 struct rc_dst_register dst
;
1054 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
1055 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
1056 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
1057 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_W
));
1058 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
1059 srcreg(RC_FILE_TEMPORARY
, tempreg
));
1060 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
1061 srcreg(RC_FILE_TEMPORARY
, tempreg
),
1062 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
1063 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
1065 dst
= inst
->U
.I
.DstReg
;
1067 dst
.WriteMask
= inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_X
;
1068 sin_approx(c
, inst
, dst
,
1069 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
1072 dst
.WriteMask
= inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_Y
;
1073 sin_approx(c
, inst
, dst
,
1074 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
1078 rc_remove_instruction(inst
);
1083 static void r300_transform_SIN_COS_SCS(struct radeon_compiler
*c
,
1084 struct rc_instruction
*inst
,
1087 if (inst
->U
.I
.Opcode
== RC_OPCODE_COS
) {
1088 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, &inst
->U
.I
, inst
->U
.I
.DstReg
,
1089 srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
1090 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SIN
) {
1091 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, &inst
->U
.I
,
1092 inst
->U
.I
.DstReg
, srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
1093 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SCS
) {
1094 struct rc_dst_register moddst
= inst
->U
.I
.DstReg
;
1096 if (inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_X
) {
1097 moddst
.WriteMask
= RC_MASK_X
;
1098 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, &inst
->U
.I
, moddst
,
1099 srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
1101 if (inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_Y
) {
1102 moddst
.WriteMask
= RC_MASK_Y
;
1103 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, &inst
->U
.I
, moddst
,
1104 srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
1108 rc_remove_instruction(inst
);
1113 * Transform the trigonometric functions COS, SIN, and SCS
1114 * to include pre-scaling by 1/(2*PI) and taking the fractional
1115 * part, so that the input to COS and SIN is always in the range [0,1).
1116 * SCS is replaced by one COS and one SIN instruction.
1118 * @warning This transformation implicitly changes the semantics of SIN and COS!
1120 int radeonTransformTrigScale(struct radeon_compiler
* c
,
1121 struct rc_instruction
* inst
,
1124 static const float RCP_2PI
= 0.15915494309189535;
1126 unsigned int constant
;
1127 unsigned int constant_swizzle
;
1129 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
1130 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
1131 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
1134 temp
= rc_find_free_temporary(c
);
1135 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, RCP_2PI
, &constant_swizzle
);
1137 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstregtmpmask(temp
, RC_MASK_W
),
1138 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
1139 srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
));
1140 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(temp
, RC_MASK_W
),
1141 srcreg(RC_FILE_TEMPORARY
, temp
));
1143 r300_transform_SIN_COS_SCS(c
, inst
, temp
);
1148 * Transform the trigonometric functions COS, SIN, and SCS
1149 * so that the input to COS and SIN is always in the range [-PI, PI].
1150 * SCS is replaced by one COS and one SIN instruction.
1152 int r300_transform_trig_scale_vertex(struct radeon_compiler
*c
,
1153 struct rc_instruction
*inst
,
1156 static const float cons
[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1158 unsigned int constant
;
1160 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
1161 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
1162 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
1165 /* Repeat x in the range [-PI, PI]:
1167 * repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1170 temp
= rc_find_free_temporary(c
);
1171 constant
= rc_constants_add_immediate_vec4(&c
->Program
.Constants
, cons
);
1173 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(temp
, RC_MASK_W
),
1174 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
1175 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_XXXX
),
1176 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_YYYY
));
1177 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(temp
, RC_MASK_W
),
1178 srcreg(RC_FILE_TEMPORARY
, temp
));
1179 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(temp
, RC_MASK_W
),
1180 srcreg(RC_FILE_TEMPORARY
, temp
),
1181 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_ZZZZ
),
1182 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_WWWW
));
1184 r300_transform_SIN_COS_SCS(c
, inst
, temp
);
1189 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1190 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1191 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1193 * @warning This explicitly changes the form of DDX and DDY!
1196 int radeonTransformDeriv(struct radeon_compiler
* c
,
1197 struct rc_instruction
* inst
,
1200 if (inst
->U
.I
.Opcode
!= RC_OPCODE_DDX
&& inst
->U
.I
.Opcode
!= RC_OPCODE_DDY
)
1203 inst
->U
.I
.SrcReg
[1].Swizzle
= RC_SWIZZLE_1111
;
1204 inst
->U
.I
.SrcReg
[1].Negate
= RC_MASK_XYZW
;
1210 * IF Temp[0].x -> IF Temp[0].x
1212 * KILL -> KIL -abs(Temp[0].x)
1219 * KILL - > KIL -abs(Temp[0].x)
1224 * IF Temp[0].x -> IF Temp[0].x
1228 * KILL -> KIL -abs(Temp[0].x)
1234 * KILL -> KIL -none.1111
1236 * This needs to be done in its own pass, because it might modify the
1237 * instructions before and after KILL.
1239 void rc_transform_KILL(struct radeon_compiler
* c
, void *user
)
1241 struct rc_instruction
* inst
;
1242 for (inst
= c
->Program
.Instructions
.Next
;
1243 inst
!= &c
->Program
.Instructions
; inst
= inst
->Next
) {
1244 struct rc_instruction
* if_inst
;
1247 if (inst
->U
.I
.Opcode
!= RC_OPCODE_KILP
)
1250 for (if_inst
= inst
->Prev
; if_inst
!= &c
->Program
.Instructions
;
1251 if_inst
= if_inst
->Prev
) {
1253 if (if_inst
->U
.I
.Opcode
== RC_OPCODE_IF
) {
1259 inst
->U
.I
.Opcode
= RC_OPCODE_KIL
;
1262 inst
->U
.I
.SrcReg
[0] = negate(builtin_one
);
1264 /* This should work even if the KILP is inside the ELSE
1265 * block, because -0.0 is considered negative. */
1266 inst
->U
.I
.SrcReg
[0] =
1267 negate(absolute(if_inst
->U
.I
.SrcReg
[0]));
1269 if (inst
->Prev
->U
.I
.Opcode
!= RC_OPCODE_IF
1270 && inst
->Next
->U
.I
.Opcode
!= RC_OPCODE_ENDIF
) {
1272 /* Optimize the special case:
1279 rc_remove_instruction(inst
->Prev
);
1281 rc_remove_instruction(inst
->Next
);
1287 int rc_force_output_alpha_to_one(struct radeon_compiler
*c
,
1288 struct rc_instruction
*inst
, void *data
)
1290 struct r300_fragment_program_compiler
*fragc
= (struct r300_fragment_program_compiler
*)c
;
1291 const struct rc_opcode_info
*info
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
1294 if (!info
->HasDstReg
|| inst
->U
.I
.DstReg
.File
!= RC_FILE_OUTPUT
||
1295 inst
->U
.I
.DstReg
.Index
== fragc
->OutputDepth
)
1298 tmp
= rc_find_free_temporary(c
);
1300 /* Insert MOV after inst, set alpha to 1. */
1301 emit1(c
, inst
, RC_OPCODE_MOV
, 0, inst
->U
.I
.DstReg
,
1302 srcregswz(RC_FILE_TEMPORARY
, tmp
, RC_SWIZZLE_XYZ1
));
1304 /* Re-route the destination of inst to the source of mov. */
1305 inst
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
1306 inst
->U
.I
.DstReg
.Index
= tmp
;
1308 /* Move the saturate output modifier to the MOV instruction
1309 * (for better copy propagation). */
1310 inst
->Next
->U
.I
.SaturateMode
= inst
->U
.I
.SaturateMode
;
1311 inst
->U
.I
.SaturateMode
= RC_SATURATE_NONE
;