2 * Copyright (C) 2008 Nicolai Haehnle.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
36 #include "radeon_program_alu.h"
38 #include "radeon_compiler.h"
41 static struct rc_instruction
*emit1(
42 struct radeon_compiler
* c
, struct rc_instruction
* after
,
43 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
44 struct rc_src_register SrcReg
)
46 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
48 fpi
->I
.Opcode
= Opcode
;
49 fpi
->I
.SaturateMode
= Saturate
;
50 fpi
->I
.DstReg
= DstReg
;
51 fpi
->I
.SrcReg
[0] = SrcReg
;
55 static struct rc_instruction
*emit2(
56 struct radeon_compiler
* c
, struct rc_instruction
* after
,
57 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
58 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
)
60 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
62 fpi
->I
.Opcode
= Opcode
;
63 fpi
->I
.SaturateMode
= Saturate
;
64 fpi
->I
.DstReg
= DstReg
;
65 fpi
->I
.SrcReg
[0] = SrcReg0
;
66 fpi
->I
.SrcReg
[1] = SrcReg1
;
70 static struct rc_instruction
*emit3(
71 struct radeon_compiler
* c
, struct rc_instruction
* after
,
72 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
73 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
,
74 struct rc_src_register SrcReg2
)
76 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
78 fpi
->I
.Opcode
= Opcode
;
79 fpi
->I
.SaturateMode
= Saturate
;
80 fpi
->I
.DstReg
= DstReg
;
81 fpi
->I
.SrcReg
[0] = SrcReg0
;
82 fpi
->I
.SrcReg
[1] = SrcReg1
;
83 fpi
->I
.SrcReg
[2] = SrcReg2
;
87 static struct rc_dst_register
dstreg(int file
, int index
)
89 struct rc_dst_register dst
;
92 dst
.WriteMask
= RC_MASK_XYZW
;
97 static struct rc_dst_register
dstregtmpmask(int index
, int mask
)
99 struct rc_dst_register dst
= {0};
100 dst
.File
= RC_FILE_TEMPORARY
;
102 dst
.WriteMask
= mask
;
107 static const struct rc_src_register builtin_zero
= {
108 .File
= RC_FILE_NONE
,
110 .Swizzle
= RC_SWIZZLE_0000
112 static const struct rc_src_register builtin_one
= {
113 .File
= RC_FILE_NONE
,
115 .Swizzle
= RC_SWIZZLE_1111
117 static const struct rc_src_register srcreg_undefined
= {
118 .File
= RC_FILE_NONE
,
120 .Swizzle
= RC_SWIZZLE_XYZW
123 static struct rc_src_register
srcreg(int file
, int index
)
125 struct rc_src_register src
= srcreg_undefined
;
131 static struct rc_src_register
srcregswz(int file
, int index
, int swz
)
133 struct rc_src_register src
= srcreg_undefined
;
140 static struct rc_src_register
absolute(struct rc_src_register reg
)
142 struct rc_src_register newreg
= reg
;
144 newreg
.Negate
= RC_MASK_NONE
;
148 static struct rc_src_register
negate(struct rc_src_register reg
)
150 struct rc_src_register newreg
= reg
;
151 newreg
.Negate
= newreg
.Negate
^ RC_MASK_XYZW
;
155 static struct rc_src_register
swizzle(struct rc_src_register reg
,
156 rc_swizzle x
, rc_swizzle y
, rc_swizzle z
, rc_swizzle w
)
158 struct rc_src_register swizzled
= reg
;
159 swizzled
.Swizzle
= combine_swizzles4(reg
.Swizzle
, x
, y
, z
, w
);
163 static struct rc_src_register
scalar(struct rc_src_register reg
)
165 return swizzle(reg
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
);
168 static void transform_ABS(struct radeon_compiler
* c
,
169 struct rc_instruction
* inst
)
171 struct rc_src_register src
= inst
->I
.SrcReg
[0];
173 src
.Negate
= RC_MASK_NONE
;
174 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->I
.SaturateMode
, inst
->I
.DstReg
, src
);
175 rc_remove_instruction(inst
);
178 static void transform_DP3(struct radeon_compiler
* c
,
179 struct rc_instruction
* inst
)
181 struct rc_src_register src0
= inst
->I
.SrcReg
[0];
182 struct rc_src_register src1
= inst
->I
.SrcReg
[1];
183 src0
.Negate
&= ~RC_MASK_W
;
184 src0
.Swizzle
&= ~(7 << (3 * 3));
185 src0
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
186 src1
.Negate
&= ~RC_MASK_W
;
187 src1
.Swizzle
&= ~(7 << (3 * 3));
188 src1
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
189 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, inst
->I
.SaturateMode
, inst
->I
.DstReg
, src0
, src1
);
190 rc_remove_instruction(inst
);
193 static void transform_DPH(struct radeon_compiler
* c
,
194 struct rc_instruction
* inst
)
196 struct rc_src_register src0
= inst
->I
.SrcReg
[0];
197 src0
.Negate
&= ~RC_MASK_W
;
198 src0
.Swizzle
&= ~(7 << (3 * 3));
199 src0
.Swizzle
|= RC_SWIZZLE_ONE
<< (3 * 3);
200 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, inst
->I
.SaturateMode
, inst
->I
.DstReg
, src0
, inst
->I
.SrcReg
[1]);
201 rc_remove_instruction(inst
);
205 * [1, src0.y*src1.y, src0.z, src1.w]
206 * So basically MUL with lotsa swizzling.
208 static void transform_DST(struct radeon_compiler
* c
,
209 struct rc_instruction
* inst
)
211 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, inst
->I
.SaturateMode
, inst
->I
.DstReg
,
212 swizzle(inst
->I
.SrcReg
[0], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_ONE
),
213 swizzle(inst
->I
.SrcReg
[1], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_ONE
, RC_SWIZZLE_W
));
214 rc_remove_instruction(inst
);
217 static void transform_FLR(struct radeon_compiler
* c
,
218 struct rc_instruction
* inst
)
220 int tempreg
= rc_find_free_temporary(c
);
221 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->I
.SrcReg
[0]);
222 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, inst
->I
.SaturateMode
, inst
->I
.DstReg
,
223 inst
->I
.SrcReg
[0], negate(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
224 rc_remove_instruction(inst
);
228 * Definition of LIT (from ARB_fragment_program):
230 * tmp = VectorLoad(op0);
231 * if (tmp.x < 0) tmp.x = 0;
232 * if (tmp.y < 0) tmp.y = 0;
233 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
234 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
237 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
240 * The longest path of computation is the one leading to result.z,
241 * consisting of 5 operations. This implementation of LIT takes
242 * 5 slots, if the subsequent optimization passes are clever enough
243 * to pair instructions correctly.
245 static void transform_LIT(struct radeon_compiler
* c
,
246 struct rc_instruction
* inst
)
248 unsigned int constant
;
249 unsigned int constant_swizzle
;
251 struct rc_src_register srctemp
;
253 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, -127.999999, &constant_swizzle
);
255 if (inst
->I
.DstReg
.WriteMask
!= RC_MASK_XYZW
|| inst
->I
.DstReg
.File
!= RC_FILE_TEMPORARY
) {
256 struct rc_instruction
* inst_mov
;
258 inst_mov
= emit1(c
, inst
,
259 RC_OPCODE_MOV
, 0, inst
->I
.DstReg
,
260 srcreg(RC_FILE_TEMPORARY
, rc_find_free_temporary(c
)));
262 inst
->I
.DstReg
.File
= RC_FILE_TEMPORARY
;
263 inst
->I
.DstReg
.Index
= inst_mov
->I
.SrcReg
[0].Index
;
264 inst
->I
.DstReg
.WriteMask
= RC_MASK_XYZW
;
267 temp
= inst
->I
.DstReg
.Index
;
268 srctemp
= srcreg(RC_FILE_TEMPORARY
, temp
);
270 // tmp.x = max(0.0, Src.x);
271 // tmp.y = max(0.0, Src.y);
272 // tmp.w = clamp(Src.z, -128+eps, 128-eps);
273 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
274 dstregtmpmask(temp
, RC_MASK_XYW
),
276 swizzle(srcreg(RC_FILE_CONSTANT
, constant
),
277 RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, constant_swizzle
&3));
278 emit2(c
, inst
->Prev
, RC_OPCODE_MIN
, 0,
279 dstregtmpmask(temp
, RC_MASK_Z
),
280 swizzle(srctemp
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
281 negate(srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
)));
283 // tmp.w = Pow(tmp.y, tmp.w)
284 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0,
285 dstregtmpmask(temp
, RC_MASK_W
),
286 swizzle(srctemp
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
));
287 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0,
288 dstregtmpmask(temp
, RC_MASK_W
),
289 swizzle(srctemp
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
290 swizzle(srctemp
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
));
291 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, 0,
292 dstregtmpmask(temp
, RC_MASK_W
),
293 swizzle(srctemp
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
));
295 // tmp.z = (tmp.x > 0) ? tmp.w : 0.0
296 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->I
.SaturateMode
,
297 dstregtmpmask(temp
, RC_MASK_Z
),
298 negate(swizzle(srctemp
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
)),
299 swizzle(srctemp
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
302 // tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
303 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->I
.SaturateMode
,
304 dstregtmpmask(temp
, RC_MASK_XYW
),
305 swizzle(srctemp
, RC_SWIZZLE_ONE
, RC_SWIZZLE_X
, RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
));
307 rc_remove_instruction(inst
);
310 static void transform_LRP(struct radeon_compiler
* c
,
311 struct rc_instruction
* inst
)
313 int tempreg
= rc_find_free_temporary(c
);
315 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0,
316 dstreg(RC_FILE_TEMPORARY
, tempreg
),
317 inst
->I
.SrcReg
[1], negate(inst
->I
.SrcReg
[2]));
318 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, inst
->I
.SaturateMode
,
320 inst
->I
.SrcReg
[0], srcreg(RC_FILE_TEMPORARY
, tempreg
), inst
->I
.SrcReg
[2]);
322 rc_remove_instruction(inst
);
325 static void transform_POW(struct radeon_compiler
* c
,
326 struct rc_instruction
* inst
)
328 int tempreg
= rc_find_free_temporary(c
);
329 struct rc_dst_register tempdst
= dstreg(RC_FILE_TEMPORARY
, tempreg
);
330 struct rc_src_register tempsrc
= srcreg(RC_FILE_TEMPORARY
, tempreg
);
331 tempdst
.WriteMask
= RC_MASK_W
;
332 tempsrc
.Swizzle
= RC_SWIZZLE_WWWW
;
334 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0, tempdst
, scalar(inst
->I
.SrcReg
[0]));
335 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, tempdst
, tempsrc
, scalar(inst
->I
.SrcReg
[1]));
336 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, inst
->I
.SaturateMode
, inst
->I
.DstReg
, tempsrc
);
338 rc_remove_instruction(inst
);
341 static void transform_RSQ(struct radeon_compiler
* c
,
342 struct rc_instruction
* inst
)
344 inst
->I
.SrcReg
[0] = absolute(inst
->I
.SrcReg
[0]);
347 static void transform_SGE(struct radeon_compiler
* c
,
348 struct rc_instruction
* inst
)
350 int tempreg
= rc_find_free_temporary(c
);
352 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->I
.SrcReg
[0], negate(inst
->I
.SrcReg
[1]));
353 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->I
.SaturateMode
, inst
->I
.DstReg
,
354 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_zero
, builtin_one
);
356 rc_remove_instruction(inst
);
359 static void transform_SLT(struct radeon_compiler
* c
,
360 struct rc_instruction
* inst
)
362 int tempreg
= rc_find_free_temporary(c
);
364 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->I
.SrcReg
[0], negate(inst
->I
.SrcReg
[1]));
365 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->I
.SaturateMode
, inst
->I
.DstReg
,
366 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_one
, builtin_zero
);
368 rc_remove_instruction(inst
);
371 static void transform_SUB(struct radeon_compiler
* c
,
372 struct rc_instruction
* inst
)
374 inst
->I
.Opcode
= RC_OPCODE_ADD
;
375 inst
->I
.SrcReg
[1] = negate(inst
->I
.SrcReg
[1]);
378 static void transform_SWZ(struct radeon_compiler
* c
,
379 struct rc_instruction
* inst
)
381 inst
->I
.Opcode
= RC_OPCODE_MOV
;
384 static void transform_XPD(struct radeon_compiler
* c
,
385 struct rc_instruction
* inst
)
387 int tempreg
= rc_find_free_temporary(c
);
389 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
),
390 swizzle(inst
->I
.SrcReg
[0], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
391 swizzle(inst
->I
.SrcReg
[1], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
));
392 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, inst
->I
.SaturateMode
, inst
->I
.DstReg
,
393 swizzle(inst
->I
.SrcReg
[0], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
),
394 swizzle(inst
->I
.SrcReg
[1], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
395 negate(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
397 rc_remove_instruction(inst
);
402 * Can be used as a transformation for @ref radeonClauseLocalTransform,
403 * no userData necessary.
405 * Eliminates the following ALU instructions:
406 * ABS, DPH, DST, FLR, LIT, LRP, POW, SGE, SLT, SUB, SWZ, XPD
408 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
410 * Transforms RSQ to Radeon's native RSQ by explicitly setting
413 * @note should be applicable to R300 and R500 fragment programs.
415 int radeonTransformALU(
416 struct radeon_compiler
* c
,
417 struct rc_instruction
* inst
,
420 switch(inst
->I
.Opcode
) {
421 case RC_OPCODE_ABS
: transform_ABS(c
, inst
); return 1;
422 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
423 case RC_OPCODE_DST
: transform_DST(c
, inst
); return 1;
424 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
425 case RC_OPCODE_LIT
: transform_LIT(c
, inst
); return 1;
426 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
427 case RC_OPCODE_POW
: transform_POW(c
, inst
); return 1;
428 case RC_OPCODE_RSQ
: transform_RSQ(c
, inst
); return 1;
429 case RC_OPCODE_SGE
: transform_SGE(c
, inst
); return 1;
430 case RC_OPCODE_SLT
: transform_SLT(c
, inst
); return 1;
431 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
432 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
433 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
440 static void transform_r300_vertex_ABS(struct radeon_compiler
* c
,
441 struct rc_instruction
* inst
)
443 /* Note: r500 can take absolute values, but r300 cannot. */
444 inst
->I
.Opcode
= RC_OPCODE_MAX
;
445 inst
->I
.SrcReg
[1] = inst
->I
.SrcReg
[0];
446 inst
->I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
450 * For use with radeonLocalTransform, this transforms non-native ALU
451 * instructions of the r300 up to r500 vertex engine.
453 int r300_transform_vertex_alu(
454 struct radeon_compiler
* c
,
455 struct rc_instruction
* inst
,
458 switch(inst
->I
.Opcode
) {
459 case RC_OPCODE_ABS
: transform_r300_vertex_ABS(c
, inst
); return 1;
460 case RC_OPCODE_DP3
: transform_DP3(c
, inst
); return 1;
461 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
462 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
463 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
464 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
465 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
466 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
472 static void sincos_constants(struct radeon_compiler
* c
, unsigned int *constants
)
474 static const float SinCosConsts
[2][4] = {
477 -0.405284735, // -4/(PI*PI)
484 0.159154943, // 1/(2*PI)
490 for(i
= 0; i
< 2; ++i
)
491 constants
[i
] = rc_constants_add_immediate_vec4(&c
->Program
.Constants
, SinCosConsts
[i
]);
495 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
497 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
498 * MAD tmp.x, tmp.y, |src|, tmp.x
499 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
500 * MAD dest, tmp.y, weight, tmp.x
502 static void sin_approx(
503 struct radeon_compiler
* c
, struct rc_instruction
* before
,
504 struct rc_dst_register dst
, struct rc_src_register src
, const unsigned int* constants
)
506 unsigned int tempreg
= rc_find_free_temporary(c
);
508 emit2(c
, before
, RC_OPCODE_MUL
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
509 swizzle(src
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
510 srcreg(RC_FILE_CONSTANT
, constants
[0]));
511 emit3(c
, before
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_X
),
512 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
),
513 absolute(swizzle(src
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
)),
514 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
));
515 emit3(c
, before
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_Y
),
516 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
517 absolute(swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
)),
518 negate(swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
)));
519 emit3(c
, before
, RC_OPCODE_MAD
, 0, dst
,
520 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
),
521 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[0]), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
522 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
));
526 * Translate the trigonometric functions COS, SIN, and SCS
527 * using only the basic instructions
528 * MOV, ADD, MUL, MAD, FRC
530 int radeonTransformTrigSimple(struct radeon_compiler
* c
,
531 struct rc_instruction
* inst
,
534 if (inst
->I
.Opcode
!= RC_OPCODE_COS
&&
535 inst
->I
.Opcode
!= RC_OPCODE_SIN
&&
536 inst
->I
.Opcode
!= RC_OPCODE_SCS
)
539 unsigned int constants
[2];
540 unsigned int tempreg
= rc_find_free_temporary(c
);
542 sincos_constants(c
, constants
);
544 if (inst
->I
.Opcode
== RC_OPCODE_COS
) {
545 // MAD tmp.x, src, 1/(2*PI), 0.75
547 // MAD tmp.z, tmp.x, 2*PI, -PI
548 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
549 swizzle(inst
->I
.SrcReg
[0], RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
550 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
),
551 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
));
552 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
553 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
));
554 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
555 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
556 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
557 negate(swizzle(srcreg(RC_FILE_CONSTANT
, constants
[0]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
)));
559 sin_approx(c
, inst
, inst
->I
.DstReg
,
560 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
562 } else if (inst
->I
.Opcode
== RC_OPCODE_SIN
) {
563 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
564 swizzle(inst
->I
.SrcReg
[0], RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
565 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
),
566 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
));
567 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
568 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
));
569 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
570 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
571 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
572 negate(swizzle(srcreg(RC_FILE_CONSTANT
, constants
[0]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
)));
574 sin_approx(c
, inst
, inst
->I
.DstReg
,
575 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
578 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
579 swizzle(inst
->I
.SrcReg
[0], RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
580 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
),
581 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_W
));
582 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
583 srcreg(RC_FILE_TEMPORARY
, tempreg
));
584 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
585 srcreg(RC_FILE_TEMPORARY
, tempreg
),
586 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
587 negate(swizzle(srcreg(RC_FILE_CONSTANT
, constants
[0]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
)));
589 struct rc_dst_register dst
= inst
->I
.DstReg
;
591 dst
.WriteMask
= inst
->I
.DstReg
.WriteMask
& RC_MASK_X
;
592 sin_approx(c
, inst
, dst
,
593 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
596 dst
.WriteMask
= inst
->I
.DstReg
.WriteMask
& RC_MASK_Y
;
597 sin_approx(c
, inst
, dst
,
598 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
),
602 rc_remove_instruction(inst
);
609 * Transform the trigonometric functions COS, SIN, and SCS
610 * to include pre-scaling by 1/(2*PI) and taking the fractional
611 * part, so that the input to COS and SIN is always in the range [0,1).
612 * SCS is replaced by one COS and one SIN instruction.
614 * @warning This transformation implicitly changes the semantics of SIN and COS!
616 int radeonTransformTrigScale(struct radeon_compiler
* c
,
617 struct rc_instruction
* inst
,
620 if (inst
->I
.Opcode
!= RC_OPCODE_COS
&&
621 inst
->I
.Opcode
!= RC_OPCODE_SIN
&&
622 inst
->I
.Opcode
!= RC_OPCODE_SCS
)
625 static const float RCP_2PI
= 0.15915494309189535;
627 unsigned int constant
;
628 unsigned int constant_swizzle
;
630 temp
= rc_find_free_temporary(c
);
631 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, RCP_2PI
, &constant_swizzle
);
633 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstregtmpmask(temp
, RC_MASK_W
),
634 swizzle(inst
->I
.SrcReg
[0], RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
635 srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
));
636 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(temp
, RC_MASK_W
),
637 srcreg(RC_FILE_TEMPORARY
, temp
));
639 if (inst
->I
.Opcode
== RC_OPCODE_COS
) {
640 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, inst
->I
.SaturateMode
, inst
->I
.DstReg
,
641 srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
642 } else if (inst
->I
.Opcode
== RC_OPCODE_SIN
) {
643 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, inst
->I
.SaturateMode
,
644 inst
->I
.DstReg
, srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
645 } else if (inst
->I
.Opcode
== RC_OPCODE_SCS
) {
646 struct rc_dst_register moddst
= inst
->I
.DstReg
;
648 if (inst
->I
.DstReg
.WriteMask
& RC_MASK_X
) {
649 moddst
.WriteMask
= RC_MASK_X
;
650 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, inst
->I
.SaturateMode
, moddst
,
651 srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
653 if (inst
->I
.DstReg
.WriteMask
& RC_MASK_Y
) {
654 moddst
.WriteMask
= RC_MASK_Y
;
655 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, inst
->I
.SaturateMode
, moddst
,
656 srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
660 rc_remove_instruction(inst
);
666 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
667 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
668 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
670 * @warning This explicitly changes the form of DDX and DDY!
673 int radeonTransformDeriv(struct radeon_compiler
* c
,
674 struct rc_instruction
* inst
,
677 if (inst
->I
.Opcode
!= RC_OPCODE_DDX
&& inst
->I
.Opcode
!= RC_OPCODE_DDY
)
680 inst
->I
.SrcReg
[1].Swizzle
= RC_MAKE_SWIZZLE(RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
);
681 inst
->I
.SrcReg
[1].Negate
= RC_MASK_XYZW
;