2 * Copyright (C) 2008 Nicolai Haehnle.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
36 #include "radeon_program_alu.h"
38 #include "radeon_compiler.h"
41 static struct rc_instruction
*emit1(
42 struct radeon_compiler
* c
, struct rc_instruction
* after
,
43 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
44 struct rc_src_register SrcReg
)
46 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
48 fpi
->U
.I
.Opcode
= Opcode
;
49 fpi
->U
.I
.SaturateMode
= Saturate
;
50 fpi
->U
.I
.DstReg
= DstReg
;
51 fpi
->U
.I
.SrcReg
[0] = SrcReg
;
55 static struct rc_instruction
*emit2(
56 struct radeon_compiler
* c
, struct rc_instruction
* after
,
57 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
58 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
)
60 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
62 fpi
->U
.I
.Opcode
= Opcode
;
63 fpi
->U
.I
.SaturateMode
= Saturate
;
64 fpi
->U
.I
.DstReg
= DstReg
;
65 fpi
->U
.I
.SrcReg
[0] = SrcReg0
;
66 fpi
->U
.I
.SrcReg
[1] = SrcReg1
;
70 static struct rc_instruction
*emit3(
71 struct radeon_compiler
* c
, struct rc_instruction
* after
,
72 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
73 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
,
74 struct rc_src_register SrcReg2
)
76 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
78 fpi
->U
.I
.Opcode
= Opcode
;
79 fpi
->U
.I
.SaturateMode
= Saturate
;
80 fpi
->U
.I
.DstReg
= DstReg
;
81 fpi
->U
.I
.SrcReg
[0] = SrcReg0
;
82 fpi
->U
.I
.SrcReg
[1] = SrcReg1
;
83 fpi
->U
.I
.SrcReg
[2] = SrcReg2
;
87 static struct rc_dst_register
dstreg(int file
, int index
)
89 struct rc_dst_register dst
;
92 dst
.WriteMask
= RC_MASK_XYZW
;
97 static struct rc_dst_register
dstregtmpmask(int index
, int mask
)
99 struct rc_dst_register dst
= {0};
100 dst
.File
= RC_FILE_TEMPORARY
;
102 dst
.WriteMask
= mask
;
107 static const struct rc_src_register builtin_zero
= {
108 .File
= RC_FILE_NONE
,
110 .Swizzle
= RC_SWIZZLE_0000
112 static const struct rc_src_register builtin_one
= {
113 .File
= RC_FILE_NONE
,
115 .Swizzle
= RC_SWIZZLE_1111
117 static const struct rc_src_register srcreg_undefined
= {
118 .File
= RC_FILE_NONE
,
120 .Swizzle
= RC_SWIZZLE_XYZW
123 static struct rc_src_register
srcreg(int file
, int index
)
125 struct rc_src_register src
= srcreg_undefined
;
131 static struct rc_src_register
srcregswz(int file
, int index
, int swz
)
133 struct rc_src_register src
= srcreg_undefined
;
140 static struct rc_src_register
absolute(struct rc_src_register reg
)
142 struct rc_src_register newreg
= reg
;
144 newreg
.Negate
= RC_MASK_NONE
;
148 static struct rc_src_register
negate(struct rc_src_register reg
)
150 struct rc_src_register newreg
= reg
;
151 newreg
.Negate
= newreg
.Negate
^ RC_MASK_XYZW
;
155 static struct rc_src_register
swizzle(struct rc_src_register reg
,
156 rc_swizzle x
, rc_swizzle y
, rc_swizzle z
, rc_swizzle w
)
158 struct rc_src_register swizzled
= reg
;
159 swizzled
.Swizzle
= combine_swizzles4(reg
.Swizzle
, x
, y
, z
, w
);
163 static struct rc_src_register
scalar(struct rc_src_register reg
)
165 return swizzle(reg
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
);
168 static void transform_ABS(struct radeon_compiler
* c
,
169 struct rc_instruction
* inst
)
171 struct rc_src_register src
= inst
->U
.I
.SrcReg
[0];
173 src
.Negate
= RC_MASK_NONE
;
174 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src
);
175 rc_remove_instruction(inst
);
178 static void transform_DP3(struct radeon_compiler
* c
,
179 struct rc_instruction
* inst
)
181 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
182 struct rc_src_register src1
= inst
->U
.I
.SrcReg
[1];
183 src0
.Negate
&= ~RC_MASK_W
;
184 src0
.Swizzle
&= ~(7 << (3 * 3));
185 src0
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
186 src1
.Negate
&= ~RC_MASK_W
;
187 src1
.Swizzle
&= ~(7 << (3 * 3));
188 src1
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
189 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src0
, src1
);
190 rc_remove_instruction(inst
);
193 static void transform_DPH(struct radeon_compiler
* c
,
194 struct rc_instruction
* inst
)
196 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
197 src0
.Negate
&= ~RC_MASK_W
;
198 src0
.Swizzle
&= ~(7 << (3 * 3));
199 src0
.Swizzle
|= RC_SWIZZLE_ONE
<< (3 * 3);
200 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src0
, inst
->U
.I
.SrcReg
[1]);
201 rc_remove_instruction(inst
);
205 * [1, src0.y*src1.y, src0.z, src1.w]
206 * So basically MUL with lotsa swizzling.
208 static void transform_DST(struct radeon_compiler
* c
,
209 struct rc_instruction
* inst
)
211 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
212 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_ONE
),
213 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_ONE
, RC_SWIZZLE_W
));
214 rc_remove_instruction(inst
);
217 static void transform_FLR(struct radeon_compiler
* c
,
218 struct rc_instruction
* inst
)
220 int tempreg
= rc_find_free_temporary(c
);
221 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0]);
222 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
223 inst
->U
.I
.SrcReg
[0], negate(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
224 rc_remove_instruction(inst
);
228 * Definition of LIT (from ARB_fragment_program):
230 * tmp = VectorLoad(op0);
231 * if (tmp.x < 0) tmp.x = 0;
232 * if (tmp.y < 0) tmp.y = 0;
233 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
234 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
237 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
240 * The longest path of computation is the one leading to result.z,
241 * consisting of 5 operations. This implementation of LIT takes
242 * 5 slots, if the subsequent optimization passes are clever enough
243 * to pair instructions correctly.
245 static void transform_LIT(struct radeon_compiler
* c
,
246 struct rc_instruction
* inst
)
248 unsigned int constant
;
249 unsigned int constant_swizzle
;
251 struct rc_src_register srctemp
;
253 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, -127.999999, &constant_swizzle
);
255 if (inst
->U
.I
.DstReg
.WriteMask
!= RC_MASK_XYZW
|| inst
->U
.I
.DstReg
.File
!= RC_FILE_TEMPORARY
) {
256 struct rc_instruction
* inst_mov
;
258 inst_mov
= emit1(c
, inst
,
259 RC_OPCODE_MOV
, 0, inst
->U
.I
.DstReg
,
260 srcreg(RC_FILE_TEMPORARY
, rc_find_free_temporary(c
)));
262 inst
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
263 inst
->U
.I
.DstReg
.Index
= inst_mov
->U
.I
.SrcReg
[0].Index
;
264 inst
->U
.I
.DstReg
.WriteMask
= RC_MASK_XYZW
;
267 temp
= inst
->U
.I
.DstReg
.Index
;
268 srctemp
= srcreg(RC_FILE_TEMPORARY
, temp
);
270 // tmp.x = max(0.0, Src.x);
271 // tmp.y = max(0.0, Src.y);
272 // tmp.w = clamp(Src.z, -128+eps, 128-eps);
273 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
274 dstregtmpmask(temp
, RC_MASK_XYW
),
276 swizzle(srcreg(RC_FILE_CONSTANT
, constant
),
277 RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, constant_swizzle
&3));
278 emit2(c
, inst
->Prev
, RC_OPCODE_MIN
, 0,
279 dstregtmpmask(temp
, RC_MASK_Z
),
280 swizzle(srctemp
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
281 negate(srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
)));
283 // tmp.w = Pow(tmp.y, tmp.w)
284 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0,
285 dstregtmpmask(temp
, RC_MASK_W
),
286 swizzle(srctemp
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
));
287 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0,
288 dstregtmpmask(temp
, RC_MASK_W
),
289 swizzle(srctemp
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
290 swizzle(srctemp
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
));
291 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, 0,
292 dstregtmpmask(temp
, RC_MASK_W
),
293 swizzle(srctemp
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
));
295 // tmp.z = (tmp.x > 0) ? tmp.w : 0.0
296 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
,
297 dstregtmpmask(temp
, RC_MASK_Z
),
298 negate(swizzle(srctemp
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
)),
299 swizzle(srctemp
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
302 // tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
303 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
,
304 dstregtmpmask(temp
, RC_MASK_XYW
),
305 swizzle(srctemp
, RC_SWIZZLE_ONE
, RC_SWIZZLE_X
, RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
));
307 rc_remove_instruction(inst
);
310 static void transform_LRP(struct radeon_compiler
* c
,
311 struct rc_instruction
* inst
)
313 int tempreg
= rc_find_free_temporary(c
);
315 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0,
316 dstreg(RC_FILE_TEMPORARY
, tempreg
),
317 inst
->U
.I
.SrcReg
[1], negate(inst
->U
.I
.SrcReg
[2]));
318 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, inst
->U
.I
.SaturateMode
,
320 inst
->U
.I
.SrcReg
[0], srcreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[2]);
322 rc_remove_instruction(inst
);
325 static void transform_POW(struct radeon_compiler
* c
,
326 struct rc_instruction
* inst
)
328 int tempreg
= rc_find_free_temporary(c
);
329 struct rc_dst_register tempdst
= dstreg(RC_FILE_TEMPORARY
, tempreg
);
330 struct rc_src_register tempsrc
= srcreg(RC_FILE_TEMPORARY
, tempreg
);
331 tempdst
.WriteMask
= RC_MASK_W
;
332 tempsrc
.Swizzle
= RC_SWIZZLE_WWWW
;
334 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0, tempdst
, scalar(inst
->U
.I
.SrcReg
[0]));
335 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, tempdst
, tempsrc
, scalar(inst
->U
.I
.SrcReg
[1]));
336 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, tempsrc
);
338 rc_remove_instruction(inst
);
341 static void transform_RSQ(struct radeon_compiler
* c
,
342 struct rc_instruction
* inst
)
344 inst
->U
.I
.SrcReg
[0] = absolute(inst
->U
.I
.SrcReg
[0]);
347 static void transform_SEQ(struct radeon_compiler
* c
,
348 struct rc_instruction
* inst
)
350 int tempreg
= rc_find_free_temporary(c
);
352 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
353 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
354 negate(absolute(srcreg(RC_FILE_TEMPORARY
, tempreg
))), builtin_zero
, builtin_one
);
356 rc_remove_instruction(inst
);
359 static void transform_SFL(struct radeon_compiler
* c
,
360 struct rc_instruction
* inst
)
362 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, builtin_zero
);
363 rc_remove_instruction(inst
);
366 static void transform_SGE(struct radeon_compiler
* c
,
367 struct rc_instruction
* inst
)
369 int tempreg
= rc_find_free_temporary(c
);
371 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
372 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
373 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_zero
, builtin_one
);
375 rc_remove_instruction(inst
);
378 static void transform_SGT(struct radeon_compiler
* c
,
379 struct rc_instruction
* inst
)
381 int tempreg
= rc_find_free_temporary(c
);
383 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), negate(inst
->U
.I
.SrcReg
[0]), inst
->U
.I
.SrcReg
[1]);
384 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
385 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_one
, builtin_zero
);
387 rc_remove_instruction(inst
);
390 static void transform_SLE(struct radeon_compiler
* c
,
391 struct rc_instruction
* inst
)
393 int tempreg
= rc_find_free_temporary(c
);
395 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), negate(inst
->U
.I
.SrcReg
[0]), inst
->U
.I
.SrcReg
[1]);
396 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
397 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_zero
, builtin_one
);
399 rc_remove_instruction(inst
);
402 static void transform_SLT(struct radeon_compiler
* c
,
403 struct rc_instruction
* inst
)
405 int tempreg
= rc_find_free_temporary(c
);
407 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
408 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
409 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_one
, builtin_zero
);
411 rc_remove_instruction(inst
);
414 static void transform_SNE(struct radeon_compiler
* c
,
415 struct rc_instruction
* inst
)
417 int tempreg
= rc_find_free_temporary(c
);
419 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
420 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
421 negate(absolute(srcreg(RC_FILE_TEMPORARY
, tempreg
))), builtin_one
, builtin_zero
);
423 rc_remove_instruction(inst
);
426 static void transform_SUB(struct radeon_compiler
* c
,
427 struct rc_instruction
* inst
)
429 inst
->U
.I
.Opcode
= RC_OPCODE_ADD
;
430 inst
->U
.I
.SrcReg
[1] = negate(inst
->U
.I
.SrcReg
[1]);
433 static void transform_SWZ(struct radeon_compiler
* c
,
434 struct rc_instruction
* inst
)
436 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
439 static void transform_XPD(struct radeon_compiler
* c
,
440 struct rc_instruction
* inst
)
442 int tempreg
= rc_find_free_temporary(c
);
444 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
),
445 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
446 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
));
447 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
448 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
),
449 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
450 negate(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
452 rc_remove_instruction(inst
);
457 * Can be used as a transformation for @ref radeonClauseLocalTransform,
458 * no userData necessary.
460 * Eliminates the following ALU instructions:
461 * ABS, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
463 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
465 * Transforms RSQ to Radeon's native RSQ by explicitly setting
468 * @note should be applicable to R300 and R500 fragment programs.
470 int radeonTransformALU(
471 struct radeon_compiler
* c
,
472 struct rc_instruction
* inst
,
475 switch(inst
->U
.I
.Opcode
) {
476 case RC_OPCODE_ABS
: transform_ABS(c
, inst
); return 1;
477 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
478 case RC_OPCODE_DST
: transform_DST(c
, inst
); return 1;
479 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
480 case RC_OPCODE_LIT
: transform_LIT(c
, inst
); return 1;
481 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
482 case RC_OPCODE_POW
: transform_POW(c
, inst
); return 1;
483 case RC_OPCODE_RSQ
: transform_RSQ(c
, inst
); return 1;
484 case RC_OPCODE_SEQ
: transform_SEQ(c
, inst
); return 1;
485 case RC_OPCODE_SFL
: transform_SFL(c
, inst
); return 1;
486 case RC_OPCODE_SGE
: transform_SGE(c
, inst
); return 1;
487 case RC_OPCODE_SGT
: transform_SGT(c
, inst
); return 1;
488 case RC_OPCODE_SLE
: transform_SLE(c
, inst
); return 1;
489 case RC_OPCODE_SLT
: transform_SLT(c
, inst
); return 1;
490 case RC_OPCODE_SNE
: transform_SNE(c
, inst
); return 1;
491 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
492 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
493 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
500 static void transform_r300_vertex_ABS(struct radeon_compiler
* c
,
501 struct rc_instruction
* inst
)
503 /* Note: r500 can take absolute values, but r300 cannot. */
504 inst
->U
.I
.Opcode
= RC_OPCODE_MAX
;
505 inst
->U
.I
.SrcReg
[1] = inst
->U
.I
.SrcReg
[0];
506 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
510 * For use with radeonLocalTransform, this transforms non-native ALU
511 * instructions of the r300 up to r500 vertex engine.
513 int r300_transform_vertex_alu(
514 struct radeon_compiler
* c
,
515 struct rc_instruction
* inst
,
518 switch(inst
->U
.I
.Opcode
) {
519 case RC_OPCODE_ABS
: transform_r300_vertex_ABS(c
, inst
); return 1;
520 case RC_OPCODE_DP3
: transform_DP3(c
, inst
); return 1;
521 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
522 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
523 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
524 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
525 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
526 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
532 static void sincos_constants(struct radeon_compiler
* c
, unsigned int *constants
)
534 static const float SinCosConsts
[2][4] = {
537 -0.405284735, // -4/(PI*PI)
544 0.159154943, // 1/(2*PI)
550 for(i
= 0; i
< 2; ++i
)
551 constants
[i
] = rc_constants_add_immediate_vec4(&c
->Program
.Constants
, SinCosConsts
[i
]);
555 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
557 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
558 * MAD tmp.x, tmp.y, |src|, tmp.x
559 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
560 * MAD dest, tmp.y, weight, tmp.x
562 static void sin_approx(
563 struct radeon_compiler
* c
, struct rc_instruction
* before
,
564 struct rc_dst_register dst
, struct rc_src_register src
, const unsigned int* constants
)
566 unsigned int tempreg
= rc_find_free_temporary(c
);
568 emit2(c
, before
, RC_OPCODE_MUL
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
569 swizzle(src
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
570 srcreg(RC_FILE_CONSTANT
, constants
[0]));
571 emit3(c
, before
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_X
),
572 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
),
573 absolute(swizzle(src
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
)),
574 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
));
575 emit3(c
, before
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_Y
),
576 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
577 absolute(swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
)),
578 negate(swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
)));
579 emit3(c
, before
, RC_OPCODE_MAD
, 0, dst
,
580 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
),
581 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[0]), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
582 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
));
586 * Translate the trigonometric functions COS, SIN, and SCS
587 * using only the basic instructions
588 * MOV, ADD, MUL, MAD, FRC
590 int radeonTransformTrigSimple(struct radeon_compiler
* c
,
591 struct rc_instruction
* inst
,
594 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
595 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
596 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
599 unsigned int constants
[2];
600 unsigned int tempreg
= rc_find_free_temporary(c
);
602 sincos_constants(c
, constants
);
604 if (inst
->U
.I
.Opcode
== RC_OPCODE_COS
) {
605 // MAD tmp.x, src, 1/(2*PI), 0.75
607 // MAD tmp.z, tmp.x, 2*PI, -PI
608 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
609 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
610 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
),
611 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
));
612 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
613 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
));
614 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
615 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
616 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
617 negate(swizzle(srcreg(RC_FILE_CONSTANT
, constants
[0]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
)));
619 sin_approx(c
, inst
, inst
->U
.I
.DstReg
,
620 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
622 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SIN
) {
623 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
624 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
625 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
),
626 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
));
627 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
628 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
));
629 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
630 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
631 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
632 negate(swizzle(srcreg(RC_FILE_CONSTANT
, constants
[0]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
)));
634 sin_approx(c
, inst
, inst
->U
.I
.DstReg
,
635 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
638 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
639 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
640 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
),
641 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_W
));
642 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
643 srcreg(RC_FILE_TEMPORARY
, tempreg
));
644 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
645 srcreg(RC_FILE_TEMPORARY
, tempreg
),
646 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
, RC_SWIZZLE_W
),
647 negate(swizzle(srcreg(RC_FILE_CONSTANT
, constants
[0]), RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
, RC_SWIZZLE_Z
)));
649 struct rc_dst_register dst
= inst
->U
.I
.DstReg
;
651 dst
.WriteMask
= inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_X
;
652 sin_approx(c
, inst
, dst
,
653 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
656 dst
.WriteMask
= inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_Y
;
657 sin_approx(c
, inst
, dst
,
658 swizzle(srcreg(RC_FILE_TEMPORARY
, tempreg
), RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
, RC_SWIZZLE_Y
),
662 rc_remove_instruction(inst
);
669 * Transform the trigonometric functions COS, SIN, and SCS
670 * to include pre-scaling by 1/(2*PI) and taking the fractional
671 * part, so that the input to COS and SIN is always in the range [0,1).
672 * SCS is replaced by one COS and one SIN instruction.
674 * @warning This transformation implicitly changes the semantics of SIN and COS!
676 int radeonTransformTrigScale(struct radeon_compiler
* c
,
677 struct rc_instruction
* inst
,
680 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
681 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
682 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
685 static const float RCP_2PI
= 0.15915494309189535;
687 unsigned int constant
;
688 unsigned int constant_swizzle
;
690 temp
= rc_find_free_temporary(c
);
691 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, RCP_2PI
, &constant_swizzle
);
693 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstregtmpmask(temp
, RC_MASK_W
),
694 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
, RC_SWIZZLE_X
),
695 srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
));
696 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(temp
, RC_MASK_W
),
697 srcreg(RC_FILE_TEMPORARY
, temp
));
699 if (inst
->U
.I
.Opcode
== RC_OPCODE_COS
) {
700 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
701 srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
702 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SIN
) {
703 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, inst
->U
.I
.SaturateMode
,
704 inst
->U
.I
.DstReg
, srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
705 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SCS
) {
706 struct rc_dst_register moddst
= inst
->U
.I
.DstReg
;
708 if (inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_X
) {
709 moddst
.WriteMask
= RC_MASK_X
;
710 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, inst
->U
.I
.SaturateMode
, moddst
,
711 srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
713 if (inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_Y
) {
714 moddst
.WriteMask
= RC_MASK_Y
;
715 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, inst
->U
.I
.SaturateMode
, moddst
,
716 srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
720 rc_remove_instruction(inst
);
726 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
727 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
728 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
730 * @warning This explicitly changes the form of DDX and DDY!
733 int radeonTransformDeriv(struct radeon_compiler
* c
,
734 struct rc_instruction
* inst
,
737 if (inst
->U
.I
.Opcode
!= RC_OPCODE_DDX
&& inst
->U
.I
.Opcode
!= RC_OPCODE_DDY
)
740 inst
->U
.I
.SrcReg
[1].Swizzle
= RC_MAKE_SWIZZLE(RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
);
741 inst
->U
.I
.SrcReg
[1].Negate
= RC_MASK_XYZW
;