2 * Copyright (C) 2008 Nicolai Haehnle.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
36 #include "radeon_program_alu.h"
38 #include "radeon_compiler.h"
41 static struct rc_instruction
*emit1(
42 struct radeon_compiler
* c
, struct rc_instruction
* after
,
43 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
44 struct rc_src_register SrcReg
)
46 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
48 fpi
->U
.I
.Opcode
= Opcode
;
49 fpi
->U
.I
.SaturateMode
= Saturate
;
50 fpi
->U
.I
.DstReg
= DstReg
;
51 fpi
->U
.I
.SrcReg
[0] = SrcReg
;
55 static struct rc_instruction
*emit2(
56 struct radeon_compiler
* c
, struct rc_instruction
* after
,
57 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
58 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
)
60 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
62 fpi
->U
.I
.Opcode
= Opcode
;
63 fpi
->U
.I
.SaturateMode
= Saturate
;
64 fpi
->U
.I
.DstReg
= DstReg
;
65 fpi
->U
.I
.SrcReg
[0] = SrcReg0
;
66 fpi
->U
.I
.SrcReg
[1] = SrcReg1
;
70 static struct rc_instruction
*emit3(
71 struct radeon_compiler
* c
, struct rc_instruction
* after
,
72 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
73 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
,
74 struct rc_src_register SrcReg2
)
76 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
78 fpi
->U
.I
.Opcode
= Opcode
;
79 fpi
->U
.I
.SaturateMode
= Saturate
;
80 fpi
->U
.I
.DstReg
= DstReg
;
81 fpi
->U
.I
.SrcReg
[0] = SrcReg0
;
82 fpi
->U
.I
.SrcReg
[1] = SrcReg1
;
83 fpi
->U
.I
.SrcReg
[2] = SrcReg2
;
87 static struct rc_dst_register
dstreg(int file
, int index
)
89 struct rc_dst_register dst
;
92 dst
.WriteMask
= RC_MASK_XYZW
;
97 static struct rc_dst_register
dstregtmpmask(int index
, int mask
)
99 struct rc_dst_register dst
= {0};
100 dst
.File
= RC_FILE_TEMPORARY
;
102 dst
.WriteMask
= mask
;
107 static const struct rc_src_register builtin_zero
= {
108 .File
= RC_FILE_NONE
,
110 .Swizzle
= RC_SWIZZLE_0000
112 static const struct rc_src_register builtin_one
= {
113 .File
= RC_FILE_NONE
,
115 .Swizzle
= RC_SWIZZLE_1111
117 static const struct rc_src_register srcreg_undefined
= {
118 .File
= RC_FILE_NONE
,
120 .Swizzle
= RC_SWIZZLE_XYZW
123 static struct rc_src_register
srcreg(int file
, int index
)
125 struct rc_src_register src
= srcreg_undefined
;
131 static struct rc_src_register
srcregswz(int file
, int index
, int swz
)
133 struct rc_src_register src
= srcreg_undefined
;
140 static struct rc_src_register
absolute(struct rc_src_register reg
)
142 struct rc_src_register newreg
= reg
;
144 newreg
.Negate
= RC_MASK_NONE
;
148 static struct rc_src_register
negate(struct rc_src_register reg
)
150 struct rc_src_register newreg
= reg
;
151 newreg
.Negate
= newreg
.Negate
^ RC_MASK_XYZW
;
155 static struct rc_src_register
swizzle(struct rc_src_register reg
,
156 rc_swizzle x
, rc_swizzle y
, rc_swizzle z
, rc_swizzle w
)
158 struct rc_src_register swizzled
= reg
;
159 swizzled
.Swizzle
= combine_swizzles4(reg
.Swizzle
, x
, y
, z
, w
);
163 static struct rc_src_register
swizzle_smear(struct rc_src_register reg
,
166 return swizzle(reg
, x
, x
, x
, x
);
169 static struct rc_src_register
swizzle_xxxx(struct rc_src_register reg
)
171 return swizzle_smear(reg
, RC_SWIZZLE_X
);
174 static struct rc_src_register
swizzle_yyyy(struct rc_src_register reg
)
176 return swizzle_smear(reg
, RC_SWIZZLE_Y
);
179 static struct rc_src_register
swizzle_zzzz(struct rc_src_register reg
)
181 return swizzle_smear(reg
, RC_SWIZZLE_Z
);
184 static struct rc_src_register
swizzle_wwww(struct rc_src_register reg
)
186 return swizzle_smear(reg
, RC_SWIZZLE_W
);
189 static void transform_ABS(struct radeon_compiler
* c
,
190 struct rc_instruction
* inst
)
192 struct rc_src_register src
= inst
->U
.I
.SrcReg
[0];
194 src
.Negate
= RC_MASK_NONE
;
195 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src
);
196 rc_remove_instruction(inst
);
199 static void transform_CEIL(struct radeon_compiler
* c
,
200 struct rc_instruction
* inst
)
203 * ceil(x) = -floor(-x)
205 * After inlining floor:
206 * ceil(x) = -(-x-frac(-x))
208 * After simplification:
209 * ceil(x) = x+frac(-x)
212 int tempreg
= rc_find_free_temporary(c
);
213 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), negate(inst
->U
.I
.SrcReg
[0]));
214 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
215 inst
->U
.I
.SrcReg
[0], srcreg(RC_FILE_TEMPORARY
, tempreg
));
216 rc_remove_instruction(inst
);
219 static void transform_DP3(struct radeon_compiler
* c
,
220 struct rc_instruction
* inst
)
222 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
223 struct rc_src_register src1
= inst
->U
.I
.SrcReg
[1];
224 src0
.Negate
&= ~RC_MASK_W
;
225 src0
.Swizzle
&= ~(7 << (3 * 3));
226 src0
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
227 src1
.Negate
&= ~RC_MASK_W
;
228 src1
.Swizzle
&= ~(7 << (3 * 3));
229 src1
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
230 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src0
, src1
);
231 rc_remove_instruction(inst
);
234 static void transform_DPH(struct radeon_compiler
* c
,
235 struct rc_instruction
* inst
)
237 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
238 src0
.Negate
&= ~RC_MASK_W
;
239 src0
.Swizzle
&= ~(7 << (3 * 3));
240 src0
.Swizzle
|= RC_SWIZZLE_ONE
<< (3 * 3);
241 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src0
, inst
->U
.I
.SrcReg
[1]);
242 rc_remove_instruction(inst
);
246 * [1, src0.y*src1.y, src0.z, src1.w]
247 * So basically MUL with lotsa swizzling.
249 static void transform_DST(struct radeon_compiler
* c
,
250 struct rc_instruction
* inst
)
252 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
253 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_ONE
),
254 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_ONE
, RC_SWIZZLE_W
));
255 rc_remove_instruction(inst
);
258 static void transform_FLR(struct radeon_compiler
* c
,
259 struct rc_instruction
* inst
)
261 int tempreg
= rc_find_free_temporary(c
);
262 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0]);
263 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
264 inst
->U
.I
.SrcReg
[0], negate(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
265 rc_remove_instruction(inst
);
269 * Definition of LIT (from ARB_fragment_program):
271 * tmp = VectorLoad(op0);
272 * if (tmp.x < 0) tmp.x = 0;
273 * if (tmp.y < 0) tmp.y = 0;
274 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
275 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
278 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
281 * The longest path of computation is the one leading to result.z,
282 * consisting of 5 operations. This implementation of LIT takes
283 * 5 slots, if the subsequent optimization passes are clever enough
284 * to pair instructions correctly.
286 static void transform_LIT(struct radeon_compiler
* c
,
287 struct rc_instruction
* inst
)
289 unsigned int constant
;
290 unsigned int constant_swizzle
;
292 struct rc_src_register srctemp
;
294 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, -127.999999, &constant_swizzle
);
296 if (inst
->U
.I
.DstReg
.WriteMask
!= RC_MASK_XYZW
|| inst
->U
.I
.DstReg
.File
!= RC_FILE_TEMPORARY
) {
297 struct rc_instruction
* inst_mov
;
299 inst_mov
= emit1(c
, inst
,
300 RC_OPCODE_MOV
, 0, inst
->U
.I
.DstReg
,
301 srcreg(RC_FILE_TEMPORARY
, rc_find_free_temporary(c
)));
303 inst
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
304 inst
->U
.I
.DstReg
.Index
= inst_mov
->U
.I
.SrcReg
[0].Index
;
305 inst
->U
.I
.DstReg
.WriteMask
= RC_MASK_XYZW
;
308 temp
= inst
->U
.I
.DstReg
.Index
;
309 srctemp
= srcreg(RC_FILE_TEMPORARY
, temp
);
311 /* tmp.x = max(0.0, Src.x); */
312 /* tmp.y = max(0.0, Src.y); */
313 /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
314 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
315 dstregtmpmask(temp
, RC_MASK_XYW
),
317 swizzle(srcreg(RC_FILE_CONSTANT
, constant
),
318 RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, constant_swizzle
&3));
319 emit2(c
, inst
->Prev
, RC_OPCODE_MIN
, 0,
320 dstregtmpmask(temp
, RC_MASK_Z
),
321 swizzle_wwww(srctemp
),
322 negate(srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
)));
324 /* tmp.w = Pow(tmp.y, tmp.w) */
325 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0,
326 dstregtmpmask(temp
, RC_MASK_W
),
327 swizzle_yyyy(srctemp
));
328 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0,
329 dstregtmpmask(temp
, RC_MASK_W
),
330 swizzle_wwww(srctemp
),
331 swizzle_zzzz(srctemp
));
332 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, 0,
333 dstregtmpmask(temp
, RC_MASK_W
),
334 swizzle_wwww(srctemp
));
336 /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
337 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
,
338 dstregtmpmask(temp
, RC_MASK_Z
),
339 negate(swizzle_xxxx(srctemp
)),
340 swizzle_wwww(srctemp
),
343 /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
344 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
,
345 dstregtmpmask(temp
, RC_MASK_XYW
),
346 swizzle(srctemp
, RC_SWIZZLE_ONE
, RC_SWIZZLE_X
, RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
));
348 rc_remove_instruction(inst
);
351 static void transform_LRP(struct radeon_compiler
* c
,
352 struct rc_instruction
* inst
)
354 int tempreg
= rc_find_free_temporary(c
);
356 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0,
357 dstreg(RC_FILE_TEMPORARY
, tempreg
),
358 inst
->U
.I
.SrcReg
[1], negate(inst
->U
.I
.SrcReg
[2]));
359 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, inst
->U
.I
.SaturateMode
,
361 inst
->U
.I
.SrcReg
[0], srcreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[2]);
363 rc_remove_instruction(inst
);
366 static void transform_POW(struct radeon_compiler
* c
,
367 struct rc_instruction
* inst
)
369 int tempreg
= rc_find_free_temporary(c
);
370 struct rc_dst_register tempdst
= dstreg(RC_FILE_TEMPORARY
, tempreg
);
371 struct rc_src_register tempsrc
= srcreg(RC_FILE_TEMPORARY
, tempreg
);
372 tempdst
.WriteMask
= RC_MASK_W
;
373 tempsrc
.Swizzle
= RC_SWIZZLE_WWWW
;
375 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0, tempdst
, swizzle_xxxx(inst
->U
.I
.SrcReg
[0]));
376 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, tempdst
, tempsrc
, swizzle_xxxx(inst
->U
.I
.SrcReg
[1]));
377 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, tempsrc
);
379 rc_remove_instruction(inst
);
382 static void transform_RSQ(struct radeon_compiler
* c
,
383 struct rc_instruction
* inst
)
385 inst
->U
.I
.SrcReg
[0] = absolute(inst
->U
.I
.SrcReg
[0]);
388 static void transform_SEQ(struct radeon_compiler
* c
,
389 struct rc_instruction
* inst
)
391 int tempreg
= rc_find_free_temporary(c
);
393 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
394 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
395 negate(absolute(srcreg(RC_FILE_TEMPORARY
, tempreg
))), builtin_zero
, builtin_one
);
397 rc_remove_instruction(inst
);
400 static void transform_SFL(struct radeon_compiler
* c
,
401 struct rc_instruction
* inst
)
403 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, builtin_zero
);
404 rc_remove_instruction(inst
);
407 static void transform_SGE(struct radeon_compiler
* c
,
408 struct rc_instruction
* inst
)
410 int tempreg
= rc_find_free_temporary(c
);
412 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
413 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
414 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_zero
, builtin_one
);
416 rc_remove_instruction(inst
);
419 static void transform_SGT(struct radeon_compiler
* c
,
420 struct rc_instruction
* inst
)
422 int tempreg
= rc_find_free_temporary(c
);
424 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), negate(inst
->U
.I
.SrcReg
[0]), inst
->U
.I
.SrcReg
[1]);
425 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
426 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_one
, builtin_zero
);
428 rc_remove_instruction(inst
);
431 static void transform_SLE(struct radeon_compiler
* c
,
432 struct rc_instruction
* inst
)
434 int tempreg
= rc_find_free_temporary(c
);
436 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), negate(inst
->U
.I
.SrcReg
[0]), inst
->U
.I
.SrcReg
[1]);
437 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
438 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_zero
, builtin_one
);
440 rc_remove_instruction(inst
);
443 static void transform_SLT(struct radeon_compiler
* c
,
444 struct rc_instruction
* inst
)
446 int tempreg
= rc_find_free_temporary(c
);
448 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
449 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
450 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_one
, builtin_zero
);
452 rc_remove_instruction(inst
);
455 static void transform_SNE(struct radeon_compiler
* c
,
456 struct rc_instruction
* inst
)
458 int tempreg
= rc_find_free_temporary(c
);
460 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
461 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
462 negate(absolute(srcreg(RC_FILE_TEMPORARY
, tempreg
))), builtin_one
, builtin_zero
);
464 rc_remove_instruction(inst
);
467 static void transform_SUB(struct radeon_compiler
* c
,
468 struct rc_instruction
* inst
)
470 inst
->U
.I
.Opcode
= RC_OPCODE_ADD
;
471 inst
->U
.I
.SrcReg
[1] = negate(inst
->U
.I
.SrcReg
[1]);
474 static void transform_SWZ(struct radeon_compiler
* c
,
475 struct rc_instruction
* inst
)
477 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
480 static void transform_XPD(struct radeon_compiler
* c
,
481 struct rc_instruction
* inst
)
483 int tempreg
= rc_find_free_temporary(c
);
485 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
),
486 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
487 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
));
488 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
489 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
),
490 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
491 negate(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
493 rc_remove_instruction(inst
);
498 * Can be used as a transformation for @ref radeonClauseLocalTransform,
499 * no userData necessary.
501 * Eliminates the following ALU instructions:
502 * ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
504 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
506 * Transforms RSQ to Radeon's native RSQ by explicitly setting
509 * @note should be applicable to R300 and R500 fragment programs.
511 int radeonTransformALU(
512 struct radeon_compiler
* c
,
513 struct rc_instruction
* inst
,
516 switch(inst
->U
.I
.Opcode
) {
517 case RC_OPCODE_ABS
: transform_ABS(c
, inst
); return 1;
518 case RC_OPCODE_CEIL
: transform_CEIL(c
, inst
); return 1;
519 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
520 case RC_OPCODE_DST
: transform_DST(c
, inst
); return 1;
521 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
522 case RC_OPCODE_LIT
: transform_LIT(c
, inst
); return 1;
523 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
524 case RC_OPCODE_POW
: transform_POW(c
, inst
); return 1;
525 case RC_OPCODE_RSQ
: transform_RSQ(c
, inst
); return 1;
526 case RC_OPCODE_SEQ
: transform_SEQ(c
, inst
); return 1;
527 case RC_OPCODE_SFL
: transform_SFL(c
, inst
); return 1;
528 case RC_OPCODE_SGE
: transform_SGE(c
, inst
); return 1;
529 case RC_OPCODE_SGT
: transform_SGT(c
, inst
); return 1;
530 case RC_OPCODE_SLE
: transform_SLE(c
, inst
); return 1;
531 case RC_OPCODE_SLT
: transform_SLT(c
, inst
); return 1;
532 case RC_OPCODE_SNE
: transform_SNE(c
, inst
); return 1;
533 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
534 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
535 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
542 static void transform_r300_vertex_ABS(struct radeon_compiler
* c
,
543 struct rc_instruction
* inst
)
545 /* Note: r500 can take absolute values, but r300 cannot. */
546 inst
->U
.I
.Opcode
= RC_OPCODE_MAX
;
547 inst
->U
.I
.SrcReg
[1] = inst
->U
.I
.SrcReg
[0];
548 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
551 static void transform_r300_vertex_CMP(struct radeon_compiler
* c
,
552 struct rc_instruction
* inst
)
554 /* There is no decent CMP available, so let's rig one up.
555 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
556 * The following sequence consumes two temps and two extra slots
557 * (the second temp and the second slot is consumed by transform_LRP),
558 * but should be equivalent:
560 * SLT tmp0, src0, 0.0
561 * LRP dst, tmp0, src1, src2
563 * Yes, I know, I'm a mad scientist. ~ C. & M. */
564 int tempreg0
= rc_find_free_temporary(c
);
566 /* SLT tmp0, src0, 0.0 */
567 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
568 dstreg(RC_FILE_TEMPORARY
, tempreg0
),
569 inst
->U
.I
.SrcReg
[0], builtin_zero
);
571 /* LRP dst, tmp0, src1, src2 */
573 emit3(c
, inst
->Prev
, RC_OPCODE_LRP
, 0,
575 srcreg(RC_FILE_TEMPORARY
, tempreg0
), inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[2]));
577 rc_remove_instruction(inst
);
580 static void transform_r300_vertex_fix_LIT(struct radeon_compiler
* c
,
581 struct rc_instruction
* inst
)
583 int tempreg
= rc_find_free_temporary(c
);
584 unsigned constant_swizzle
;
585 int constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
,
586 0.0000000000000000001,
590 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, 0,
591 dstreg(RC_FILE_TEMPORARY
, tempreg
),
592 inst
->U
.I
.SrcReg
[0]);
594 /* MAX dst.z, src, 0.00...001 */
595 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
596 dstregtmpmask(tempreg
, RC_MASK_Y
),
597 srcreg(RC_FILE_TEMPORARY
, tempreg
),
598 srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
));
600 inst
->U
.I
.SrcReg
[0] = srcreg(RC_FILE_TEMPORARY
, tempreg
);
603 static void transform_r300_vertex_SGT(struct radeon_compiler
* c
,
604 struct rc_instruction
* inst
)
606 /* x > y <==> -x < -y */
607 inst
->U
.I
.Opcode
= RC_OPCODE_SLT
;
608 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
609 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
612 static void transform_r300_vertex_SLE(struct radeon_compiler
* c
,
613 struct rc_instruction
* inst
)
615 /* x <= y <==> -x >= -y */
616 inst
->U
.I
.Opcode
= RC_OPCODE_SGE
;
617 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
618 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
622 * For use with radeonLocalTransform, this transforms non-native ALU
623 * instructions of the r300 up to r500 vertex engine.
625 int r300_transform_vertex_alu(
626 struct radeon_compiler
* c
,
627 struct rc_instruction
* inst
,
630 switch(inst
->U
.I
.Opcode
) {
631 case RC_OPCODE_ABS
: transform_r300_vertex_ABS(c
, inst
); return 1;
632 case RC_OPCODE_CEIL
: transform_CEIL(c
, inst
); return 1;
633 case RC_OPCODE_CMP
: transform_r300_vertex_CMP(c
, inst
); return 1;
634 case RC_OPCODE_DP3
: transform_DP3(c
, inst
); return 1;
635 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
636 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
637 case RC_OPCODE_LIT
: transform_r300_vertex_fix_LIT(c
, inst
); return 1;
638 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
639 case RC_OPCODE_SFL
: transform_SFL(c
, inst
); return 1;
640 case RC_OPCODE_SGT
: transform_r300_vertex_SGT(c
, inst
); return 1;
641 case RC_OPCODE_SLE
: transform_r300_vertex_SLE(c
, inst
); return 1;
642 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
643 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
644 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
650 static void sincos_constants(struct radeon_compiler
* c
, unsigned int *constants
)
652 static const float SinCosConsts
[2][4] = {
654 1.273239545, /* 4/PI */
655 -0.405284735, /* -4/(PI*PI) */
656 3.141592654, /* PI */
662 0.159154943, /* 1/(2*PI) */
663 6.283185307 /* 2*PI */
668 for(i
= 0; i
< 2; ++i
)
669 constants
[i
] = rc_constants_add_immediate_vec4(&c
->Program
.Constants
, SinCosConsts
[i
]);
673 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
675 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
676 * MAD tmp.x, tmp.y, |src|, tmp.x
677 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
678 * MAD dest, tmp.y, weight, tmp.x
680 static void sin_approx(
681 struct radeon_compiler
* c
, struct rc_instruction
* inst
,
682 struct rc_dst_register dst
, struct rc_src_register src
, const unsigned int* constants
)
684 unsigned int tempreg
= rc_find_free_temporary(c
);
686 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
688 srcreg(RC_FILE_CONSTANT
, constants
[0]));
689 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_X
),
690 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
691 absolute(swizzle_xxxx(src
)),
692 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
693 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_Y
),
694 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
695 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
))),
696 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
))));
697 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dst
,
698 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
699 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[0])),
700 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
704 * Translate the trigonometric functions COS, SIN, and SCS
705 * using only the basic instructions
706 * MOV, ADD, MUL, MAD, FRC
708 int radeonTransformTrigSimple(struct radeon_compiler
* c
,
709 struct rc_instruction
* inst
,
712 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
713 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
714 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
717 unsigned int constants
[2];
718 unsigned int tempreg
= rc_find_free_temporary(c
);
720 sincos_constants(c
, constants
);
722 if (inst
->U
.I
.Opcode
== RC_OPCODE_COS
) {
723 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
724 /* FRC tmp.x, tmp.x */
725 /* MAD tmp.z, tmp.x, 2*PI, -PI */
726 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
727 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
728 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
729 swizzle_xxxx(srcreg(RC_FILE_CONSTANT
, constants
[1])));
730 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
731 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
732 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
733 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
734 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
735 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
737 sin_approx(c
, inst
, inst
->U
.I
.DstReg
,
738 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
740 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SIN
) {
741 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
742 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
743 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
744 swizzle_yyyy(srcreg(RC_FILE_CONSTANT
, constants
[1])));
745 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
746 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
747 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
748 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
749 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
750 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
752 sin_approx(c
, inst
, inst
->U
.I
.DstReg
,
753 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
756 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
757 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
758 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
759 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_W
));
760 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
761 srcreg(RC_FILE_TEMPORARY
, tempreg
));
762 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
763 srcreg(RC_FILE_TEMPORARY
, tempreg
),
764 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
765 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
767 struct rc_dst_register dst
= inst
->U
.I
.DstReg
;
769 dst
.WriteMask
= inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_X
;
770 sin_approx(c
, inst
, dst
,
771 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
774 dst
.WriteMask
= inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_Y
;
775 sin_approx(c
, inst
, dst
,
776 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
780 rc_remove_instruction(inst
);
787 * Transform the trigonometric functions COS, SIN, and SCS
788 * to include pre-scaling by 1/(2*PI) and taking the fractional
789 * part, so that the input to COS and SIN is always in the range [0,1).
790 * SCS is replaced by one COS and one SIN instruction.
792 * @warning This transformation implicitly changes the semantics of SIN and COS!
794 int radeonTransformTrigScale(struct radeon_compiler
* c
,
795 struct rc_instruction
* inst
,
798 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
799 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
800 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
803 static const float RCP_2PI
= 0.15915494309189535;
805 unsigned int constant
;
806 unsigned int constant_swizzle
;
808 temp
= rc_find_free_temporary(c
);
809 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, RCP_2PI
, &constant_swizzle
);
811 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstregtmpmask(temp
, RC_MASK_W
),
812 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
813 srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
));
814 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(temp
, RC_MASK_W
),
815 srcreg(RC_FILE_TEMPORARY
, temp
));
817 if (inst
->U
.I
.Opcode
== RC_OPCODE_COS
) {
818 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
819 srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
820 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SIN
) {
821 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, inst
->U
.I
.SaturateMode
,
822 inst
->U
.I
.DstReg
, srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
823 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SCS
) {
824 struct rc_dst_register moddst
= inst
->U
.I
.DstReg
;
826 if (inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_X
) {
827 moddst
.WriteMask
= RC_MASK_X
;
828 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, inst
->U
.I
.SaturateMode
, moddst
,
829 srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
831 if (inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_Y
) {
832 moddst
.WriteMask
= RC_MASK_Y
;
833 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, inst
->U
.I
.SaturateMode
, moddst
,
834 srcregswz(RC_FILE_TEMPORARY
, temp
, RC_SWIZZLE_WWWW
));
838 rc_remove_instruction(inst
);
844 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
845 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
846 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
848 * @warning This explicitly changes the form of DDX and DDY!
851 int radeonTransformDeriv(struct radeon_compiler
* c
,
852 struct rc_instruction
* inst
,
855 if (inst
->U
.I
.Opcode
!= RC_OPCODE_DDX
&& inst
->U
.I
.Opcode
!= RC_OPCODE_DDY
)
858 inst
->U
.I
.SrcReg
[1].Swizzle
= RC_SWIZZLE_1111
;
859 inst
->U
.I
.SrcReg
[1].Negate
= RC_MASK_XYZW
;