2 * Copyright (C) 2008 Nicolai Haehnle.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
36 #include "radeon_program_alu.h"
38 #include "radeon_compiler.h"
41 static struct rc_instruction
*emit1(
42 struct radeon_compiler
* c
, struct rc_instruction
* after
,
43 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
44 struct rc_src_register SrcReg
)
46 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
48 fpi
->U
.I
.Opcode
= Opcode
;
49 fpi
->U
.I
.SaturateMode
= Saturate
;
50 fpi
->U
.I
.DstReg
= DstReg
;
51 fpi
->U
.I
.SrcReg
[0] = SrcReg
;
55 static struct rc_instruction
*emit2(
56 struct radeon_compiler
* c
, struct rc_instruction
* after
,
57 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
58 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
)
60 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
62 fpi
->U
.I
.Opcode
= Opcode
;
63 fpi
->U
.I
.SaturateMode
= Saturate
;
64 fpi
->U
.I
.DstReg
= DstReg
;
65 fpi
->U
.I
.SrcReg
[0] = SrcReg0
;
66 fpi
->U
.I
.SrcReg
[1] = SrcReg1
;
70 static struct rc_instruction
*emit3(
71 struct radeon_compiler
* c
, struct rc_instruction
* after
,
72 rc_opcode Opcode
, rc_saturate_mode Saturate
, struct rc_dst_register DstReg
,
73 struct rc_src_register SrcReg0
, struct rc_src_register SrcReg1
,
74 struct rc_src_register SrcReg2
)
76 struct rc_instruction
*fpi
= rc_insert_new_instruction(c
, after
);
78 fpi
->U
.I
.Opcode
= Opcode
;
79 fpi
->U
.I
.SaturateMode
= Saturate
;
80 fpi
->U
.I
.DstReg
= DstReg
;
81 fpi
->U
.I
.SrcReg
[0] = SrcReg0
;
82 fpi
->U
.I
.SrcReg
[1] = SrcReg1
;
83 fpi
->U
.I
.SrcReg
[2] = SrcReg2
;
87 static struct rc_dst_register
dstreg(int file
, int index
)
89 struct rc_dst_register dst
;
92 dst
.WriteMask
= RC_MASK_XYZW
;
97 static struct rc_dst_register
dstregtmpmask(int index
, int mask
)
99 struct rc_dst_register dst
= {0};
100 dst
.File
= RC_FILE_TEMPORARY
;
102 dst
.WriteMask
= mask
;
107 static const struct rc_src_register builtin_zero
= {
108 .File
= RC_FILE_NONE
,
110 .Swizzle
= RC_SWIZZLE_0000
112 static const struct rc_src_register builtin_one
= {
113 .File
= RC_FILE_NONE
,
115 .Swizzle
= RC_SWIZZLE_1111
117 static const struct rc_src_register srcreg_undefined
= {
118 .File
= RC_FILE_NONE
,
120 .Swizzle
= RC_SWIZZLE_XYZW
123 static struct rc_src_register
srcreg(int file
, int index
)
125 struct rc_src_register src
= srcreg_undefined
;
131 static struct rc_src_register
srcregswz(int file
, int index
, int swz
)
133 struct rc_src_register src
= srcreg_undefined
;
140 static struct rc_src_register
absolute(struct rc_src_register reg
)
142 struct rc_src_register newreg
= reg
;
144 newreg
.Negate
= RC_MASK_NONE
;
148 static struct rc_src_register
negate(struct rc_src_register reg
)
150 struct rc_src_register newreg
= reg
;
151 newreg
.Negate
= newreg
.Negate
^ RC_MASK_XYZW
;
155 static struct rc_src_register
swizzle(struct rc_src_register reg
,
156 rc_swizzle x
, rc_swizzle y
, rc_swizzle z
, rc_swizzle w
)
158 struct rc_src_register swizzled
= reg
;
159 swizzled
.Swizzle
= combine_swizzles4(reg
.Swizzle
, x
, y
, z
, w
);
163 static struct rc_src_register
swizzle_smear(struct rc_src_register reg
,
166 return swizzle(reg
, x
, x
, x
, x
);
169 static struct rc_src_register
swizzle_xxxx(struct rc_src_register reg
)
171 return swizzle_smear(reg
, RC_SWIZZLE_X
);
174 static struct rc_src_register
swizzle_yyyy(struct rc_src_register reg
)
176 return swizzle_smear(reg
, RC_SWIZZLE_Y
);
179 static struct rc_src_register
swizzle_zzzz(struct rc_src_register reg
)
181 return swizzle_smear(reg
, RC_SWIZZLE_Z
);
184 static struct rc_src_register
swizzle_wwww(struct rc_src_register reg
)
186 return swizzle_smear(reg
, RC_SWIZZLE_W
);
189 static void transform_ABS(struct radeon_compiler
* c
,
190 struct rc_instruction
* inst
)
192 struct rc_src_register src
= inst
->U
.I
.SrcReg
[0];
194 src
.Negate
= RC_MASK_NONE
;
195 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src
);
196 rc_remove_instruction(inst
);
199 static void transform_CEIL(struct radeon_compiler
* c
,
200 struct rc_instruction
* inst
)
203 * ceil(x) = -floor(-x)
205 * After inlining floor:
206 * ceil(x) = -(-x-frac(-x))
208 * After simplification:
209 * ceil(x) = x+frac(-x)
212 int tempreg
= rc_find_free_temporary(c
);
213 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), negate(inst
->U
.I
.SrcReg
[0]));
214 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
215 inst
->U
.I
.SrcReg
[0], srcreg(RC_FILE_TEMPORARY
, tempreg
));
216 rc_remove_instruction(inst
);
219 static void transform_DP3(struct radeon_compiler
* c
,
220 struct rc_instruction
* inst
)
222 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
223 struct rc_src_register src1
= inst
->U
.I
.SrcReg
[1];
224 src0
.Negate
&= ~RC_MASK_W
;
225 src0
.Swizzle
&= ~(7 << (3 * 3));
226 src0
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
227 src1
.Negate
&= ~RC_MASK_W
;
228 src1
.Swizzle
&= ~(7 << (3 * 3));
229 src1
.Swizzle
|= RC_SWIZZLE_ZERO
<< (3 * 3);
230 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src0
, src1
);
231 rc_remove_instruction(inst
);
234 static void transform_DPH(struct radeon_compiler
* c
,
235 struct rc_instruction
* inst
)
237 struct rc_src_register src0
= inst
->U
.I
.SrcReg
[0];
238 src0
.Negate
&= ~RC_MASK_W
;
239 src0
.Swizzle
&= ~(7 << (3 * 3));
240 src0
.Swizzle
|= RC_SWIZZLE_ONE
<< (3 * 3);
241 emit2(c
, inst
->Prev
, RC_OPCODE_DP4
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, src0
, inst
->U
.I
.SrcReg
[1]);
242 rc_remove_instruction(inst
);
246 * [1, src0.y*src1.y, src0.z, src1.w]
247 * So basically MUL with lotsa swizzling.
249 static void transform_DST(struct radeon_compiler
* c
,
250 struct rc_instruction
* inst
)
252 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
253 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_ONE
),
254 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_ONE
, RC_SWIZZLE_Y
, RC_SWIZZLE_ONE
, RC_SWIZZLE_W
));
255 rc_remove_instruction(inst
);
258 static void transform_FLR(struct radeon_compiler
* c
,
259 struct rc_instruction
* inst
)
261 int tempreg
= rc_find_free_temporary(c
);
262 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0]);
263 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
264 inst
->U
.I
.SrcReg
[0], negate(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
265 rc_remove_instruction(inst
);
269 * Definition of LIT (from ARB_fragment_program):
271 * tmp = VectorLoad(op0);
272 * if (tmp.x < 0) tmp.x = 0;
273 * if (tmp.y < 0) tmp.y = 0;
274 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
275 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
278 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
281 * The longest path of computation is the one leading to result.z,
282 * consisting of 5 operations. This implementation of LIT takes
283 * 5 slots, if the subsequent optimization passes are clever enough
284 * to pair instructions correctly.
286 static void transform_LIT(struct radeon_compiler
* c
,
287 struct rc_instruction
* inst
)
289 unsigned int constant
;
290 unsigned int constant_swizzle
;
292 struct rc_src_register srctemp
;
294 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, -127.999999, &constant_swizzle
);
296 if (inst
->U
.I
.DstReg
.WriteMask
!= RC_MASK_XYZW
|| inst
->U
.I
.DstReg
.File
!= RC_FILE_TEMPORARY
) {
297 struct rc_instruction
* inst_mov
;
299 inst_mov
= emit1(c
, inst
,
300 RC_OPCODE_MOV
, 0, inst
->U
.I
.DstReg
,
301 srcreg(RC_FILE_TEMPORARY
, rc_find_free_temporary(c
)));
303 inst
->U
.I
.DstReg
.File
= RC_FILE_TEMPORARY
;
304 inst
->U
.I
.DstReg
.Index
= inst_mov
->U
.I
.SrcReg
[0].Index
;
305 inst
->U
.I
.DstReg
.WriteMask
= RC_MASK_XYZW
;
308 temp
= inst
->U
.I
.DstReg
.Index
;
309 srctemp
= srcreg(RC_FILE_TEMPORARY
, temp
);
311 /* tmp.x = max(0.0, Src.x); */
312 /* tmp.y = max(0.0, Src.y); */
313 /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
314 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
315 dstregtmpmask(temp
, RC_MASK_XYW
),
317 swizzle(srcreg(RC_FILE_CONSTANT
, constant
),
318 RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, RC_SWIZZLE_ZERO
, constant_swizzle
&3));
319 emit2(c
, inst
->Prev
, RC_OPCODE_MIN
, 0,
320 dstregtmpmask(temp
, RC_MASK_Z
),
321 swizzle_wwww(srctemp
),
322 negate(srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
)));
324 /* tmp.w = Pow(tmp.y, tmp.w) */
325 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0,
326 dstregtmpmask(temp
, RC_MASK_W
),
327 swizzle_yyyy(srctemp
));
328 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0,
329 dstregtmpmask(temp
, RC_MASK_W
),
330 swizzle_wwww(srctemp
),
331 swizzle_zzzz(srctemp
));
332 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, 0,
333 dstregtmpmask(temp
, RC_MASK_W
),
334 swizzle_wwww(srctemp
));
336 /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
337 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
,
338 dstregtmpmask(temp
, RC_MASK_Z
),
339 negate(swizzle_xxxx(srctemp
)),
340 swizzle_wwww(srctemp
),
343 /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
344 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
,
345 dstregtmpmask(temp
, RC_MASK_XYW
),
346 swizzle(srctemp
, RC_SWIZZLE_ONE
, RC_SWIZZLE_X
, RC_SWIZZLE_ONE
, RC_SWIZZLE_ONE
));
348 rc_remove_instruction(inst
);
351 static void transform_LRP(struct radeon_compiler
* c
,
352 struct rc_instruction
* inst
)
354 int tempreg
= rc_find_free_temporary(c
);
356 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0,
357 dstreg(RC_FILE_TEMPORARY
, tempreg
),
358 inst
->U
.I
.SrcReg
[1], negate(inst
->U
.I
.SrcReg
[2]));
359 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, inst
->U
.I
.SaturateMode
,
361 inst
->U
.I
.SrcReg
[0], srcreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[2]);
363 rc_remove_instruction(inst
);
366 static void transform_POW(struct radeon_compiler
* c
,
367 struct rc_instruction
* inst
)
369 int tempreg
= rc_find_free_temporary(c
);
370 struct rc_dst_register tempdst
= dstreg(RC_FILE_TEMPORARY
, tempreg
);
371 struct rc_src_register tempsrc
= srcreg(RC_FILE_TEMPORARY
, tempreg
);
372 tempdst
.WriteMask
= RC_MASK_W
;
373 tempsrc
.Swizzle
= RC_SWIZZLE_WWWW
;
375 emit1(c
, inst
->Prev
, RC_OPCODE_LG2
, 0, tempdst
, swizzle_xxxx(inst
->U
.I
.SrcReg
[0]));
376 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, tempdst
, tempsrc
, swizzle_xxxx(inst
->U
.I
.SrcReg
[1]));
377 emit1(c
, inst
->Prev
, RC_OPCODE_EX2
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, tempsrc
);
379 rc_remove_instruction(inst
);
382 static void transform_RSQ(struct radeon_compiler
* c
,
383 struct rc_instruction
* inst
)
385 inst
->U
.I
.SrcReg
[0] = absolute(inst
->U
.I
.SrcReg
[0]);
388 static void transform_SEQ(struct radeon_compiler
* c
,
389 struct rc_instruction
* inst
)
391 int tempreg
= rc_find_free_temporary(c
);
393 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
394 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
395 negate(absolute(srcreg(RC_FILE_TEMPORARY
, tempreg
))), builtin_zero
, builtin_one
);
397 rc_remove_instruction(inst
);
400 static void transform_SFL(struct radeon_compiler
* c
,
401 struct rc_instruction
* inst
)
403 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
, builtin_zero
);
404 rc_remove_instruction(inst
);
407 static void transform_SGE(struct radeon_compiler
* c
,
408 struct rc_instruction
* inst
)
410 int tempreg
= rc_find_free_temporary(c
);
412 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
413 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
414 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_zero
, builtin_one
);
416 rc_remove_instruction(inst
);
419 static void transform_SGT(struct radeon_compiler
* c
,
420 struct rc_instruction
* inst
)
422 int tempreg
= rc_find_free_temporary(c
);
424 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), negate(inst
->U
.I
.SrcReg
[0]), inst
->U
.I
.SrcReg
[1]);
425 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
426 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_one
, builtin_zero
);
428 rc_remove_instruction(inst
);
431 static void transform_SLE(struct radeon_compiler
* c
,
432 struct rc_instruction
* inst
)
434 int tempreg
= rc_find_free_temporary(c
);
436 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), negate(inst
->U
.I
.SrcReg
[0]), inst
->U
.I
.SrcReg
[1]);
437 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
438 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_zero
, builtin_one
);
440 rc_remove_instruction(inst
);
443 static void transform_SLT(struct radeon_compiler
* c
,
444 struct rc_instruction
* inst
)
446 int tempreg
= rc_find_free_temporary(c
);
448 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
449 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
450 srcreg(RC_FILE_TEMPORARY
, tempreg
), builtin_one
, builtin_zero
);
452 rc_remove_instruction(inst
);
455 static void transform_SNE(struct radeon_compiler
* c
,
456 struct rc_instruction
* inst
)
458 int tempreg
= rc_find_free_temporary(c
);
460 emit2(c
, inst
->Prev
, RC_OPCODE_ADD
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
), inst
->U
.I
.SrcReg
[0], negate(inst
->U
.I
.SrcReg
[1]));
461 emit3(c
, inst
->Prev
, RC_OPCODE_CMP
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
462 negate(absolute(srcreg(RC_FILE_TEMPORARY
, tempreg
))), builtin_one
, builtin_zero
);
464 rc_remove_instruction(inst
);
467 static void transform_SUB(struct radeon_compiler
* c
,
468 struct rc_instruction
* inst
)
470 inst
->U
.I
.Opcode
= RC_OPCODE_ADD
;
471 inst
->U
.I
.SrcReg
[1] = negate(inst
->U
.I
.SrcReg
[1]);
474 static void transform_SWZ(struct radeon_compiler
* c
,
475 struct rc_instruction
* inst
)
477 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
480 static void transform_XPD(struct radeon_compiler
* c
,
481 struct rc_instruction
* inst
)
483 int tempreg
= rc_find_free_temporary(c
);
485 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstreg(RC_FILE_TEMPORARY
, tempreg
),
486 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
487 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
));
488 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
489 swizzle(inst
->U
.I
.SrcReg
[0], RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_W
),
490 swizzle(inst
->U
.I
.SrcReg
[1], RC_SWIZZLE_Z
, RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_W
),
491 negate(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
493 rc_remove_instruction(inst
);
498 * Can be used as a transformation for @ref radeonClauseLocalTransform,
499 * no userData necessary.
501 * Eliminates the following ALU instructions:
502 * ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
504 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
506 * Transforms RSQ to Radeon's native RSQ by explicitly setting
509 * @note should be applicable to R300 and R500 fragment programs.
511 int radeonTransformALU(
512 struct radeon_compiler
* c
,
513 struct rc_instruction
* inst
,
516 switch(inst
->U
.I
.Opcode
) {
517 case RC_OPCODE_ABS
: transform_ABS(c
, inst
); return 1;
518 case RC_OPCODE_CEIL
: transform_CEIL(c
, inst
); return 1;
519 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
520 case RC_OPCODE_DST
: transform_DST(c
, inst
); return 1;
521 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
522 case RC_OPCODE_LIT
: transform_LIT(c
, inst
); return 1;
523 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
524 case RC_OPCODE_POW
: transform_POW(c
, inst
); return 1;
525 case RC_OPCODE_RSQ
: transform_RSQ(c
, inst
); return 1;
526 case RC_OPCODE_SEQ
: transform_SEQ(c
, inst
); return 1;
527 case RC_OPCODE_SFL
: transform_SFL(c
, inst
); return 1;
528 case RC_OPCODE_SGE
: transform_SGE(c
, inst
); return 1;
529 case RC_OPCODE_SGT
: transform_SGT(c
, inst
); return 1;
530 case RC_OPCODE_SLE
: transform_SLE(c
, inst
); return 1;
531 case RC_OPCODE_SLT
: transform_SLT(c
, inst
); return 1;
532 case RC_OPCODE_SNE
: transform_SNE(c
, inst
); return 1;
533 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
534 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
535 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
542 static void transform_r300_vertex_ABS(struct radeon_compiler
* c
,
543 struct rc_instruction
* inst
)
545 /* Note: r500 can take absolute values, but r300 cannot. */
546 inst
->U
.I
.Opcode
= RC_OPCODE_MAX
;
547 inst
->U
.I
.SrcReg
[1] = inst
->U
.I
.SrcReg
[0];
548 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
551 static void transform_r300_vertex_CMP(struct radeon_compiler
* c
,
552 struct rc_instruction
* inst
)
554 /* There is no decent CMP available, so let's rig one up.
555 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
556 * The following sequence consumes two temps and two extra slots
557 * (the second temp and the second slot is consumed by transform_LRP),
558 * but should be equivalent:
560 * SLT tmp0, src0, 0.0
561 * LRP dst, tmp0, src1, src2
563 * Yes, I know, I'm a mad scientist. ~ C. & M. */
564 int tempreg0
= rc_find_free_temporary(c
);
566 /* SLT tmp0, src0, 0.0 */
567 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
568 dstreg(RC_FILE_TEMPORARY
, tempreg0
),
569 inst
->U
.I
.SrcReg
[0], builtin_zero
);
571 /* LRP dst, tmp0, src1, src2 */
573 emit3(c
, inst
->Prev
, RC_OPCODE_LRP
, 0,
575 srcreg(RC_FILE_TEMPORARY
, tempreg0
), inst
->U
.I
.SrcReg
[1], inst
->U
.I
.SrcReg
[2]));
577 rc_remove_instruction(inst
);
580 static void transform_r300_vertex_fix_LIT(struct radeon_compiler
* c
,
581 struct rc_instruction
* inst
)
583 int tempreg
= rc_find_free_temporary(c
);
584 unsigned constant_swizzle
;
585 int constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
,
586 0.0000000000000000001,
590 emit1(c
, inst
->Prev
, RC_OPCODE_MOV
, 0,
591 dstreg(RC_FILE_TEMPORARY
, tempreg
),
592 inst
->U
.I
.SrcReg
[0]);
594 /* MAX dst.z, src, 0.00...001 */
595 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
596 dstregtmpmask(tempreg
, RC_MASK_Y
),
597 srcreg(RC_FILE_TEMPORARY
, tempreg
),
598 srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
));
600 inst
->U
.I
.SrcReg
[0] = srcreg(RC_FILE_TEMPORARY
, tempreg
);
603 static void transform_r300_vertex_SEQ(struct radeon_compiler
*c
,
604 struct rc_instruction
*inst
)
606 /* x = y <==> x >= y && y >= x */
607 int tmp
= rc_find_free_temporary(c
);
610 emit2(c
, inst
->Prev
, RC_OPCODE_SGE
, 0,
611 dstregtmpmask(tmp
, inst
->U
.I
.DstReg
.WriteMask
),
613 inst
->U
.I
.SrcReg
[1]);
616 emit2(c
, inst
->Prev
, RC_OPCODE_SGE
, 0,
619 inst
->U
.I
.SrcReg
[0]);
622 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0,
624 srcreg(RC_FILE_TEMPORARY
, tmp
),
625 srcreg(inst
->U
.I
.DstReg
.File
, inst
->U
.I
.DstReg
.Index
));
627 rc_remove_instruction(inst
);
630 static void transform_r300_vertex_SNE(struct radeon_compiler
*c
,
631 struct rc_instruction
*inst
)
633 /* x != y <==> x < y || y < x */
634 int tmp
= rc_find_free_temporary(c
);
637 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
638 dstregtmpmask(tmp
, inst
->U
.I
.DstReg
.WriteMask
),
640 inst
->U
.I
.SrcReg
[1]);
643 emit2(c
, inst
->Prev
, RC_OPCODE_SLT
, 0,
646 inst
->U
.I
.SrcReg
[0]);
648 /* x || y = max(x, y) */
649 emit2(c
, inst
->Prev
, RC_OPCODE_MAX
, 0,
651 srcreg(RC_FILE_TEMPORARY
, tmp
),
652 srcreg(inst
->U
.I
.DstReg
.File
, inst
->U
.I
.DstReg
.Index
));
654 rc_remove_instruction(inst
);
657 static void transform_r300_vertex_SGT(struct radeon_compiler
* c
,
658 struct rc_instruction
* inst
)
660 /* x > y <==> -x < -y */
661 inst
->U
.I
.Opcode
= RC_OPCODE_SLT
;
662 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
663 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
666 static void transform_r300_vertex_SLE(struct radeon_compiler
* c
,
667 struct rc_instruction
* inst
)
669 /* x <= y <==> -x >= -y */
670 inst
->U
.I
.Opcode
= RC_OPCODE_SGE
;
671 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
672 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
676 * For use with radeonLocalTransform, this transforms non-native ALU
677 * instructions of the r300 up to r500 vertex engine.
679 int r300_transform_vertex_alu(
680 struct radeon_compiler
* c
,
681 struct rc_instruction
* inst
,
684 switch(inst
->U
.I
.Opcode
) {
685 case RC_OPCODE_ABS
: transform_r300_vertex_ABS(c
, inst
); return 1;
686 case RC_OPCODE_CEIL
: transform_CEIL(c
, inst
); return 1;
687 case RC_OPCODE_CMP
: transform_r300_vertex_CMP(c
, inst
); return 1;
688 case RC_OPCODE_DP3
: transform_DP3(c
, inst
); return 1;
689 case RC_OPCODE_DPH
: transform_DPH(c
, inst
); return 1;
690 case RC_OPCODE_FLR
: transform_FLR(c
, inst
); return 1;
691 case RC_OPCODE_LIT
: transform_r300_vertex_fix_LIT(c
, inst
); return 1;
692 case RC_OPCODE_LRP
: transform_LRP(c
, inst
); return 1;
695 transform_r300_vertex_SEQ(c
, inst
);
699 case RC_OPCODE_SFL
: transform_SFL(c
, inst
); return 1;
700 case RC_OPCODE_SGT
: transform_r300_vertex_SGT(c
, inst
); return 1;
701 case RC_OPCODE_SLE
: transform_r300_vertex_SLE(c
, inst
); return 1;
704 transform_r300_vertex_SNE(c
, inst
);
708 case RC_OPCODE_SUB
: transform_SUB(c
, inst
); return 1;
709 case RC_OPCODE_SWZ
: transform_SWZ(c
, inst
); return 1;
710 case RC_OPCODE_XPD
: transform_XPD(c
, inst
); return 1;
716 static void sincos_constants(struct radeon_compiler
* c
, unsigned int *constants
)
718 static const float SinCosConsts
[2][4] = {
720 1.273239545, /* 4/PI */
721 -0.405284735, /* -4/(PI*PI) */
722 3.141592654, /* PI */
728 0.159154943, /* 1/(2*PI) */
729 6.283185307 /* 2*PI */
734 for(i
= 0; i
< 2; ++i
)
735 constants
[i
] = rc_constants_add_immediate_vec4(&c
->Program
.Constants
, SinCosConsts
[i
]);
739 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
741 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
742 * MAD tmp.x, tmp.y, |src|, tmp.x
743 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
744 * MAD dest, tmp.y, weight, tmp.x
746 static void sin_approx(
747 struct radeon_compiler
* c
, struct rc_instruction
* inst
,
748 struct rc_dst_register dst
, struct rc_src_register src
, const unsigned int* constants
)
750 unsigned int tempreg
= rc_find_free_temporary(c
);
752 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
754 srcreg(RC_FILE_CONSTANT
, constants
[0]));
755 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_X
),
756 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
757 absolute(swizzle_xxxx(src
)),
758 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
759 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_Y
),
760 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
761 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
))),
762 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
))));
763 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dst
,
764 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
765 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[0])),
766 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
770 * Translate the trigonometric functions COS, SIN, and SCS
771 * using only the basic instructions
772 * MOV, ADD, MUL, MAD, FRC
774 int radeonTransformTrigSimple(struct radeon_compiler
* c
,
775 struct rc_instruction
* inst
,
778 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
779 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
780 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
783 unsigned int constants
[2];
784 unsigned int tempreg
= rc_find_free_temporary(c
);
786 sincos_constants(c
, constants
);
788 if (inst
->U
.I
.Opcode
== RC_OPCODE_COS
) {
789 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
790 /* FRC tmp.x, tmp.x */
791 /* MAD tmp.z, tmp.x, 2*PI, -PI */
792 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
793 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
794 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
795 swizzle_xxxx(srcreg(RC_FILE_CONSTANT
, constants
[1])));
796 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
797 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
798 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
799 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
800 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
801 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
803 sin_approx(c
, inst
, inst
->U
.I
.DstReg
,
804 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
806 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SIN
) {
807 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
808 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
809 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
810 swizzle_yyyy(srcreg(RC_FILE_CONSTANT
, constants
[1])));
811 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
812 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)));
813 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_W
),
814 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
815 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
816 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
818 sin_approx(c
, inst
, inst
->U
.I
.DstReg
,
819 swizzle_wwww(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
822 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
823 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
824 swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[1])),
825 swizzle(srcreg(RC_FILE_CONSTANT
, constants
[1]), RC_SWIZZLE_X
, RC_SWIZZLE_Y
, RC_SWIZZLE_Z
, RC_SWIZZLE_W
));
826 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
827 srcreg(RC_FILE_TEMPORARY
, tempreg
));
828 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(tempreg
, RC_MASK_XY
),
829 srcreg(RC_FILE_TEMPORARY
, tempreg
),
830 swizzle_wwww(srcreg(RC_FILE_CONSTANT
, constants
[1])),
831 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT
, constants
[0]))));
833 struct rc_dst_register dst
= inst
->U
.I
.DstReg
;
835 dst
.WriteMask
= inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_X
;
836 sin_approx(c
, inst
, dst
,
837 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
840 dst
.WriteMask
= inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_Y
;
841 sin_approx(c
, inst
, dst
,
842 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY
, tempreg
)),
846 rc_remove_instruction(inst
);
851 static void r300_transform_SIN_COS_SCS(struct radeon_compiler
*c
,
852 struct rc_instruction
*inst
,
855 if (inst
->U
.I
.Opcode
== RC_OPCODE_COS
) {
856 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, inst
->U
.I
.SaturateMode
, inst
->U
.I
.DstReg
,
857 srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
858 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SIN
) {
859 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, inst
->U
.I
.SaturateMode
,
860 inst
->U
.I
.DstReg
, srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
861 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_SCS
) {
862 struct rc_dst_register moddst
= inst
->U
.I
.DstReg
;
864 if (inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_X
) {
865 moddst
.WriteMask
= RC_MASK_X
;
866 emit1(c
, inst
->Prev
, RC_OPCODE_COS
, inst
->U
.I
.SaturateMode
, moddst
,
867 srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
869 if (inst
->U
.I
.DstReg
.WriteMask
& RC_MASK_Y
) {
870 moddst
.WriteMask
= RC_MASK_Y
;
871 emit1(c
, inst
->Prev
, RC_OPCODE_SIN
, inst
->U
.I
.SaturateMode
, moddst
,
872 srcregswz(RC_FILE_TEMPORARY
, srctmp
, RC_SWIZZLE_WWWW
));
876 rc_remove_instruction(inst
);
881 * Transform the trigonometric functions COS, SIN, and SCS
882 * to include pre-scaling by 1/(2*PI) and taking the fractional
883 * part, so that the input to COS and SIN is always in the range [0,1).
884 * SCS is replaced by one COS and one SIN instruction.
886 * @warning This transformation implicitly changes the semantics of SIN and COS!
888 int radeonTransformTrigScale(struct radeon_compiler
* c
,
889 struct rc_instruction
* inst
,
892 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
893 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
894 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
897 static const float RCP_2PI
= 0.15915494309189535;
899 unsigned int constant
;
900 unsigned int constant_swizzle
;
902 temp
= rc_find_free_temporary(c
);
903 constant
= rc_constants_add_immediate_scalar(&c
->Program
.Constants
, RCP_2PI
, &constant_swizzle
);
905 emit2(c
, inst
->Prev
, RC_OPCODE_MUL
, 0, dstregtmpmask(temp
, RC_MASK_W
),
906 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
907 srcregswz(RC_FILE_CONSTANT
, constant
, constant_swizzle
));
908 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(temp
, RC_MASK_W
),
909 srcreg(RC_FILE_TEMPORARY
, temp
));
911 r300_transform_SIN_COS_SCS(c
, inst
, temp
);
916 * Transform the trigonometric functions COS, SIN, and SCS
917 * so that the input to COS and SIN is always in the range [-PI, PI].
918 * SCS is replaced by one COS and one SIN instruction.
920 int r300_transform_trig_scale_vertex(struct radeon_compiler
*c
,
921 struct rc_instruction
*inst
,
924 if (inst
->U
.I
.Opcode
!= RC_OPCODE_COS
&&
925 inst
->U
.I
.Opcode
!= RC_OPCODE_SIN
&&
926 inst
->U
.I
.Opcode
!= RC_OPCODE_SCS
)
929 /* Repeat x in the range [-PI, PI]:
931 * repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
934 static const float cons
[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
936 unsigned int constant
;
938 temp
= rc_find_free_temporary(c
);
939 constant
= rc_constants_add_immediate_vec4(&c
->Program
.Constants
, cons
);
941 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(temp
, RC_MASK_W
),
942 swizzle_xxxx(inst
->U
.I
.SrcReg
[0]),
943 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_XXXX
),
944 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_YYYY
));
945 emit1(c
, inst
->Prev
, RC_OPCODE_FRC
, 0, dstregtmpmask(temp
, RC_MASK_W
),
946 srcreg(RC_FILE_TEMPORARY
, temp
));
947 emit3(c
, inst
->Prev
, RC_OPCODE_MAD
, 0, dstregtmpmask(temp
, RC_MASK_W
),
948 srcreg(RC_FILE_TEMPORARY
, temp
),
949 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_ZZZZ
),
950 srcregswz(RC_FILE_CONSTANT
, constant
, RC_SWIZZLE_WWWW
));
952 r300_transform_SIN_COS_SCS(c
, inst
, temp
);
957 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
958 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
959 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
961 * @warning This explicitly changes the form of DDX and DDY!
964 int radeonTransformDeriv(struct radeon_compiler
* c
,
965 struct rc_instruction
* inst
,
968 if (inst
->U
.I
.Opcode
!= RC_OPCODE_DDX
&& inst
->U
.I
.Opcode
!= RC_OPCODE_DDY
)
971 inst
->U
.I
.SrcReg
[1].Swizzle
= RC_SWIZZLE_1111
;
972 inst
->U
.I
.SrcReg
[1].Negate
= RC_MASK_XYZW
;
979 * KILP - > KIL -abs(Temp[0].x)
982 * This needs to be done in its own pass, because it modifies the instructions
983 * before and after KILP.
985 void radeonTransformKILP(struct radeon_compiler
* c
)
987 struct rc_instruction
* inst
;
988 for (inst
= c
->Program
.Instructions
.Next
;
989 inst
!= &c
->Program
.Instructions
; inst
= inst
->Next
) {
991 if (inst
->U
.I
.Opcode
!= RC_OPCODE_KILP
992 || inst
->Prev
->U
.I
.Opcode
!= RC_OPCODE_IF
993 || inst
->Next
->U
.I
.Opcode
!= RC_OPCODE_ENDIF
) {
996 inst
->U
.I
.Opcode
= RC_OPCODE_KIL
;
997 inst
->U
.I
.SrcReg
[0] = negate(absolute(inst
->Prev
->U
.I
.SrcReg
[0]));
1000 rc_remove_instruction(inst
->Prev
);
1002 rc_remove_instruction(inst
->Next
);