r300/compiler: refactor fragment shader compilation
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / radeon_program_alu.c
1 /*
2 * Copyright (C) 2008 Nicolai Haehnle.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /**
29 * @file
30 *
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
33 *
34 */
35
36 #include "radeon_program_alu.h"
37
38 #include "radeon_compiler.h"
39
40
41 static struct rc_instruction *emit1(
42 struct radeon_compiler * c, struct rc_instruction * after,
43 rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
44 struct rc_src_register SrcReg)
45 {
46 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
47
48 fpi->U.I.Opcode = Opcode;
49 fpi->U.I.SaturateMode = Saturate;
50 fpi->U.I.DstReg = DstReg;
51 fpi->U.I.SrcReg[0] = SrcReg;
52 return fpi;
53 }
54
55 static struct rc_instruction *emit2(
56 struct radeon_compiler * c, struct rc_instruction * after,
57 rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
58 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
59 {
60 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
61
62 fpi->U.I.Opcode = Opcode;
63 fpi->U.I.SaturateMode = Saturate;
64 fpi->U.I.DstReg = DstReg;
65 fpi->U.I.SrcReg[0] = SrcReg0;
66 fpi->U.I.SrcReg[1] = SrcReg1;
67 return fpi;
68 }
69
70 static struct rc_instruction *emit3(
71 struct radeon_compiler * c, struct rc_instruction * after,
72 rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
73 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
74 struct rc_src_register SrcReg2)
75 {
76 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
77
78 fpi->U.I.Opcode = Opcode;
79 fpi->U.I.SaturateMode = Saturate;
80 fpi->U.I.DstReg = DstReg;
81 fpi->U.I.SrcReg[0] = SrcReg0;
82 fpi->U.I.SrcReg[1] = SrcReg1;
83 fpi->U.I.SrcReg[2] = SrcReg2;
84 return fpi;
85 }
86
87 static struct rc_dst_register dstreg(int file, int index)
88 {
89 struct rc_dst_register dst;
90 dst.File = file;
91 dst.Index = index;
92 dst.WriteMask = RC_MASK_XYZW;
93 dst.RelAddr = 0;
94 return dst;
95 }
96
97 static struct rc_dst_register dstregtmpmask(int index, int mask)
98 {
99 struct rc_dst_register dst = {0};
100 dst.File = RC_FILE_TEMPORARY;
101 dst.Index = index;
102 dst.WriteMask = mask;
103 dst.RelAddr = 0;
104 return dst;
105 }
106
107 static const struct rc_src_register builtin_zero = {
108 .File = RC_FILE_NONE,
109 .Index = 0,
110 .Swizzle = RC_SWIZZLE_0000
111 };
112 static const struct rc_src_register builtin_one = {
113 .File = RC_FILE_NONE,
114 .Index = 0,
115 .Swizzle = RC_SWIZZLE_1111
116 };
117 static const struct rc_src_register srcreg_undefined = {
118 .File = RC_FILE_NONE,
119 .Index = 0,
120 .Swizzle = RC_SWIZZLE_XYZW
121 };
122
123 static struct rc_src_register srcreg(int file, int index)
124 {
125 struct rc_src_register src = srcreg_undefined;
126 src.File = file;
127 src.Index = index;
128 return src;
129 }
130
131 static struct rc_src_register srcregswz(int file, int index, int swz)
132 {
133 struct rc_src_register src = srcreg_undefined;
134 src.File = file;
135 src.Index = index;
136 src.Swizzle = swz;
137 return src;
138 }
139
140 static struct rc_src_register absolute(struct rc_src_register reg)
141 {
142 struct rc_src_register newreg = reg;
143 newreg.Abs = 1;
144 newreg.Negate = RC_MASK_NONE;
145 return newreg;
146 }
147
148 static struct rc_src_register negate(struct rc_src_register reg)
149 {
150 struct rc_src_register newreg = reg;
151 newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
152 return newreg;
153 }
154
155 static struct rc_src_register swizzle(struct rc_src_register reg,
156 rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
157 {
158 struct rc_src_register swizzled = reg;
159 swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
160 return swizzled;
161 }
162
163 static struct rc_src_register swizzle_smear(struct rc_src_register reg,
164 rc_swizzle x)
165 {
166 return swizzle(reg, x, x, x, x);
167 }
168
169 static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
170 {
171 return swizzle_smear(reg, RC_SWIZZLE_X);
172 }
173
174 static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
175 {
176 return swizzle_smear(reg, RC_SWIZZLE_Y);
177 }
178
179 static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
180 {
181 return swizzle_smear(reg, RC_SWIZZLE_Z);
182 }
183
184 static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
185 {
186 return swizzle_smear(reg, RC_SWIZZLE_W);
187 }
188
189 static void transform_ABS(struct radeon_compiler* c,
190 struct rc_instruction* inst)
191 {
192 struct rc_src_register src = inst->U.I.SrcReg[0];
193 src.Abs = 1;
194 src.Negate = RC_MASK_NONE;
195 emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, src);
196 rc_remove_instruction(inst);
197 }
198
199 static void transform_CEIL(struct radeon_compiler* c,
200 struct rc_instruction* inst)
201 {
202 /* Assuming:
203 * ceil(x) = -floor(-x)
204 *
205 * After inlining floor:
206 * ceil(x) = -(-x-frac(-x))
207 *
208 * After simplification:
209 * ceil(x) = x+frac(-x)
210 */
211
212 int tempreg = rc_find_free_temporary(c);
213 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]));
214 emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
215 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg));
216 rc_remove_instruction(inst);
217 }
218
219 static void transform_DP2(struct radeon_compiler* c,
220 struct rc_instruction* inst)
221 {
222 struct rc_src_register src0 = inst->U.I.SrcReg[0];
223 struct rc_src_register src1 = inst->U.I.SrcReg[1];
224 src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
225 src0.Swizzle &= ~(63 << (3 * 2));
226 src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
227 src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
228 src1.Swizzle &= ~(63 << (3 * 2));
229 src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
230 emit2(c, inst->Prev, RC_OPCODE_DP3, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
231 rc_remove_instruction(inst);
232 }
233
234 static void transform_DPH(struct radeon_compiler* c,
235 struct rc_instruction* inst)
236 {
237 struct rc_src_register src0 = inst->U.I.SrcReg[0];
238 src0.Negate &= ~RC_MASK_W;
239 src0.Swizzle &= ~(7 << (3 * 3));
240 src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
241 emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
242 rc_remove_instruction(inst);
243 }
244
245 /**
246 * [1, src0.y*src1.y, src0.z, src1.w]
247 * So basically MUL with lotsa swizzling.
248 */
249 static void transform_DST(struct radeon_compiler* c,
250 struct rc_instruction* inst)
251 {
252 emit2(c, inst->Prev, RC_OPCODE_MUL, inst->U.I.SaturateMode, inst->U.I.DstReg,
253 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
254 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
255 rc_remove_instruction(inst);
256 }
257
258 static void transform_FLR(struct radeon_compiler* c,
259 struct rc_instruction* inst)
260 {
261 int tempreg = rc_find_free_temporary(c);
262 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0]);
263 emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
264 inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
265 rc_remove_instruction(inst);
266 }
267
268 /**
269 * Definition of LIT (from ARB_fragment_program):
270 *
271 * tmp = VectorLoad(op0);
272 * if (tmp.x < 0) tmp.x = 0;
273 * if (tmp.y < 0) tmp.y = 0;
274 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
275 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
276 * result.x = 1.0;
277 * result.y = tmp.x;
278 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
279 * result.w = 1.0;
280 *
281 * The longest path of computation is the one leading to result.z,
282 * consisting of 5 operations. This implementation of LIT takes
283 * 5 slots, if the subsequent optimization passes are clever enough
284 * to pair instructions correctly.
285 */
286 static void transform_LIT(struct radeon_compiler* c,
287 struct rc_instruction* inst)
288 {
289 unsigned int constant;
290 unsigned int constant_swizzle;
291 unsigned int temp;
292 struct rc_src_register srctemp;
293
294 constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
295
296 if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
297 struct rc_instruction * inst_mov;
298
299 inst_mov = emit1(c, inst,
300 RC_OPCODE_MOV, 0, inst->U.I.DstReg,
301 srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
302
303 inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
304 inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
305 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
306 }
307
308 temp = inst->U.I.DstReg.Index;
309 srctemp = srcreg(RC_FILE_TEMPORARY, temp);
310
311 /* tmp.x = max(0.0, Src.x); */
312 /* tmp.y = max(0.0, Src.y); */
313 /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
314 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
315 dstregtmpmask(temp, RC_MASK_XYW),
316 inst->U.I.SrcReg[0],
317 swizzle(srcreg(RC_FILE_CONSTANT, constant),
318 RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
319 emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
320 dstregtmpmask(temp, RC_MASK_Z),
321 swizzle_wwww(srctemp),
322 negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
323
324 /* tmp.w = Pow(tmp.y, tmp.w) */
325 emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
326 dstregtmpmask(temp, RC_MASK_W),
327 swizzle_yyyy(srctemp));
328 emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
329 dstregtmpmask(temp, RC_MASK_W),
330 swizzle_wwww(srctemp),
331 swizzle_zzzz(srctemp));
332 emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
333 dstregtmpmask(temp, RC_MASK_W),
334 swizzle_wwww(srctemp));
335
336 /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
337 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode,
338 dstregtmpmask(temp, RC_MASK_Z),
339 negate(swizzle_xxxx(srctemp)),
340 swizzle_wwww(srctemp),
341 builtin_zero);
342
343 /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
344 emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode,
345 dstregtmpmask(temp, RC_MASK_XYW),
346 swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
347
348 rc_remove_instruction(inst);
349 }
350
351 static void transform_LRP(struct radeon_compiler* c,
352 struct rc_instruction* inst)
353 {
354 int tempreg = rc_find_free_temporary(c);
355
356 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
357 dstreg(RC_FILE_TEMPORARY, tempreg),
358 inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
359 emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
360 inst->U.I.DstReg,
361 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[2]);
362
363 rc_remove_instruction(inst);
364 }
365
366 static void transform_POW(struct radeon_compiler* c,
367 struct rc_instruction* inst)
368 {
369 int tempreg = rc_find_free_temporary(c);
370 struct rc_dst_register tempdst = dstreg(RC_FILE_TEMPORARY, tempreg);
371 struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempreg);
372 tempdst.WriteMask = RC_MASK_W;
373 tempsrc.Swizzle = RC_SWIZZLE_WWWW;
374
375 emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
376 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
377 emit1(c, inst->Prev, RC_OPCODE_EX2, inst->U.I.SaturateMode, inst->U.I.DstReg, tempsrc);
378
379 rc_remove_instruction(inst);
380 }
381
382 static void transform_RSQ(struct radeon_compiler* c,
383 struct rc_instruction* inst)
384 {
385 inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
386 }
387
388 static void transform_SEQ(struct radeon_compiler* c,
389 struct rc_instruction* inst)
390 {
391 int tempreg = rc_find_free_temporary(c);
392
393 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
394 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
395 negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_zero, builtin_one);
396
397 rc_remove_instruction(inst);
398 }
399
400 static void transform_SFL(struct radeon_compiler* c,
401 struct rc_instruction* inst)
402 {
403 emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, builtin_zero);
404 rc_remove_instruction(inst);
405 }
406
407 static void transform_SGE(struct radeon_compiler* c,
408 struct rc_instruction* inst)
409 {
410 int tempreg = rc_find_free_temporary(c);
411
412 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
413 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
414 srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
415
416 rc_remove_instruction(inst);
417 }
418
419 static void transform_SGT(struct radeon_compiler* c,
420 struct rc_instruction* inst)
421 {
422 int tempreg = rc_find_free_temporary(c);
423
424 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
425 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
426 srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
427
428 rc_remove_instruction(inst);
429 }
430
431 static void transform_SLE(struct radeon_compiler* c,
432 struct rc_instruction* inst)
433 {
434 int tempreg = rc_find_free_temporary(c);
435
436 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
437 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
438 srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
439
440 rc_remove_instruction(inst);
441 }
442
443 static void transform_SLT(struct radeon_compiler* c,
444 struct rc_instruction* inst)
445 {
446 int tempreg = rc_find_free_temporary(c);
447
448 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
449 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
450 srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
451
452 rc_remove_instruction(inst);
453 }
454
455 static void transform_SNE(struct radeon_compiler* c,
456 struct rc_instruction* inst)
457 {
458 int tempreg = rc_find_free_temporary(c);
459
460 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
461 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
462 negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_one, builtin_zero);
463
464 rc_remove_instruction(inst);
465 }
466
467 static void transform_SSG(struct radeon_compiler* c,
468 struct rc_instruction* inst)
469 {
470 /* result = sign(x)
471 *
472 * CMP tmp0, -x, 1, 0
473 * CMP tmp1, x, 1, 0
474 * ADD result, tmp0, -tmp1;
475 */
476 unsigned tmp0, tmp1;
477
478 /* 0 < x */
479 tmp0 = rc_find_free_temporary(c);
480 emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
481 dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
482 negate(inst->U.I.SrcReg[0]),
483 builtin_one,
484 builtin_zero);
485
486 /* x < 0 */
487 tmp1 = rc_find_free_temporary(c);
488 emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
489 dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
490 inst->U.I.SrcReg[0],
491 builtin_one,
492 builtin_zero);
493
494 /* Either both are zero, or one of them is one and the other is zero. */
495 /* result = tmp0 - tmp1 */
496 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
497 inst->U.I.DstReg,
498 srcreg(RC_FILE_TEMPORARY, tmp0),
499 negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
500
501 rc_remove_instruction(inst);
502 }
503
504 static void transform_SUB(struct radeon_compiler* c,
505 struct rc_instruction* inst)
506 {
507 inst->U.I.Opcode = RC_OPCODE_ADD;
508 inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
509 }
510
511 static void transform_SWZ(struct radeon_compiler* c,
512 struct rc_instruction* inst)
513 {
514 inst->U.I.Opcode = RC_OPCODE_MOV;
515 }
516
517 static void transform_XPD(struct radeon_compiler* c,
518 struct rc_instruction* inst)
519 {
520 int tempreg = rc_find_free_temporary(c);
521
522 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstreg(RC_FILE_TEMPORARY, tempreg),
523 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
524 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
525 emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
526 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
527 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
528 negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
529
530 rc_remove_instruction(inst);
531 }
532
533
534 /**
535 * Can be used as a transformation for @ref radeonClauseLocalTransform,
536 * no userData necessary.
537 *
538 * Eliminates the following ALU instructions:
539 * ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
540 * using:
541 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
542 *
543 * Transforms RSQ to Radeon's native RSQ by explicitly setting
544 * absolute value.
545 *
546 * @note should be applicable to R300 and R500 fragment programs.
547 */
548 int radeonTransformALU(
549 struct radeon_compiler * c,
550 struct rc_instruction* inst,
551 void* unused)
552 {
553 switch(inst->U.I.Opcode) {
554 case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
555 case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
556 case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
557 case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
558 case RC_OPCODE_DST: transform_DST(c, inst); return 1;
559 case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
560 case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
561 case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
562 case RC_OPCODE_POW: transform_POW(c, inst); return 1;
563 case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
564 case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
565 case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
566 case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
567 case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
568 case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
569 case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
570 case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
571 case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
572 case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
573 case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
574 case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
575 default:
576 return 0;
577 }
578 }
579
580
581 static void transform_r300_vertex_ABS(struct radeon_compiler* c,
582 struct rc_instruction* inst)
583 {
584 /* Note: r500 can take absolute values, but r300 cannot. */
585 inst->U.I.Opcode = RC_OPCODE_MAX;
586 inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
587 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
588 }
589
590 static void transform_r300_vertex_CMP(struct radeon_compiler* c,
591 struct rc_instruction* inst)
592 {
593 /* There is no decent CMP available, so let's rig one up.
594 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
595 * The following sequence consumes two temps and two extra slots
596 * (the second temp and the second slot is consumed by transform_LRP),
597 * but should be equivalent:
598 *
599 * SLT tmp0, src0, 0.0
600 * LRP dst, tmp0, src1, src2
601 *
602 * Yes, I know, I'm a mad scientist. ~ C. & M. */
603 int tempreg0 = rc_find_free_temporary(c);
604
605 /* SLT tmp0, src0, 0.0 */
606 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
607 dstreg(RC_FILE_TEMPORARY, tempreg0),
608 inst->U.I.SrcReg[0], builtin_zero);
609
610 /* LRP dst, tmp0, src1, src2 */
611 transform_LRP(c,
612 emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
613 inst->U.I.DstReg,
614 srcreg(RC_FILE_TEMPORARY, tempreg0), inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]));
615
616 rc_remove_instruction(inst);
617 }
618
619 static void transform_r300_vertex_DP2(struct radeon_compiler* c,
620 struct rc_instruction* inst)
621 {
622 struct rc_instruction *next_inst = inst->Next;
623 transform_DP2(c, inst);
624 next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
625 }
626
627 static void transform_r300_vertex_DP3(struct radeon_compiler* c,
628 struct rc_instruction* inst)
629 {
630 struct rc_src_register src0 = inst->U.I.SrcReg[0];
631 struct rc_src_register src1 = inst->U.I.SrcReg[1];
632 src0.Negate &= ~RC_MASK_W;
633 src0.Swizzle &= ~(7 << (3 * 3));
634 src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
635 src1.Negate &= ~RC_MASK_W;
636 src1.Swizzle &= ~(7 << (3 * 3));
637 src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
638 emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
639 rc_remove_instruction(inst);
640 }
641
642 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
643 struct rc_instruction* inst)
644 {
645 int tempreg = rc_find_free_temporary(c);
646 unsigned constant_swizzle;
647 int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
648 0.0000000000000000001,
649 &constant_swizzle);
650
651 /* MOV dst, src */
652 emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
653 dstreg(RC_FILE_TEMPORARY, tempreg),
654 inst->U.I.SrcReg[0]);
655
656 /* MAX dst.z, src, 0.00...001 */
657 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
658 dstregtmpmask(tempreg, RC_MASK_Y),
659 srcreg(RC_FILE_TEMPORARY, tempreg),
660 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
661
662 inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, tempreg);
663 }
664
665 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
666 struct rc_instruction *inst)
667 {
668 /* x = y <==> x >= y && y >= x */
669 int tmp = rc_find_free_temporary(c);
670
671 /* x <= y */
672 emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
673 dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
674 inst->U.I.SrcReg[0],
675 inst->U.I.SrcReg[1]);
676
677 /* y <= x */
678 emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
679 inst->U.I.DstReg,
680 inst->U.I.SrcReg[1],
681 inst->U.I.SrcReg[0]);
682
683 /* x && y = x * y */
684 emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
685 inst->U.I.DstReg,
686 srcreg(RC_FILE_TEMPORARY, tmp),
687 srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
688
689 rc_remove_instruction(inst);
690 }
691
692 static void transform_r300_vertex_SNE(struct radeon_compiler *c,
693 struct rc_instruction *inst)
694 {
695 /* x != y <==> x < y || y < x */
696 int tmp = rc_find_free_temporary(c);
697
698 /* x < y */
699 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
700 dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
701 inst->U.I.SrcReg[0],
702 inst->U.I.SrcReg[1]);
703
704 /* y < x */
705 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
706 inst->U.I.DstReg,
707 inst->U.I.SrcReg[1],
708 inst->U.I.SrcReg[0]);
709
710 /* x || y = max(x, y) */
711 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
712 inst->U.I.DstReg,
713 srcreg(RC_FILE_TEMPORARY, tmp),
714 srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
715
716 rc_remove_instruction(inst);
717 }
718
719 static void transform_r300_vertex_SGT(struct radeon_compiler* c,
720 struct rc_instruction* inst)
721 {
722 /* x > y <==> -x < -y */
723 inst->U.I.Opcode = RC_OPCODE_SLT;
724 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
725 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
726 }
727
728 static void transform_r300_vertex_SLE(struct radeon_compiler* c,
729 struct rc_instruction* inst)
730 {
731 /* x <= y <==> -x >= -y */
732 inst->U.I.Opcode = RC_OPCODE_SGE;
733 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
734 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
735 }
736
737 static void transform_r300_vertex_SSG(struct radeon_compiler* c,
738 struct rc_instruction* inst)
739 {
740 /* result = sign(x)
741 *
742 * SLT tmp0, 0, x;
743 * SLT tmp1, x, 0;
744 * ADD result, tmp0, -tmp1;
745 */
746 unsigned tmp0, tmp1;
747
748 /* 0 < x */
749 tmp0 = rc_find_free_temporary(c);
750 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
751 dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
752 builtin_zero,
753 inst->U.I.SrcReg[0]);
754
755 /* x < 0 */
756 tmp1 = rc_find_free_temporary(c);
757 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
758 dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
759 inst->U.I.SrcReg[0],
760 builtin_zero);
761
762 /* Either both are zero, or one of them is one and the other is zero. */
763 /* result = tmp0 - tmp1 */
764 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
765 inst->U.I.DstReg,
766 srcreg(RC_FILE_TEMPORARY, tmp0),
767 negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
768
769 rc_remove_instruction(inst);
770 }
771
772 /**
773 * For use with rc_local_transform, this transforms non-native ALU
774 * instructions of the r300 up to r500 vertex engine.
775 */
776 int r300_transform_vertex_alu(
777 struct radeon_compiler * c,
778 struct rc_instruction* inst,
779 void* unused)
780 {
781 switch(inst->U.I.Opcode) {
782 case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
783 case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
784 case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
785 case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
786 case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
787 case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
788 case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
789 case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
790 case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
791 case RC_OPCODE_SEQ:
792 if (!c->is_r500) {
793 transform_r300_vertex_SEQ(c, inst);
794 return 1;
795 }
796 return 0;
797 case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
798 case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
799 case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
800 case RC_OPCODE_SNE:
801 if (!c->is_r500) {
802 transform_r300_vertex_SNE(c, inst);
803 return 1;
804 }
805 return 0;
806 case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
807 case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
808 case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
809 case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
810 default:
811 return 0;
812 }
813 }
814
815 static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
816 {
817 static const float SinCosConsts[2][4] = {
818 {
819 1.273239545, /* 4/PI */
820 -0.405284735, /* -4/(PI*PI) */
821 3.141592654, /* PI */
822 0.2225 /* weight */
823 },
824 {
825 0.75,
826 0.5,
827 0.159154943, /* 1/(2*PI) */
828 6.283185307 /* 2*PI */
829 }
830 };
831 int i;
832
833 for(i = 0; i < 2; ++i)
834 constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
835 }
836
837 /**
838 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
839 *
840 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
841 * MAD tmp.x, tmp.y, |src|, tmp.x
842 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
843 * MAD dest, tmp.y, weight, tmp.x
844 */
845 static void sin_approx(
846 struct radeon_compiler* c, struct rc_instruction * inst,
847 struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
848 {
849 unsigned int tempreg = rc_find_free_temporary(c);
850
851 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
852 swizzle_xxxx(src),
853 srcreg(RC_FILE_CONSTANT, constants[0]));
854 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
855 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
856 absolute(swizzle_xxxx(src)),
857 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
858 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
859 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
860 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
861 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
862 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
863 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
864 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
865 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
866 }
867
868 /**
869 * Translate the trigonometric functions COS, SIN, and SCS
870 * using only the basic instructions
871 * MOV, ADD, MUL, MAD, FRC
872 */
873 int r300_transform_trig_simple(struct radeon_compiler* c,
874 struct rc_instruction* inst,
875 void* unused)
876 {
877 if (inst->U.I.Opcode != RC_OPCODE_COS &&
878 inst->U.I.Opcode != RC_OPCODE_SIN &&
879 inst->U.I.Opcode != RC_OPCODE_SCS)
880 return 0;
881
882 unsigned int constants[2];
883 unsigned int tempreg = rc_find_free_temporary(c);
884
885 sincos_constants(c, constants);
886
887 if (inst->U.I.Opcode == RC_OPCODE_COS) {
888 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
889 /* FRC tmp.x, tmp.x */
890 /* MAD tmp.z, tmp.x, 2*PI, -PI */
891 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
892 swizzle_xxxx(inst->U.I.SrcReg[0]),
893 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
894 swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
895 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
896 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
897 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
898 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
899 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
900 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
901
902 sin_approx(c, inst, inst->U.I.DstReg,
903 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
904 constants);
905 } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
906 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
907 swizzle_xxxx(inst->U.I.SrcReg[0]),
908 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
909 swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
910 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
911 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
912 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
913 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
914 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
915 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
916
917 sin_approx(c, inst, inst->U.I.DstReg,
918 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
919 constants);
920 } else {
921 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
922 swizzle_xxxx(inst->U.I.SrcReg[0]),
923 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
924 swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
925 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
926 srcreg(RC_FILE_TEMPORARY, tempreg));
927 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
928 srcreg(RC_FILE_TEMPORARY, tempreg),
929 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
930 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
931
932 struct rc_dst_register dst = inst->U.I.DstReg;
933
934 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
935 sin_approx(c, inst, dst,
936 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
937 constants);
938
939 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
940 sin_approx(c, inst, dst,
941 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
942 constants);
943 }
944
945 rc_remove_instruction(inst);
946
947 return 1;
948 }
949
950 static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
951 struct rc_instruction *inst,
952 unsigned srctmp)
953 {
954 if (inst->U.I.Opcode == RC_OPCODE_COS) {
955 emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, inst->U.I.DstReg,
956 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
957 } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
958 emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode,
959 inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
960 } else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
961 struct rc_dst_register moddst = inst->U.I.DstReg;
962
963 if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
964 moddst.WriteMask = RC_MASK_X;
965 emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, moddst,
966 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
967 }
968 if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
969 moddst.WriteMask = RC_MASK_Y;
970 emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode, moddst,
971 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
972 }
973 }
974
975 rc_remove_instruction(inst);
976 }
977
978
979 /**
980 * Transform the trigonometric functions COS, SIN, and SCS
981 * to include pre-scaling by 1/(2*PI) and taking the fractional
982 * part, so that the input to COS and SIN is always in the range [0,1).
983 * SCS is replaced by one COS and one SIN instruction.
984 *
985 * @warning This transformation implicitly changes the semantics of SIN and COS!
986 */
987 int radeonTransformTrigScale(struct radeon_compiler* c,
988 struct rc_instruction* inst,
989 void* unused)
990 {
991 if (inst->U.I.Opcode != RC_OPCODE_COS &&
992 inst->U.I.Opcode != RC_OPCODE_SIN &&
993 inst->U.I.Opcode != RC_OPCODE_SCS)
994 return 0;
995
996 static const float RCP_2PI = 0.15915494309189535;
997 unsigned int temp;
998 unsigned int constant;
999 unsigned int constant_swizzle;
1000
1001 temp = rc_find_free_temporary(c);
1002 constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1003
1004 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
1005 swizzle_xxxx(inst->U.I.SrcReg[0]),
1006 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1007 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1008 srcreg(RC_FILE_TEMPORARY, temp));
1009
1010 r300_transform_SIN_COS_SCS(c, inst, temp);
1011 return 1;
1012 }
1013
1014 /**
1015 * Transform the trigonometric functions COS, SIN, and SCS
1016 * so that the input to COS and SIN is always in the range [-PI, PI].
1017 * SCS is replaced by one COS and one SIN instruction.
1018 */
1019 int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
1020 struct rc_instruction *inst,
1021 void *unused)
1022 {
1023 if (inst->U.I.Opcode != RC_OPCODE_COS &&
1024 inst->U.I.Opcode != RC_OPCODE_SIN &&
1025 inst->U.I.Opcode != RC_OPCODE_SCS)
1026 return 0;
1027
1028 /* Repeat x in the range [-PI, PI]:
1029 *
1030 * repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1031 */
1032
1033 static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1034 unsigned int temp;
1035 unsigned int constant;
1036
1037 temp = rc_find_free_temporary(c);
1038 constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
1039
1040 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1041 swizzle_xxxx(inst->U.I.SrcReg[0]),
1042 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
1043 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
1044 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1045 srcreg(RC_FILE_TEMPORARY, temp));
1046 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1047 srcreg(RC_FILE_TEMPORARY, temp),
1048 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
1049 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
1050
1051 r300_transform_SIN_COS_SCS(c, inst, temp);
1052 return 1;
1053 }
1054
1055 /**
1056 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1057 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1058 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1059 *
1060 * @warning This explicitly changes the form of DDX and DDY!
1061 */
1062
1063 int radeonTransformDeriv(struct radeon_compiler* c,
1064 struct rc_instruction* inst,
1065 void* unused)
1066 {
1067 if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
1068 return 0;
1069
1070 inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
1071 inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
1072
1073 return 1;
1074 }
1075
1076 /**
1077 * IF Temp[0].x -\
1078 * KILP - > KIL -abs(Temp[0].x)
1079 * ENDIF -/
1080 *
1081 * This needs to be done in its own pass, because it modifies the instructions
1082 * before and after KILP.
1083 */
1084 void rc_transform_KILP(struct radeon_compiler * c, void *user)
1085 {
1086 struct rc_instruction * inst;
1087 for (inst = c->Program.Instructions.Next;
1088 inst != &c->Program.Instructions; inst = inst->Next) {
1089
1090 if (inst->U.I.Opcode != RC_OPCODE_KILP)
1091 continue;
1092
1093 inst->U.I.Opcode = RC_OPCODE_KIL;
1094
1095 if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
1096 || inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
1097 inst->U.I.SrcReg[0] = negate(builtin_one);
1098 } else {
1099
1100 inst->U.I.SrcReg[0] =
1101 negate(absolute(inst->Prev->U.I.SrcReg[0]));
1102 /* Remove IF */
1103 rc_remove_instruction(inst->Prev);
1104 /* Remove ENDIF */
1105 rc_remove_instruction(inst->Next);
1106 }
1107 }
1108 }