r300/compiler: Fix bug when lowering KILP on r300 cards
[mesa.git] / src / gallium / drivers / r300 / compiler / radeon_program_alu.c
1 /*
2 * Copyright (C) 2008 Nicolai Haehnle.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /**
29 * @file
30 *
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
33 *
34 */
35
36 #include "radeon_program_alu.h"
37
38 #include "radeon_compiler.h"
39 #include "radeon_compiler_util.h"
40
41
42 static struct rc_instruction *emit1(
43 struct radeon_compiler * c, struct rc_instruction * after,
44 rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
45 struct rc_src_register SrcReg)
46 {
47 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
48
49 fpi->U.I.Opcode = Opcode;
50 fpi->U.I.SaturateMode = Saturate;
51 fpi->U.I.DstReg = DstReg;
52 fpi->U.I.SrcReg[0] = SrcReg;
53 return fpi;
54 }
55
56 static struct rc_instruction *emit2(
57 struct radeon_compiler * c, struct rc_instruction * after,
58 rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
59 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
60 {
61 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
62
63 fpi->U.I.Opcode = Opcode;
64 fpi->U.I.SaturateMode = Saturate;
65 fpi->U.I.DstReg = DstReg;
66 fpi->U.I.SrcReg[0] = SrcReg0;
67 fpi->U.I.SrcReg[1] = SrcReg1;
68 return fpi;
69 }
70
71 static struct rc_instruction *emit3(
72 struct radeon_compiler * c, struct rc_instruction * after,
73 rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
74 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
75 struct rc_src_register SrcReg2)
76 {
77 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
78
79 fpi->U.I.Opcode = Opcode;
80 fpi->U.I.SaturateMode = Saturate;
81 fpi->U.I.DstReg = DstReg;
82 fpi->U.I.SrcReg[0] = SrcReg0;
83 fpi->U.I.SrcReg[1] = SrcReg1;
84 fpi->U.I.SrcReg[2] = SrcReg2;
85 return fpi;
86 }
87
88 static struct rc_dst_register dstregtmpmask(int index, int mask)
89 {
90 struct rc_dst_register dst = {0, 0, 0};
91 dst.File = RC_FILE_TEMPORARY;
92 dst.Index = index;
93 dst.WriteMask = mask;
94 return dst;
95 }
96
97 static const struct rc_src_register builtin_zero = {
98 .File = RC_FILE_NONE,
99 .Index = 0,
100 .Swizzle = RC_SWIZZLE_0000
101 };
102 static const struct rc_src_register builtin_one = {
103 .File = RC_FILE_NONE,
104 .Index = 0,
105 .Swizzle = RC_SWIZZLE_1111
106 };
107
108 static const struct rc_src_register builtin_half = {
109 .File = RC_FILE_NONE,
110 .Index = 0,
111 .Swizzle = RC_SWIZZLE_HHHH
112 };
113
114 static const struct rc_src_register srcreg_undefined = {
115 .File = RC_FILE_NONE,
116 .Index = 0,
117 .Swizzle = RC_SWIZZLE_XYZW
118 };
119
120 static struct rc_src_register srcreg(int file, int index)
121 {
122 struct rc_src_register src = srcreg_undefined;
123 src.File = file;
124 src.Index = index;
125 return src;
126 }
127
128 static struct rc_src_register srcregswz(int file, int index, int swz)
129 {
130 struct rc_src_register src = srcreg_undefined;
131 src.File = file;
132 src.Index = index;
133 src.Swizzle = swz;
134 return src;
135 }
136
137 static struct rc_src_register absolute(struct rc_src_register reg)
138 {
139 struct rc_src_register newreg = reg;
140 newreg.Abs = 1;
141 newreg.Negate = RC_MASK_NONE;
142 return newreg;
143 }
144
145 static struct rc_src_register negate(struct rc_src_register reg)
146 {
147 struct rc_src_register newreg = reg;
148 newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
149 return newreg;
150 }
151
152 static struct rc_src_register swizzle(struct rc_src_register reg,
153 rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
154 {
155 struct rc_src_register swizzled = reg;
156 swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
157 return swizzled;
158 }
159
160 static struct rc_src_register swizzle_smear(struct rc_src_register reg,
161 rc_swizzle x)
162 {
163 return swizzle(reg, x, x, x, x);
164 }
165
166 static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
167 {
168 return swizzle_smear(reg, RC_SWIZZLE_X);
169 }
170
171 static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
172 {
173 return swizzle_smear(reg, RC_SWIZZLE_Y);
174 }
175
176 static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
177 {
178 return swizzle_smear(reg, RC_SWIZZLE_Z);
179 }
180
181 static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
182 {
183 return swizzle_smear(reg, RC_SWIZZLE_W);
184 }
185
186 static int is_dst_safe_to_reuse(struct rc_instruction *inst)
187 {
188 const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
189 unsigned i;
190
191 assert(info->HasDstReg);
192
193 if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
194 return 0;
195
196 for (i = 0; i < info->NumSrcRegs; i++) {
197 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
198 inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
199 return 0;
200 }
201
202 return 1;
203 }
204
205 static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
206 struct rc_instruction *inst)
207 {
208 unsigned tmp;
209
210 if (is_dst_safe_to_reuse(inst))
211 tmp = inst->U.I.DstReg.Index;
212 else
213 tmp = rc_find_free_temporary(c);
214
215 return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
216 }
217
218 static void transform_ABS(struct radeon_compiler* c,
219 struct rc_instruction* inst)
220 {
221 struct rc_src_register src = inst->U.I.SrcReg[0];
222 src.Abs = 1;
223 src.Negate = RC_MASK_NONE;
224 emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, src);
225 rc_remove_instruction(inst);
226 }
227
228 static void transform_CEIL(struct radeon_compiler* c,
229 struct rc_instruction* inst)
230 {
231 /* Assuming:
232 * ceil(x) = -floor(-x)
233 *
234 * After inlining floor:
235 * ceil(x) = -(-x-frac(-x))
236 *
237 * After simplification:
238 * ceil(x) = x+frac(-x)
239 */
240
241 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
242 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
243 emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
244 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
245 rc_remove_instruction(inst);
246 }
247
248 static void transform_CLAMP(struct radeon_compiler *c,
249 struct rc_instruction *inst)
250 {
251 /* CLAMP dst, src, min, max
252 * into:
253 * MIN tmp, src, max
254 * MAX dst, tmp, min
255 */
256 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
257 emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
258 inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
259 emit2(c, inst->Prev, RC_OPCODE_MAX, inst->U.I.SaturateMode, inst->U.I.DstReg,
260 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
261 rc_remove_instruction(inst);
262 }
263
264 static void transform_DP2(struct radeon_compiler* c,
265 struct rc_instruction* inst)
266 {
267 struct rc_src_register src0 = inst->U.I.SrcReg[0];
268 struct rc_src_register src1 = inst->U.I.SrcReg[1];
269 src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
270 src0.Swizzle &= ~(63 << (3 * 2));
271 src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
272 src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
273 src1.Swizzle &= ~(63 << (3 * 2));
274 src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
275 emit2(c, inst->Prev, RC_OPCODE_DP3, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
276 rc_remove_instruction(inst);
277 }
278
279 static void transform_DPH(struct radeon_compiler* c,
280 struct rc_instruction* inst)
281 {
282 struct rc_src_register src0 = inst->U.I.SrcReg[0];
283 src0.Negate &= ~RC_MASK_W;
284 src0.Swizzle &= ~(7 << (3 * 3));
285 src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
286 emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
287 rc_remove_instruction(inst);
288 }
289
290 /**
291 * [1, src0.y*src1.y, src0.z, src1.w]
292 * So basically MUL with lotsa swizzling.
293 */
294 static void transform_DST(struct radeon_compiler* c,
295 struct rc_instruction* inst)
296 {
297 emit2(c, inst->Prev, RC_OPCODE_MUL, inst->U.I.SaturateMode, inst->U.I.DstReg,
298 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
299 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
300 rc_remove_instruction(inst);
301 }
302
303 static void transform_FLR(struct radeon_compiler* c,
304 struct rc_instruction* inst)
305 {
306 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
307 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
308 emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
309 inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
310 rc_remove_instruction(inst);
311 }
312
313 /**
314 * Definition of LIT (from ARB_fragment_program):
315 *
316 * tmp = VectorLoad(op0);
317 * if (tmp.x < 0) tmp.x = 0;
318 * if (tmp.y < 0) tmp.y = 0;
319 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
320 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
321 * result.x = 1.0;
322 * result.y = tmp.x;
323 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
324 * result.w = 1.0;
325 *
326 * The longest path of computation is the one leading to result.z,
327 * consisting of 5 operations. This implementation of LIT takes
328 * 5 slots, if the subsequent optimization passes are clever enough
329 * to pair instructions correctly.
330 */
331 static void transform_LIT(struct radeon_compiler* c,
332 struct rc_instruction* inst)
333 {
334 unsigned int constant;
335 unsigned int constant_swizzle;
336 unsigned int temp;
337 struct rc_src_register srctemp;
338
339 constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
340
341 if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
342 struct rc_instruction * inst_mov;
343
344 inst_mov = emit1(c, inst,
345 RC_OPCODE_MOV, 0, inst->U.I.DstReg,
346 srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
347
348 inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
349 inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
350 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
351 }
352
353 temp = inst->U.I.DstReg.Index;
354 srctemp = srcreg(RC_FILE_TEMPORARY, temp);
355
356 /* tmp.x = max(0.0, Src.x); */
357 /* tmp.y = max(0.0, Src.y); */
358 /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
359 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
360 dstregtmpmask(temp, RC_MASK_XYW),
361 inst->U.I.SrcReg[0],
362 swizzle(srcreg(RC_FILE_CONSTANT, constant),
363 RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
364 emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
365 dstregtmpmask(temp, RC_MASK_Z),
366 swizzle_wwww(srctemp),
367 negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
368
369 /* tmp.w = Pow(tmp.y, tmp.w) */
370 emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
371 dstregtmpmask(temp, RC_MASK_W),
372 swizzle_yyyy(srctemp));
373 emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
374 dstregtmpmask(temp, RC_MASK_W),
375 swizzle_wwww(srctemp),
376 swizzle_zzzz(srctemp));
377 emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
378 dstregtmpmask(temp, RC_MASK_W),
379 swizzle_wwww(srctemp));
380
381 /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
382 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode,
383 dstregtmpmask(temp, RC_MASK_Z),
384 negate(swizzle_xxxx(srctemp)),
385 swizzle_wwww(srctemp),
386 builtin_zero);
387
388 /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
389 emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode,
390 dstregtmpmask(temp, RC_MASK_XYW),
391 swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
392
393 rc_remove_instruction(inst);
394 }
395
396 static void transform_LRP(struct radeon_compiler* c,
397 struct rc_instruction* inst)
398 {
399 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
400
401 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
402 dst,
403 inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
404 emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
405 inst->U.I.DstReg,
406 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
407
408 rc_remove_instruction(inst);
409 }
410
411 static void transform_POW(struct radeon_compiler* c,
412 struct rc_instruction* inst)
413 {
414 struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
415 struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
416 tempdst.WriteMask = RC_MASK_W;
417 tempsrc.Swizzle = RC_SWIZZLE_WWWW;
418
419 emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
420 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
421 emit1(c, inst->Prev, RC_OPCODE_EX2, inst->U.I.SaturateMode, inst->U.I.DstReg, tempsrc);
422
423 rc_remove_instruction(inst);
424 }
425
426 /* dst = ROUND(src) :
427 * add = src + .5
428 * frac = FRC(add)
429 * dst = add - frac
430 *
431 * According to the GLSL spec, the implementor can decide which way to round
432 * when the fraction is .5. We round down for .5.
433 *
434 */
435 static void transform_ROUND(struct radeon_compiler* c,
436 struct rc_instruction* inst)
437 {
438 unsigned int mask = inst->U.I.DstReg.WriteMask;
439 unsigned int frac_index, add_index;
440 struct rc_dst_register frac_dst, add_dst;
441 struct rc_src_register frac_src, add_src;
442
443 /* add = src + .5 */
444 add_index = rc_find_free_temporary(c);
445 add_dst = dstregtmpmask(add_index, mask);
446 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
447 builtin_half);
448 add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
449
450
451 /* frac = FRC(add) */
452 frac_index = rc_find_free_temporary(c);
453 frac_dst = dstregtmpmask(frac_index, mask);
454 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
455 frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
456
457 /* dst = add - frac */
458 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
459 add_src, negate(frac_src));
460 rc_remove_instruction(inst);
461 }
462
463 static void transform_RSQ(struct radeon_compiler* c,
464 struct rc_instruction* inst)
465 {
466 inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
467 }
468
469 static void transform_SEQ(struct radeon_compiler* c,
470 struct rc_instruction* inst)
471 {
472 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
473
474 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
475 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
476 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
477
478 rc_remove_instruction(inst);
479 }
480
481 static void transform_SFL(struct radeon_compiler* c,
482 struct rc_instruction* inst)
483 {
484 emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, builtin_zero);
485 rc_remove_instruction(inst);
486 }
487
488 static void transform_SGE(struct radeon_compiler* c,
489 struct rc_instruction* inst)
490 {
491 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
492
493 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
494 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
495 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
496
497 rc_remove_instruction(inst);
498 }
499
500 static void transform_SGT(struct radeon_compiler* c,
501 struct rc_instruction* inst)
502 {
503 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
504
505 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
506 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
507 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
508
509 rc_remove_instruction(inst);
510 }
511
512 static void transform_SLE(struct radeon_compiler* c,
513 struct rc_instruction* inst)
514 {
515 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
516
517 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
518 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
519 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
520
521 rc_remove_instruction(inst);
522 }
523
524 static void transform_SLT(struct radeon_compiler* c,
525 struct rc_instruction* inst)
526 {
527 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
528
529 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
530 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
531 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
532
533 rc_remove_instruction(inst);
534 }
535
536 static void transform_SNE(struct radeon_compiler* c,
537 struct rc_instruction* inst)
538 {
539 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
540
541 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
542 emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
543 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
544
545 rc_remove_instruction(inst);
546 }
547
548 static void transform_SSG(struct radeon_compiler* c,
549 struct rc_instruction* inst)
550 {
551 /* result = sign(x)
552 *
553 * CMP tmp0, -x, 1, 0
554 * CMP tmp1, x, 1, 0
555 * ADD result, tmp0, -tmp1;
556 */
557 struct rc_dst_register dst0;
558 unsigned tmp1;
559
560 /* 0 < x */
561 dst0 = try_to_reuse_dst(c, inst);
562 emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
563 dst0,
564 negate(inst->U.I.SrcReg[0]),
565 builtin_one,
566 builtin_zero);
567
568 /* x < 0 */
569 tmp1 = rc_find_free_temporary(c);
570 emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
571 dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
572 inst->U.I.SrcReg[0],
573 builtin_one,
574 builtin_zero);
575
576 /* Either both are zero, or one of them is one and the other is zero. */
577 /* result = tmp0 - tmp1 */
578 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
579 inst->U.I.DstReg,
580 srcreg(RC_FILE_TEMPORARY, dst0.Index),
581 negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
582
583 rc_remove_instruction(inst);
584 }
585
586 static void transform_SUB(struct radeon_compiler* c,
587 struct rc_instruction* inst)
588 {
589 inst->U.I.Opcode = RC_OPCODE_ADD;
590 inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
591 }
592
593 static void transform_SWZ(struct radeon_compiler* c,
594 struct rc_instruction* inst)
595 {
596 inst->U.I.Opcode = RC_OPCODE_MOV;
597 }
598
599 static void transform_XPD(struct radeon_compiler* c,
600 struct rc_instruction* inst)
601 {
602 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
603
604 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
605 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
606 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
607 emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
608 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
609 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
610 negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
611
612 rc_remove_instruction(inst);
613 }
614
615
616 /**
617 * Can be used as a transformation for @ref radeonClauseLocalTransform,
618 * no userData necessary.
619 *
620 * Eliminates the following ALU instructions:
621 * ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
622 * using:
623 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
624 *
625 * Transforms RSQ to Radeon's native RSQ by explicitly setting
626 * absolute value.
627 *
628 * @note should be applicable to R300 and R500 fragment programs.
629 */
630 int radeonTransformALU(
631 struct radeon_compiler * c,
632 struct rc_instruction* inst,
633 void* unused)
634 {
635 switch(inst->U.I.Opcode) {
636 case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
637 case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
638 case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
639 case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
640 case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
641 case RC_OPCODE_DST: transform_DST(c, inst); return 1;
642 case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
643 case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
644 case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
645 case RC_OPCODE_POW: transform_POW(c, inst); return 1;
646 case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
647 case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
648 case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
649 case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
650 case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
651 case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
652 case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
653 case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
654 case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
655 case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
656 case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
657 case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
658 case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
659 default:
660 return 0;
661 }
662 }
663
664
665 static void transform_r300_vertex_ABS(struct radeon_compiler* c,
666 struct rc_instruction* inst)
667 {
668 /* Note: r500 can take absolute values, but r300 cannot. */
669 inst->U.I.Opcode = RC_OPCODE_MAX;
670 inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
671 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
672 }
673
674 static void transform_r300_vertex_CMP(struct radeon_compiler* c,
675 struct rc_instruction* inst)
676 {
677 /* There is no decent CMP available, so let's rig one up.
678 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
679 * The following sequence consumes zero to two temps and two extra slots
680 * (the second temp and the second slot is consumed by transform_LRP),
681 * but should be equivalent:
682 *
683 * SLT tmp0, src0, 0.0
684 * LRP dst, tmp0, src1, src2
685 *
686 * Yes, I know, I'm a mad scientist. ~ C. & M. */
687 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
688
689 /* SLT tmp0, src0, 0.0 */
690 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
691 dst,
692 inst->U.I.SrcReg[0], builtin_zero);
693
694 /* LRP dst, tmp0, src1, src2 */
695 transform_LRP(c,
696 emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
697 inst->U.I.DstReg,
698 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]));
699
700 rc_remove_instruction(inst);
701 }
702
703 static void transform_r300_vertex_DP2(struct radeon_compiler* c,
704 struct rc_instruction* inst)
705 {
706 struct rc_instruction *next_inst = inst->Next;
707 transform_DP2(c, inst);
708 next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
709 }
710
711 static void transform_r300_vertex_DP3(struct radeon_compiler* c,
712 struct rc_instruction* inst)
713 {
714 struct rc_src_register src0 = inst->U.I.SrcReg[0];
715 struct rc_src_register src1 = inst->U.I.SrcReg[1];
716 src0.Negate &= ~RC_MASK_W;
717 src0.Swizzle &= ~(7 << (3 * 3));
718 src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
719 src1.Negate &= ~RC_MASK_W;
720 src1.Swizzle &= ~(7 << (3 * 3));
721 src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
722 emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
723 rc_remove_instruction(inst);
724 }
725
726 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
727 struct rc_instruction* inst)
728 {
729 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
730 unsigned constant_swizzle;
731 int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
732 0.0000000000000000001,
733 &constant_swizzle);
734
735 /* MOV dst, src */
736 dst.WriteMask = RC_MASK_XYZW;
737 emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
738 dst,
739 inst->U.I.SrcReg[0]);
740
741 /* MAX dst.y, src, 0.00...001 */
742 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
743 dstregtmpmask(dst.Index, RC_MASK_Y),
744 srcreg(RC_FILE_TEMPORARY, dst.Index),
745 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
746
747 inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
748 }
749
750 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
751 struct rc_instruction *inst)
752 {
753 /* x = y <==> x >= y && y >= x */
754 int tmp = rc_find_free_temporary(c);
755
756 /* x <= y */
757 emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
758 dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
759 inst->U.I.SrcReg[0],
760 inst->U.I.SrcReg[1]);
761
762 /* y <= x */
763 emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
764 inst->U.I.DstReg,
765 inst->U.I.SrcReg[1],
766 inst->U.I.SrcReg[0]);
767
768 /* x && y = x * y */
769 emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
770 inst->U.I.DstReg,
771 srcreg(RC_FILE_TEMPORARY, tmp),
772 srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
773
774 rc_remove_instruction(inst);
775 }
776
777 static void transform_r300_vertex_SNE(struct radeon_compiler *c,
778 struct rc_instruction *inst)
779 {
780 /* x != y <==> x < y || y < x */
781 int tmp = rc_find_free_temporary(c);
782
783 /* x < y */
784 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
785 dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
786 inst->U.I.SrcReg[0],
787 inst->U.I.SrcReg[1]);
788
789 /* y < x */
790 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
791 inst->U.I.DstReg,
792 inst->U.I.SrcReg[1],
793 inst->U.I.SrcReg[0]);
794
795 /* x || y = max(x, y) */
796 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
797 inst->U.I.DstReg,
798 srcreg(RC_FILE_TEMPORARY, tmp),
799 srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
800
801 rc_remove_instruction(inst);
802 }
803
804 static void transform_r300_vertex_SGT(struct radeon_compiler* c,
805 struct rc_instruction* inst)
806 {
807 /* x > y <==> -x < -y */
808 inst->U.I.Opcode = RC_OPCODE_SLT;
809 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
810 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
811 }
812
813 static void transform_r300_vertex_SLE(struct radeon_compiler* c,
814 struct rc_instruction* inst)
815 {
816 /* x <= y <==> -x >= -y */
817 inst->U.I.Opcode = RC_OPCODE_SGE;
818 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
819 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
820 }
821
822 static void transform_r300_vertex_SSG(struct radeon_compiler* c,
823 struct rc_instruction* inst)
824 {
825 /* result = sign(x)
826 *
827 * SLT tmp0, 0, x;
828 * SLT tmp1, x, 0;
829 * ADD result, tmp0, -tmp1;
830 */
831 struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
832 unsigned tmp1;
833
834 /* 0 < x */
835 dst0 = try_to_reuse_dst(c, inst);
836 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
837 dst0,
838 builtin_zero,
839 inst->U.I.SrcReg[0]);
840
841 /* x < 0 */
842 tmp1 = rc_find_free_temporary(c);
843 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
844 dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
845 inst->U.I.SrcReg[0],
846 builtin_zero);
847
848 /* Either both are zero, or one of them is one and the other is zero. */
849 /* result = tmp0 - tmp1 */
850 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
851 inst->U.I.DstReg,
852 srcreg(RC_FILE_TEMPORARY, dst0.Index),
853 negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
854
855 rc_remove_instruction(inst);
856 }
857
858 /**
859 * For use with rc_local_transform, this transforms non-native ALU
860 * instructions of the r300 up to r500 vertex engine.
861 */
862 int r300_transform_vertex_alu(
863 struct radeon_compiler * c,
864 struct rc_instruction* inst,
865 void* unused)
866 {
867 switch(inst->U.I.Opcode) {
868 case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
869 case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
870 case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
871 case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
872 case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
873 case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
874 case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
875 case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
876 case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
877 case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
878 case RC_OPCODE_SEQ:
879 if (!c->is_r500) {
880 transform_r300_vertex_SEQ(c, inst);
881 return 1;
882 }
883 return 0;
884 case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
885 case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
886 case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
887 case RC_OPCODE_SNE:
888 if (!c->is_r500) {
889 transform_r300_vertex_SNE(c, inst);
890 return 1;
891 }
892 return 0;
893 case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
894 case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
895 case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
896 case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
897 default:
898 return 0;
899 }
900 }
901
902 static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
903 {
904 static const float SinCosConsts[2][4] = {
905 {
906 1.273239545, /* 4/PI */
907 -0.405284735, /* -4/(PI*PI) */
908 3.141592654, /* PI */
909 0.2225 /* weight */
910 },
911 {
912 0.75,
913 0.5,
914 0.159154943, /* 1/(2*PI) */
915 6.283185307 /* 2*PI */
916 }
917 };
918 int i;
919
920 for(i = 0; i < 2; ++i)
921 constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
922 }
923
924 /**
925 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
926 *
927 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
928 * MAD tmp.x, tmp.y, |src|, tmp.x
929 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
930 * MAD dest, tmp.y, weight, tmp.x
931 */
932 static void sin_approx(
933 struct radeon_compiler* c, struct rc_instruction * inst,
934 struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
935 {
936 unsigned int tempreg = rc_find_free_temporary(c);
937
938 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
939 swizzle_xxxx(src),
940 srcreg(RC_FILE_CONSTANT, constants[0]));
941 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
942 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
943 absolute(swizzle_xxxx(src)),
944 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
945 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
946 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
947 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
948 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
949 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
950 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
951 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
952 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
953 }
954
955 /**
956 * Translate the trigonometric functions COS, SIN, and SCS
957 * using only the basic instructions
958 * MOV, ADD, MUL, MAD, FRC
959 */
960 int r300_transform_trig_simple(struct radeon_compiler* c,
961 struct rc_instruction* inst,
962 void* unused)
963 {
964 unsigned int constants[2];
965 unsigned int tempreg;
966
967 if (inst->U.I.Opcode != RC_OPCODE_COS &&
968 inst->U.I.Opcode != RC_OPCODE_SIN &&
969 inst->U.I.Opcode != RC_OPCODE_SCS)
970 return 0;
971
972 tempreg = rc_find_free_temporary(c);
973
974 sincos_constants(c, constants);
975
976 if (inst->U.I.Opcode == RC_OPCODE_COS) {
977 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
978 /* FRC tmp.x, tmp.x */
979 /* MAD tmp.z, tmp.x, 2*PI, -PI */
980 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
981 swizzle_xxxx(inst->U.I.SrcReg[0]),
982 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
983 swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
984 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
985 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
986 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
987 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
988 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
989 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
990
991 sin_approx(c, inst, inst->U.I.DstReg,
992 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
993 constants);
994 } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
995 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
996 swizzle_xxxx(inst->U.I.SrcReg[0]),
997 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
998 swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
999 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1000 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1001 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1002 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1003 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1004 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1005
1006 sin_approx(c, inst, inst->U.I.DstReg,
1007 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1008 constants);
1009 } else {
1010 struct rc_dst_register dst;
1011
1012 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1013 swizzle_xxxx(inst->U.I.SrcReg[0]),
1014 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1015 swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
1016 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1017 srcreg(RC_FILE_TEMPORARY, tempreg));
1018 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1019 srcreg(RC_FILE_TEMPORARY, tempreg),
1020 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1021 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1022
1023 dst = inst->U.I.DstReg;
1024
1025 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
1026 sin_approx(c, inst, dst,
1027 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
1028 constants);
1029
1030 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
1031 sin_approx(c, inst, dst,
1032 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
1033 constants);
1034 }
1035
1036 rc_remove_instruction(inst);
1037
1038 return 1;
1039 }
1040
1041 static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
1042 struct rc_instruction *inst,
1043 unsigned srctmp)
1044 {
1045 if (inst->U.I.Opcode == RC_OPCODE_COS) {
1046 emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, inst->U.I.DstReg,
1047 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1048 } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1049 emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode,
1050 inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1051 } else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
1052 struct rc_dst_register moddst = inst->U.I.DstReg;
1053
1054 if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
1055 moddst.WriteMask = RC_MASK_X;
1056 emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, moddst,
1057 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1058 }
1059 if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
1060 moddst.WriteMask = RC_MASK_Y;
1061 emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode, moddst,
1062 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1063 }
1064 }
1065
1066 rc_remove_instruction(inst);
1067 }
1068
1069
1070 /**
1071 * Transform the trigonometric functions COS, SIN, and SCS
1072 * to include pre-scaling by 1/(2*PI) and taking the fractional
1073 * part, so that the input to COS and SIN is always in the range [0,1).
1074 * SCS is replaced by one COS and one SIN instruction.
1075 *
1076 * @warning This transformation implicitly changes the semantics of SIN and COS!
1077 */
1078 int radeonTransformTrigScale(struct radeon_compiler* c,
1079 struct rc_instruction* inst,
1080 void* unused)
1081 {
1082 static const float RCP_2PI = 0.15915494309189535;
1083 unsigned int temp;
1084 unsigned int constant;
1085 unsigned int constant_swizzle;
1086
1087 if (inst->U.I.Opcode != RC_OPCODE_COS &&
1088 inst->U.I.Opcode != RC_OPCODE_SIN &&
1089 inst->U.I.Opcode != RC_OPCODE_SCS)
1090 return 0;
1091
1092 temp = rc_find_free_temporary(c);
1093 constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1094
1095 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
1096 swizzle_xxxx(inst->U.I.SrcReg[0]),
1097 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1098 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1099 srcreg(RC_FILE_TEMPORARY, temp));
1100
1101 r300_transform_SIN_COS_SCS(c, inst, temp);
1102 return 1;
1103 }
1104
1105 /**
1106 * Transform the trigonometric functions COS, SIN, and SCS
1107 * so that the input to COS and SIN is always in the range [-PI, PI].
1108 * SCS is replaced by one COS and one SIN instruction.
1109 */
1110 int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
1111 struct rc_instruction *inst,
1112 void *unused)
1113 {
1114 static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1115 unsigned int temp;
1116 unsigned int constant;
1117
1118 if (inst->U.I.Opcode != RC_OPCODE_COS &&
1119 inst->U.I.Opcode != RC_OPCODE_SIN &&
1120 inst->U.I.Opcode != RC_OPCODE_SCS)
1121 return 0;
1122
1123 /* Repeat x in the range [-PI, PI]:
1124 *
1125 * repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1126 */
1127
1128 temp = rc_find_free_temporary(c);
1129 constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
1130
1131 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1132 swizzle_xxxx(inst->U.I.SrcReg[0]),
1133 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
1134 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
1135 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1136 srcreg(RC_FILE_TEMPORARY, temp));
1137 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1138 srcreg(RC_FILE_TEMPORARY, temp),
1139 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
1140 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
1141
1142 r300_transform_SIN_COS_SCS(c, inst, temp);
1143 return 1;
1144 }
1145
1146 /**
1147 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1148 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1149 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1150 *
1151 * @warning This explicitly changes the form of DDX and DDY!
1152 */
1153
1154 int radeonTransformDeriv(struct radeon_compiler* c,
1155 struct rc_instruction* inst,
1156 void* unused)
1157 {
1158 if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
1159 return 0;
1160
1161 inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
1162 inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
1163
1164 return 1;
1165 }
1166
1167 /**
1168 * IF Temp[0].x -> IF Temp[0].x
1169 * ... -> ...
1170 * KILP -> KIL -abs(Temp[0].x)
1171 * ... -> ...
1172 * ENDIF -> ENDIF
1173 *
1174 * === OR ===
1175 *
1176 * IF Temp[0].x -\
1177 * KILP - > KIL -abs(Temp[0].x)
1178 * ENDIF -/
1179 *
1180 * === OR ===
1181 *
1182 * IF Temp[0].x -> IF Temp[0].x
1183 * ... -> ...
1184 * ELSE -> ELSE
1185 * ... -> ...
1186 * KILP -> KIL -abs(Temp[0].x)
1187 * ... -> ...
1188 * ENDIF -> ENDIF
1189 *
1190 * === OR ===
1191 *
1192 * KILP -> KIL -none.1111
1193 *
1194 * This needs to be done in its own pass, because it might modify the
1195 * instructions before and after KILP.
1196 */
1197 void rc_transform_KILP(struct radeon_compiler * c, void *user)
1198 {
1199 struct rc_instruction * inst;
1200 for (inst = c->Program.Instructions.Next;
1201 inst != &c->Program.Instructions; inst = inst->Next) {
1202 struct rc_instruction * if_inst;
1203 unsigned in_if = 0;
1204
1205 if (inst->U.I.Opcode != RC_OPCODE_KILP)
1206 continue;
1207
1208 for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
1209 if_inst = if_inst->Prev) {
1210
1211 if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
1212 in_if = 1;
1213 break;
1214 }
1215 }
1216
1217 inst->U.I.Opcode = RC_OPCODE_KIL;
1218
1219 if (!in_if) {
1220 inst->U.I.SrcReg[0] = negate(builtin_one);
1221 } else {
1222 /* This should work even if the KILP is inside the ELSE
1223 * block, because -0.0 is considered negative. */
1224 inst->U.I.SrcReg[0] =
1225 negate(absolute(if_inst->U.I.SrcReg[0]));
1226
1227 if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
1228 && inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
1229
1230 /* Optimize the special case:
1231 * IF Temp[0].x
1232 * KILP
1233 * ENDIF
1234 */
1235
1236 /* Remove IF */
1237 rc_remove_instruction(inst->Prev);
1238 /* Remove ENDIF */
1239 rc_remove_instruction(inst->Next);
1240 }
1241 }
1242 }
1243 }