r300/compiler: Clear empty registers after constant folding
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / radeon_optimize.c
1 /*
2 * Copyright (C) 2009 Nicolai Haehnle.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 #include "radeon_dataflow.h"
29
30 #include "radeon_compiler.h"
31 #include "radeon_swizzle.h"
32
33 struct peephole_state {
34 struct rc_instruction * Inst;
35 /** Stores a bitmask of the components that are still "alive" (i.e.
36 * they have not been written to since Inst was executed.)
37 */
38 unsigned int WriteMask;
39 };
40
41 typedef void (*rc_presub_replace_fn)(struct peephole_state *,
42 struct rc_instruction *,
43 unsigned int);
44
45 static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
46 {
47 struct rc_src_register combine;
48 combine.File = inner.File;
49 combine.Index = inner.Index;
50 combine.RelAddr = inner.RelAddr;
51 if (outer.Abs) {
52 combine.Abs = 1;
53 combine.Negate = outer.Negate;
54 } else {
55 combine.Abs = inner.Abs;
56 combine.Negate = 0;
57 for(unsigned int chan = 0; chan < 4; ++chan) {
58 unsigned int swz = GET_SWZ(outer.Swizzle, chan);
59 if (swz < 4)
60 combine.Negate |= GET_BIT(inner.Negate, swz) << chan;
61 }
62 combine.Negate ^= outer.Negate;
63 }
64 combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
65 return combine;
66 }
67
68 struct copy_propagate_state {
69 struct radeon_compiler * C;
70 struct rc_instruction * Mov;
71 unsigned int Conflict:1;
72
73 /** Whether Mov's source has been clobbered */
74 unsigned int SourceClobbered:1;
75
76 /** Which components of Mov's destination register are still from that Mov? */
77 unsigned int MovMask:4;
78
79 /** Which components of Mov's destination register are clearly *not* from that Mov */
80 unsigned int DefinedMask:4;
81
82 /** Which components of Mov's source register are sourced */
83 unsigned int SourcedMask:4;
84
85 /** Branch depth beyond Mov; negative value indicates we left the Mov's block */
86 int BranchDepth;
87 };
88
89 /**
90 * This is a callback function that is meant to be passed to
91 * rc_for_all_reads_mask. This function will be called once for each source
92 * register in inst.
93 * @param inst The instruction that the source register belongs to.
94 * @param file The register file of the source register.
95 * @param index The index of the source register.
96 * @param mask The components of the source register that are being read from.
97 */
98 static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
99 rc_register_file file, unsigned int index, unsigned int mask)
100 {
101 struct copy_propagate_state * s = data;
102
103 /* XXX This could probably be handled better. */
104 if (file == RC_FILE_ADDRESS) {
105 s->Conflict = 1;
106 return;
107 }
108
109 if (file != RC_FILE_TEMPORARY || index != s->Mov->U.I.DstReg.Index)
110 return;
111
112 /* These instructions cannot read from the constants file.
113 * see radeonTransformTEX()
114 */
115 if(s->Mov->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
116 s->Mov->U.I.SrcReg[0].File != RC_FILE_INPUT &&
117 (inst->U.I.Opcode == RC_OPCODE_TEX ||
118 inst->U.I.Opcode == RC_OPCODE_TXB ||
119 inst->U.I.Opcode == RC_OPCODE_TXP ||
120 inst->U.I.Opcode == RC_OPCODE_KIL)){
121 s->Conflict = 1;
122 return;
123 }
124 if ((mask & s->MovMask) == mask) {
125 if (s->SourceClobbered) {
126 s->Conflict = 1;
127 }
128 } else if ((mask & s->DefinedMask) == mask) {
129 /* read from something entirely written by other instruction: this is okay */
130 } else {
131 /* read from component combination that is not well-defined without
132 * the MOV: cannot remove it */
133 s->Conflict = 1;
134 }
135 }
136
137 static void copy_propagate_scan_write(void * data, struct rc_instruction * inst,
138 rc_register_file file, unsigned int index, unsigned int mask)
139 {
140 struct copy_propagate_state * s = data;
141
142 if (s->BranchDepth < 0)
143 return;
144
145 if (file == s->Mov->U.I.DstReg.File && index == s->Mov->U.I.DstReg.Index) {
146 s->MovMask &= ~mask;
147 if (s->BranchDepth == 0)
148 s->DefinedMask |= mask;
149 else
150 s->DefinedMask &= ~mask;
151 }
152 if (file == s->Mov->U.I.SrcReg[0].File && index == s->Mov->U.I.SrcReg[0].Index) {
153 if (mask & s->SourcedMask)
154 s->SourceClobbered = 1;
155 } else if (s->Mov->U.I.SrcReg[0].RelAddr && file == RC_FILE_ADDRESS) {
156 s->SourceClobbered = 1;
157 }
158 }
159
160 static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * inst_mov)
161 {
162 struct copy_propagate_state s;
163
164 if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
165 inst_mov->U.I.DstReg.RelAddr ||
166 inst_mov->U.I.WriteALUResult ||
167 inst_mov->U.I.SaturateMode)
168 return;
169
170 memset(&s, 0, sizeof(s));
171 s.C = c;
172 s.Mov = inst_mov;
173 s.MovMask = inst_mov->U.I.DstReg.WriteMask;
174 s.DefinedMask = RC_MASK_XYZW & ~s.MovMask;
175
176 for(unsigned int chan = 0; chan < 4; ++chan) {
177 unsigned int swz = GET_SWZ(inst_mov->U.I.SrcReg[0].Swizzle, chan);
178 s.SourcedMask |= (1 << swz) & RC_MASK_XYZW;
179 }
180
181 /* 1st pass: Check whether all subsequent readers can be changed */
182 for(struct rc_instruction * inst = inst_mov->Next;
183 inst != &c->Program.Instructions;
184 inst = inst->Next) {
185 const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
186 /* XXX In the future we might be able to make the optimizer
187 * smart enough to handle loops. */
188 if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP
189 || inst->U.I.Opcode == RC_OPCODE_ENDLOOP){
190 return;
191 }
192
193 /* It is possible to do copy propigation in this situation,
194 * just not right now, see peephole_add_presub_inv() */
195 if (inst_mov->U.I.PreSub.Opcode != RC_PRESUB_NONE &&
196 (info->NumSrcRegs > 2 || info->HasTexture)) {
197 return;
198 }
199
200 rc_for_all_reads_mask(inst, copy_propagate_scan_read, &s);
201 rc_for_all_writes_mask(inst, copy_propagate_scan_write, &s);
202 if (s.Conflict)
203 return;
204
205 if (s.BranchDepth >= 0) {
206 if (inst->U.I.Opcode == RC_OPCODE_IF) {
207 s.BranchDepth++;
208 } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF
209 || inst->U.I.Opcode == RC_OPCODE_ELSE) {
210 s.BranchDepth--;
211 if (s.BranchDepth < 0) {
212 s.DefinedMask &= ~s.MovMask;
213 s.MovMask = 0;
214 }
215 }
216 }
217 }
218
219 if (s.Conflict)
220 return;
221
222 /* 2nd pass: We can satisfy all readers, so switch them over all at once */
223 s.MovMask = inst_mov->U.I.DstReg.WriteMask;
224 s.BranchDepth = 0;
225
226 for(struct rc_instruction * inst = inst_mov->Next;
227 inst != &c->Program.Instructions;
228 inst = inst->Next) {
229 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
230 for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
231 if (inst->U.I.SrcReg[src].File == RC_FILE_TEMPORARY &&
232 inst->U.I.SrcReg[src].Index == s.Mov->U.I.DstReg.Index) {
233 unsigned int refmask = 0;
234
235 for(unsigned int chan = 0; chan < 4; ++chan) {
236 unsigned int swz = GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan);
237 refmask |= (1 << swz) & RC_MASK_XYZW;
238 }
239
240 if ((refmask & s.MovMask) == refmask) {
241 inst->U.I.SrcReg[src] = chain_srcregs(inst->U.I.SrcReg[src], s.Mov->U.I.SrcReg[0]);
242 if (s.Mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
243 inst->U.I.PreSub = s.Mov->U.I.PreSub;
244 }
245 }
246 }
247
248 if (opcode->HasDstReg) {
249 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY &&
250 inst->U.I.DstReg.Index == s.Mov->U.I.DstReg.Index) {
251 s.MovMask &= ~inst->U.I.DstReg.WriteMask;
252 }
253 }
254
255 if (s.BranchDepth >= 0) {
256 if (inst->U.I.Opcode == RC_OPCODE_IF) {
257 s.BranchDepth++;
258 } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF
259 || inst->U.I.Opcode == RC_OPCODE_ELSE) {
260 s.BranchDepth--;
261 if (s.BranchDepth < 0)
262 break; /* no more readers after this point */
263 }
264 }
265 }
266
267 /* Finally, remove the original MOV instruction */
268 rc_remove_instruction(inst_mov);
269 }
270
271 /**
272 * Check if a source register is actually always the same
273 * swizzle constant.
274 */
275 static int is_src_uniform_constant(struct rc_src_register src,
276 rc_swizzle * pswz, unsigned int * pnegate)
277 {
278 int have_used = 0;
279
280 if (src.File != RC_FILE_NONE) {
281 *pswz = 0;
282 return 0;
283 }
284
285 for(unsigned int chan = 0; chan < 4; ++chan) {
286 unsigned int swz = GET_SWZ(src.Swizzle, chan);
287 if (swz < 4) {
288 *pswz = 0;
289 return 0;
290 }
291 if (swz == RC_SWIZZLE_UNUSED)
292 continue;
293
294 if (!have_used) {
295 *pswz = swz;
296 *pnegate = GET_BIT(src.Negate, chan);
297 have_used = 1;
298 } else {
299 if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
300 *pswz = 0;
301 return 0;
302 }
303 }
304 }
305
306 return 1;
307 }
308
309 static void constant_folding_mad(struct rc_instruction * inst)
310 {
311 rc_swizzle swz;
312 unsigned int negate;
313
314 if (is_src_uniform_constant(inst->U.I.SrcReg[2], &swz, &negate)) {
315 if (swz == RC_SWIZZLE_ZERO) {
316 inst->U.I.Opcode = RC_OPCODE_MUL;
317 return;
318 }
319 }
320
321 if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
322 if (swz == RC_SWIZZLE_ONE) {
323 inst->U.I.Opcode = RC_OPCODE_ADD;
324 if (negate)
325 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
326 inst->U.I.SrcReg[1] = inst->U.I.SrcReg[2];
327 return;
328 } else if (swz == RC_SWIZZLE_ZERO) {
329 inst->U.I.Opcode = RC_OPCODE_MOV;
330 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
331 return;
332 }
333 }
334
335 if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
336 if (swz == RC_SWIZZLE_ONE) {
337 inst->U.I.Opcode = RC_OPCODE_ADD;
338 if (negate)
339 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
340 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
341 return;
342 } else if (swz == RC_SWIZZLE_ZERO) {
343 inst->U.I.Opcode = RC_OPCODE_MOV;
344 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
345 return;
346 }
347 }
348 }
349
350 static void constant_folding_mul(struct rc_instruction * inst)
351 {
352 rc_swizzle swz;
353 unsigned int negate;
354
355 if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
356 if (swz == RC_SWIZZLE_ONE) {
357 inst->U.I.Opcode = RC_OPCODE_MOV;
358 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
359 if (negate)
360 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
361 return;
362 } else if (swz == RC_SWIZZLE_ZERO) {
363 inst->U.I.Opcode = RC_OPCODE_MOV;
364 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
365 return;
366 }
367 }
368
369 if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
370 if (swz == RC_SWIZZLE_ONE) {
371 inst->U.I.Opcode = RC_OPCODE_MOV;
372 if (negate)
373 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
374 return;
375 } else if (swz == RC_SWIZZLE_ZERO) {
376 inst->U.I.Opcode = RC_OPCODE_MOV;
377 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
378 return;
379 }
380 }
381 }
382
383 static void constant_folding_add(struct rc_instruction * inst)
384 {
385 rc_swizzle swz;
386 unsigned int negate;
387
388 if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
389 if (swz == RC_SWIZZLE_ZERO) {
390 inst->U.I.Opcode = RC_OPCODE_MOV;
391 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
392 return;
393 }
394 }
395
396 if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
397 if (swz == RC_SWIZZLE_ZERO) {
398 inst->U.I.Opcode = RC_OPCODE_MOV;
399 return;
400 }
401 }
402 }
403
404 /**
405 * Replace 0.0, 1.0 and 0.5 immediate constants by their
406 * respective swizzles. Simplify instructions like ADD dst, src, 0;
407 */
408 static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
409 {
410 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
411 unsigned int i;
412
413 /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
414 for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
415 struct rc_constant * constant;
416 struct rc_src_register newsrc;
417 int have_real_reference;
418
419 if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
420 inst->U.I.SrcReg[src].RelAddr ||
421 inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
422 continue;
423
424 constant =
425 &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
426
427 if (constant->Type != RC_CONSTANT_IMMEDIATE)
428 continue;
429
430 newsrc = inst->U.I.SrcReg[src];
431 have_real_reference = 0;
432 for(unsigned int chan = 0; chan < 4; ++chan) {
433 unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
434 unsigned int newswz;
435 float imm;
436 float baseimm;
437
438 if (swz >= 4)
439 continue;
440
441 imm = constant->u.Immediate[swz];
442 baseimm = imm;
443 if (imm < 0.0)
444 baseimm = -baseimm;
445
446 if (baseimm == 0.0) {
447 newswz = RC_SWIZZLE_ZERO;
448 } else if (baseimm == 1.0) {
449 newswz = RC_SWIZZLE_ONE;
450 } else if (baseimm == 0.5 && c->has_half_swizzles) {
451 newswz = RC_SWIZZLE_HALF;
452 } else {
453 have_real_reference = 1;
454 continue;
455 }
456
457 SET_SWZ(newsrc.Swizzle, chan, newswz);
458 if (imm < 0.0 && !newsrc.Abs)
459 newsrc.Negate ^= 1 << chan;
460 }
461
462 if (!have_real_reference) {
463 newsrc.File = RC_FILE_NONE;
464 newsrc.Index = 0;
465 }
466
467 /* don't make the swizzle worse */
468 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc) &&
469 c->SwizzleCaps->IsNative(inst->U.I.Opcode, inst->U.I.SrcReg[src]))
470 continue;
471
472 inst->U.I.SrcReg[src] = newsrc;
473 }
474
475 /* Simplify instructions based on constants */
476 if (inst->U.I.Opcode == RC_OPCODE_MAD)
477 constant_folding_mad(inst);
478
479 /* note: MAD can simplify to MUL or ADD */
480 if (inst->U.I.Opcode == RC_OPCODE_MUL)
481 constant_folding_mul(inst);
482 else if (inst->U.I.Opcode == RC_OPCODE_ADD)
483 constant_folding_add(inst);
484
485 /* In case this instruction has been converted, make sure all of the
486 * registers that are no longer used are empty. */
487 opcode = rc_get_opcode_info(inst->U.I.Opcode);
488 for(i = opcode->NumSrcRegs; i < 3; i++) {
489 memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
490 }
491 }
492
493 /**
494 * If src and dst use the same register, this function returns a writemask that
495 * indicates wich components are read by src. Otherwise zero is returned.
496 */
497 static unsigned int src_reads_dst_mask(struct rc_src_register src,
498 struct rc_dst_register dst)
499 {
500 unsigned int mask = 0;
501 unsigned int i;
502 if (dst.File != src.File || dst.Index != src.Index) {
503 return 0;
504 }
505
506 for(i = 0; i < 4; i++) {
507 mask |= 1 << GET_SWZ(src.Swizzle, i);
508 }
509 mask &= RC_MASK_XYZW;
510
511 return mask;
512 }
513
514 /* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
515 * in any of its channels. Return 0 otherwise. */
516 static int src_has_const_swz(struct rc_src_register src) {
517 int chan;
518 for(chan = 0; chan < 4; chan++) {
519 unsigned int swz = GET_SWZ(src.Swizzle, chan);
520 if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF
521 || swz == RC_SWIZZLE_ONE) {
522 return 1;
523 }
524 }
525 return 0;
526 }
527
528 static void peephole_scan_write(void * data, struct rc_instruction * inst,
529 rc_register_file file, unsigned int index, unsigned int mask)
530 {
531 struct peephole_state * s = data;
532 if(s->Inst->U.I.DstReg.File == file
533 && s->Inst->U.I.DstReg.Index == index) {
534 unsigned int common_mask = s->WriteMask & mask;
535 s->WriteMask &= ~common_mask;
536 }
537 }
538
539 static int presub_helper(
540 struct radeon_compiler * c,
541 struct peephole_state * s,
542 rc_presubtract_op presub_opcode,
543 rc_presub_replace_fn presub_replace)
544 {
545 struct rc_instruction * inst;
546 unsigned int can_remove = 0;
547 unsigned int cant_sub = 0;
548
549 for(inst = s->Inst->Next; inst != &c->Program.Instructions;
550 inst = inst->Next) {
551 unsigned int i;
552 unsigned char can_use_presub = 1;
553 const struct rc_opcode_info * info =
554 rc_get_opcode_info(inst->U.I.Opcode);
555 /* XXX: There are some situations where instructions
556 * with more than 2 src registers can use the
557 * presubtract select, but to keep things simple we
558 * will disable presubtract on these instructions for
559 * now. */
560 if (info->NumSrcRegs > 2 || info->HasTexture) {
561 can_use_presub = 0;
562 }
563
564 /* We can't use more than one presubtract value in an
565 * instruction, unless the two prsubtract operations
566 * are the same and read from the same registers. */
567 if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE) {
568 if (inst->U.I.PreSub.Opcode != presub_opcode
569 || inst->U.I.PreSub.SrcReg[0].File !=
570 s->Inst->U.I.SrcReg[1].File
571 || inst->U.I.PreSub.SrcReg[0].Index !=
572 s->Inst->U.I.SrcReg[1].Index) {
573 can_use_presub = 0;
574 }
575 }
576
577 /* Even if the instruction can't use a presubtract operation
578 * we still need to check if the instruction reads from
579 * s->Inst->U.I.DstReg, because if it does we must not
580 * remove s->Inst. */
581 for(i = 0; i < info->NumSrcRegs; i++) {
582 unsigned int mask = src_reads_dst_mask(
583 inst->U.I.SrcReg[i], s->Inst->U.I.DstReg);
584 /* XXX We could be more aggressive here using
585 * presubtract. It is okay if SrcReg[i] only reads
586 * from some of the mask components. */
587 if(s->Inst->U.I.DstReg.WriteMask != mask) {
588 if (s->Inst->U.I.DstReg.WriteMask & mask) {
589 can_remove = 0;
590 break;
591 } else {
592 continue;
593 }
594 }
595 if (cant_sub || !can_use_presub) {
596 can_remove = 0;
597 break;
598 }
599 presub_replace(s, inst, i);
600 can_remove = 1;
601 }
602 if(!can_remove)
603 break;
604 rc_for_all_writes_mask(inst, peephole_scan_write, s);
605 /* If all components of inst_add's destination register have
606 * been written to by subsequent instructions, the original
607 * value of the destination register is no longer valid and
608 * we can't keep doing substitutions. */
609 if (!s->WriteMask){
610 break;
611 }
612 /* Make this instruction doesn't write to the presubtract source. */
613 if (inst->U.I.DstReg.WriteMask &
614 src_reads_dst_mask(s->Inst->U.I.SrcReg[1],
615 inst->U.I.DstReg)
616 || src_reads_dst_mask(s->Inst->U.I.SrcReg[0],
617 inst->U.I.DstReg)
618 || info->IsFlowControl) {
619 cant_sub = 1;
620 }
621 }
622 return can_remove;
623 }
624
625 /* This function assumes that s->Inst->U.I.SrcReg[0] and
626 * s->Inst->U.I.SrcReg[1] aren't both negative. */
627 static void presub_replace_add(struct peephole_state *s,
628 struct rc_instruction * inst,
629 unsigned int src_index)
630 {
631 rc_presubtract_op presub_opcode;
632 if (s->Inst->U.I.SrcReg[1].Negate || s->Inst->U.I.SrcReg[0].Negate)
633 presub_opcode = RC_PRESUB_SUB;
634 else
635 presub_opcode = RC_PRESUB_ADD;
636
637 if (s->Inst->U.I.SrcReg[1].Negate) {
638 inst->U.I.PreSub.SrcReg[0] = s->Inst->U.I.SrcReg[1];
639 inst->U.I.PreSub.SrcReg[1] = s->Inst->U.I.SrcReg[0];
640 } else {
641 inst->U.I.PreSub.SrcReg[0] = s->Inst->U.I.SrcReg[0];
642 inst->U.I.PreSub.SrcReg[1] = s->Inst->U.I.SrcReg[1];
643 }
644 inst->U.I.PreSub.SrcReg[0].Negate = 0;
645 inst->U.I.PreSub.SrcReg[1].Negate = 0;
646 inst->U.I.PreSub.Opcode = presub_opcode;
647 inst->U.I.SrcReg[src_index] = chain_srcregs(inst->U.I.SrcReg[src_index],
648 inst->U.I.PreSub.SrcReg[0]);
649 inst->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
650 inst->U.I.SrcReg[src_index].Index = presub_opcode;
651 }
652
653 static int is_presub_candidate(struct rc_instruction * inst)
654 {
655 const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
656 unsigned int i;
657
658 if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE || inst->U.I.SaturateMode)
659 return 0;
660
661 for(i = 0; i < info->NumSrcRegs; i++) {
662 if (src_reads_dst_mask(inst->U.I.SrcReg[i], inst->U.I.DstReg))
663 return 0;
664 }
665 return 1;
666 }
667
668 static int peephole_add_presub_add(
669 struct radeon_compiler * c,
670 struct rc_instruction * inst_add)
671 {
672 struct rc_src_register * src0 = NULL;
673 struct rc_src_register * src1 = NULL;
674 unsigned int i;
675 struct peephole_state s;
676
677 if (!is_presub_candidate(inst_add))
678 return 0;
679
680 if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
681 return 0;
682
683 /* src0 and src1 can't have absolute values only one can be negative and they must be all negative or all positive. */
684 for (i = 0; i < 2; i++) {
685 if (inst_add->U.I.SrcReg[i].Abs)
686 return 0;
687 if ((inst_add->U.I.SrcReg[i].Negate
688 & inst_add->U.I.DstReg.WriteMask) ==
689 inst_add->U.I.DstReg.WriteMask) {
690 src0 = &inst_add->U.I.SrcReg[i];
691 } else if (!src1) {
692 src1 = &inst_add->U.I.SrcReg[i];
693 } else {
694 src0 = &inst_add->U.I.SrcReg[i];
695 }
696 }
697
698 if (!src1)
699 return 0;
700
701 s.Inst = inst_add;
702 s.WriteMask = inst_add->U.I.DstReg.WriteMask;
703 if (presub_helper(c, &s, RC_PRESUB_ADD, presub_replace_add)) {
704 rc_remove_instruction(inst_add);
705 return 1;
706 }
707 return 0;
708 }
709
710 static void presub_replace_inv(struct peephole_state * s,
711 struct rc_instruction * inst,
712 unsigned int src_index)
713 {
714 /* We must be careful not to modify s->Inst, since it
715 * is possible it will remain part of the program.
716 * XXX Maybe pass a struct instead of a pointer for s->Inst.*/
717 inst->U.I.PreSub.SrcReg[0] = s->Inst->U.I.SrcReg[1];
718 inst->U.I.PreSub.SrcReg[0].Negate = 0;
719 inst->U.I.PreSub.Opcode = RC_PRESUB_INV;
720 inst->U.I.SrcReg[src_index] = chain_srcregs(inst->U.I.SrcReg[src_index],
721 inst->U.I.PreSub.SrcReg[0]);
722
723 inst->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
724 inst->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
725 }
726
727 /**
728 * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
729 * Use the presubtract 1 - src0 for all readers of TEMP[0]. The first source
730 * of the add instruction must have the constatnt 1 swizzle. This function
731 * does not check const registers to see if their value is 1.0, so it should
732 * be called after the constant_folding optimization.
733 * @return
734 * 0 if the ADD instruction is still part of the program.
735 * 1 if the ADD instruction is no longer part of the program.
736 */
737 static int peephole_add_presub_inv(
738 struct radeon_compiler * c,
739 struct rc_instruction * inst_add)
740 {
741 unsigned int i, swz, mask;
742 struct peephole_state s;
743
744 if (!is_presub_candidate(inst_add))
745 return 0;
746
747 mask = inst_add->U.I.DstReg.WriteMask;
748
749 /* Check if src0 is 1. */
750 /* XXX It would be nice to use is_src_uniform_constant here, but that
751 * function only works if the register's file is RC_FILE_NONE */
752 for(i = 0; i < 4; i++ ) {
753 swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
754 if(((1 << i) & inst_add->U.I.DstReg.WriteMask)
755 && swz != RC_SWIZZLE_ONE) {
756 return 0;
757 }
758 }
759
760 /* Check src1. */
761 if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
762 inst_add->U.I.DstReg.WriteMask
763 || inst_add->U.I.SrcReg[1].Abs
764 || (inst_add->U.I.SrcReg[1].File != RC_FILE_TEMPORARY
765 && inst_add->U.I.SrcReg[1].File != RC_FILE_CONSTANT)
766 || src_has_const_swz(inst_add->U.I.SrcReg[1])) {
767
768 return 0;
769 }
770
771 /* Setup the peephole_state information. */
772 s.Inst = inst_add;
773 s.WriteMask = inst_add->U.I.DstReg.WriteMask;
774
775 if (presub_helper(c, &s, RC_PRESUB_INV, presub_replace_inv)) {
776 rc_remove_instruction(inst_add);
777 return 1;
778 }
779 return 0;
780 }
781
782 /**
783 * @return
784 * 0 if inst is still part of the program.
785 * 1 if inst is no longer part of the program.
786 */
787 static int peephole(struct radeon_compiler * c, struct rc_instruction * inst)
788 {
789 switch(inst->U.I.Opcode){
790 case RC_OPCODE_ADD:
791 if (c->has_presub) {
792 if(peephole_add_presub_inv(c, inst))
793 return 1;
794 if(peephole_add_presub_add(c, inst))
795 return 1;
796 }
797 break;
798 default:
799 break;
800 }
801 return 0;
802 }
803
804 void rc_optimize(struct radeon_compiler * c, void *user)
805 {
806 struct rc_instruction * inst = c->Program.Instructions.Next;
807 while(inst != &c->Program.Instructions) {
808 struct rc_instruction * cur = inst;
809 inst = inst->Next;
810
811 constant_folding(c, cur);
812
813 if(peephole(c, cur))
814 continue;
815
816 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
817 copy_propagate(c, cur);
818 /* cur may no longer be part of the program */
819 }
820 }
821 }