nv50/ir: can't have predication and immediates
[mesa.git] / src / gallium / drivers / nouveau / codegen / nv50_ir_peephole.cpp
1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_target.h"
25 #include "codegen/nv50_ir_build_util.h"
26
27 extern "C" {
28 #include "util/u_math.h"
29 }
30
31 namespace nv50_ir {
32
33 bool
34 Instruction::isNop() const
35 {
36 if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
37 return true;
38 if (terminator || join) // XXX: should terminator imply flow ?
39 return false;
40 if (op == OP_ATOM)
41 return false;
42 if (!fixed && op == OP_NOP)
43 return true;
44
45 if (defExists(0) && def(0).rep()->reg.data.id < 0) {
46 for (int d = 1; defExists(d); ++d)
47 if (def(d).rep()->reg.data.id >= 0)
48 WARN("part of vector result is unused !\n");
49 return true;
50 }
51
52 if (op == OP_MOV || op == OP_UNION) {
53 if (!getDef(0)->equals(getSrc(0)))
54 return false;
55 if (op == OP_UNION)
56 if (!def(0).rep()->equals(getSrc(1)))
57 return false;
58 return true;
59 }
60
61 return false;
62 }
63
64 bool Instruction::isDead() const
65 {
66 if (op == OP_STORE ||
67 op == OP_EXPORT ||
68 op == OP_ATOM ||
69 op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
70 op == OP_WRSV)
71 return false;
72
73 for (int d = 0; defExists(d); ++d)
74 if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
75 return false;
76
77 if (terminator || asFlow())
78 return false;
79 if (fixed)
80 return false;
81
82 return true;
83 };
84
85 // =============================================================================
86
87 class CopyPropagation : public Pass
88 {
89 private:
90 virtual bool visit(BasicBlock *);
91 };
92
93 // Propagate all MOVs forward to make subsequent optimization easier, except if
94 // the sources stem from a phi, in which case we don't want to mess up potential
95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
96 bool
97 CopyPropagation::visit(BasicBlock *bb)
98 {
99 Instruction *mov, *si, *next;
100
101 for (mov = bb->getEntry(); mov; mov = next) {
102 next = mov->next;
103 if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
104 continue;
105 if (mov->getPredicate())
106 continue;
107 if (mov->def(0).getFile() != mov->src(0).getFile())
108 continue;
109 si = mov->getSrc(0)->getInsn();
110 if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
111 // propagate
112 mov->def(0).replace(mov->getSrc(0), false);
113 delete_Instruction(prog, mov);
114 }
115 }
116 return true;
117 }
118
119 // =============================================================================
120
121 class MergeSplits : public Pass
122 {
123 private:
124 virtual bool visit(BasicBlock *);
125 };
126
127 // For SPLIT / MERGE pairs that operate on the same registers, replace the
128 // post-merge def with the SPLIT's source.
129 bool
130 MergeSplits::visit(BasicBlock *bb)
131 {
132 Instruction *i, *next, *si;
133
134 for (i = bb->getEntry(); i; i = next) {
135 next = i->next;
136 if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
137 continue;
138 si = i->getSrc(0)->getInsn();
139 if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
140 continue;
141 i->def(0).replace(si->getSrc(0), false);
142 delete_Instruction(prog, i);
143 }
144
145 return true;
146 }
147
148 // =============================================================================
149
150 class LoadPropagation : public Pass
151 {
152 private:
153 virtual bool visit(BasicBlock *);
154
155 void checkSwapSrc01(Instruction *);
156
157 bool isCSpaceLoad(Instruction *);
158 bool isImmdLoad(Instruction *);
159 bool isAttribOrSharedLoad(Instruction *);
160 };
161
162 bool
163 LoadPropagation::isCSpaceLoad(Instruction *ld)
164 {
165 return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
166 }
167
168 bool
169 LoadPropagation::isImmdLoad(Instruction *ld)
170 {
171 if (!ld || (ld->op != OP_MOV) ||
172 ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
173 return false;
174 return ld->src(0).getFile() == FILE_IMMEDIATE;
175 }
176
177 bool
178 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
179 {
180 return ld &&
181 (ld->op == OP_VFETCH ||
182 (ld->op == OP_LOAD &&
183 (ld->src(0).getFile() == FILE_SHADER_INPUT ||
184 ld->src(0).getFile() == FILE_MEMORY_SHARED)));
185 }
186
187 void
188 LoadPropagation::checkSwapSrc01(Instruction *insn)
189 {
190 if (!prog->getTarget()->getOpInfo(insn).commutative)
191 if (insn->op != OP_SET && insn->op != OP_SLCT)
192 return;
193 if (insn->src(1).getFile() != FILE_GPR)
194 return;
195
196 Instruction *i0 = insn->getSrc(0)->getInsn();
197 Instruction *i1 = insn->getSrc(1)->getInsn();
198
199 if (isCSpaceLoad(i0)) {
200 if (!isCSpaceLoad(i1))
201 insn->swapSources(0, 1);
202 else
203 return;
204 } else
205 if (isImmdLoad(i0)) {
206 if (!isCSpaceLoad(i1) && !isImmdLoad(i1))
207 insn->swapSources(0, 1);
208 else
209 return;
210 } else
211 if (isAttribOrSharedLoad(i1)) {
212 if (!isAttribOrSharedLoad(i0))
213 insn->swapSources(0, 1);
214 else
215 return;
216 } else {
217 return;
218 }
219
220 if (insn->op == OP_SET || insn->op == OP_SET_AND ||
221 insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
222 insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
223 else
224 if (insn->op == OP_SLCT)
225 insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
226 }
227
228 bool
229 LoadPropagation::visit(BasicBlock *bb)
230 {
231 const Target *targ = prog->getTarget();
232 Instruction *next;
233
234 for (Instruction *i = bb->getEntry(); i; i = next) {
235 next = i->next;
236
237 if (i->op == OP_CALL) // calls have args as sources, they must be in regs
238 continue;
239
240 if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
241 continue;
242
243 if (i->srcExists(1))
244 checkSwapSrc01(i);
245
246 for (int s = 0; i->srcExists(s); ++s) {
247 Instruction *ld = i->getSrc(s)->getInsn();
248
249 if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
250 continue;
251 if (!targ->insnCanLoad(i, s, ld))
252 continue;
253
254 // propagate !
255 i->setSrc(s, ld->getSrc(0));
256 if (ld->src(0).isIndirect(0))
257 i->setIndirect(s, 0, ld->getIndirect(0, 0));
258
259 if (ld->getDef(0)->refCount() == 0)
260 delete_Instruction(prog, ld);
261 }
262 }
263 return true;
264 }
265
266 // =============================================================================
267
268 class IndirectPropagation : public Pass
269 {
270 private:
271 virtual bool visit(BasicBlock *);
272 };
273
274 bool
275 IndirectPropagation::visit(BasicBlock *bb)
276 {
277 const Target *targ = prog->getTarget();
278 Instruction *next;
279
280 for (Instruction *i = bb->getEntry(); i; i = next) {
281 next = i->next;
282
283 for (int s = 0; i->srcExists(s); ++s) {
284 Instruction *insn;
285 ImmediateValue imm;
286 if (!i->src(s).isIndirect(0))
287 continue;
288 insn = i->getIndirect(s, 0)->getInsn();
289 if (!insn)
290 continue;
291 if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
292 if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
293 !insn->src(1).getImmediate(imm) ||
294 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
295 continue;
296 i->setIndirect(s, 0, insn->getSrc(0));
297 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
298 i->src(s).get()->reg.data.offset += imm.reg.data.u32;
299 } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
300 if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
301 !insn->src(1).getImmediate(imm) ||
302 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
303 continue;
304 i->setIndirect(s, 0, insn->getSrc(0));
305 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
306 i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
307 } else if (insn->op == OP_MOV) {
308 if (!insn->src(0).getImmediate(imm) ||
309 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
310 continue;
311 i->setIndirect(s, 0, NULL);
312 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
313 i->src(s).get()->reg.data.offset += imm.reg.data.u32;
314 }
315 }
316 }
317 return true;
318 }
319
320 // =============================================================================
321
322 // Evaluate constant expressions.
323 class ConstantFolding : public Pass
324 {
325 public:
326 bool foldAll(Program *);
327
328 private:
329 virtual bool visit(BasicBlock *);
330
331 void expr(Instruction *, ImmediateValue&, ImmediateValue&);
332 void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
333 void opnd(Instruction *, ImmediateValue&, int s);
334
335 void unary(Instruction *, const ImmediateValue&);
336
337 void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
338
339 CmpInstruction *findOriginForTestWithZero(Value *);
340
341 unsigned int foldCount;
342
343 BuildUtil bld;
344 };
345
346 // TODO: remember generated immediates and only revisit these
347 bool
348 ConstantFolding::foldAll(Program *prog)
349 {
350 unsigned int iterCount = 0;
351 do {
352 foldCount = 0;
353 if (!run(prog))
354 return false;
355 } while (foldCount && ++iterCount < 2);
356 return true;
357 }
358
359 bool
360 ConstantFolding::visit(BasicBlock *bb)
361 {
362 Instruction *i, *next;
363
364 for (i = bb->getEntry(); i; i = next) {
365 next = i->next;
366 if (i->op == OP_MOV || i->op == OP_CALL)
367 continue;
368
369 ImmediateValue src0, src1, src2;
370
371 if (i->srcExists(2) &&
372 i->src(0).getImmediate(src0) &&
373 i->src(1).getImmediate(src1) &&
374 i->src(2).getImmediate(src2))
375 expr(i, src0, src1, src2);
376 else
377 if (i->srcExists(1) &&
378 i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
379 expr(i, src0, src1);
380 else
381 if (i->srcExists(0) && i->src(0).getImmediate(src0))
382 opnd(i, src0, 0);
383 else
384 if (i->srcExists(1) && i->src(1).getImmediate(src1))
385 opnd(i, src1, 1);
386 }
387 return true;
388 }
389
390 CmpInstruction *
391 ConstantFolding::findOriginForTestWithZero(Value *value)
392 {
393 if (!value)
394 return NULL;
395 Instruction *insn = value->getInsn();
396
397 if (insn->asCmp() && insn->op != OP_SLCT)
398 return insn->asCmp();
399
400 /* Sometimes mov's will sneak in as a result of other folding. This gets
401 * cleaned up later.
402 */
403 if (insn->op == OP_MOV)
404 return findOriginForTestWithZero(insn->getSrc(0));
405
406 /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
407 if (insn->op == OP_AND) {
408 int s = 0;
409 ImmediateValue imm;
410 if (!insn->src(s).getImmediate(imm)) {
411 s = 1;
412 if (!insn->src(s).getImmediate(imm))
413 return NULL;
414 }
415 if (imm.reg.data.f32 != 1.0f)
416 return NULL;
417 /* TODO: Come up with a way to handle the condition being inverted */
418 if (insn->src(!s).mod != Modifier(0))
419 return NULL;
420 return findOriginForTestWithZero(insn->getSrc(!s));
421 }
422
423 return NULL;
424 }
425
426 void
427 Modifier::applyTo(ImmediateValue& imm) const
428 {
429 if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
430 return;
431 switch (imm.reg.type) {
432 case TYPE_F32:
433 if (bits & NV50_IR_MOD_ABS)
434 imm.reg.data.f32 = fabsf(imm.reg.data.f32);
435 if (bits & NV50_IR_MOD_NEG)
436 imm.reg.data.f32 = -imm.reg.data.f32;
437 if (bits & NV50_IR_MOD_SAT) {
438 if (imm.reg.data.f32 < 0.0f)
439 imm.reg.data.f32 = 0.0f;
440 else
441 if (imm.reg.data.f32 > 1.0f)
442 imm.reg.data.f32 = 1.0f;
443 }
444 assert(!(bits & NV50_IR_MOD_NOT));
445 break;
446
447 case TYPE_S8: // NOTE: will be extended
448 case TYPE_S16:
449 case TYPE_S32:
450 case TYPE_U8: // NOTE: treated as signed
451 case TYPE_U16:
452 case TYPE_U32:
453 if (bits & NV50_IR_MOD_ABS)
454 imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
455 imm.reg.data.s32 : -imm.reg.data.s32;
456 if (bits & NV50_IR_MOD_NEG)
457 imm.reg.data.s32 = -imm.reg.data.s32;
458 if (bits & NV50_IR_MOD_NOT)
459 imm.reg.data.s32 = ~imm.reg.data.s32;
460 break;
461
462 case TYPE_F64:
463 if (bits & NV50_IR_MOD_ABS)
464 imm.reg.data.f64 = fabs(imm.reg.data.f64);
465 if (bits & NV50_IR_MOD_NEG)
466 imm.reg.data.f64 = -imm.reg.data.f64;
467 if (bits & NV50_IR_MOD_SAT) {
468 if (imm.reg.data.f64 < 0.0)
469 imm.reg.data.f64 = 0.0;
470 else
471 if (imm.reg.data.f64 > 1.0)
472 imm.reg.data.f64 = 1.0;
473 }
474 assert(!(bits & NV50_IR_MOD_NOT));
475 break;
476
477 default:
478 assert(!"invalid/unhandled type");
479 imm.reg.data.u64 = 0;
480 break;
481 }
482 }
483
484 operation
485 Modifier::getOp() const
486 {
487 switch (bits) {
488 case NV50_IR_MOD_ABS: return OP_ABS;
489 case NV50_IR_MOD_NEG: return OP_NEG;
490 case NV50_IR_MOD_SAT: return OP_SAT;
491 case NV50_IR_MOD_NOT: return OP_NOT;
492 case 0:
493 return OP_MOV;
494 default:
495 return OP_CVT;
496 }
497 }
498
499 void
500 ConstantFolding::expr(Instruction *i,
501 ImmediateValue &imm0, ImmediateValue &imm1)
502 {
503 struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
504 struct Storage res;
505 DataType type = i->dType;
506
507 memset(&res.data, 0, sizeof(res.data));
508
509 switch (i->op) {
510 case OP_MAD:
511 case OP_FMA:
512 case OP_MUL:
513 if (i->dnz && i->dType == TYPE_F32) {
514 if (!isfinite(a->data.f32))
515 a->data.f32 = 0.0f;
516 if (!isfinite(b->data.f32))
517 b->data.f32 = 0.0f;
518 }
519 switch (i->dType) {
520 case TYPE_F32:
521 res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
522 break;
523 case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
524 case TYPE_S32:
525 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
526 res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
527 break;
528 }
529 /* fallthrough */
530 case TYPE_U32:
531 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
532 res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
533 break;
534 }
535 res.data.u32 = a->data.u32 * b->data.u32; break;
536 default:
537 return;
538 }
539 break;
540 case OP_DIV:
541 if (b->data.u32 == 0)
542 break;
543 switch (i->dType) {
544 case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
545 case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
546 case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
547 case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
548 default:
549 return;
550 }
551 break;
552 case OP_ADD:
553 switch (i->dType) {
554 case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
555 case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
556 case TYPE_S32:
557 case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
558 default:
559 return;
560 }
561 break;
562 case OP_POW:
563 switch (i->dType) {
564 case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
565 case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
566 default:
567 return;
568 }
569 break;
570 case OP_MAX:
571 switch (i->dType) {
572 case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
573 case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
574 case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
575 case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
576 default:
577 return;
578 }
579 break;
580 case OP_MIN:
581 switch (i->dType) {
582 case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
583 case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
584 case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
585 case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
586 default:
587 return;
588 }
589 break;
590 case OP_AND:
591 res.data.u64 = a->data.u64 & b->data.u64;
592 break;
593 case OP_OR:
594 res.data.u64 = a->data.u64 | b->data.u64;
595 break;
596 case OP_XOR:
597 res.data.u64 = a->data.u64 ^ b->data.u64;
598 break;
599 case OP_SHL:
600 res.data.u32 = a->data.u32 << b->data.u32;
601 break;
602 case OP_SHR:
603 switch (i->dType) {
604 case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
605 case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
606 default:
607 return;
608 }
609 break;
610 case OP_SLCT:
611 if (a->data.u32 != b->data.u32)
612 return;
613 res.data.u32 = a->data.u32;
614 break;
615 case OP_EXTBF: {
616 int offset = b->data.u32 & 0xff;
617 int width = (b->data.u32 >> 8) & 0xff;
618 int rshift = offset;
619 int lshift = 0;
620 if (width == 0) {
621 res.data.u32 = 0;
622 break;
623 }
624 if (width + offset < 32) {
625 rshift = 32 - width;
626 lshift = 32 - width - offset;
627 }
628 if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
629 res.data.u32 = util_bitreverse(a->data.u32);
630 else
631 res.data.u32 = a->data.u32;
632 switch (i->dType) {
633 case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
634 case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
635 default:
636 return;
637 }
638 break;
639 }
640 case OP_POPCNT:
641 res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
642 break;
643 case OP_PFETCH:
644 // The two arguments to pfetch are logically added together. Normally
645 // the second argument will not be constant, but that can happen.
646 res.data.u32 = a->data.u32 + b->data.u32;
647 type = TYPE_U32;
648 break;
649 case OP_MERGE:
650 switch (i->dType) {
651 case TYPE_U64:
652 case TYPE_S64:
653 case TYPE_F64:
654 res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
655 break;
656 default:
657 return;
658 }
659 break;
660 default:
661 return;
662 }
663 ++foldCount;
664
665 i->src(0).mod = Modifier(0);
666 i->src(1).mod = Modifier(0);
667 i->postFactor = 0;
668
669 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
670 i->setSrc(1, NULL);
671
672 i->getSrc(0)->reg.data = res.data;
673 i->getSrc(0)->reg.type = type;
674 i->getSrc(0)->reg.size = typeSizeof(type);
675
676 switch (i->op) {
677 case OP_MAD:
678 case OP_FMA: {
679 i->op = OP_ADD;
680
681 /* Move the immediate to the second arg, otherwise the ADD operation
682 * won't be emittable
683 */
684 i->setSrc(1, i->getSrc(0));
685 i->setSrc(0, i->getSrc(2));
686 i->src(0).mod = i->src(2).mod;
687 i->setSrc(2, NULL);
688
689 ImmediateValue src0;
690 if (i->src(0).getImmediate(src0))
691 expr(i, src0, *i->getSrc(1)->asImm());
692 if (i->saturate && !prog->getTarget()->isSatSupported(i)) {
693 bld.setPosition(i, false);
694 i->setSrc(1, bld.loadImm(NULL, res.data.u32));
695 }
696 break;
697 }
698 case OP_PFETCH:
699 // Leave PFETCH alone... we just folded its 2 args into 1.
700 break;
701 default:
702 i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
703 break;
704 }
705 i->subOp = 0;
706 }
707
708 void
709 ConstantFolding::expr(Instruction *i,
710 ImmediateValue &imm0,
711 ImmediateValue &imm1,
712 ImmediateValue &imm2)
713 {
714 struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
715 struct Storage res;
716
717 memset(&res.data, 0, sizeof(res.data));
718
719 switch (i->op) {
720 case OP_INSBF: {
721 int offset = b->data.u32 & 0xff;
722 int width = (b->data.u32 >> 8) & 0xff;
723 unsigned bitmask = ((1 << width) - 1) << offset;
724 res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
725 break;
726 }
727 case OP_MAD:
728 case OP_FMA: {
729 switch (i->dType) {
730 case TYPE_F32:
731 res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
732 c->data.f32;
733 break;
734 case TYPE_F64:
735 res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
736 break;
737 case TYPE_S32:
738 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
739 res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
740 break;
741 }
742 /* fallthrough */
743 case TYPE_U32:
744 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
745 res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
746 break;
747 }
748 res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
749 break;
750 default:
751 return;
752 }
753 break;
754 }
755 default:
756 return;
757 }
758
759 ++foldCount;
760 i->src(0).mod = Modifier(0);
761 i->src(1).mod = Modifier(0);
762 i->src(2).mod = Modifier(0);
763
764 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
765 i->setSrc(1, NULL);
766 i->setSrc(2, NULL);
767
768 i->getSrc(0)->reg.data = res.data;
769 i->getSrc(0)->reg.type = i->dType;
770 i->getSrc(0)->reg.size = typeSizeof(i->dType);
771
772 i->op = OP_MOV;
773 }
774
775 void
776 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
777 {
778 Storage res;
779
780 if (i->dType != TYPE_F32)
781 return;
782 switch (i->op) {
783 case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
784 case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
785 case OP_SAT: res.data.f32 = CLAMP(imm.reg.data.f32, 0.0f, 1.0f); break;
786 case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
787 case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
788 case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
789 case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
790 case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
791 case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
792 case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
793 case OP_PRESIN:
794 case OP_PREEX2:
795 // these should be handled in subsequent OP_SIN/COS/EX2
796 res.data.f32 = imm.reg.data.f32;
797 break;
798 default:
799 return;
800 }
801 i->op = OP_MOV;
802 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
803 i->src(0).mod = Modifier(0);
804 }
805
806 void
807 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
808 const int s, ImmediateValue& imm2)
809 {
810 const int t = s ? 0 : 1;
811 Instruction *insn;
812 Instruction *mul1 = NULL; // mul1 before mul2
813 int e = 0;
814 float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
815 ImmediateValue imm1;
816
817 assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
818
819 if (mul2->getSrc(t)->refCount() == 1) {
820 insn = mul2->getSrc(t)->getInsn();
821 if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
822 mul1 = insn;
823 if (mul1 && !mul1->saturate) {
824 int s1;
825
826 if (mul1->src(s1 = 0).getImmediate(imm1) ||
827 mul1->src(s1 = 1).getImmediate(imm1)) {
828 bld.setPosition(mul1, false);
829 // a = mul r, imm1
830 // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
831 mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
832 mul1->src(s1).mod = Modifier(0);
833 mul2->def(0).replace(mul1->getDef(0), false);
834 mul1->saturate = mul2->saturate;
835 } else
836 if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
837 // c = mul a, b
838 // d = mul c, imm -> d = mul_x_imm a, b
839 mul1->postFactor = e;
840 mul2->def(0).replace(mul1->getDef(0), false);
841 if (f < 0)
842 mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
843 mul1->saturate = mul2->saturate;
844 }
845 return;
846 }
847 }
848 if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
849 // b = mul a, imm
850 // d = mul b, c -> d = mul_x_imm a, c
851 int s2, t2;
852 insn = (*mul2->getDef(0)->uses.begin())->getInsn();
853 if (!insn)
854 return;
855 mul1 = mul2;
856 mul2 = NULL;
857 s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
858 t2 = s2 ? 0 : 1;
859 if (insn->op == OP_MUL && insn->dType == TYPE_F32)
860 if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
861 mul2 = insn;
862 if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
863 mul2->postFactor = e;
864 mul2->setSrc(s2, mul1->src(t));
865 if (f < 0)
866 mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
867 }
868 }
869 }
870
871 void
872 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
873 {
874 const int t = !s;
875 const operation op = i->op;
876 Instruction *newi = i;
877
878 switch (i->op) {
879 case OP_MUL:
880 if (i->dType == TYPE_F32)
881 tryCollapseChainedMULs(i, s, imm0);
882
883 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
884 assert(!isFloatType(i->sType));
885 if (imm0.isInteger(1) && i->dType == TYPE_S32) {
886 bld.setPosition(i, false);
887 // Need to set to the sign value, which is a compare.
888 newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
889 TYPE_S32, i->getSrc(t), bld.mkImm(0));
890 delete_Instruction(prog, i);
891 } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
892 // The high bits can't be set in this case (either mul by 0 or
893 // unsigned by 1)
894 i->op = OP_MOV;
895 i->subOp = 0;
896 i->setSrc(0, new_ImmediateValue(prog, 0u));
897 i->src(0).mod = Modifier(0);
898 i->setSrc(1, NULL);
899 } else if (!imm0.isNegative() && imm0.isPow2()) {
900 // Translate into a shift
901 imm0.applyLog2();
902 i->op = OP_SHR;
903 i->subOp = 0;
904 imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
905 i->setSrc(0, i->getSrc(t));
906 i->src(0).mod = i->src(t).mod;
907 i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
908 i->src(1).mod = 0;
909 }
910 } else
911 if (imm0.isInteger(0)) {
912 i->op = OP_MOV;
913 i->setSrc(0, new_ImmediateValue(prog, 0u));
914 i->src(0).mod = Modifier(0);
915 i->postFactor = 0;
916 i->setSrc(1, NULL);
917 } else
918 if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
919 if (imm0.isNegative())
920 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
921 i->op = i->src(t).mod.getOp();
922 if (s == 0) {
923 i->setSrc(0, i->getSrc(1));
924 i->src(0).mod = i->src(1).mod;
925 i->src(1).mod = 0;
926 }
927 if (i->op != OP_CVT)
928 i->src(0).mod = 0;
929 i->setSrc(1, NULL);
930 } else
931 if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
932 if (imm0.isNegative())
933 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
934 i->op = OP_ADD;
935 i->setSrc(s, i->getSrc(t));
936 i->src(s).mod = i->src(t).mod;
937 } else
938 if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
939 i->op = OP_SHL;
940 imm0.applyLog2();
941 i->setSrc(0, i->getSrc(t));
942 i->src(0).mod = i->src(t).mod;
943 i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
944 i->src(1).mod = 0;
945 } else
946 if (i->postFactor && i->sType == TYPE_F32) {
947 /* Can't emit a postfactor with an immediate, have to fold it in */
948 i->setSrc(s, new_ImmediateValue(
949 prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
950 i->postFactor = 0;
951 }
952 break;
953 case OP_MAD:
954 if (imm0.isInteger(0)) {
955 i->setSrc(0, i->getSrc(2));
956 i->src(0).mod = i->src(2).mod;
957 i->setSrc(1, NULL);
958 i->setSrc(2, NULL);
959 i->op = i->src(0).mod.getOp();
960 if (i->op != OP_CVT)
961 i->src(0).mod = 0;
962 } else
963 if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
964 (imm0.isInteger(1) || imm0.isInteger(-1))) {
965 if (imm0.isNegative())
966 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
967 if (s == 0) {
968 i->setSrc(0, i->getSrc(1));
969 i->src(0).mod = i->src(1).mod;
970 }
971 i->setSrc(1, i->getSrc(2));
972 i->src(1).mod = i->src(2).mod;
973 i->setSrc(2, NULL);
974 i->op = OP_ADD;
975 }
976 break;
977 case OP_ADD:
978 if (i->usesFlags())
979 break;
980 if (imm0.isInteger(0)) {
981 if (s == 0) {
982 i->setSrc(0, i->getSrc(1));
983 i->src(0).mod = i->src(1).mod;
984 }
985 i->setSrc(1, NULL);
986 i->op = i->src(0).mod.getOp();
987 if (i->op != OP_CVT)
988 i->src(0).mod = Modifier(0);
989 }
990 break;
991
992 case OP_DIV:
993 if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
994 break;
995 bld.setPosition(i, false);
996 if (imm0.reg.data.u32 == 0) {
997 break;
998 } else
999 if (imm0.reg.data.u32 == 1) {
1000 i->op = OP_MOV;
1001 i->setSrc(1, NULL);
1002 } else
1003 if (i->dType == TYPE_U32 && imm0.isPow2()) {
1004 i->op = OP_SHR;
1005 i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
1006 } else
1007 if (i->dType == TYPE_U32) {
1008 Instruction *mul;
1009 Value *tA, *tB;
1010 const uint32_t d = imm0.reg.data.u32;
1011 uint32_t m;
1012 int r, s;
1013 uint32_t l = util_logbase2(d);
1014 if (((uint32_t)1 << l) < d)
1015 ++l;
1016 m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
1017 r = l ? 1 : 0;
1018 s = l ? (l - 1) : 0;
1019
1020 tA = bld.getSSA();
1021 tB = bld.getSSA();
1022 mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
1023 bld.loadImm(NULL, m));
1024 mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
1025 bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
1026 tA = bld.getSSA();
1027 if (r)
1028 bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
1029 else
1030 tA = tB;
1031 tB = s ? bld.getSSA() : i->getDef(0);
1032 newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
1033 if (s)
1034 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
1035
1036 delete_Instruction(prog, i);
1037 } else
1038 if (imm0.reg.data.s32 == -1) {
1039 i->op = OP_NEG;
1040 i->setSrc(1, NULL);
1041 } else {
1042 LValue *tA, *tB;
1043 LValue *tD;
1044 const int32_t d = imm0.reg.data.s32;
1045 int32_t m;
1046 int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
1047 if ((1 << l) < abs(d))
1048 ++l;
1049 if (!l)
1050 l = 1;
1051 m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
1052
1053 tA = bld.getSSA();
1054 tB = bld.getSSA();
1055 bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
1056 i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
1057 if (l > 1)
1058 bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
1059 else
1060 tB = tA;
1061 tA = bld.getSSA();
1062 bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
1063 tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
1064 newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
1065 if (d < 0)
1066 bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
1067
1068 delete_Instruction(prog, i);
1069 }
1070 break;
1071
1072 case OP_MOD:
1073 if (i->sType == TYPE_U32 && imm0.isPow2()) {
1074 bld.setPosition(i, false);
1075 i->op = OP_AND;
1076 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1077 }
1078 break;
1079
1080 case OP_SET: // TODO: SET_AND,OR,XOR
1081 {
1082 /* This optimizes the case where the output of a set is being compared
1083 * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1084 * can be a lot cleverer in our comparison.
1085 */
1086 CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
1087 CondCode cc, ccZ;
1088 if (imm0.reg.data.u32 != 0 || !si)
1089 return;
1090 cc = si->setCond;
1091 ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
1092 // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1093 // first.
1094 if (s == 0)
1095 ccZ = reverseCondCode(ccZ);
1096 // If there is a negative modifier, we need to undo that, by flipping
1097 // the comparison to zero.
1098 if (i->src(t).mod.neg())
1099 ccZ = reverseCondCode(ccZ);
1100 // If this is a signed comparison, we expect the input to be a regular
1101 // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1102 // is positive, so just flip the sign.
1103 if (i->sType == TYPE_S32) {
1104 assert(!isFloatType(si->dType));
1105 ccZ = reverseCondCode(ccZ);
1106 }
1107 switch (ccZ) {
1108 case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
1109 case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
1110 case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
1111 case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
1112 case CC_GT: break; // bool > 0 -- bool
1113 case CC_NE: break; // bool != 0 -- bool
1114 default:
1115 return;
1116 }
1117
1118 // Update the condition of this SET to be identical to the origin set,
1119 // but with the updated condition code. The original SET should get
1120 // DCE'd, ideally.
1121 i->op = si->op;
1122 i->asCmp()->setCond = cc;
1123 i->setSrc(0, si->src(0));
1124 i->setSrc(1, si->src(1));
1125 if (si->srcExists(2))
1126 i->setSrc(2, si->src(2));
1127 i->sType = si->sType;
1128 }
1129 break;
1130
1131 case OP_AND:
1132 {
1133 Instruction *src = i->getSrc(t)->getInsn();
1134 ImmediateValue imm1;
1135 if (imm0.reg.data.u32 == 0) {
1136 i->op = OP_MOV;
1137 i->setSrc(0, new_ImmediateValue(prog, 0u));
1138 i->src(0).mod = Modifier(0);
1139 i->setSrc(1, NULL);
1140 } else if (imm0.reg.data.u32 == ~0U) {
1141 i->op = i->src(t).mod.getOp();
1142 if (t) {
1143 i->setSrc(0, i->getSrc(t));
1144 i->src(0).mod = i->src(t).mod;
1145 }
1146 i->setSrc(1, NULL);
1147 } else if (src->asCmp()) {
1148 CmpInstruction *cmp = src->asCmp();
1149 if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
1150 return;
1151 if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
1152 return;
1153 if (imm0.reg.data.f32 != 1.0)
1154 return;
1155 if (cmp->dType != TYPE_U32)
1156 return;
1157
1158 cmp->dType = TYPE_F32;
1159 if (i->src(t).mod != Modifier(0)) {
1160 assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
1161 i->src(t).mod = Modifier(0);
1162 cmp->setCond = inverseCondCode(cmp->setCond);
1163 }
1164 i->op = OP_MOV;
1165 i->setSrc(s, NULL);
1166 if (t) {
1167 i->setSrc(0, i->getSrc(t));
1168 i->setSrc(t, NULL);
1169 }
1170 } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
1171 src->op == OP_SHR &&
1172 src->src(1).getImmediate(imm1) &&
1173 i->src(t).mod == Modifier(0) &&
1174 util_is_power_of_two(imm0.reg.data.u32 + 1)) {
1175 // low byte = offset, high byte = width
1176 uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
1177 i->op = OP_EXTBF;
1178 i->setSrc(0, src->getSrc(0));
1179 i->setSrc(1, new_ImmediateValue(prog, ext));
1180 }
1181 }
1182 break;
1183
1184 case OP_SHL:
1185 {
1186 if (s != 1 || i->src(0).mod != Modifier(0))
1187 break;
1188 // try to concatenate shifts
1189 Instruction *si = i->getSrc(0)->getInsn();
1190 if (!si)
1191 break;
1192 ImmediateValue imm1;
1193 switch (si->op) {
1194 case OP_SHL:
1195 if (si->src(1).getImmediate(imm1)) {
1196 bld.setPosition(i, false);
1197 i->setSrc(0, si->getSrc(0));
1198 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
1199 }
1200 break;
1201 case OP_MUL:
1202 int muls;
1203 if (isFloatType(si->dType))
1204 return;
1205 if (si->src(1).getImmediate(imm1))
1206 muls = 1;
1207 else if (si->src(0).getImmediate(imm1))
1208 muls = 0;
1209 else
1210 return;
1211
1212 bld.setPosition(i, false);
1213 i->op = OP_MUL;
1214 i->setSrc(0, si->getSrc(!muls));
1215 i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1216 break;
1217 case OP_SUB:
1218 case OP_ADD:
1219 int adds;
1220 if (isFloatType(si->dType))
1221 return;
1222 if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
1223 adds = 0;
1224 else if (si->src(1).getImmediate(imm1))
1225 adds = 1;
1226 else
1227 return;
1228 // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1229
1230 // This is more operations, but if one of x, y is an immediate, then
1231 // we can get a situation where (a) we can use ISCADD, or (b)
1232 // propagate the add bit into an indirect load.
1233 bld.setPosition(i, false);
1234 i->op = si->op;
1235 i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1236 i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
1237 bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
1238 si->getSrc(!adds),
1239 bld.mkImm(imm0.reg.data.u32)));
1240 break;
1241 default:
1242 return;
1243 }
1244 }
1245 break;
1246
1247 case OP_ABS:
1248 case OP_NEG:
1249 case OP_SAT:
1250 case OP_LG2:
1251 case OP_RCP:
1252 case OP_SQRT:
1253 case OP_RSQ:
1254 case OP_PRESIN:
1255 case OP_SIN:
1256 case OP_COS:
1257 case OP_PREEX2:
1258 case OP_EX2:
1259 unary(i, imm0);
1260 break;
1261 case OP_BFIND: {
1262 int32_t res;
1263 switch (i->dType) {
1264 case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
1265 case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
1266 default:
1267 return;
1268 }
1269 if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
1270 res = 31 - res;
1271 bld.setPosition(i, false); /* make sure bld is init'ed */
1272 i->setSrc(0, bld.mkImm(res));
1273 i->setSrc(1, NULL);
1274 i->op = OP_MOV;
1275 i->subOp = 0;
1276 break;
1277 }
1278 case OP_POPCNT: {
1279 // Only deal with 1-arg POPCNT here
1280 if (i->srcExists(1))
1281 break;
1282 uint32_t res = util_bitcount(imm0.reg.data.u32);
1283 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1284 i->setSrc(1, NULL);
1285 i->op = OP_MOV;
1286 break;
1287 }
1288 case OP_CVT: {
1289 Storage res;
1290
1291 // TODO: handle 64-bit values properly
1292 if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
1293 return;
1294
1295 // TODO: handle single byte/word extractions
1296 if (i->subOp)
1297 return;
1298
1299 bld.setPosition(i, true); /* make sure bld is init'ed */
1300
1301 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1302 case type: \
1303 switch (i->sType) { \
1304 case TYPE_F64: \
1305 res.data.dst = util_iround(i->saturate ? \
1306 CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1307 imm0.reg.data.f64); \
1308 break; \
1309 case TYPE_F32: \
1310 res.data.dst = util_iround(i->saturate ? \
1311 CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1312 imm0.reg.data.f32); \
1313 break; \
1314 case TYPE_S32: \
1315 res.data.dst = i->saturate ? \
1316 CLAMP(imm0.reg.data.s32, imin, imax) : \
1317 imm0.reg.data.s32; \
1318 break; \
1319 case TYPE_U32: \
1320 res.data.dst = i->saturate ? \
1321 CLAMP(imm0.reg.data.u32, umin, umax) : \
1322 imm0.reg.data.u32; \
1323 break; \
1324 case TYPE_S16: \
1325 res.data.dst = i->saturate ? \
1326 CLAMP(imm0.reg.data.s16, imin, imax) : \
1327 imm0.reg.data.s16; \
1328 break; \
1329 case TYPE_U16: \
1330 res.data.dst = i->saturate ? \
1331 CLAMP(imm0.reg.data.u16, umin, umax) : \
1332 imm0.reg.data.u16; \
1333 break; \
1334 default: return; \
1335 } \
1336 i->setSrc(0, bld.mkImm(res.data.dst)); \
1337 break
1338
1339 switch(i->dType) {
1340 CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
1341 CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
1342 CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
1343 CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
1344 case TYPE_F32:
1345 switch (i->sType) {
1346 case TYPE_F64:
1347 res.data.f32 = i->saturate ?
1348 CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1349 imm0.reg.data.f64;
1350 break;
1351 case TYPE_F32:
1352 res.data.f32 = i->saturate ?
1353 CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1354 imm0.reg.data.f32;
1355 break;
1356 case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
1357 case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
1358 case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
1359 case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
1360 default:
1361 return;
1362 }
1363 i->setSrc(0, bld.mkImm(res.data.f32));
1364 break;
1365 case TYPE_F64:
1366 switch (i->sType) {
1367 case TYPE_F64:
1368 res.data.f64 = i->saturate ?
1369 CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1370 imm0.reg.data.f64;
1371 break;
1372 case TYPE_F32:
1373 res.data.f64 = i->saturate ?
1374 CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1375 imm0.reg.data.f32;
1376 break;
1377 case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
1378 case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
1379 case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
1380 case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
1381 default:
1382 return;
1383 }
1384 i->setSrc(0, bld.mkImm(res.data.f64));
1385 break;
1386 default:
1387 return;
1388 }
1389 #undef CASE
1390
1391 i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
1392 i->op = OP_MOV;
1393 i->saturate = 0;
1394 i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
1395 break;
1396 }
1397 default:
1398 return;
1399 }
1400 if (newi->op != op)
1401 foldCount++;
1402 }
1403
1404 // =============================================================================
1405
1406 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1407 class ModifierFolding : public Pass
1408 {
1409 private:
1410 virtual bool visit(BasicBlock *);
1411 };
1412
1413 bool
1414 ModifierFolding::visit(BasicBlock *bb)
1415 {
1416 const Target *target = prog->getTarget();
1417
1418 Instruction *i, *next, *mi;
1419 Modifier mod;
1420
1421 for (i = bb->getEntry(); i; i = next) {
1422 next = i->next;
1423
1424 if (0 && i->op == OP_SUB) {
1425 // turn "sub" into "add neg" (do we really want this ?)
1426 i->op = OP_ADD;
1427 i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1428 }
1429
1430 for (int s = 0; s < 3 && i->srcExists(s); ++s) {
1431 mi = i->getSrc(s)->getInsn();
1432 if (!mi ||
1433 mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
1434 continue;
1435 if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
1436 if ((i->op != OP_ADD &&
1437 i->op != OP_MUL) ||
1438 (mi->op != OP_ABS &&
1439 mi->op != OP_NEG))
1440 continue;
1441 } else
1442 if (i->sType != mi->dType) {
1443 continue;
1444 }
1445 if ((mod = Modifier(mi->op)) == Modifier(0))
1446 continue;
1447 mod *= mi->src(0).mod;
1448
1449 if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
1450 // abs neg [abs] = abs
1451 mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
1452 } else
1453 if ((i->op == OP_NEG) && mod.neg()) {
1454 assert(s == 0);
1455 // neg as both opcode and modifier on same insn is prohibited
1456 // neg neg abs = abs, neg neg = identity
1457 mod = mod & Modifier(~NV50_IR_MOD_NEG);
1458 i->op = mod.getOp();
1459 mod = mod & Modifier(~NV50_IR_MOD_ABS);
1460 if (mod == Modifier(0))
1461 i->op = OP_MOV;
1462 }
1463
1464 if (target->isModSupported(i, s, mod)) {
1465 i->setSrc(s, mi->getSrc(0));
1466 i->src(s).mod *= mod;
1467 }
1468 }
1469
1470 if (i->op == OP_SAT) {
1471 mi = i->getSrc(0)->getInsn();
1472 if (mi &&
1473 mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
1474 mi->saturate = 1;
1475 mi->setDef(0, i->getDef(0));
1476 delete_Instruction(prog, i);
1477 }
1478 }
1479 }
1480
1481 return true;
1482 }
1483
1484 // =============================================================================
1485
1486 // MUL + ADD -> MAD/FMA
1487 // MIN/MAX(a, a) -> a, etc.
1488 // SLCT(a, b, const) -> cc(const) ? a : b
1489 // RCP(RCP(a)) -> a
1490 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1491 class AlgebraicOpt : public Pass
1492 {
1493 private:
1494 virtual bool visit(BasicBlock *);
1495
1496 void handleABS(Instruction *);
1497 bool handleADD(Instruction *);
1498 bool tryADDToMADOrSAD(Instruction *, operation toOp);
1499 void handleMINMAX(Instruction *);
1500 void handleRCP(Instruction *);
1501 void handleSLCT(Instruction *);
1502 void handleLOGOP(Instruction *);
1503 void handleCVT_NEG(Instruction *);
1504 void handleCVT_EXTBF(Instruction *);
1505 void handleSUCLAMP(Instruction *);
1506
1507 BuildUtil bld;
1508 };
1509
1510 void
1511 AlgebraicOpt::handleABS(Instruction *abs)
1512 {
1513 Instruction *sub = abs->getSrc(0)->getInsn();
1514 DataType ty;
1515 if (!sub ||
1516 !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
1517 return;
1518 // expect not to have mods yet, if we do, bail
1519 if (sub->src(0).mod || sub->src(1).mod)
1520 return;
1521 // hidden conversion ?
1522 ty = intTypeToSigned(sub->dType);
1523 if (abs->dType != abs->sType || ty != abs->sType)
1524 return;
1525
1526 if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
1527 sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
1528 sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
1529 return;
1530
1531 Value *src0 = sub->getSrc(0);
1532 Value *src1 = sub->getSrc(1);
1533
1534 if (sub->op == OP_ADD) {
1535 Instruction *neg = sub->getSrc(1)->getInsn();
1536 if (neg && neg->op != OP_NEG) {
1537 neg = sub->getSrc(0)->getInsn();
1538 src0 = sub->getSrc(1);
1539 }
1540 if (!neg || neg->op != OP_NEG ||
1541 neg->dType != neg->sType || neg->sType != ty)
1542 return;
1543 src1 = neg->getSrc(0);
1544 }
1545
1546 // found ABS(SUB))
1547 abs->moveSources(1, 2); // move sources >=1 up by 2
1548 abs->op = OP_SAD;
1549 abs->setType(sub->dType);
1550 abs->setSrc(0, src0);
1551 abs->setSrc(1, src1);
1552 bld.setPosition(abs, false);
1553 abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
1554 }
1555
1556 bool
1557 AlgebraicOpt::handleADD(Instruction *add)
1558 {
1559 Value *src0 = add->getSrc(0);
1560 Value *src1 = add->getSrc(1);
1561
1562 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1563 return false;
1564
1565 bool changed = false;
1566 if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
1567 changed = tryADDToMADOrSAD(add, OP_MAD);
1568 if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
1569 changed = tryADDToMADOrSAD(add, OP_SAD);
1570 return changed;
1571 }
1572
1573 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1574 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1575 bool
1576 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
1577 {
1578 Value *src0 = add->getSrc(0);
1579 Value *src1 = add->getSrc(1);
1580 Value *src;
1581 int s;
1582 const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1583 const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1584 Modifier mod[4];
1585
1586 if (src0->refCount() == 1 &&
1587 src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1588 s = 0;
1589 else
1590 if (src1->refCount() == 1 &&
1591 src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1592 s = 1;
1593 else
1594 return false;
1595
1596 src = add->getSrc(s);
1597
1598 if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
1599 return false;
1600
1601 if (src->getInsn()->postFactor)
1602 return false;
1603 if (toOp == OP_SAD) {
1604 ImmediateValue imm;
1605 if (!src->getInsn()->src(2).getImmediate(imm))
1606 return false;
1607 if (!imm.isInteger(0))
1608 return false;
1609 }
1610
1611 if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
1612 isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
1613 return false;
1614
1615 mod[0] = add->src(0).mod;
1616 mod[1] = add->src(1).mod;
1617 mod[2] = src->getUniqueInsn()->src(0).mod;
1618 mod[3] = src->getUniqueInsn()->src(1).mod;
1619
1620 if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1621 return false;
1622
1623 add->op = toOp;
1624 add->subOp = src->getInsn()->subOp; // potentially mul-high
1625 add->dType = src->getInsn()->dType; // sign matters for imad hi
1626 add->sType = src->getInsn()->sType;
1627
1628 add->setSrc(2, add->src(s ? 0 : 1));
1629
1630 add->setSrc(0, src->getInsn()->getSrc(0));
1631 add->src(0).mod = mod[2] ^ mod[s];
1632 add->setSrc(1, src->getInsn()->getSrc(1));
1633 add->src(1).mod = mod[3];
1634
1635 return true;
1636 }
1637
1638 void
1639 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1640 {
1641 Value *src0 = minmax->getSrc(0);
1642 Value *src1 = minmax->getSrc(1);
1643
1644 if (src0 != src1 || src0->reg.file != FILE_GPR)
1645 return;
1646 if (minmax->src(0).mod == minmax->src(1).mod) {
1647 if (minmax->def(0).mayReplace(minmax->src(0))) {
1648 minmax->def(0).replace(minmax->src(0), false);
1649 minmax->bb->remove(minmax);
1650 } else {
1651 minmax->op = OP_CVT;
1652 minmax->setSrc(1, NULL);
1653 }
1654 } else {
1655 // TODO:
1656 // min(x, -x) = -abs(x)
1657 // min(x, -abs(x)) = -abs(x)
1658 // min(x, abs(x)) = x
1659 // max(x, -abs(x)) = x
1660 // max(x, abs(x)) = abs(x)
1661 // max(x, -x) = abs(x)
1662 }
1663 }
1664
1665 void
1666 AlgebraicOpt::handleRCP(Instruction *rcp)
1667 {
1668 Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1669
1670 if (si && si->op == OP_RCP) {
1671 Modifier mod = rcp->src(0).mod * si->src(0).mod;
1672 rcp->op = mod.getOp();
1673 rcp->setSrc(0, si->getSrc(0));
1674 }
1675 }
1676
1677 void
1678 AlgebraicOpt::handleSLCT(Instruction *slct)
1679 {
1680 if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
1681 if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
1682 slct->setSrc(0, slct->getSrc(1));
1683 } else
1684 if (slct->getSrc(0) != slct->getSrc(1)) {
1685 return;
1686 }
1687 slct->op = OP_MOV;
1688 slct->setSrc(1, NULL);
1689 slct->setSrc(2, NULL);
1690 }
1691
1692 void
1693 AlgebraicOpt::handleLOGOP(Instruction *logop)
1694 {
1695 Value *src0 = logop->getSrc(0);
1696 Value *src1 = logop->getSrc(1);
1697
1698 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1699 return;
1700
1701 if (src0 == src1) {
1702 if ((logop->op == OP_AND || logop->op == OP_OR) &&
1703 logop->def(0).mayReplace(logop->src(0))) {
1704 logop->def(0).replace(logop->src(0), false);
1705 delete_Instruction(prog, logop);
1706 }
1707 } else {
1708 // try AND(SET, SET) -> SET_AND(SET)
1709 Instruction *set0 = src0->getInsn();
1710 Instruction *set1 = src1->getInsn();
1711
1712 if (!set0 || set0->fixed || !set1 || set1->fixed)
1713 return;
1714 if (set1->op != OP_SET) {
1715 Instruction *xchg = set0;
1716 set0 = set1;
1717 set1 = xchg;
1718 if (set1->op != OP_SET)
1719 return;
1720 }
1721 operation redOp = (logop->op == OP_AND ? OP_SET_AND :
1722 logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
1723 if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
1724 return;
1725 if (set0->op != OP_SET &&
1726 set0->op != OP_SET_AND &&
1727 set0->op != OP_SET_OR &&
1728 set0->op != OP_SET_XOR)
1729 return;
1730 if (set0->getDef(0)->refCount() > 1 &&
1731 set1->getDef(0)->refCount() > 1)
1732 return;
1733 if (set0->getPredicate() || set1->getPredicate())
1734 return;
1735 // check that they don't source each other
1736 for (int s = 0; s < 2; ++s)
1737 if (set0->getSrc(s) == set1->getDef(0) ||
1738 set1->getSrc(s) == set0->getDef(0))
1739 return;
1740
1741 set0 = cloneForward(func, set0);
1742 set1 = cloneShallow(func, set1);
1743 logop->bb->insertAfter(logop, set1);
1744 logop->bb->insertAfter(logop, set0);
1745
1746 set0->dType = TYPE_U8;
1747 set0->getDef(0)->reg.file = FILE_PREDICATE;
1748 set0->getDef(0)->reg.size = 1;
1749 set1->setSrc(2, set0->getDef(0));
1750 set1->op = redOp;
1751 set1->setDef(0, logop->getDef(0));
1752 delete_Instruction(prog, logop);
1753 }
1754 }
1755
1756 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1757 // nv50:
1758 // F2I(NEG(I2F(ABS(SET))))
1759 void
1760 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
1761 {
1762 Instruction *insn = cvt->getSrc(0)->getInsn();
1763 if (cvt->sType != TYPE_F32 ||
1764 cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
1765 return;
1766 if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
1767 return;
1768 if (insn->src(0).mod != Modifier(0))
1769 return;
1770 insn = insn->getSrc(0)->getInsn();
1771
1772 // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
1773 if (insn && insn->op == OP_CVT &&
1774 insn->dType == TYPE_F32 &&
1775 insn->sType == TYPE_S32) {
1776 insn = insn->getSrc(0)->getInsn();
1777 if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
1778 insn->src(0).mod)
1779 return;
1780 insn = insn->getSrc(0)->getInsn();
1781 if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
1782 return;
1783 } else
1784 if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
1785 return;
1786 }
1787
1788 Instruction *bset = cloneShallow(func, insn);
1789 bset->dType = TYPE_U32;
1790 bset->setDef(0, cvt->getDef(0));
1791 cvt->bb->insertAfter(cvt, bset);
1792 delete_Instruction(prog, cvt);
1793 }
1794
1795 // Some shaders extract packed bytes out of words and convert them to
1796 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
1797 // nv50 for word sizes.
1798 //
1799 // CVT(EXTBF(x, byte/word))
1800 // CVT(AND(bytemask, x))
1801 // CVT(AND(bytemask, SHR(x, 8/16/24)))
1802 // CVT(SHR(x, 16/24))
1803 void
1804 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
1805 {
1806 Instruction *insn = cvt->getSrc(0)->getInsn();
1807 ImmediateValue imm;
1808 Value *arg = NULL;
1809 unsigned width, offset;
1810 if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
1811 return;
1812 if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
1813 width = (imm.reg.data.u32 >> 8) & 0xff;
1814 offset = imm.reg.data.u32 & 0xff;
1815 arg = insn->getSrc(0);
1816
1817 if (width != 8 && width != 16)
1818 return;
1819 if (width == 8 && offset & 0x7)
1820 return;
1821 if (width == 16 && offset & 0xf)
1822 return;
1823 } else if (insn->op == OP_AND) {
1824 int s;
1825 if (insn->src(0).getImmediate(imm))
1826 s = 0;
1827 else if (insn->src(1).getImmediate(imm))
1828 s = 1;
1829 else
1830 return;
1831
1832 if (imm.reg.data.u32 == 0xff)
1833 width = 8;
1834 else if (imm.reg.data.u32 == 0xffff)
1835 width = 16;
1836 else
1837 return;
1838
1839 arg = insn->getSrc(!s);
1840 Instruction *shift = arg->getInsn();
1841 offset = 0;
1842 if (shift && shift->op == OP_SHR &&
1843 shift->sType == cvt->sType &&
1844 shift->src(1).getImmediate(imm) &&
1845 ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
1846 (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
1847 arg = shift->getSrc(0);
1848 offset = imm.reg.data.u32;
1849 }
1850 } else if (insn->op == OP_SHR &&
1851 insn->sType == cvt->sType &&
1852 insn->src(1).getImmediate(imm)) {
1853 arg = insn->getSrc(0);
1854 if (imm.reg.data.u32 == 24) {
1855 width = 8;
1856 offset = 24;
1857 } else if (imm.reg.data.u32 == 16) {
1858 width = 16;
1859 offset = 16;
1860 } else {
1861 return;
1862 }
1863 }
1864
1865 if (!arg)
1866 return;
1867
1868 // Irrespective of what came earlier, we can undo a shift on the argument
1869 // by adjusting the offset.
1870 Instruction *shift = arg->getInsn();
1871 if (shift && shift->op == OP_SHL &&
1872 shift->src(1).getImmediate(imm) &&
1873 ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
1874 (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
1875 imm.reg.data.u32 <= offset) {
1876 arg = shift->getSrc(0);
1877 offset -= imm.reg.data.u32;
1878 }
1879
1880 // The unpackSnorm lowering still leaves a few shifts behind, but it's too
1881 // annoying to detect them.
1882
1883 if (width == 8) {
1884 cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
1885 } else {
1886 assert(width == 16);
1887 cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
1888 }
1889 cvt->setSrc(0, arg);
1890 cvt->subOp = offset >> 3;
1891 }
1892
1893 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
1894 void
1895 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
1896 {
1897 ImmediateValue imm;
1898 int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
1899 int s;
1900 Instruction *add;
1901
1902 assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
1903
1904 // look for ADD (TODO: only count references by non-SUCLAMP)
1905 if (insn->getSrc(0)->refCount() > 1)
1906 return;
1907 add = insn->getSrc(0)->getInsn();
1908 if (!add || add->op != OP_ADD ||
1909 (add->dType != TYPE_U32 &&
1910 add->dType != TYPE_S32))
1911 return;
1912
1913 // look for immediate
1914 for (s = 0; s < 2; ++s)
1915 if (add->src(s).getImmediate(imm))
1916 break;
1917 if (s >= 2)
1918 return;
1919 s = s ? 0 : 1;
1920 // determine if immediate fits
1921 val += imm.reg.data.s32;
1922 if (val > 31 || val < -32)
1923 return;
1924 // determine if other addend fits
1925 if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
1926 return;
1927
1928 bld.setPosition(insn, false); // make sure bld is init'ed
1929 // replace sources
1930 insn->setSrc(2, bld.mkImm(val));
1931 insn->setSrc(0, add->getSrc(s));
1932 }
1933
1934 bool
1935 AlgebraicOpt::visit(BasicBlock *bb)
1936 {
1937 Instruction *next;
1938 for (Instruction *i = bb->getEntry(); i; i = next) {
1939 next = i->next;
1940 switch (i->op) {
1941 case OP_ABS:
1942 handleABS(i);
1943 break;
1944 case OP_ADD:
1945 handleADD(i);
1946 break;
1947 case OP_RCP:
1948 handleRCP(i);
1949 break;
1950 case OP_MIN:
1951 case OP_MAX:
1952 handleMINMAX(i);
1953 break;
1954 case OP_SLCT:
1955 handleSLCT(i);
1956 break;
1957 case OP_AND:
1958 case OP_OR:
1959 case OP_XOR:
1960 handleLOGOP(i);
1961 break;
1962 case OP_CVT:
1963 handleCVT_NEG(i);
1964 if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
1965 handleCVT_EXTBF(i);
1966 break;
1967 case OP_SUCLAMP:
1968 handleSUCLAMP(i);
1969 break;
1970 default:
1971 break;
1972 }
1973 }
1974
1975 return true;
1976 }
1977
1978 // =============================================================================
1979
1980 static inline void
1981 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
1982 {
1983 if (offset != ldst->getSrc(0)->reg.data.offset) {
1984 if (ldst->getSrc(0)->refCount() > 1)
1985 ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
1986 ldst->getSrc(0)->reg.data.offset = offset;
1987 }
1988 }
1989
1990 // Combine loads and stores, forward stores to loads where possible.
1991 class MemoryOpt : public Pass
1992 {
1993 private:
1994 class Record
1995 {
1996 public:
1997 Record *next;
1998 Instruction *insn;
1999 const Value *rel[2];
2000 const Value *base;
2001 int32_t offset;
2002 int8_t fileIndex;
2003 uint8_t size;
2004 bool locked;
2005 Record *prev;
2006
2007 bool overlaps(const Instruction *ldst) const;
2008
2009 inline void link(Record **);
2010 inline void unlink(Record **);
2011 inline void set(const Instruction *ldst);
2012 };
2013
2014 public:
2015 MemoryOpt();
2016
2017 Record *loads[DATA_FILE_COUNT];
2018 Record *stores[DATA_FILE_COUNT];
2019
2020 MemoryPool recordPool;
2021
2022 private:
2023 virtual bool visit(BasicBlock *);
2024 bool runOpt(BasicBlock *);
2025
2026 Record **getList(const Instruction *);
2027
2028 Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
2029
2030 // merge @insn into load/store instruction from @rec
2031 bool combineLd(Record *rec, Instruction *ld);
2032 bool combineSt(Record *rec, Instruction *st);
2033
2034 bool replaceLdFromLd(Instruction *ld, Record *ldRec);
2035 bool replaceLdFromSt(Instruction *ld, Record *stRec);
2036 bool replaceStFromSt(Instruction *restrict st, Record *stRec);
2037
2038 void addRecord(Instruction *ldst);
2039 void purgeRecords(Instruction *const st, DataFile);
2040 void lockStores(Instruction *const ld);
2041 void reset();
2042
2043 private:
2044 Record *prevRecord;
2045 };
2046
2047 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
2048 {
2049 for (int i = 0; i < DATA_FILE_COUNT; ++i) {
2050 loads[i] = NULL;
2051 stores[i] = NULL;
2052 }
2053 prevRecord = NULL;
2054 }
2055
2056 void
2057 MemoryOpt::reset()
2058 {
2059 for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
2060 Record *it, *next;
2061 for (it = loads[i]; it; it = next) {
2062 next = it->next;
2063 recordPool.release(it);
2064 }
2065 loads[i] = NULL;
2066 for (it = stores[i]; it; it = next) {
2067 next = it->next;
2068 recordPool.release(it);
2069 }
2070 stores[i] = NULL;
2071 }
2072 }
2073
2074 bool
2075 MemoryOpt::combineLd(Record *rec, Instruction *ld)
2076 {
2077 int32_t offRc = rec->offset;
2078 int32_t offLd = ld->getSrc(0)->reg.data.offset;
2079 int sizeRc = rec->size;
2080 int sizeLd = typeSizeof(ld->dType);
2081 int size = sizeRc + sizeLd;
2082 int d, j;
2083
2084 if (!prog->getTarget()->
2085 isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
2086 return false;
2087 // no unaligned loads
2088 if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
2089 ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
2090 return false;
2091
2092 assert(sizeRc + sizeLd <= 16 && offRc != offLd);
2093
2094 for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
2095
2096 if (offLd < offRc) {
2097 int sz;
2098 for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
2099 // d: nr of definitions in ld
2100 // j: nr of definitions in rec->insn, move:
2101 for (d = d + j - 1; j > 0; --j, --d)
2102 rec->insn->setDef(d, rec->insn->getDef(j - 1));
2103
2104 if (rec->insn->getSrc(0)->refCount() > 1)
2105 rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
2106 rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
2107
2108 d = 0;
2109 } else {
2110 d = j;
2111 }
2112 // move definitions of @ld to @rec->insn
2113 for (j = 0; sizeLd; ++j, ++d) {
2114 sizeLd -= ld->getDef(j)->reg.size;
2115 rec->insn->setDef(d, ld->getDef(j));
2116 }
2117
2118 rec->size = size;
2119 rec->insn->getSrc(0)->reg.size = size;
2120 rec->insn->setType(typeOfSize(size));
2121
2122 delete_Instruction(prog, ld);
2123
2124 return true;
2125 }
2126
2127 bool
2128 MemoryOpt::combineSt(Record *rec, Instruction *st)
2129 {
2130 int32_t offRc = rec->offset;
2131 int32_t offSt = st->getSrc(0)->reg.data.offset;
2132 int sizeRc = rec->size;
2133 int sizeSt = typeSizeof(st->dType);
2134 int s = sizeSt / 4;
2135 int size = sizeRc + sizeSt;
2136 int j, k;
2137 Value *src[4]; // no modifiers in ValueRef allowed for st
2138 Value *extra[3];
2139
2140 if (!prog->getTarget()->
2141 isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
2142 return false;
2143 if (size == 8 && MIN2(offRc, offSt) & 0x7)
2144 return false;
2145
2146 st->takeExtraSources(0, extra); // save predicate and indirect address
2147
2148 if (offRc < offSt) {
2149 // save values from @st
2150 for (s = 0; sizeSt; ++s) {
2151 sizeSt -= st->getSrc(s + 1)->reg.size;
2152 src[s] = st->getSrc(s + 1);
2153 }
2154 // set record's values as low sources of @st
2155 for (j = 1; sizeRc; ++j) {
2156 sizeRc -= rec->insn->getSrc(j)->reg.size;
2157 st->setSrc(j, rec->insn->getSrc(j));
2158 }
2159 // set saved values as high sources of @st
2160 for (k = j, j = 0; j < s; ++j)
2161 st->setSrc(k++, src[j]);
2162
2163 updateLdStOffset(st, offRc, func);
2164 } else {
2165 for (j = 1; sizeSt; ++j)
2166 sizeSt -= st->getSrc(j)->reg.size;
2167 for (s = 1; sizeRc; ++j, ++s) {
2168 sizeRc -= rec->insn->getSrc(s)->reg.size;
2169 st->setSrc(j, rec->insn->getSrc(s));
2170 }
2171 rec->offset = offSt;
2172 }
2173 st->putExtraSources(0, extra); // restore pointer and predicate
2174
2175 delete_Instruction(prog, rec->insn);
2176 rec->insn = st;
2177 rec->size = size;
2178 rec->insn->getSrc(0)->reg.size = size;
2179 rec->insn->setType(typeOfSize(size));
2180 return true;
2181 }
2182
2183 void
2184 MemoryOpt::Record::set(const Instruction *ldst)
2185 {
2186 const Symbol *mem = ldst->getSrc(0)->asSym();
2187 fileIndex = mem->reg.fileIndex;
2188 rel[0] = ldst->getIndirect(0, 0);
2189 rel[1] = ldst->getIndirect(0, 1);
2190 offset = mem->reg.data.offset;
2191 base = mem->getBase();
2192 size = typeSizeof(ldst->sType);
2193 }
2194
2195 void
2196 MemoryOpt::Record::link(Record **list)
2197 {
2198 next = *list;
2199 if (next)
2200 next->prev = this;
2201 prev = NULL;
2202 *list = this;
2203 }
2204
2205 void
2206 MemoryOpt::Record::unlink(Record **list)
2207 {
2208 if (next)
2209 next->prev = prev;
2210 if (prev)
2211 prev->next = next;
2212 else
2213 *list = next;
2214 }
2215
2216 MemoryOpt::Record **
2217 MemoryOpt::getList(const Instruction *insn)
2218 {
2219 if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
2220 return &loads[insn->src(0).getFile()];
2221 return &stores[insn->src(0).getFile()];
2222 }
2223
2224 void
2225 MemoryOpt::addRecord(Instruction *i)
2226 {
2227 Record **list = getList(i);
2228 Record *it = reinterpret_cast<Record *>(recordPool.allocate());
2229
2230 it->link(list);
2231 it->set(i);
2232 it->insn = i;
2233 it->locked = false;
2234 }
2235
2236 MemoryOpt::Record *
2237 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
2238 {
2239 const Symbol *sym = insn->getSrc(0)->asSym();
2240 const int size = typeSizeof(insn->sType);
2241 Record *rec = NULL;
2242 Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
2243
2244 for (; it; it = it->next) {
2245 if (it->locked && insn->op != OP_LOAD)
2246 continue;
2247 if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
2248 it->rel[0] != insn->getIndirect(0, 0) ||
2249 it->fileIndex != sym->reg.fileIndex ||
2250 it->rel[1] != insn->getIndirect(0, 1))
2251 continue;
2252
2253 if (it->offset < sym->reg.data.offset) {
2254 if (it->offset + it->size >= sym->reg.data.offset) {
2255 isAdj = (it->offset + it->size == sym->reg.data.offset);
2256 if (!isAdj)
2257 return it;
2258 if (!(it->offset & 0x7))
2259 rec = it;
2260 }
2261 } else {
2262 isAdj = it->offset != sym->reg.data.offset;
2263 if (size <= it->size && !isAdj)
2264 return it;
2265 else
2266 if (!(sym->reg.data.offset & 0x7))
2267 if (it->offset - size <= sym->reg.data.offset)
2268 rec = it;
2269 }
2270 }
2271 return rec;
2272 }
2273
2274 bool
2275 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
2276 {
2277 Instruction *st = rec->insn;
2278 int32_t offSt = rec->offset;
2279 int32_t offLd = ld->getSrc(0)->reg.data.offset;
2280 int d, s;
2281
2282 for (s = 1; offSt != offLd && st->srcExists(s); ++s)
2283 offSt += st->getSrc(s)->reg.size;
2284 if (offSt != offLd)
2285 return false;
2286
2287 for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
2288 if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
2289 return false;
2290 if (st->getSrc(s)->reg.file != FILE_GPR)
2291 return false;
2292 ld->def(d).replace(st->src(s), false);
2293 }
2294 ld->bb->remove(ld);
2295 return true;
2296 }
2297
2298 bool
2299 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
2300 {
2301 Instruction *ldR = rec->insn;
2302 int32_t offR = rec->offset;
2303 int32_t offE = ldE->getSrc(0)->reg.data.offset;
2304 int dR, dE;
2305
2306 assert(offR <= offE);
2307 for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
2308 offR += ldR->getDef(dR)->reg.size;
2309 if (offR != offE)
2310 return false;
2311
2312 for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
2313 if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
2314 return false;
2315 ldE->def(dE).replace(ldR->getDef(dR), false);
2316 }
2317
2318 delete_Instruction(prog, ldE);
2319 return true;
2320 }
2321
2322 bool
2323 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
2324 {
2325 const Instruction *const ri = rec->insn;
2326 Value *extra[3];
2327
2328 int32_t offS = st->getSrc(0)->reg.data.offset;
2329 int32_t offR = rec->offset;
2330 int32_t endS = offS + typeSizeof(st->dType);
2331 int32_t endR = offR + typeSizeof(ri->dType);
2332
2333 rec->size = MAX2(endS, endR) - MIN2(offS, offR);
2334
2335 st->takeExtraSources(0, extra);
2336
2337 if (offR < offS) {
2338 Value *vals[10];
2339 int s, n;
2340 int k = 0;
2341 // get non-replaced sources of ri
2342 for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
2343 vals[k++] = ri->getSrc(s);
2344 n = s;
2345 // get replaced sources of st
2346 for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
2347 vals[k++] = st->getSrc(s);
2348 // skip replaced sources of ri
2349 for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
2350 // get non-replaced sources after values covered by st
2351 for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
2352 vals[k++] = ri->getSrc(s);
2353 assert((unsigned int)k <= Elements(vals));
2354 for (s = 0; s < k; ++s)
2355 st->setSrc(s + 1, vals[s]);
2356 st->setSrc(0, ri->getSrc(0));
2357 } else
2358 if (endR > endS) {
2359 int j, s;
2360 for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
2361 for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
2362 for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
2363 st->setSrc(s++, ri->getSrc(j));
2364 }
2365 st->putExtraSources(0, extra);
2366
2367 delete_Instruction(prog, rec->insn);
2368
2369 rec->insn = st;
2370 rec->offset = st->getSrc(0)->reg.data.offset;
2371
2372 st->setType(typeOfSize(rec->size));
2373
2374 return true;
2375 }
2376
2377 bool
2378 MemoryOpt::Record::overlaps(const Instruction *ldst) const
2379 {
2380 Record that;
2381 that.set(ldst);
2382
2383 if (this->fileIndex != that.fileIndex)
2384 return false;
2385
2386 if (this->rel[0] || that.rel[0])
2387 return this->base == that.base;
2388 return
2389 (this->offset < that.offset + that.size) &&
2390 (this->offset + this->size > that.offset);
2391 }
2392
2393 // We must not eliminate stores that affect the result of @ld if
2394 // we find later stores to the same location, and we may no longer
2395 // merge them with later stores.
2396 // The stored value can, however, still be used to determine the value
2397 // returned by future loads.
2398 void
2399 MemoryOpt::lockStores(Instruction *const ld)
2400 {
2401 for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
2402 if (!r->locked && r->overlaps(ld))
2403 r->locked = true;
2404 }
2405
2406 // Prior loads from the location of @st are no longer valid.
2407 // Stores to the location of @st may no longer be used to derive
2408 // the value at it nor be coalesced into later stores.
2409 void
2410 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
2411 {
2412 if (st)
2413 f = st->src(0).getFile();
2414
2415 for (Record *r = loads[f]; r; r = r->next)
2416 if (!st || r->overlaps(st))
2417 r->unlink(&loads[f]);
2418
2419 for (Record *r = stores[f]; r; r = r->next)
2420 if (!st || r->overlaps(st))
2421 r->unlink(&stores[f]);
2422 }
2423
2424 bool
2425 MemoryOpt::visit(BasicBlock *bb)
2426 {
2427 bool ret = runOpt(bb);
2428 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
2429 // where 96 bit memory operations are forbidden.
2430 if (ret)
2431 ret = runOpt(bb);
2432 return ret;
2433 }
2434
2435 bool
2436 MemoryOpt::runOpt(BasicBlock *bb)
2437 {
2438 Instruction *ldst, *next;
2439 Record *rec;
2440 bool isAdjacent = true;
2441
2442 for (ldst = bb->getEntry(); ldst; ldst = next) {
2443 bool keep = true;
2444 bool isLoad = true;
2445 next = ldst->next;
2446
2447 if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
2448 if (ldst->isDead()) {
2449 // might have been produced by earlier optimization
2450 delete_Instruction(prog, ldst);
2451 continue;
2452 }
2453 } else
2454 if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
2455 isLoad = false;
2456 } else {
2457 // TODO: maybe have all fixed ops act as barrier ?
2458 if (ldst->op == OP_CALL ||
2459 ldst->op == OP_BAR ||
2460 ldst->op == OP_MEMBAR) {
2461 purgeRecords(NULL, FILE_MEMORY_LOCAL);
2462 purgeRecords(NULL, FILE_MEMORY_GLOBAL);
2463 purgeRecords(NULL, FILE_MEMORY_SHARED);
2464 purgeRecords(NULL, FILE_SHADER_OUTPUT);
2465 } else
2466 if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
2467 if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
2468 purgeRecords(NULL, FILE_MEMORY_LOCAL);
2469 purgeRecords(NULL, FILE_MEMORY_GLOBAL);
2470 purgeRecords(NULL, FILE_MEMORY_SHARED);
2471 } else {
2472 purgeRecords(NULL, ldst->src(0).getFile());
2473 }
2474 } else
2475 if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
2476 purgeRecords(NULL, FILE_SHADER_OUTPUT);
2477 }
2478 continue;
2479 }
2480 if (ldst->getPredicate()) // TODO: handle predicated ld/st
2481 continue;
2482 if (ldst->perPatch) // TODO: create separate per-patch lists
2483 continue;
2484
2485 if (isLoad) {
2486 DataFile file = ldst->src(0).getFile();
2487
2488 // if ld l[]/g[] look for previous store to eliminate the reload
2489 if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
2490 // TODO: shared memory ?
2491 rec = findRecord(ldst, false, isAdjacent);
2492 if (rec && !isAdjacent)
2493 keep = !replaceLdFromSt(ldst, rec);
2494 }
2495
2496 // or look for ld from the same location and replace this one
2497 rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
2498 if (rec) {
2499 if (!isAdjacent)
2500 keep = !replaceLdFromLd(ldst, rec);
2501 else
2502 // or combine a previous load with this one
2503 keep = !combineLd(rec, ldst);
2504 }
2505 if (keep)
2506 lockStores(ldst);
2507 } else {
2508 rec = findRecord(ldst, false, isAdjacent);
2509 if (rec) {
2510 if (!isAdjacent)
2511 keep = !replaceStFromSt(ldst, rec);
2512 else
2513 keep = !combineSt(rec, ldst);
2514 }
2515 if (keep)
2516 purgeRecords(ldst, DATA_FILE_COUNT);
2517 }
2518 if (keep)
2519 addRecord(ldst);
2520 }
2521 reset();
2522
2523 return true;
2524 }
2525
2526 // =============================================================================
2527
2528 // Turn control flow into predicated instructions (after register allocation !).
2529 // TODO:
2530 // Could move this to before register allocation on NVC0 and also handle nested
2531 // constructs.
2532 class FlatteningPass : public Pass
2533 {
2534 private:
2535 virtual bool visit(Function *);
2536 virtual bool visit(BasicBlock *);
2537
2538 bool tryPredicateConditional(BasicBlock *);
2539 void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
2540 void tryPropagateBranch(BasicBlock *);
2541 inline bool isConstantCondition(Value *pred);
2542 inline bool mayPredicate(const Instruction *, const Value *pred) const;
2543 inline void removeFlow(Instruction *);
2544
2545 uint8_t gpr_unit;
2546 };
2547
2548 bool
2549 FlatteningPass::isConstantCondition(Value *pred)
2550 {
2551 Instruction *insn = pred->getUniqueInsn();
2552 assert(insn);
2553 if (insn->op != OP_SET || insn->srcExists(2))
2554 return false;
2555
2556 for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
2557 Instruction *ld = insn->getSrc(s)->getUniqueInsn();
2558 DataFile file;
2559 if (ld) {
2560 if (ld->op != OP_MOV && ld->op != OP_LOAD)
2561 return false;
2562 if (ld->src(0).isIndirect(0))
2563 return false;
2564 file = ld->src(0).getFile();
2565 } else {
2566 file = insn->src(s).getFile();
2567 // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
2568 // in register "units", which can vary between targets.
2569 if (file == FILE_GPR) {
2570 Value *v = insn->getSrc(s);
2571 int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
2572 int units = bytes >> gpr_unit;
2573 if (units > prog->maxGPR)
2574 file = FILE_IMMEDIATE;
2575 }
2576 }
2577 if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
2578 return false;
2579 }
2580 return true;
2581 }
2582
2583 void
2584 FlatteningPass::removeFlow(Instruction *insn)
2585 {
2586 FlowInstruction *term = insn ? insn->asFlow() : NULL;
2587 if (!term)
2588 return;
2589 Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
2590
2591 if (term->op == OP_BRA) {
2592 // TODO: this might get more difficult when we get arbitrary BRAs
2593 if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
2594 return;
2595 } else
2596 if (term->op != OP_JOIN)
2597 return;
2598
2599 Value *pred = term->getPredicate();
2600
2601 delete_Instruction(prog, term);
2602
2603 if (pred && pred->refCount() == 0) {
2604 Instruction *pSet = pred->getUniqueInsn();
2605 pred->join->reg.data.id = -1; // deallocate
2606 if (pSet->isDead())
2607 delete_Instruction(prog, pSet);
2608 }
2609 }
2610
2611 void
2612 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
2613 {
2614 for (Instruction *i = bb->getEntry(); i; i = i->next) {
2615 if (i->isNop())
2616 continue;
2617 assert(!i->getPredicate());
2618 i->setPredicate(cc, pred);
2619 }
2620 removeFlow(bb->getExit());
2621 }
2622
2623 bool
2624 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
2625 {
2626 if (insn->isPseudo())
2627 return true;
2628 // TODO: calls where we don't know which registers are modified
2629
2630 if (!prog->getTarget()->mayPredicate(insn, pred))
2631 return false;
2632 for (int d = 0; insn->defExists(d); ++d)
2633 if (insn->getDef(d)->equals(pred))
2634 return false;
2635 return true;
2636 }
2637
2638 // If we jump to BRA/RET/EXIT, replace the jump with it.
2639 // NOTE: We do not update the CFG anymore here !
2640 //
2641 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
2642 // BB:0
2643 // @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
2644 // BB1:
2645 // bra BB:3
2646 // BB2:
2647 // ...
2648 // BB3:
2649 // ...
2650 void
2651 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
2652 {
2653 for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
2654 BasicBlock *bf = i->asFlow()->target.bb;
2655
2656 if (bf->getInsnCount() != 1)
2657 continue;
2658
2659 FlowInstruction *bra = i->asFlow();
2660 FlowInstruction *rep = bf->getExit()->asFlow();
2661
2662 if (!rep || rep->getPredicate())
2663 continue;
2664 if (rep->op != OP_BRA &&
2665 rep->op != OP_JOIN &&
2666 rep->op != OP_EXIT)
2667 continue;
2668
2669 // TODO: If there are multiple branches to @rep, only the first would
2670 // be replaced, so only remove them after this pass is done ?
2671 // Also, need to check all incident blocks for fall-through exits and
2672 // add the branch there.
2673 bra->op = rep->op;
2674 bra->target.bb = rep->target.bb;
2675 if (bf->cfg.incidentCount() == 1)
2676 bf->remove(rep);
2677 }
2678 }
2679
2680 bool
2681 FlatteningPass::visit(Function *fn)
2682 {
2683 gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
2684
2685 return true;
2686 }
2687
2688 bool
2689 FlatteningPass::visit(BasicBlock *bb)
2690 {
2691 if (tryPredicateConditional(bb))
2692 return true;
2693
2694 // try to attach join to previous instruction
2695 if (prog->getTarget()->hasJoin) {
2696 Instruction *insn = bb->getExit();
2697 if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
2698 insn = insn->prev;
2699 if (insn && !insn->getPredicate() &&
2700 !insn->asFlow() &&
2701 insn->op != OP_TEXBAR &&
2702 !isTextureOp(insn->op) && // probably just nve4
2703 !isSurfaceOp(insn->op) && // not confirmed
2704 insn->op != OP_LINTERP && // probably just nve4
2705 insn->op != OP_PINTERP && // probably just nve4
2706 ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
2707 (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
2708 !insn->isNop()) {
2709 insn->join = 1;
2710 bb->remove(bb->getExit());
2711 return true;
2712 }
2713 }
2714 }
2715
2716 tryPropagateBranch(bb);
2717
2718 return true;
2719 }
2720
2721 bool
2722 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
2723 {
2724 BasicBlock *bL = NULL, *bR = NULL;
2725 unsigned int nL = 0, nR = 0, limit = 12;
2726 Instruction *insn;
2727 unsigned int mask;
2728
2729 mask = bb->initiatesSimpleConditional();
2730 if (!mask)
2731 return false;
2732
2733 assert(bb->getExit());
2734 Value *pred = bb->getExit()->getPredicate();
2735 assert(pred);
2736
2737 if (isConstantCondition(pred))
2738 limit = 4;
2739
2740 Graph::EdgeIterator ei = bb->cfg.outgoing();
2741
2742 if (mask & 1) {
2743 bL = BasicBlock::get(ei.getNode());
2744 for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
2745 if (!mayPredicate(insn, pred))
2746 return false;
2747 if (nL > limit)
2748 return false; // too long, do a real branch
2749 }
2750 ei.next();
2751
2752 if (mask & 2) {
2753 bR = BasicBlock::get(ei.getNode());
2754 for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
2755 if (!mayPredicate(insn, pred))
2756 return false;
2757 if (nR > limit)
2758 return false; // too long, do a real branch
2759 }
2760
2761 if (bL)
2762 predicateInstructions(bL, pred, bb->getExit()->cc);
2763 if (bR)
2764 predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
2765
2766 if (bb->joinAt) {
2767 bb->remove(bb->joinAt);
2768 bb->joinAt = NULL;
2769 }
2770 removeFlow(bb->getExit()); // delete the branch/join at the fork point
2771
2772 // remove potential join operations at the end of the conditional
2773 if (prog->getTarget()->joinAnterior) {
2774 bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
2775 if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
2776 removeFlow(bb->getEntry());
2777 }
2778
2779 return true;
2780 }
2781
2782 // =============================================================================
2783
2784 // Fold Immediate into MAD; must be done after register allocation due to
2785 // constraint SDST == SSRC2
2786 // TODO:
2787 // Does NVC0+ have other situations where this pass makes sense?
2788 class NV50PostRaConstantFolding : public Pass
2789 {
2790 private:
2791 virtual bool visit(BasicBlock *);
2792 };
2793
2794 bool
2795 NV50PostRaConstantFolding::visit(BasicBlock *bb)
2796 {
2797 Value *vtmp;
2798 Instruction *def;
2799
2800 for (Instruction *i = bb->getFirst(); i; i = i->next) {
2801 switch (i->op) {
2802 case OP_MAD:
2803 if (i->def(0).getFile() != FILE_GPR ||
2804 i->src(0).getFile() != FILE_GPR ||
2805 i->src(1).getFile() != FILE_GPR ||
2806 i->src(2).getFile() != FILE_GPR ||
2807 i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id ||
2808 !isFloatType(i->dType))
2809 break;
2810
2811 if (i->getDef(0)->reg.data.id >= 64 ||
2812 i->getSrc(0)->reg.data.id >= 64)
2813 break;
2814
2815 if (i->getPredicate())
2816 break;
2817
2818 def = i->getSrc(1)->getInsn();
2819 if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
2820 vtmp = i->getSrc(1);
2821 i->setSrc(1, def->getSrc(0));
2822
2823 /* There's no post-RA dead code elimination, so do it here
2824 * XXX: if we add more code-removing post-RA passes, we might
2825 * want to create a post-RA dead-code elim pass */
2826 if (vtmp->refCount() == 0)
2827 delete_Instruction(bb->getProgram(), def);
2828
2829 break;
2830 }
2831 break;
2832 default:
2833 break;
2834 }
2835 }
2836
2837 return true;
2838 }
2839
2840 // =============================================================================
2841
2842 // Common subexpression elimination. Stupid O^2 implementation.
2843 class LocalCSE : public Pass
2844 {
2845 private:
2846 virtual bool visit(BasicBlock *);
2847
2848 inline bool tryReplace(Instruction **, Instruction *);
2849
2850 DLList ops[OP_LAST + 1];
2851 };
2852
2853 class GlobalCSE : public Pass
2854 {
2855 private:
2856 virtual bool visit(BasicBlock *);
2857 };
2858
2859 bool
2860 Instruction::isActionEqual(const Instruction *that) const
2861 {
2862 if (this->op != that->op ||
2863 this->dType != that->dType ||
2864 this->sType != that->sType)
2865 return false;
2866 if (this->cc != that->cc)
2867 return false;
2868
2869 if (this->asTex()) {
2870 if (memcmp(&this->asTex()->tex,
2871 &that->asTex()->tex,
2872 sizeof(this->asTex()->tex)))
2873 return false;
2874 } else
2875 if (this->asCmp()) {
2876 if (this->asCmp()->setCond != that->asCmp()->setCond)
2877 return false;
2878 } else
2879 if (this->asFlow()) {
2880 return false;
2881 } else {
2882 if (this->ipa != that->ipa ||
2883 this->lanes != that->lanes ||
2884 this->perPatch != that->perPatch)
2885 return false;
2886 if (this->postFactor != that->postFactor)
2887 return false;
2888 }
2889
2890 if (this->subOp != that->subOp ||
2891 this->saturate != that->saturate ||
2892 this->rnd != that->rnd ||
2893 this->ftz != that->ftz ||
2894 this->dnz != that->dnz ||
2895 this->cache != that->cache ||
2896 this->mask != that->mask)
2897 return false;
2898
2899 return true;
2900 }
2901
2902 bool
2903 Instruction::isResultEqual(const Instruction *that) const
2904 {
2905 unsigned int d, s;
2906
2907 // NOTE: location of discard only affects tex with liveOnly and quadops
2908 if (!this->defExists(0) && this->op != OP_DISCARD)
2909 return false;
2910
2911 if (!isActionEqual(that))
2912 return false;
2913
2914 if (this->predSrc != that->predSrc)
2915 return false;
2916
2917 for (d = 0; this->defExists(d); ++d) {
2918 if (!that->defExists(d) ||
2919 !this->getDef(d)->equals(that->getDef(d), false))
2920 return false;
2921 }
2922 if (that->defExists(d))
2923 return false;
2924
2925 for (s = 0; this->srcExists(s); ++s) {
2926 if (!that->srcExists(s))
2927 return false;
2928 if (this->src(s).mod != that->src(s).mod)
2929 return false;
2930 if (!this->getSrc(s)->equals(that->getSrc(s), true))
2931 return false;
2932 }
2933 if (that->srcExists(s))
2934 return false;
2935
2936 if (op == OP_LOAD || op == OP_VFETCH) {
2937 switch (src(0).getFile()) {
2938 case FILE_MEMORY_CONST:
2939 case FILE_SHADER_INPUT:
2940 return true;
2941 case FILE_SHADER_OUTPUT:
2942 return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
2943 default:
2944 return false;
2945 }
2946 }
2947
2948 return true;
2949 }
2950
2951 // pull through common expressions from different in-blocks
2952 bool
2953 GlobalCSE::visit(BasicBlock *bb)
2954 {
2955 Instruction *phi, *next, *ik;
2956 int s;
2957
2958 // TODO: maybe do this with OP_UNION, too
2959
2960 for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
2961 next = phi->next;
2962 if (phi->getSrc(0)->refCount() > 1)
2963 continue;
2964 ik = phi->getSrc(0)->getInsn();
2965 if (!ik)
2966 continue; // probably a function input
2967 for (s = 1; phi->srcExists(s); ++s) {
2968 if (phi->getSrc(s)->refCount() > 1)
2969 break;
2970 if (!phi->getSrc(s)->getInsn() ||
2971 !phi->getSrc(s)->getInsn()->isResultEqual(ik))
2972 break;
2973 }
2974 if (!phi->srcExists(s)) {
2975 Instruction *entry = bb->getEntry();
2976 ik->bb->remove(ik);
2977 if (!entry || entry->op != OP_JOIN)
2978 bb->insertHead(ik);
2979 else
2980 bb->insertAfter(entry, ik);
2981 ik->setDef(0, phi->getDef(0));
2982 delete_Instruction(prog, phi);
2983 }
2984 }
2985
2986 return true;
2987 }
2988
2989 bool
2990 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
2991 {
2992 Instruction *old = *ptr;
2993
2994 // TODO: maybe relax this later (causes trouble with OP_UNION)
2995 if (i->isPredicated())
2996 return false;
2997
2998 if (!old->isResultEqual(i))
2999 return false;
3000
3001 for (int d = 0; old->defExists(d); ++d)
3002 old->def(d).replace(i->getDef(d), false);
3003 delete_Instruction(prog, old);
3004 *ptr = NULL;
3005 return true;
3006 }
3007
3008 bool
3009 LocalCSE::visit(BasicBlock *bb)
3010 {
3011 unsigned int replaced;
3012
3013 do {
3014 Instruction *ir, *next;
3015
3016 replaced = 0;
3017
3018 // will need to know the order of instructions
3019 int serial = 0;
3020 for (ir = bb->getFirst(); ir; ir = ir->next)
3021 ir->serial = serial++;
3022
3023 for (ir = bb->getEntry(); ir; ir = next) {
3024 int s;
3025 Value *src = NULL;
3026
3027 next = ir->next;
3028
3029 if (ir->fixed) {
3030 ops[ir->op].insert(ir);
3031 continue;
3032 }
3033
3034 for (s = 0; ir->srcExists(s); ++s)
3035 if (ir->getSrc(s)->asLValue())
3036 if (!src || ir->getSrc(s)->refCount() < src->refCount())
3037 src = ir->getSrc(s);
3038
3039 if (src) {
3040 for (Value::UseIterator it = src->uses.begin();
3041 it != src->uses.end(); ++it) {
3042 Instruction *ik = (*it)->getInsn();
3043 if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
3044 if (tryReplace(&ir, ik))
3045 break;
3046 }
3047 } else {
3048 DLLIST_FOR_EACH(&ops[ir->op], iter)
3049 {
3050 Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
3051 if (tryReplace(&ir, ik))
3052 break;
3053 }
3054 }
3055
3056 if (ir)
3057 ops[ir->op].insert(ir);
3058 else
3059 ++replaced;
3060 }
3061 for (unsigned int i = 0; i <= OP_LAST; ++i)
3062 ops[i].clear();
3063
3064 } while (replaced);
3065
3066 return true;
3067 }
3068
3069 // =============================================================================
3070
3071 // Remove computations of unused values.
3072 class DeadCodeElim : public Pass
3073 {
3074 public:
3075 bool buryAll(Program *);
3076
3077 private:
3078 virtual bool visit(BasicBlock *);
3079
3080 void checkSplitLoad(Instruction *ld); // for partially dead loads
3081
3082 unsigned int deadCount;
3083 };
3084
3085 bool
3086 DeadCodeElim::buryAll(Program *prog)
3087 {
3088 do {
3089 deadCount = 0;
3090 if (!this->run(prog, false, false))
3091 return false;
3092 } while (deadCount);
3093
3094 return true;
3095 }
3096
3097 bool
3098 DeadCodeElim::visit(BasicBlock *bb)
3099 {
3100 Instruction *next;
3101
3102 for (Instruction *i = bb->getFirst(); i; i = next) {
3103 next = i->next;
3104 if (i->isDead()) {
3105 ++deadCount;
3106 delete_Instruction(prog, i);
3107 } else
3108 if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
3109 checkSplitLoad(i);
3110 } else
3111 if (i->defExists(0) && !i->getDef(0)->refCount()) {
3112 if (i->op == OP_ATOM ||
3113 i->op == OP_SUREDP ||
3114 i->op == OP_SUREDB)
3115 i->setDef(0, NULL);
3116 }
3117 }
3118 return true;
3119 }
3120
3121 // Each load can go into up to 4 destinations, any of which might potentially
3122 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3123 // of where the holes are. We find the first contiguous region, put it into
3124 // the first load, and then put the second contiguous region into the second
3125 // load. There can be at most 2 contiguous regions.
3126 //
3127 // Note that there are some restrictions, for example it's not possible to do
3128 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3129 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3130 // split into a 64-bit and 32-bit load.
3131 void
3132 DeadCodeElim::checkSplitLoad(Instruction *ld1)
3133 {
3134 Instruction *ld2 = NULL; // can get at most 2 loads
3135 Value *def1[4];
3136 Value *def2[4];
3137 int32_t addr1, addr2;
3138 int32_t size1, size2;
3139 int d, n1, n2;
3140 uint32_t mask = 0xffffffff;
3141
3142 for (d = 0; ld1->defExists(d); ++d)
3143 if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
3144 mask &= ~(1 << d);
3145 if (mask == 0xffffffff)
3146 return;
3147
3148 addr1 = ld1->getSrc(0)->reg.data.offset;
3149 n1 = n2 = 0;
3150 size1 = size2 = 0;
3151
3152 // Compute address/width for first load
3153 for (d = 0; ld1->defExists(d); ++d) {
3154 if (mask & (1 << d)) {
3155 if (size1 && (addr1 & 0x7))
3156 break;
3157 def1[n1] = ld1->getDef(d);
3158 size1 += def1[n1++]->reg.size;
3159 } else
3160 if (!n1) {
3161 addr1 += ld1->getDef(d)->reg.size;
3162 } else {
3163 break;
3164 }
3165 }
3166
3167 // Scale back the size of the first load until it can be loaded. This
3168 // typically happens for TYPE_B96 loads.
3169 while (n1 &&
3170 !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
3171 typeOfSize(size1))) {
3172 size1 -= def1[--n1]->reg.size;
3173 d--;
3174 }
3175
3176 // Compute address/width for second load
3177 for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
3178 if (mask & (1 << d)) {
3179 assert(!size2 || !(addr2 & 0x7));
3180 def2[n2] = ld1->getDef(d);
3181 size2 += def2[n2++]->reg.size;
3182 } else if (!n2) {
3183 assert(!n2);
3184 addr2 += ld1->getDef(d)->reg.size;
3185 } else {
3186 break;
3187 }
3188 }
3189
3190 // Make sure that we've processed all the values
3191 for (; ld1->defExists(d); ++d)
3192 assert(!(mask & (1 << d)));
3193
3194 updateLdStOffset(ld1, addr1, func);
3195 ld1->setType(typeOfSize(size1));
3196 for (d = 0; d < 4; ++d)
3197 ld1->setDef(d, (d < n1) ? def1[d] : NULL);
3198
3199 if (!n2)
3200 return;
3201
3202 ld2 = cloneShallow(func, ld1);
3203 updateLdStOffset(ld2, addr2, func);
3204 ld2->setType(typeOfSize(size2));
3205 for (d = 0; d < 4; ++d)
3206 ld2->setDef(d, (d < n2) ? def2[d] : NULL);
3207
3208 ld1->bb->insertAfter(ld1, ld2);
3209 }
3210
3211 // =============================================================================
3212
3213 #define RUN_PASS(l, n, f) \
3214 if (level >= (l)) { \
3215 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \
3216 INFO("PEEPHOLE: %s\n", #n); \
3217 n pass; \
3218 if (!pass.f(this)) \
3219 return false; \
3220 }
3221
3222 bool
3223 Program::optimizeSSA(int level)
3224 {
3225 RUN_PASS(1, DeadCodeElim, buryAll);
3226 RUN_PASS(1, CopyPropagation, run);
3227 RUN_PASS(1, MergeSplits, run);
3228 RUN_PASS(2, GlobalCSE, run);
3229 RUN_PASS(1, LocalCSE, run);
3230 RUN_PASS(2, AlgebraicOpt, run);
3231 RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
3232 RUN_PASS(1, ConstantFolding, foldAll);
3233 RUN_PASS(1, LoadPropagation, run);
3234 RUN_PASS(1, IndirectPropagation, run);
3235 RUN_PASS(2, MemoryOpt, run);
3236 RUN_PASS(2, LocalCSE, run);
3237 RUN_PASS(0, DeadCodeElim, buryAll);
3238
3239 return true;
3240 }
3241
3242 bool
3243 Program::optimizePostRA(int level)
3244 {
3245 RUN_PASS(2, FlatteningPass, run);
3246 if (getTarget()->getChipset() < 0xc0)
3247 RUN_PASS(2, NV50PostRaConstantFolding, run);
3248
3249 return true;
3250 }
3251
3252 }