2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 #include "nv50_ir_target.h"
25 #include "nv50_ir_build_util.h"
28 #include "util/u_math.h"
34 Instruction::isNop() const
36 if (op
== OP_CONSTRAINT
|| op
== OP_PHI
)
38 if (terminator
|| join
) // XXX: should terminator imply flow ?
40 if (!fixed
&& op
== OP_NOP
)
43 if (def
[0].exists() && def
[0].rep()->reg
.data
.id
< 0) {
44 for (int d
= 1; defExists(d
); ++d
)
45 if (def
[d
].rep()->reg
.data
.id
>= 0)
46 WARN("part of vector result is unused !\n");
50 if (op
== OP_MOV
|| op
== OP_UNION
) {
51 if (!def
[0].rep()->equals(getSrc(0)))
54 if (!def
[0].rep()->equals(getSrc(1)))
62 bool Instruction::isDead() const
68 for (int d
= 0; defExists(d
); ++d
)
69 if (getDef(d
)->refCount() || getDef(d
)->reg
.data
.id
>= 0)
72 if (terminator
|| asFlow())
80 // =============================================================================
82 class CopyPropagation
: public Pass
85 virtual bool visit(BasicBlock
*);
88 // Propagate all MOVs forward to make subsequent optimization easier, except if
89 // the sources stem from a phi, in which case we don't want to mess up potential
90 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
92 CopyPropagation::visit(BasicBlock
*bb
)
94 Instruction
*mov
, *si
, *next
;
96 for (mov
= bb
->getEntry(); mov
; mov
= next
) {
98 if (mov
->op
!= OP_MOV
|| mov
->fixed
|| !mov
->getSrc(0)->asLValue())
100 si
= mov
->getSrc(0)->getInsn();
101 if (mov
->getDef(0)->reg
.data
.id
< 0 && si
&& si
->op
!= OP_PHI
) {
103 mov
->def
[0].replace(mov
->getSrc(0), false);
104 delete_Instruction(prog
, mov
);
110 // =============================================================================
112 class LoadPropagation
: public Pass
115 virtual bool visit(BasicBlock
*);
117 void checkSwapSrc01(Instruction
*);
119 bool isCSpaceLoad(Instruction
*);
120 bool isImmd32Load(Instruction
*);
124 LoadPropagation::isCSpaceLoad(Instruction
*ld
)
126 return ld
&& ld
->op
== OP_LOAD
&& ld
->src
[0].getFile() == FILE_MEMORY_CONST
;
130 LoadPropagation::isImmd32Load(Instruction
*ld
)
132 if (!ld
|| (ld
->op
!= OP_MOV
) || (typeSizeof(ld
->dType
) != 4))
134 return ld
->src
[0].getFile() == FILE_IMMEDIATE
;
138 LoadPropagation::checkSwapSrc01(Instruction
*insn
)
140 if (!prog
->getTarget()->getOpInfo(insn
).commutative
)
141 if (insn
->op
!= OP_SET
&& insn
->op
!= OP_SLCT
)
143 if (insn
->src
[1].getFile() != FILE_GPR
)
146 Instruction
*i0
= insn
->getSrc(0)->getInsn();
147 Instruction
*i1
= insn
->getSrc(1)->getInsn();
149 if (isCSpaceLoad(i0
)) {
150 if (!isCSpaceLoad(i1
))
151 insn
->swapSources(0, 1);
155 if (isImmd32Load(i0
)) {
156 if (!isCSpaceLoad(i1
) && !isImmd32Load(i1
))
157 insn
->swapSources(0, 1);
164 if (insn
->op
== OP_SET
)
165 insn
->asCmp()->setCond
= reverseCondCode(insn
->asCmp()->setCond
);
167 if (insn
->op
== OP_SLCT
)
168 insn
->asCmp()->setCond
= inverseCondCode(insn
->asCmp()->setCond
);
172 LoadPropagation::visit(BasicBlock
*bb
)
174 const Target
*targ
= prog
->getTarget();
177 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
183 for (int s
= 0; i
->srcExists(s
); ++s
) {
184 Instruction
*ld
= i
->getSrc(s
)->getInsn();
186 if (!ld
|| ld
->fixed
|| (ld
->op
!= OP_LOAD
&& ld
->op
!= OP_MOV
))
188 if (!targ
->insnCanLoad(i
, s
, ld
))
192 i
->setSrc(s
, ld
->getSrc(0));
193 if (ld
->src
[0].isIndirect(0))
194 i
->setIndirect(s
, 0, ld
->getIndirect(0, 0));
196 if (ld
->getDef(0)->refCount() == 0)
197 delete_Instruction(prog
, ld
);
203 // =============================================================================
205 // Evaluate constant expressions.
206 class ConstantFolding
: public Pass
209 bool foldAll(Program
*);
212 virtual bool visit(BasicBlock
*);
214 void expr(Instruction
*, ImmediateValue
*, ImmediateValue
*);
215 void opnd(Instruction
*, ImmediateValue
*, int s
);
217 void unary(Instruction
*, const ImmediateValue
&);
219 // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
220 CmpInstruction
*findOriginForTestWithZero(Value
*);
222 unsigned int foldCount
;
227 // TODO: remember generated immediates and only revisit these
229 ConstantFolding::foldAll(Program
*prog
)
231 unsigned int iterCount
= 0;
236 } while (foldCount
&& ++iterCount
< 2);
241 ConstantFolding::visit(BasicBlock
*bb
)
243 Instruction
*i
, *next
;
245 for (i
= bb
->getEntry(); i
; i
= next
) {
247 if (i
->op
== OP_MOV
) // continue early, MOV appears frequently
250 ImmediateValue
*src0
= i
->src
[0].getImmediate();
251 ImmediateValue
*src1
= i
->src
[1].getImmediate();
266 ConstantFolding::findOriginForTestWithZero(Value
*value
)
270 Instruction
*insn
= value
->getInsn();
272 while (insn
&& insn
->op
!= OP_SET
) {
273 Instruction
*next
= NULL
;
278 next
= insn
->getSrc(0)->getInsn();
279 if (insn
->sType
!= next
->dType
)
283 next
= insn
->getSrc(0)->getInsn();
290 return insn
? insn
->asCmp() : NULL
;
294 Modifier::applyTo(ImmediateValue
& imm
) const
296 switch (imm
.reg
.type
) {
298 if (bits
& NV50_IR_MOD_ABS
)
299 imm
.reg
.data
.f32
= fabsf(imm
.reg
.data
.f32
);
300 if (bits
& NV50_IR_MOD_NEG
)
301 imm
.reg
.data
.f32
= -imm
.reg
.data
.f32
;
302 if (bits
& NV50_IR_MOD_SAT
) {
303 if (imm
.reg
.data
.f32
< 0.0f
)
304 imm
.reg
.data
.f32
= 0.0f
;
306 if (imm
.reg
.data
.f32
> 1.0f
)
307 imm
.reg
.data
.f32
= 1.0f
;
309 assert(!(bits
& NV50_IR_MOD_NOT
));
312 case TYPE_S8
: // NOTE: will be extended
315 case TYPE_U8
: // NOTE: treated as signed
318 if (bits
& NV50_IR_MOD_ABS
)
319 imm
.reg
.data
.s32
= (imm
.reg
.data
.s32
>= 0) ?
320 imm
.reg
.data
.s32
: -imm
.reg
.data
.s32
;
321 if (bits
& NV50_IR_MOD_NEG
)
322 imm
.reg
.data
.s32
= -imm
.reg
.data
.s32
;
323 if (bits
& NV50_IR_MOD_NOT
)
324 imm
.reg
.data
.s32
= ~imm
.reg
.data
.s32
;
328 if (bits
& NV50_IR_MOD_ABS
)
329 imm
.reg
.data
.f64
= fabs(imm
.reg
.data
.f64
);
330 if (bits
& NV50_IR_MOD_NEG
)
331 imm
.reg
.data
.f64
= -imm
.reg
.data
.f64
;
332 if (bits
& NV50_IR_MOD_SAT
) {
333 if (imm
.reg
.data
.f64
< 0.0)
334 imm
.reg
.data
.f64
= 0.0;
336 if (imm
.reg
.data
.f64
> 1.0)
337 imm
.reg
.data
.f64
= 1.0;
339 assert(!(bits
& NV50_IR_MOD_NOT
));
343 assert(!"invalid/unhandled type");
344 imm
.reg
.data
.u64
= 0;
350 Modifier::getOp() const
353 case NV50_IR_MOD_ABS
: return OP_ABS
;
354 case NV50_IR_MOD_NEG
: return OP_NEG
;
355 case NV50_IR_MOD_SAT
: return OP_SAT
;
356 case NV50_IR_MOD_NOT
: return OP_NOT
;
365 ConstantFolding::expr(Instruction
*i
,
366 ImmediateValue
*src0
, ImmediateValue
*src1
)
368 ImmediateValue
imm0(src0
, i
->sType
);
369 ImmediateValue
imm1(src1
, i
->sType
);
371 struct Storage
*const a
= &imm0
.reg
, *const b
= &imm1
.reg
;
373 i
->src
[0].mod
.applyTo(imm0
);
374 i
->src
[1].mod
.applyTo(imm1
);
380 if (i
->dnz
&& i
->dType
== TYPE_F32
) {
381 if (!isfinite(a
->data
.f32
))
383 if (!isfinite(b
->data
.f32
))
387 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
* b
->data
.f32
; break;
388 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
* b
->data
.f64
; break;
390 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
* b
->data
.u32
; break;
396 if (b
->data
.u32
== 0)
399 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
/ b
->data
.f32
; break;
400 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
/ b
->data
.f64
; break;
401 case TYPE_S32
: res
.data
.s32
= a
->data
.s32
/ b
->data
.s32
; break;
402 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
/ b
->data
.u32
; break;
409 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
+ b
->data
.f32
; break;
410 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
+ b
->data
.f64
; break;
412 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
+ b
->data
.u32
; break;
419 case TYPE_F32
: res
.data
.f32
= pow(a
->data
.f32
, b
->data
.f32
); break;
420 case TYPE_F64
: res
.data
.f64
= pow(a
->data
.f64
, b
->data
.f64
); break;
427 case TYPE_F32
: res
.data
.f32
= MAX2(a
->data
.f32
, b
->data
.f32
); break;
428 case TYPE_F64
: res
.data
.f64
= MAX2(a
->data
.f64
, b
->data
.f64
); break;
429 case TYPE_S32
: res
.data
.s32
= MAX2(a
->data
.s32
, b
->data
.s32
); break;
430 case TYPE_U32
: res
.data
.u32
= MAX2(a
->data
.u32
, b
->data
.u32
); break;
437 case TYPE_F32
: res
.data
.f32
= MIN2(a
->data
.f32
, b
->data
.f32
); break;
438 case TYPE_F64
: res
.data
.f64
= MIN2(a
->data
.f64
, b
->data
.f64
); break;
439 case TYPE_S32
: res
.data
.s32
= MIN2(a
->data
.s32
, b
->data
.s32
); break;
440 case TYPE_U32
: res
.data
.u32
= MIN2(a
->data
.u32
, b
->data
.u32
); break;
446 res
.data
.u64
= a
->data
.u64
& b
->data
.u64
;
449 res
.data
.u64
= a
->data
.u64
| b
->data
.u64
;
452 res
.data
.u64
= a
->data
.u64
^ b
->data
.u64
;
455 res
.data
.u32
= a
->data
.u32
<< b
->data
.u32
;
459 case TYPE_S32
: res
.data
.s32
= a
->data
.s32
>> b
->data
.u32
; break;
460 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
>> b
->data
.u32
; break;
466 if (a
->data
.u32
!= b
->data
.u32
)
468 res
.data
.u32
= a
->data
.u32
;
475 i
->src
[0].mod
= Modifier(0);
476 i
->src
[1].mod
= Modifier(0);
478 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.u32
));
481 i
->getSrc(0)->reg
.data
= res
.data
;
483 if (i
->op
== OP_MAD
|| i
->op
== OP_FMA
) {
486 i
->setSrc(1, i
->getSrc(0));
487 i
->setSrc(0, i
->getSrc(2));
490 i
->src
[1].mod
= i
->src
[2].mod
;
492 src0
= i
->src
[0].getImmediate();
494 expr(i
, src0
, i
->getSrc(1)->asImm());
501 ConstantFolding::unary(Instruction
*i
, const ImmediateValue
&imm
)
505 if (i
->dType
!= TYPE_F32
)
508 case OP_NEG
: res
.data
.f32
= -imm
.reg
.data
.f32
; break;
509 case OP_ABS
: res
.data
.f32
= fabsf(imm
.reg
.data
.f32
); break;
510 case OP_RCP
: res
.data
.f32
= 1.0f
/ imm
.reg
.data
.f32
; break;
511 case OP_RSQ
: res
.data
.f32
= 1.0f
/ sqrtf(imm
.reg
.data
.f32
); break;
512 case OP_LG2
: res
.data
.f32
= log2f(imm
.reg
.data
.f32
); break;
513 case OP_EX2
: res
.data
.f32
= exp2f(imm
.reg
.data
.f32
); break;
514 case OP_SIN
: res
.data
.f32
= sinf(imm
.reg
.data
.f32
); break;
515 case OP_COS
: res
.data
.f32
= cosf(imm
.reg
.data
.f32
); break;
516 case OP_SQRT
: res
.data
.f32
= sqrtf(imm
.reg
.data
.f32
); break;
519 // these should be handled in subsequent OP_SIN/COS/EX2
520 res
.data
.f32
= imm
.reg
.data
.f32
;
526 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.f32
));
527 i
->src
[0].mod
= Modifier(0);
531 ConstantFolding::opnd(Instruction
*i
, ImmediateValue
*src
, int s
)
534 const operation op
= i
->op
;
536 ImmediateValue
imm(src
, i
->sType
);
538 i
->src
[s
].mod
.applyTo(imm
);
542 if (i
->dType
== TYPE_F32
&& i
->getSrc(t
)->refCount() == 1) {
543 Instruction
*si
= i
->getSrc(t
)->getUniqueInsn();
545 if (si
&& si
->op
== OP_MUL
) {
546 float f
= imm
.reg
.data
.f32
;
548 if (si
->src
[1].getImmediate()) {
549 f
*= si
->src
[1].getImmediate()->reg
.data
.f32
;
550 si
->setSrc(1, new_ImmediateValue(prog
, f
));
551 i
->def
[0].replace(i
->getSrc(t
), false);
555 if (f
== 0.125f
) fac
= -3;
557 if (f
== 0.250f
) fac
= -2;
559 if (f
== 0.500f
) fac
= -1;
561 if (f
== 2.000f
) fac
= +1;
563 if (f
== 4.000f
) fac
= +2;
565 if (f
== 8.000f
) fac
= +3;
569 // FIXME: allowed & modifier
570 si
->postFactor
= fac
;
571 i
->def
[0].replace(i
->getSrc(t
), false);
577 if (imm
.isInteger(0)) {
579 i
->setSrc(0, i
->getSrc(s
));
582 if (imm
.isInteger(1) || imm
.isInteger(-1)) {
583 if (imm
.isNegative())
584 i
->src
[t
].mod
= i
->src
[t
].mod
^ Modifier(NV50_IR_MOD_NEG
);
585 i
->op
= i
->src
[t
].mod
.getOp();
587 i
->setSrc(0, i
->getSrc(1));
588 i
->src
[0].mod
= i
->src
[1].mod
;
595 if (imm
.isInteger(2) || imm
.isInteger(-2)) {
596 if (imm
.isNegative())
597 i
->src
[t
].mod
= i
->src
[t
].mod
^ Modifier(NV50_IR_MOD_NEG
);
599 i
->setSrc(s
, i
->getSrc(t
));
600 i
->src
[s
].mod
= i
->src
[t
].mod
;
602 if (!isFloatType(i
->sType
) && !imm
.isNegative() && imm
.isPow2()) {
605 i
->setSrc(1, new_ImmediateValue(prog
, imm
.reg
.data
.u32
));
609 if (imm
.isInteger(0)) {
611 i
->setSrc(0, i
->getSrc(1));
612 i
->src
[0].mod
= i
->src
[1].mod
;
615 i
->op
= i
->src
[0].mod
.getOp();
617 i
->src
[0].mod
= Modifier(0);
622 if (s
!= 1 || (i
->dType
!= TYPE_S32
&& i
->dType
!= TYPE_U32
))
624 bld
.setPosition(i
, false);
625 if (imm
.reg
.data
.u32
== 0) {
628 if (imm
.reg
.data
.u32
== 1) {
632 if (i
->dType
== TYPE_U32
&& imm
.isPow2()) {
634 i
->setSrc(1, bld
.mkImm(util_logbase2(imm
.reg
.data
.u32
)));
636 if (i
->dType
== TYPE_U32
) {
639 const uint32_t d
= imm
.reg
.data
.u32
;
642 uint32_t l
= util_logbase2(d
);
643 if (((uint32_t)1 << l
) < d
)
645 m
= (((uint64_t)1 << 32) * (((uint64_t)1 << l
) - d
)) / d
+ 1;
651 mul
= bld
.mkOp2(OP_MUL
, TYPE_U32
, tA
, i
->getSrc(0),
652 bld
.loadImm(NULL
, m
));
653 mul
->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
654 bld
.mkOp2(OP_SUB
, TYPE_U32
, tB
, i
->getSrc(0), tA
);
657 bld
.mkOp2(OP_SHR
, TYPE_U32
, tA
, tB
, bld
.mkImm(r
));
660 tB
= s
? bld
.getSSA() : i
->getDef(0);
661 bld
.mkOp2(OP_ADD
, TYPE_U32
, tB
, mul
->getDef(0), tA
);
663 bld
.mkOp2(OP_SHR
, TYPE_U32
, i
->getDef(0), tB
, bld
.mkImm(s
));
665 delete_Instruction(prog
, i
);
667 if (imm
.reg
.data
.s32
== -1) {
673 const int32_t d
= imm
.reg
.data
.s32
;
675 int32_t l
= util_logbase2(static_cast<unsigned>(abs(d
)));
676 if ((1 << l
) < abs(d
))
680 m
= ((uint64_t)1 << (32 + l
- 1)) / abs(d
) + 1 - ((uint64_t)1 << 32);
684 bld
.mkOp3(OP_MAD
, TYPE_S32
, tA
, i
->getSrc(0), bld
.loadImm(NULL
, m
),
685 i
->getSrc(0))->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
687 bld
.mkOp2(OP_SHR
, TYPE_S32
, tB
, tA
, bld
.mkImm(l
- 1));
691 bld
.mkCmp(OP_SET
, CC_LT
, TYPE_S32
, tA
, i
->getSrc(0), bld
.mkImm(0));
692 tD
= (d
< 0) ? bld
.getSSA() : i
->getDef(0)->asLValue();
693 bld
.mkOp2(OP_SUB
, TYPE_U32
, tD
, tB
, tA
);
695 bld
.mkOp1(OP_NEG
, TYPE_S32
, i
->getDef(0), tB
);
697 delete_Instruction(prog
, i
);
701 case OP_SET
: // TODO: SET_AND,OR,XOR
703 CmpInstruction
*si
= findOriginForTestWithZero(i
->getSrc(t
));
705 if (i
->src
[t
].mod
!= Modifier(0))
707 if (imm
.reg
.data
.u32
!= 0 || !si
|| si
->op
!= OP_SET
)
710 ccZ
= (CondCode
)((unsigned int)i
->asCmp()->setCond
& ~CC_U
);
712 ccZ
= reverseCondCode(ccZ
);
714 case CC_LT
: cc
= CC_FL
; break;
715 case CC_GE
: cc
= CC_TR
; break;
716 case CC_EQ
: cc
= inverseCondCode(cc
); break;
717 case CC_LE
: cc
= inverseCondCode(cc
); break;
723 i
->asCmp()->setCond
= cc
;
724 i
->setSrc(0, si
->src
[0]);
725 i
->setSrc(1, si
->src
[1]);
726 i
->sType
= si
->sType
;
732 if (s
!= 1 || i
->src
[0].mod
!= Modifier(0))
734 // try to concatenate shifts
735 Instruction
*si
= i
->getSrc(0)->getInsn();
737 si
->op
!= OP_SHL
|| si
->src
[1].mod
!= Modifier(0))
739 ImmediateValue
*siImm
= si
->src
[1].getImmediate();
741 bld
.setPosition(i
, false);
742 i
->setSrc(0, si
->getSrc(0));
743 i
->setSrc(1, bld
.loadImm(NULL
,
744 imm
.reg
.data
.u32
+ siImm
->reg
.data
.u32
));
769 // =============================================================================
771 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
772 class ModifierFolding
: public Pass
775 virtual bool visit(BasicBlock
*);
779 ModifierFolding::visit(BasicBlock
*bb
)
781 const Target
*target
= prog
->getTarget();
783 Instruction
*i
, *next
, *mi
;
786 for (i
= bb
->getEntry(); i
; i
= next
) {
789 if (0 && i
->op
== OP_SUB
) {
790 // turn "sub" into "add neg" (do we really want this ?)
792 i
->src
[0].mod
= i
->src
[0].mod
^ Modifier(NV50_IR_MOD_NEG
);
795 for (int s
= 0; s
< 3 && i
->srcExists(s
); ++s
) {
796 mi
= i
->getSrc(s
)->getInsn();
798 mi
->predSrc
>= 0 || mi
->getDef(0)->refCount() > 8)
800 if (i
->sType
== TYPE_U32
&& mi
->dType
== TYPE_S32
) {
801 if ((i
->op
!= OP_ADD
&&
807 if (i
->sType
!= mi
->dType
) {
810 if ((mod
= Modifier(mi
->op
)) == Modifier(0))
812 mod
= mod
* mi
->src
[0].mod
;
814 if ((i
->op
== OP_ABS
) || i
->src
[s
].mod
.abs()) {
815 // abs neg [abs] = abs
816 mod
= mod
& Modifier(~(NV50_IR_MOD_NEG
| NV50_IR_MOD_ABS
));
818 if ((i
->op
== OP_NEG
) && mod
.neg()) {
820 // neg as both opcode and modifier on same insn is prohibited
821 // neg neg abs = abs, neg neg = identity
822 mod
= mod
& Modifier(~NV50_IR_MOD_NEG
);
824 mod
= mod
& Modifier(~NV50_IR_MOD_ABS
);
825 if (mod
== Modifier(0))
829 if (target
->isModSupported(i
, s
, mod
)) {
830 i
->setSrc(s
, mi
->getSrc(0));
831 i
->src
[s
].mod
= i
->src
[s
].mod
* mod
;
835 if (i
->op
== OP_SAT
) {
836 mi
= i
->getSrc(0)->getInsn();
838 mi
->getDef(0)->refCount() <= 1 && target
->isSatSupported(mi
)) {
840 mi
->setDef(0, i
->getDef(0));
841 delete_Instruction(prog
, i
);
849 // =============================================================================
851 // MUL + ADD -> MAD/FMA
852 // MIN/MAX(a, a) -> a, etc.
853 // SLCT(a, b, const) -> cc(const) ? a : b
855 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
856 class AlgebraicOpt
: public Pass
859 virtual bool visit(BasicBlock
*);
861 void handleADD(Instruction
*);
862 void handleMINMAX(Instruction
*);
863 void handleRCP(Instruction
*);
864 void handleSLCT(Instruction
*);
865 void handleLOGOP(Instruction
*);
866 void handleCVT(Instruction
*);
870 AlgebraicOpt::handleADD(Instruction
*add
)
872 Value
*src0
= add
->getSrc(0);
873 Value
*src1
= add
->getSrc(1);
878 if (!prog
->getTarget()->isOpSupported(OP_MAD
, add
->dType
))
881 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
884 if (src0
->refCount() == 1 &&
885 src0
->getUniqueInsn() && src0
->getUniqueInsn()->op
== OP_MUL
)
888 if (src1
->refCount() == 1 &&
889 src1
->getUniqueInsn() && src1
->getUniqueInsn()->op
== OP_MUL
)
894 if ((src0
->getUniqueInsn() && src0
->getUniqueInsn()->bb
!= add
->bb
) ||
895 (src1
->getUniqueInsn() && src1
->getUniqueInsn()->bb
!= add
->bb
))
898 src
= add
->getSrc(s
);
900 mod
[0] = add
->src
[0].mod
;
901 mod
[1] = add
->src
[1].mod
;
902 mod
[2] = src
->getUniqueInsn()->src
[0].mod
;
903 mod
[3] = src
->getUniqueInsn()->src
[1].mod
;
905 if (((mod
[0] | mod
[1]) | (mod
[2] | mod
[3])) & Modifier(~NV50_IR_MOD_NEG
))
909 add
->subOp
= src
->getInsn()->subOp
; // potentially mul-high
911 add
->setSrc(2, add
->src
[s
? 0 : 1]);
913 add
->setSrc(0, src
->getInsn()->getSrc(0));
914 add
->src
[0].mod
= mod
[2] ^ mod
[s
];
915 add
->setSrc(1, src
->getInsn()->getSrc(1));
916 add
->src
[1].mod
= mod
[3];
920 AlgebraicOpt::handleMINMAX(Instruction
*minmax
)
922 Value
*src0
= minmax
->getSrc(0);
923 Value
*src1
= minmax
->getSrc(1);
925 if (src0
!= src1
|| src0
->reg
.file
!= FILE_GPR
)
927 if (minmax
->src
[0].mod
== minmax
->src
[1].mod
) {
928 if (minmax
->src
[0].mod
) {
930 minmax
->setSrc(1, NULL
);
932 minmax
->def
[0].replace(minmax
->getSrc(0), false);
933 minmax
->bb
->remove(minmax
);
937 // min(x, -x) = -abs(x)
938 // min(x, -abs(x)) = -abs(x)
939 // min(x, abs(x)) = x
940 // max(x, -abs(x)) = x
941 // max(x, abs(x)) = abs(x)
942 // max(x, -x) = abs(x)
947 AlgebraicOpt::handleRCP(Instruction
*rcp
)
949 Instruction
*si
= rcp
->getSrc(0)->getUniqueInsn();
951 if (si
&& si
->op
== OP_RCP
) {
952 Modifier mod
= rcp
->src
[0].mod
* si
->src
[0].mod
;
953 rcp
->op
= mod
.getOp();
954 rcp
->setSrc(0, si
->getSrc(0));
959 AlgebraicOpt::handleSLCT(Instruction
*slct
)
961 if (slct
->getSrc(2)->reg
.file
== FILE_IMMEDIATE
) {
962 if (slct
->getSrc(2)->asImm()->compare(slct
->asCmp()->setCond
, 0.0f
))
963 slct
->setSrc(0, slct
->getSrc(1));
965 if (slct
->getSrc(0) != slct
->getSrc(1)) {
969 slct
->setSrc(1, NULL
);
970 slct
->setSrc(2, NULL
);
974 AlgebraicOpt::handleLOGOP(Instruction
*logop
)
976 Value
*src0
= logop
->getSrc(0);
977 Value
*src1
= logop
->getSrc(1);
979 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
983 if (logop
->src
[0].mod
!= Modifier(0) ||
984 logop
->src
[1].mod
!= Modifier(0))
986 if (logop
->op
== OP_AND
|| logop
->op
== OP_OR
) {
987 logop
->def
[0].replace(logop
->getSrc(0), false);
988 delete_Instruction(prog
, logop
);
991 // try AND(SET, SET) -> SET_AND(SET)
992 Instruction
*set0
= src0
->getInsn();
993 Instruction
*set1
= src1
->getInsn();
995 if (!set0
|| set0
->fixed
|| !set1
|| set1
->fixed
)
997 if (set1
->op
!= OP_SET
) {
998 Instruction
*xchg
= set0
;
1001 if (set1
->op
!= OP_SET
)
1004 if (set0
->op
!= OP_SET
&&
1005 set0
->op
!= OP_SET_AND
&&
1006 set0
->op
!= OP_SET_OR
&&
1007 set0
->op
!= OP_SET_XOR
)
1009 if (set0
->getDef(0)->refCount() > 1 &&
1010 set1
->getDef(0)->refCount() > 1)
1012 if (set0
->getPredicate() || set1
->getPredicate())
1014 // check that they don't source each other
1015 for (int s
= 0; s
< 2; ++s
)
1016 if (set0
->getSrc(s
) == set1
->getDef(0) ||
1017 set1
->getSrc(s
) == set0
->getDef(0))
1020 set0
= set0
->clone(true);
1021 set1
= set1
->clone(false);
1022 logop
->bb
->insertAfter(logop
, set1
);
1023 logop
->bb
->insertAfter(logop
, set0
);
1025 set0
->dType
= TYPE_U8
;
1026 set0
->getDef(0)->reg
.file
= FILE_PREDICATE
;
1027 set0
->getDef(0)->reg
.size
= 1;
1028 set1
->setSrc(2, set0
->getDef(0));
1029 switch (logop
->op
) {
1030 case OP_AND
: set1
->op
= OP_SET_AND
; break;
1031 case OP_OR
: set1
->op
= OP_SET_OR
; break;
1032 case OP_XOR
: set1
->op
= OP_SET_XOR
; break;
1037 set1
->setDef(0, logop
->getDef(0));
1038 delete_Instruction(prog
, logop
);
1042 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1044 AlgebraicOpt::handleCVT(Instruction
*cvt
)
1046 if (cvt
->sType
!= TYPE_F32
||
1047 cvt
->dType
!= TYPE_S32
|| cvt
->src
[0].mod
!= Modifier(0))
1049 Instruction
*insn
= cvt
->getSrc(0)->getInsn();
1050 if (!insn
|| insn
->op
!= OP_NEG
|| insn
->dType
!= TYPE_F32
)
1052 if (insn
->src
[0].mod
!= Modifier(0))
1054 insn
= insn
->getSrc(0)->getInsn();
1055 if (!insn
|| insn
->op
!= OP_SET
|| insn
->dType
!= TYPE_F32
)
1058 Instruction
*bset
= insn
->clone(false);
1059 bset
->dType
= TYPE_U32
;
1060 bset
->setDef(0, cvt
->getDef(0));
1061 cvt
->bb
->insertAfter(cvt
, bset
);
1062 delete_Instruction(prog
, cvt
);
1066 AlgebraicOpt::visit(BasicBlock
*bb
)
1069 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
1101 // =============================================================================
1104 updateLdStOffset(Instruction
*ldst
, int32_t offset
, Function
*fn
)
1106 if (offset
!= ldst
->getSrc(0)->reg
.data
.offset
) {
1107 if (ldst
->getSrc(0)->refCount() > 1)
1108 ldst
->setSrc(0, ldst
->getSrc(0)->clone(fn
));
1109 ldst
->getSrc(0)->reg
.data
.offset
= offset
;
1113 // Combine loads and stores, forward stores to loads where possible.
1114 class MemoryOpt
: public Pass
1122 const Value
*rel
[2];
1130 bool overlaps(const Instruction
*ldst
) const;
1132 inline void link(Record
**);
1133 inline void unlink(Record
**);
1134 inline void set(const Instruction
*ldst
);
1140 Record
*loads
[DATA_FILE_COUNT
];
1141 Record
*stores
[DATA_FILE_COUNT
];
1143 MemoryPool recordPool
;
1146 virtual bool visit(BasicBlock
*);
1147 bool runOpt(BasicBlock
*);
1149 Record
**getList(const Instruction
*);
1151 Record
*findRecord(const Instruction
*, bool load
, bool& isAdjacent
) const;
1153 // merge @insn into load/store instruction from @rec
1154 bool combineLd(Record
*rec
, Instruction
*ld
);
1155 bool combineSt(Record
*rec
, Instruction
*st
);
1157 bool replaceLdFromLd(Instruction
*ld
, Record
*ldRec
);
1158 bool replaceLdFromSt(Instruction
*ld
, Record
*stRec
);
1159 bool replaceStFromSt(Instruction
*restrict st
, Record
*stRec
);
1161 void addRecord(Instruction
*ldst
);
1162 void purgeRecords(Instruction
*const st
, DataFile
);
1163 void lockStores(Instruction
*const ld
);
1170 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record
), 6)
1172 for (int i
= 0; i
< DATA_FILE_COUNT
; ++i
) {
1182 for (unsigned int i
= 0; i
< DATA_FILE_COUNT
; ++i
) {
1184 for (it
= loads
[i
]; it
; it
= next
) {
1186 recordPool
.release(it
);
1189 for (it
= stores
[i
]; it
; it
= next
) {
1191 recordPool
.release(it
);
1198 MemoryOpt::combineLd(Record
*rec
, Instruction
*ld
)
1200 int32_t offRc
= rec
->offset
;
1201 int32_t offLd
= ld
->getSrc(0)->reg
.data
.offset
;
1202 int sizeRc
= rec
->size
;
1203 int sizeLd
= typeSizeof(ld
->dType
);
1204 int size
= sizeRc
+ sizeLd
;
1207 // only VFETCH can do a 96 byte load
1208 if (ld
->op
!= OP_VFETCH
&& size
== 12)
1210 // no unaligned loads
1211 if (((size
== 0x8) && (MIN2(offLd
, offRc
) & 0x7)) ||
1212 ((size
== 0xc) && (MIN2(offLd
, offRc
) & 0xf)))
1215 assert(sizeRc
+ sizeLd
<= 16 && offRc
!= offLd
);
1217 for (j
= 0; sizeRc
; sizeRc
-= rec
->insn
->getDef(j
)->reg
.size
, ++j
);
1219 if (offLd
< offRc
) {
1221 for (sz
= 0, d
= 0; sz
< sizeLd
; sz
+= ld
->getDef(d
)->reg
.size
, ++d
);
1222 // d: nr of definitions in ld
1223 // j: nr of definitions in rec->insn, move:
1224 for (d
= d
+ j
- 1; j
> 0; --j
, --d
)
1225 rec
->insn
->setDef(d
, rec
->insn
->getDef(j
- 1));
1227 if (rec
->insn
->getSrc(0)->refCount() > 1)
1228 rec
->insn
->setSrc(0, rec
->insn
->getSrc(0)->clone(func
));
1229 rec
->offset
= rec
->insn
->getSrc(0)->reg
.data
.offset
= offLd
;
1235 // move definitions of @ld to @rec->insn
1236 for (j
= 0; sizeLd
; ++j
, ++d
) {
1237 sizeLd
-= ld
->getDef(j
)->reg
.size
;
1238 rec
->insn
->setDef(d
, ld
->getDef(j
));
1242 rec
->insn
->setType(typeOfSize(size
));
1244 delete_Instruction(prog
, ld
);
1250 MemoryOpt::combineSt(Record
*rec
, Instruction
*st
)
1252 int32_t offRc
= rec
->offset
;
1253 int32_t offSt
= st
->getSrc(0)->reg
.data
.offset
;
1254 int sizeRc
= rec
->size
;
1255 int sizeSt
= typeSizeof(st
->dType
);
1257 int size
= sizeRc
+ sizeSt
;
1259 Value
*src
[4]; // no modifiers in ValueRef allowed for st
1262 if (size
== 12) // XXX: check if EXPORT a[] can do this after all
1264 if (size
== 8 && MIN2(offRc
, offSt
) & 0x7)
1267 st
->takeExtraSources(0, extra
); // save predicate and indirect address
1269 if (offRc
< offSt
) {
1270 // save values from @st
1271 for (s
= 0; sizeSt
; ++s
) {
1272 sizeSt
-= st
->getSrc(s
+ 1)->reg
.size
;
1273 src
[s
] = st
->getSrc(s
+ 1);
1275 // set record's values as low sources of @st
1276 for (j
= 1; sizeRc
; ++j
) {
1277 sizeRc
-= st
->getSrc(j
)->reg
.size
;
1278 st
->setSrc(j
, rec
->insn
->getSrc(j
));
1280 // set saved values as high sources of @st
1281 for (k
= j
, j
= 0; j
< s
; ++j
)
1282 st
->setSrc(k
++, src
[j
]);
1284 updateLdStOffset(st
, offRc
, func
);
1286 for (j
= 1; sizeSt
; ++j
)
1287 sizeSt
-= st
->getSrc(j
)->reg
.size
;
1288 for (s
= 1; sizeRc
; ++j
, ++s
) {
1289 sizeRc
-= rec
->insn
->getSrc(s
)->reg
.size
;
1290 st
->setSrc(j
, rec
->insn
->getSrc(s
));
1292 rec
->offset
= offSt
;
1294 st
->putExtraSources(0, extra
); // restore pointer and predicate
1296 delete_Instruction(prog
, rec
->insn
);
1299 rec
->insn
->setType(typeOfSize(size
));
1304 MemoryOpt::Record::set(const Instruction
*ldst
)
1306 const Symbol
*mem
= ldst
->getSrc(0)->asSym();
1307 fileIndex
= mem
->reg
.fileIndex
;
1308 rel
[0] = ldst
->getIndirect(0, 0);
1309 rel
[1] = ldst
->getIndirect(0, 1);
1310 offset
= mem
->reg
.data
.offset
;
1311 base
= mem
->getBase();
1312 size
= typeSizeof(ldst
->sType
);
1316 MemoryOpt::Record::link(Record
**list
)
1326 MemoryOpt::Record::unlink(Record
**list
)
1336 MemoryOpt::Record
**
1337 MemoryOpt::getList(const Instruction
*insn
)
1339 if (insn
->op
== OP_LOAD
|| insn
->op
== OP_VFETCH
)
1340 return &loads
[insn
->src
[0].getFile()];
1341 return &stores
[insn
->src
[0].getFile()];
1345 MemoryOpt::addRecord(Instruction
*i
)
1347 Record
**list
= getList(i
);
1348 Record
*it
= reinterpret_cast<Record
*>(recordPool
.allocate());
1357 MemoryOpt::findRecord(const Instruction
*insn
, bool load
, bool& isAdj
) const
1359 const Symbol
*sym
= insn
->getSrc(0)->asSym();
1360 const int size
= typeSizeof(insn
->sType
);
1362 Record
*it
= load
? loads
[sym
->reg
.file
] : stores
[sym
->reg
.file
];
1364 for (; it
; it
= it
->next
) {
1365 if (it
->locked
&& insn
->op
!= OP_LOAD
)
1367 if ((it
->offset
>> 4) != (sym
->reg
.data
.offset
>> 4) ||
1368 it
->rel
[0] != insn
->getIndirect(0, 0) ||
1369 it
->fileIndex
!= sym
->reg
.fileIndex
||
1370 it
->rel
[1] != insn
->getIndirect(0, 1))
1373 if (it
->offset
< sym
->reg
.data
.offset
) {
1374 if (it
->offset
+ it
->size
>= sym
->reg
.data
.offset
) {
1375 isAdj
= (it
->offset
+ it
->size
== sym
->reg
.data
.offset
);
1378 if (!(it
->offset
& 0x7))
1382 isAdj
= it
->offset
!= sym
->reg
.data
.offset
;
1383 if (size
<= it
->size
&& !isAdj
)
1386 if (!(sym
->reg
.data
.offset
& 0x7))
1387 if (it
->offset
- size
<= sym
->reg
.data
.offset
)
1395 MemoryOpt::replaceLdFromSt(Instruction
*ld
, Record
*rec
)
1397 Instruction
*st
= rec
->insn
;
1398 int32_t offSt
= rec
->offset
;
1399 int32_t offLd
= ld
->getSrc(0)->reg
.data
.offset
;
1402 for (s
= 1; offSt
!= offLd
&& st
->srcExists(s
); ++s
)
1403 offSt
+= st
->getSrc(s
)->reg
.size
;
1407 for (d
= 0; ld
->defExists(d
) && st
->srcExists(s
); ++d
, ++s
) {
1408 if (ld
->getDef(d
)->reg
.size
!= st
->getSrc(s
)->reg
.size
)
1410 if (st
->getSrc(s
)->reg
.file
!= FILE_GPR
)
1412 ld
->def
[d
].replace(st
->getSrc(s
), false);
1419 MemoryOpt::replaceLdFromLd(Instruction
*ldE
, Record
*rec
)
1421 Instruction
*ldR
= rec
->insn
;
1422 int32_t offR
= rec
->offset
;
1423 int32_t offE
= ldE
->getSrc(0)->reg
.data
.offset
;
1426 assert(offR
<= offE
);
1427 for (dR
= 0; offR
< offE
&& ldR
->defExists(dR
); ++dR
)
1428 offR
+= ldR
->getDef(dR
)->reg
.size
;
1432 for (dE
= 0; ldE
->defExists(dE
) && ldR
->defExists(dR
); ++dE
, ++dR
) {
1433 if (ldE
->getDef(dE
)->reg
.size
!= ldR
->getDef(dR
)->reg
.size
)
1435 ldE
->def
[dE
].replace(ldR
->getDef(dR
), false);
1438 delete_Instruction(prog
, ldE
);
1443 MemoryOpt::replaceStFromSt(Instruction
*restrict st
, Record
*rec
)
1445 const Instruction
*const ri
= rec
->insn
;
1448 int32_t offS
= st
->getSrc(0)->reg
.data
.offset
;
1449 int32_t offR
= rec
->offset
;
1450 int32_t endS
= offS
+ typeSizeof(st
->dType
);
1451 int32_t endR
= offR
+ typeSizeof(ri
->dType
);
1453 rec
->size
= MAX2(endS
, endR
) - MIN2(offS
, offR
);
1455 st
->takeExtraSources(0, extra
);
1461 // get non-replaced sources of ri
1462 for (s
= 1; offR
< offS
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
)
1463 vals
[k
++] = ri
->getSrc(s
);
1465 // get replaced sources of st
1466 for (s
= 1; st
->srcExists(s
); offS
+= st
->getSrc(s
)->reg
.size
, ++s
)
1467 vals
[k
++] = st
->getSrc(s
);
1468 // skip replaced sources of ri
1469 for (s
= n
; offR
< endS
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
);
1470 // get non-replaced sources after values covered by st
1471 for (; offR
< endR
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
)
1472 vals
[k
++] = ri
->getSrc(s
);
1473 for (s
= 0; s
< k
; ++s
)
1474 st
->setSrc(s
+ 1, vals
[s
]);
1475 st
->setSrc(0, ri
->getSrc(0));
1479 for (j
= 1; offR
< endS
; offR
+= ri
->getSrc(j
++)->reg
.size
);
1480 for (s
= 1; offS
< endS
; offS
+= st
->getSrc(s
++)->reg
.size
);
1481 for (; offR
< endR
; offR
+= ri
->getSrc(j
++)->reg
.size
)
1482 st
->setSrc(s
++, ri
->getSrc(j
));
1484 st
->putExtraSources(0, extra
);
1486 delete_Instruction(prog
, rec
->insn
);
1489 rec
->offset
= st
->getSrc(0)->reg
.data
.offset
;
1491 st
->setType(typeOfSize(rec
->size
));
1497 MemoryOpt::Record::overlaps(const Instruction
*ldst
) const
1502 if (this->fileIndex
!= that
.fileIndex
)
1505 if (this->rel
[0] || that
.rel
[0])
1506 return this->base
== that
.base
;
1508 (this->offset
< that
.offset
+ that
.size
) &&
1509 (this->offset
+ this->size
> that
.offset
);
1512 // We must not eliminate stores that affect the result of @ld if
1513 // we find later stores to the same location, and we may no longer
1514 // merge them with later stores.
1515 // The stored value can, however, still be used to determine the value
1516 // returned by future loads.
1518 MemoryOpt::lockStores(Instruction
*const ld
)
1520 for (Record
*r
= stores
[ld
->src
[0].getFile()]; r
; r
= r
->next
)
1521 if (!r
->locked
&& r
->overlaps(ld
))
1525 // Prior loads from the location of @st are no longer valid.
1526 // Stores to the location of @st may no longer be used to derive
1527 // the value at it nor be coalesced into later stores.
1529 MemoryOpt::purgeRecords(Instruction
*const st
, DataFile f
)
1532 f
= st
->src
[0].getFile();
1534 for (Record
*r
= loads
[f
]; r
; r
= r
->next
)
1535 if (!st
|| r
->overlaps(st
))
1536 r
->unlink(&loads
[f
]);
1538 for (Record
*r
= stores
[f
]; r
; r
= r
->next
)
1539 if (!st
|| r
->overlaps(st
))
1540 r
->unlink(&stores
[f
]);
1544 MemoryOpt::visit(BasicBlock
*bb
)
1546 bool ret
= runOpt(bb
);
1547 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
1548 // where 96 bit memory operations are forbidden.
1555 MemoryOpt::runOpt(BasicBlock
*bb
)
1557 Instruction
*ldst
, *next
;
1559 bool isAdjacent
= true;
1561 for (ldst
= bb
->getEntry(); ldst
; ldst
= next
) {
1566 if (ldst
->op
== OP_LOAD
|| ldst
->op
== OP_VFETCH
) {
1567 if (ldst
->isDead()) {
1568 // might have been produced by earlier optimization
1569 delete_Instruction(prog
, ldst
);
1573 if (ldst
->op
== OP_STORE
|| ldst
->op
== OP_EXPORT
) {
1576 // TODO: maybe have all fixed ops act as barrier ?
1577 if (ldst
->op
== OP_CALL
) {
1578 purgeRecords(NULL
, FILE_MEMORY_LOCAL
);
1579 purgeRecords(NULL
, FILE_MEMORY_GLOBAL
);
1580 purgeRecords(NULL
, FILE_MEMORY_SHARED
);
1581 purgeRecords(NULL
, FILE_SHADER_OUTPUT
);
1583 if (ldst
->op
== OP_EMIT
|| ldst
->op
== OP_RESTART
) {
1584 purgeRecords(NULL
, FILE_SHADER_OUTPUT
);
1588 if (ldst
->getPredicate()) // TODO: handle predicated ld/st
1592 DataFile file
= ldst
->src
[0].getFile();
1594 // if ld l[]/g[] look for previous store to eliminate the reload
1595 if (file
== FILE_MEMORY_GLOBAL
|| file
== FILE_MEMORY_LOCAL
) {
1596 // TODO: shared memory ?
1597 rec
= findRecord(ldst
, false, isAdjacent
);
1598 if (rec
&& !isAdjacent
)
1599 keep
= !replaceLdFromSt(ldst
, rec
);
1602 // or look for ld from the same location and replace this one
1603 rec
= keep
? findRecord(ldst
, true, isAdjacent
) : NULL
;
1606 keep
= !replaceLdFromLd(ldst
, rec
);
1608 // or combine a previous load with this one
1609 keep
= !combineLd(rec
, ldst
);
1614 rec
= findRecord(ldst
, false, isAdjacent
);
1617 keep
= !replaceStFromSt(ldst
, rec
);
1619 keep
= !combineSt(rec
, ldst
);
1622 purgeRecords(ldst
, DATA_FILE_COUNT
);
1632 // =============================================================================
1634 // Turn control flow into predicated instructions (after register allocation !).
1636 // Could move this to before register allocation on NVC0 and also handle nested
1638 class FlatteningPass
: public Pass
1641 virtual bool visit(BasicBlock
*);
1643 bool tryPredicateConditional(BasicBlock
*);
1644 void predicateInstructions(BasicBlock
*, Value
*pred
, CondCode cc
);
1645 void tryPropagateBranch(BasicBlock
*);
1646 inline bool isConstantCondition(Value
*pred
);
1647 inline bool mayPredicate(const Instruction
*, const Value
*pred
) const;
1648 inline void removeFlow(Instruction
*);
1652 FlatteningPass::isConstantCondition(Value
*pred
)
1654 Instruction
*insn
= pred
->getUniqueInsn();
1656 if (insn
->op
!= OP_SET
|| insn
->srcExists(2))
1659 for (int s
= 0; s
< 2 && insn
->srcExists(s
); ++s
) {
1660 Instruction
*ld
= insn
->getSrc(s
)->getUniqueInsn();
1663 if (ld
->op
!= OP_MOV
&& ld
->op
!= OP_LOAD
)
1665 if (ld
->src
[0].isIndirect(0))
1667 file
= ld
->src
[0].getFile();
1669 file
= insn
->src
[s
].getFile();
1670 // catch $r63 on NVC0
1671 if (file
== FILE_GPR
&& insn
->getSrc(s
)->reg
.data
.id
> prog
->maxGPR
)
1672 file
= FILE_IMMEDIATE
;
1674 if (file
!= FILE_IMMEDIATE
&& file
!= FILE_MEMORY_CONST
)
1681 FlatteningPass::removeFlow(Instruction
*insn
)
1683 FlowInstruction
*term
= insn
? insn
->asFlow() : NULL
;
1686 Graph::Edge::Type ty
= term
->bb
->cfg
.outgoing().getType();
1688 if (term
->op
== OP_BRA
) {
1689 // TODO: this might get more difficult when we get arbitrary BRAs
1690 if (ty
== Graph::Edge::CROSS
|| ty
== Graph::Edge::BACK
)
1693 if (term
->op
!= OP_JOIN
)
1696 delete_Instruction(prog
, term
);
1698 Value
*pred
= term
->getPredicate();
1700 if (pred
&& pred
->refCount() == 0) {
1701 Instruction
*pSet
= pred
->getUniqueInsn();
1702 pred
->join
->reg
.data
.id
= -1; // deallocate
1704 delete_Instruction(prog
, pSet
);
1709 FlatteningPass::predicateInstructions(BasicBlock
*bb
, Value
*pred
, CondCode cc
)
1711 for (Instruction
*i
= bb
->getEntry(); i
; i
= i
->next
) {
1714 assert(!i
->getPredicate());
1715 i
->setPredicate(cc
, pred
);
1717 removeFlow(bb
->getExit());
1721 FlatteningPass::mayPredicate(const Instruction
*insn
, const Value
*pred
) const
1723 if (insn
->isPseudo())
1725 // TODO: calls where we don't know which registers are modified
1727 if (!prog
->getTarget()->mayPredicate(insn
, pred
))
1729 for (int d
= 0; insn
->defExists(d
); ++d
)
1730 if (insn
->getDef(d
)->equals(pred
))
1735 // If we conditionally skip over or to a branch instruction, replace it.
1736 // NOTE: We do not update the CFG anymore here !
1738 FlatteningPass::tryPropagateBranch(BasicBlock
*bb
)
1740 BasicBlock
*bf
= NULL
;
1743 if (bb
->cfg
.outgoingCount() != 2)
1745 if (!bb
->getExit() || bb
->getExit()->op
!= OP_BRA
)
1747 Graph::EdgeIterator ei
= bb
->cfg
.outgoing();
1749 for (i
= 0; !ei
.end(); ++i
, ei
.next()) {
1750 bf
= BasicBlock::get(ei
.getNode());
1751 if (bf
->getInsnCount() == 1)
1754 if (ei
.end() || !bf
->getExit())
1756 FlowInstruction
*bra
= bb
->getExit()->asFlow();
1757 FlowInstruction
*rep
= bf
->getExit()->asFlow();
1759 if (rep
->getPredicate())
1761 if (rep
->op
!= OP_BRA
&&
1762 rep
->op
!= OP_JOIN
&&
1767 bra
->target
.bb
= rep
->target
.bb
;
1768 if (i
) // 2nd out block means branch not taken
1769 bra
->cc
= inverseCondCode(bra
->cc
);
1774 FlatteningPass::visit(BasicBlock
*bb
)
1776 if (tryPredicateConditional(bb
))
1779 // try to attach join to previous instruction
1780 Instruction
*insn
= bb
->getExit();
1781 if (insn
&& insn
->op
== OP_JOIN
&& !insn
->getPredicate()) {
1783 if (insn
&& !insn
->getPredicate() && !insn
->asFlow() && !insn
->isNop()) {
1785 bb
->remove(bb
->getExit());
1790 tryPropagateBranch(bb
);
1796 FlatteningPass::tryPredicateConditional(BasicBlock
*bb
)
1798 BasicBlock
*bL
= NULL
, *bR
= NULL
;
1799 unsigned int nL
= 0, nR
= 0, limit
= 12;
1803 mask
= bb
->initiatesSimpleConditional();
1807 assert(bb
->getExit());
1808 Value
*pred
= bb
->getExit()->getPredicate();
1811 if (isConstantCondition(pred
))
1814 Graph::EdgeIterator ei
= bb
->cfg
.outgoing();
1817 bL
= BasicBlock::get(ei
.getNode());
1818 for (insn
= bL
->getEntry(); insn
; insn
= insn
->next
, ++nL
)
1819 if (!mayPredicate(insn
, pred
))
1822 return false; // too long, do a real branch
1827 bR
= BasicBlock::get(ei
.getNode());
1828 for (insn
= bR
->getEntry(); insn
; insn
= insn
->next
, ++nR
)
1829 if (!mayPredicate(insn
, pred
))
1832 return false; // too long, do a real branch
1836 predicateInstructions(bL
, pred
, bb
->getExit()->cc
);
1838 predicateInstructions(bR
, pred
, inverseCondCode(bb
->getExit()->cc
));
1841 bb
->remove(bb
->joinAt
);
1844 removeFlow(bb
->getExit()); // delete the branch/join at the fork point
1846 // remove potential join operations at the end of the conditional
1847 if (prog
->getTarget()->joinAnterior
) {
1848 bb
= BasicBlock::get((bL
? bL
: bR
)->cfg
.outgoing().getNode());
1849 if (bb
->getEntry() && bb
->getEntry()->op
== OP_JOIN
)
1850 removeFlow(bb
->getEntry());
1856 // =============================================================================
1858 // Common subexpression elimination. Stupid O^2 implementation.
1859 class LocalCSE
: public Pass
1862 virtual bool visit(BasicBlock
*);
1864 inline bool tryReplace(Instruction
**, Instruction
*);
1866 DLList ops
[OP_LAST
+ 1];
1869 class GlobalCSE
: public Pass
1872 virtual bool visit(BasicBlock
*);
1876 Instruction::isActionEqual(const Instruction
*that
) const
1878 if (this->op
!= that
->op
||
1879 this->dType
!= that
->dType
||
1880 this->sType
!= that
->sType
)
1882 if (this->cc
!= that
->cc
)
1885 if (this->asTex()) {
1886 if (memcmp(&this->asTex()->tex
,
1887 &that
->asTex()->tex
,
1888 sizeof(this->asTex()->tex
)))
1891 if (this->asCmp()) {
1892 if (this->asCmp()->setCond
!= that
->asCmp()->setCond
)
1895 if (this->asFlow()) {
1898 if (this->atomic
!= that
->atomic
||
1899 this->ipa
!= that
->ipa
||
1900 this->lanes
!= that
->lanes
||
1901 this->perPatch
!= that
->perPatch
)
1903 if (this->postFactor
!= that
->postFactor
)
1907 if (this->subOp
!= that
->subOp
||
1908 this->saturate
!= that
->saturate
||
1909 this->rnd
!= that
->rnd
||
1910 this->ftz
!= that
->ftz
||
1911 this->dnz
!= that
->dnz
||
1912 this->cache
!= that
->cache
)
1919 Instruction::isResultEqual(const Instruction
*that
) const
1923 // NOTE: location of discard only affects tex with liveOnly and quadops
1924 if (!this->defExists(0) && this->op
!= OP_DISCARD
)
1927 if (!isActionEqual(that
))
1930 if (this->predSrc
!= that
->predSrc
)
1933 for (d
= 0; this->defExists(d
); ++d
) {
1934 if (!that
->defExists(d
) ||
1935 !this->getDef(d
)->equals(that
->getDef(d
), false))
1938 if (that
->defExists(d
))
1941 for (s
= 0; this->srcExists(s
); ++s
) {
1942 if (!that
->srcExists(s
))
1944 if (this->src
[s
].mod
!= that
->src
[s
].mod
)
1946 if (!this->getSrc(s
)->equals(that
->getSrc(s
), true))
1949 if (that
->srcExists(s
))
1952 if (op
== OP_LOAD
|| op
== OP_VFETCH
) {
1953 switch (src
[0].getFile()) {
1954 case FILE_MEMORY_CONST
:
1955 case FILE_SHADER_INPUT
:
1965 // pull through common expressions from different in-blocks
1967 GlobalCSE::visit(BasicBlock
*bb
)
1969 Instruction
*phi
, *next
, *ik
;
1972 for (phi
= bb
->getPhi(); phi
&& phi
->op
== OP_PHI
; phi
= next
) {
1974 if (phi
->getSrc(0)->refCount() > 1)
1976 ik
= phi
->getSrc(0)->getInsn();
1977 for (s
= 1; phi
->srcExists(s
); ++s
) {
1978 if (phi
->getSrc(s
)->refCount() > 1)
1980 if (!phi
->getSrc(s
)->getInsn()->isResultEqual(ik
))
1983 if (!phi
->srcExists(s
)) {
1984 Instruction
*entry
= bb
->getEntry();
1986 if (!entry
|| entry
->op
!= OP_JOIN
)
1989 bb
->insertAfter(entry
, ik
);
1990 ik
->setDef(0, phi
->getDef(0));
1991 delete_Instruction(prog
, phi
);
1999 LocalCSE::tryReplace(Instruction
**ptr
, Instruction
*i
)
2001 Instruction
*old
= *ptr
;
2002 if (!old
->isResultEqual(i
))
2004 for (int d
= 0; old
->defExists(d
); ++d
)
2005 old
->def
[d
].replace(i
->getDef(d
), false);
2006 delete_Instruction(prog
, old
);
2012 LocalCSE::visit(BasicBlock
*bb
)
2014 unsigned int replaced
;
2017 Instruction
*ir
, *next
;
2021 // will need to know the order of instructions
2023 for (ir
= bb
->getEntry(); ir
; ir
= ir
->next
)
2024 ir
->serial
= serial
++;
2026 for (ir
= bb
->getEntry(); ir
; ir
= next
) {
2033 ops
[ir
->op
].insert(ir
);
2037 for (s
= 0; ir
->srcExists(s
); ++s
)
2038 if (ir
->getSrc(s
)->asLValue())
2039 if (!src
|| ir
->getSrc(s
)->refCount() < src
->refCount())
2040 src
= ir
->getSrc(s
);
2043 for (ValueRef::Iterator refs
= src
->uses
->iterator(); !refs
.end();
2045 Instruction
*ik
= refs
.get()->getInsn();
2046 if (ik
->serial
< ir
->serial
&& ik
->bb
== ir
->bb
)
2047 if (tryReplace(&ir
, ik
))
2051 DLLIST_FOR_EACH(&ops
[ir
->op
], iter
)
2053 Instruction
*ik
= reinterpret_cast<Instruction
*>(iter
.get());
2054 if (tryReplace(&ir
, ik
))
2060 ops
[ir
->op
].insert(ir
);
2064 for (unsigned int i
= 0; i
<= OP_LAST
; ++i
)
2072 // =============================================================================
2074 // Remove computations of unused values.
2075 class DeadCodeElim
: public Pass
2078 bool buryAll(Program
*);
2081 virtual bool visit(BasicBlock
*);
2083 void checkSplitLoad(Instruction
*ld
); // for partially dead loads
2085 unsigned int deadCount
;
2089 DeadCodeElim::buryAll(Program
*prog
)
2093 if (!this->run(prog
, false, false))
2095 } while (deadCount
);
2101 DeadCodeElim::visit(BasicBlock
*bb
)
2105 for (Instruction
*i
= bb
->getFirst(); i
; i
= next
) {
2109 delete_Instruction(prog
, i
);
2111 if (i
->defExists(1) && (i
->op
== OP_VFETCH
|| i
->op
== OP_LOAD
)) {
2119 DeadCodeElim::checkSplitLoad(Instruction
*ld1
)
2121 Instruction
*ld2
= NULL
; // can get at most 2 loads
2124 int32_t addr1
, addr2
;
2125 int32_t size1
, size2
;
2127 uint32_t mask
= 0xffffffff;
2129 for (d
= 0; ld1
->defExists(d
); ++d
)
2130 if (!ld1
->getDef(d
)->refCount() && ld1
->getDef(d
)->reg
.data
.id
< 0)
2132 if (mask
== 0xffffffff)
2135 addr1
= ld1
->getSrc(0)->reg
.data
.offset
;
2138 for (d
= 0; ld1
->defExists(d
); ++d
) {
2139 if (mask
& (1 << d
)) {
2140 if (size1
&& (addr1
& 0x7))
2142 def1
[n1
] = ld1
->getDef(d
);
2143 size1
+= def1
[n1
++]->reg
.size
;
2146 addr1
+= ld1
->getDef(d
)->reg
.size
;
2151 for (addr2
= addr1
+ size1
; ld1
->defExists(d
); ++d
) {
2152 if (mask
& (1 << d
)) {
2153 def2
[n2
] = ld1
->getDef(d
);
2154 size2
+= def2
[n2
++]->reg
.size
;
2157 addr2
+= ld1
->getDef(d
)->reg
.size
;
2161 updateLdStOffset(ld1
, addr1
, func
);
2162 ld1
->setType(typeOfSize(size1
));
2163 for (d
= 0; d
< 4; ++d
)
2164 ld1
->setDef(d
, (d
< n1
) ? def1
[d
] : NULL
);
2169 ld2
= ld1
->clone(false);
2170 updateLdStOffset(ld2
, addr2
, func
);
2171 ld2
->setType(typeOfSize(size2
));
2172 for (d
= 0; d
< 4; ++d
)
2173 ld2
->setDef(d
, (d
< n2
) ? def2
[d
] : NULL
);
2175 ld1
->bb
->insertAfter(ld1
, ld2
);
2178 // =============================================================================
2180 #define RUN_PASS(l, n, f) \
2181 if (level >= (l)) { \
2182 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \
2183 INFO("PEEPHOLE: %s\n", #n); \
2185 if (!pass.f(this)) \
2190 Program::optimizeSSA(int level
)
2192 RUN_PASS(1, DeadCodeElim
, buryAll
);
2193 RUN_PASS(1, CopyPropagation
, run
);
2194 RUN_PASS(2, GlobalCSE
, run
);
2195 RUN_PASS(1, LocalCSE
, run
);
2196 RUN_PASS(2, AlgebraicOpt
, run
);
2197 RUN_PASS(2, ModifierFolding
, run
); // before load propagation -> less checks
2198 RUN_PASS(1, ConstantFolding
, foldAll
);
2199 RUN_PASS(1, LoadPropagation
, run
);
2200 RUN_PASS(2, MemoryOpt
, run
);
2201 RUN_PASS(2, LocalCSE
, run
);
2202 RUN_PASS(0, DeadCodeElim
, buryAll
);
2207 Program::optimizePostRA(int level
)
2209 RUN_PASS(2, FlatteningPass
, run
);