2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 #include "nv50_ir_target.h"
25 #include "nv50_ir_build_util.h"
28 #include "util/u_math.h"
34 Instruction::isNop() const
36 if (op
== OP_CONSTRAINT
|| op
== OP_PHI
)
38 if (terminator
|| join
) // XXX: should terminator imply flow ?
40 if (!fixed
&& op
== OP_NOP
)
43 if (defExists(0) && def(0).rep()->reg
.data
.id
< 0) {
44 for (int d
= 1; defExists(d
); ++d
)
45 if (def(d
).rep()->reg
.data
.id
>= 0)
46 WARN("part of vector result is unused !\n");
50 if (op
== OP_MOV
|| op
== OP_UNION
) {
51 if (!def(0).rep()->equals(getSrc(0)))
54 if (!def(0).rep()->equals(getSrc(1)))
62 bool Instruction::isDead() const
68 for (int d
= 0; defExists(d
); ++d
)
69 if (getDef(d
)->refCount() || getDef(d
)->reg
.data
.id
>= 0)
72 if (terminator
|| asFlow())
80 // =============================================================================
82 class CopyPropagation
: public Pass
85 virtual bool visit(BasicBlock
*);
88 // Propagate all MOVs forward to make subsequent optimization easier, except if
89 // the sources stem from a phi, in which case we don't want to mess up potential
90 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
92 CopyPropagation::visit(BasicBlock
*bb
)
94 Instruction
*mov
, *si
, *next
;
96 for (mov
= bb
->getEntry(); mov
; mov
= next
) {
98 if (mov
->op
!= OP_MOV
|| mov
->fixed
|| !mov
->getSrc(0)->asLValue())
100 si
= mov
->getSrc(0)->getInsn();
101 if (mov
->getDef(0)->reg
.data
.id
< 0 && si
&& si
->op
!= OP_PHI
) {
103 mov
->def(0).replace(mov
->getSrc(0), false);
104 delete_Instruction(prog
, mov
);
110 // =============================================================================
112 class LoadPropagation
: public Pass
115 virtual bool visit(BasicBlock
*);
117 void checkSwapSrc01(Instruction
*);
119 bool isCSpaceLoad(Instruction
*);
120 bool isImmd32Load(Instruction
*);
124 LoadPropagation::isCSpaceLoad(Instruction
*ld
)
126 return ld
&& ld
->op
== OP_LOAD
&& ld
->src(0).getFile() == FILE_MEMORY_CONST
;
130 LoadPropagation::isImmd32Load(Instruction
*ld
)
132 if (!ld
|| (ld
->op
!= OP_MOV
) || (typeSizeof(ld
->dType
) != 4))
134 return ld
->src(0).getFile() == FILE_IMMEDIATE
;
138 LoadPropagation::checkSwapSrc01(Instruction
*insn
)
140 if (!prog
->getTarget()->getOpInfo(insn
).commutative
)
141 if (insn
->op
!= OP_SET
&& insn
->op
!= OP_SLCT
)
143 if (insn
->src(1).getFile() != FILE_GPR
)
146 Instruction
*i0
= insn
->getSrc(0)->getInsn();
147 Instruction
*i1
= insn
->getSrc(1)->getInsn();
149 if (isCSpaceLoad(i0
)) {
150 if (!isCSpaceLoad(i1
))
151 insn
->swapSources(0, 1);
155 if (isImmd32Load(i0
)) {
156 if (!isCSpaceLoad(i1
) && !isImmd32Load(i1
))
157 insn
->swapSources(0, 1);
164 if (insn
->op
== OP_SET
)
165 insn
->asCmp()->setCond
= reverseCondCode(insn
->asCmp()->setCond
);
167 if (insn
->op
== OP_SLCT
)
168 insn
->asCmp()->setCond
= inverseCondCode(insn
->asCmp()->setCond
);
172 LoadPropagation::visit(BasicBlock
*bb
)
174 const Target
*targ
= prog
->getTarget();
177 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
183 for (int s
= 0; i
->srcExists(s
); ++s
) {
184 Instruction
*ld
= i
->getSrc(s
)->getInsn();
186 if (!ld
|| ld
->fixed
|| (ld
->op
!= OP_LOAD
&& ld
->op
!= OP_MOV
))
188 if (!targ
->insnCanLoad(i
, s
, ld
))
192 i
->setSrc(s
, ld
->getSrc(0));
193 if (ld
->src(0).isIndirect(0))
194 i
->setIndirect(s
, 0, ld
->getIndirect(0, 0));
196 if (ld
->getDef(0)->refCount() == 0)
197 delete_Instruction(prog
, ld
);
203 // =============================================================================
205 // Evaluate constant expressions.
206 class ConstantFolding
: public Pass
209 bool foldAll(Program
*);
212 virtual bool visit(BasicBlock
*);
214 void expr(Instruction
*, ImmediateValue
*, ImmediateValue
*);
215 void opnd(Instruction
*, ImmediateValue
*, int s
);
217 void unary(Instruction
*, const ImmediateValue
&);
219 void tryCollapseChainedMULs(Instruction
*, const int s
, ImmediateValue
&);
221 // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
222 CmpInstruction
*findOriginForTestWithZero(Value
*);
224 unsigned int foldCount
;
229 // TODO: remember generated immediates and only revisit these
231 ConstantFolding::foldAll(Program
*prog
)
233 unsigned int iterCount
= 0;
238 } while (foldCount
&& ++iterCount
< 2);
243 ConstantFolding::visit(BasicBlock
*bb
)
245 Instruction
*i
, *next
;
247 for (i
= bb
->getEntry(); i
; i
= next
) {
249 if (i
->op
== OP_MOV
) // continue early, MOV appears frequently
252 ImmediateValue
*src0
= i
->srcExists(0) ? i
->src(0).getImmediate() : NULL
;
253 ImmediateValue
*src1
= i
->srcExists(1) ? i
->src(1).getImmediate() : NULL
;
268 ConstantFolding::findOriginForTestWithZero(Value
*value
)
272 Instruction
*insn
= value
->getInsn();
274 while (insn
&& insn
->op
!= OP_SET
) {
275 Instruction
*next
= NULL
;
280 next
= insn
->getSrc(0)->getInsn();
281 if (insn
->sType
!= next
->dType
)
285 next
= insn
->getSrc(0)->getInsn();
292 return insn
? insn
->asCmp() : NULL
;
296 Modifier::applyTo(ImmediateValue
& imm
) const
298 switch (imm
.reg
.type
) {
300 if (bits
& NV50_IR_MOD_ABS
)
301 imm
.reg
.data
.f32
= fabsf(imm
.reg
.data
.f32
);
302 if (bits
& NV50_IR_MOD_NEG
)
303 imm
.reg
.data
.f32
= -imm
.reg
.data
.f32
;
304 if (bits
& NV50_IR_MOD_SAT
) {
305 if (imm
.reg
.data
.f32
< 0.0f
)
306 imm
.reg
.data
.f32
= 0.0f
;
308 if (imm
.reg
.data
.f32
> 1.0f
)
309 imm
.reg
.data
.f32
= 1.0f
;
311 assert(!(bits
& NV50_IR_MOD_NOT
));
314 case TYPE_S8
: // NOTE: will be extended
317 case TYPE_U8
: // NOTE: treated as signed
320 if (bits
& NV50_IR_MOD_ABS
)
321 imm
.reg
.data
.s32
= (imm
.reg
.data
.s32
>= 0) ?
322 imm
.reg
.data
.s32
: -imm
.reg
.data
.s32
;
323 if (bits
& NV50_IR_MOD_NEG
)
324 imm
.reg
.data
.s32
= -imm
.reg
.data
.s32
;
325 if (bits
& NV50_IR_MOD_NOT
)
326 imm
.reg
.data
.s32
= ~imm
.reg
.data
.s32
;
330 if (bits
& NV50_IR_MOD_ABS
)
331 imm
.reg
.data
.f64
= fabs(imm
.reg
.data
.f64
);
332 if (bits
& NV50_IR_MOD_NEG
)
333 imm
.reg
.data
.f64
= -imm
.reg
.data
.f64
;
334 if (bits
& NV50_IR_MOD_SAT
) {
335 if (imm
.reg
.data
.f64
< 0.0)
336 imm
.reg
.data
.f64
= 0.0;
338 if (imm
.reg
.data
.f64
> 1.0)
339 imm
.reg
.data
.f64
= 1.0;
341 assert(!(bits
& NV50_IR_MOD_NOT
));
345 assert(!"invalid/unhandled type");
346 imm
.reg
.data
.u64
= 0;
352 Modifier::getOp() const
355 case NV50_IR_MOD_ABS
: return OP_ABS
;
356 case NV50_IR_MOD_NEG
: return OP_NEG
;
357 case NV50_IR_MOD_SAT
: return OP_SAT
;
358 case NV50_IR_MOD_NOT
: return OP_NOT
;
367 ConstantFolding::expr(Instruction
*i
,
368 ImmediateValue
*src0
, ImmediateValue
*src1
)
370 ImmediateValue
imm0(src0
, i
->sType
);
371 ImmediateValue
imm1(src1
, i
->sType
);
373 struct Storage
*const a
= &imm0
.reg
, *const b
= &imm1
.reg
;
375 i
->src(0).mod
.applyTo(imm0
);
376 i
->src(1).mod
.applyTo(imm1
);
382 if (i
->dnz
&& i
->dType
== TYPE_F32
) {
383 if (!isfinite(a
->data
.f32
))
385 if (!isfinite(b
->data
.f32
))
389 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
* b
->data
.f32
; break;
390 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
* b
->data
.f64
; break;
392 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
* b
->data
.u32
; break;
398 if (b
->data
.u32
== 0)
401 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
/ b
->data
.f32
; break;
402 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
/ b
->data
.f64
; break;
403 case TYPE_S32
: res
.data
.s32
= a
->data
.s32
/ b
->data
.s32
; break;
404 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
/ b
->data
.u32
; break;
411 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
+ b
->data
.f32
; break;
412 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
+ b
->data
.f64
; break;
414 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
+ b
->data
.u32
; break;
421 case TYPE_F32
: res
.data
.f32
= pow(a
->data
.f32
, b
->data
.f32
); break;
422 case TYPE_F64
: res
.data
.f64
= pow(a
->data
.f64
, b
->data
.f64
); break;
429 case TYPE_F32
: res
.data
.f32
= MAX2(a
->data
.f32
, b
->data
.f32
); break;
430 case TYPE_F64
: res
.data
.f64
= MAX2(a
->data
.f64
, b
->data
.f64
); break;
431 case TYPE_S32
: res
.data
.s32
= MAX2(a
->data
.s32
, b
->data
.s32
); break;
432 case TYPE_U32
: res
.data
.u32
= MAX2(a
->data
.u32
, b
->data
.u32
); break;
439 case TYPE_F32
: res
.data
.f32
= MIN2(a
->data
.f32
, b
->data
.f32
); break;
440 case TYPE_F64
: res
.data
.f64
= MIN2(a
->data
.f64
, b
->data
.f64
); break;
441 case TYPE_S32
: res
.data
.s32
= MIN2(a
->data
.s32
, b
->data
.s32
); break;
442 case TYPE_U32
: res
.data
.u32
= MIN2(a
->data
.u32
, b
->data
.u32
); break;
448 res
.data
.u64
= a
->data
.u64
& b
->data
.u64
;
451 res
.data
.u64
= a
->data
.u64
| b
->data
.u64
;
454 res
.data
.u64
= a
->data
.u64
^ b
->data
.u64
;
457 res
.data
.u32
= a
->data
.u32
<< b
->data
.u32
;
461 case TYPE_S32
: res
.data
.s32
= a
->data
.s32
>> b
->data
.u32
; break;
462 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
>> b
->data
.u32
; break;
468 if (a
->data
.u32
!= b
->data
.u32
)
470 res
.data
.u32
= a
->data
.u32
;
477 i
->src(0).mod
= Modifier(0);
478 i
->src(1).mod
= Modifier(0);
480 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.u32
));
483 i
->getSrc(0)->reg
.data
= res
.data
;
485 if (i
->op
== OP_MAD
|| i
->op
== OP_FMA
) {
488 i
->setSrc(1, i
->getSrc(0));
489 i
->src(1).mod
= i
->src(2).mod
;
490 i
->setSrc(0, i
->getSrc(2));
493 src0
= i
->src(0).getImmediate();
495 expr(i
, src0
, i
->getSrc(1)->asImm());
502 ConstantFolding::unary(Instruction
*i
, const ImmediateValue
&imm
)
506 if (i
->dType
!= TYPE_F32
)
509 case OP_NEG
: res
.data
.f32
= -imm
.reg
.data
.f32
; break;
510 case OP_ABS
: res
.data
.f32
= fabsf(imm
.reg
.data
.f32
); break;
511 case OP_RCP
: res
.data
.f32
= 1.0f
/ imm
.reg
.data
.f32
; break;
512 case OP_RSQ
: res
.data
.f32
= 1.0f
/ sqrtf(imm
.reg
.data
.f32
); break;
513 case OP_LG2
: res
.data
.f32
= log2f(imm
.reg
.data
.f32
); break;
514 case OP_EX2
: res
.data
.f32
= exp2f(imm
.reg
.data
.f32
); break;
515 case OP_SIN
: res
.data
.f32
= sinf(imm
.reg
.data
.f32
); break;
516 case OP_COS
: res
.data
.f32
= cosf(imm
.reg
.data
.f32
); break;
517 case OP_SQRT
: res
.data
.f32
= sqrtf(imm
.reg
.data
.f32
); break;
520 // these should be handled in subsequent OP_SIN/COS/EX2
521 res
.data
.f32
= imm
.reg
.data
.f32
;
527 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.f32
));
528 i
->src(0).mod
= Modifier(0);
532 ConstantFolding::tryCollapseChainedMULs(Instruction
*mul2
,
533 const int s
, ImmediateValue
& imm2
)
535 const int t
= s
? 0 : 1;
537 Instruction
*mul1
= NULL
; // mul1 before mul2
539 float f
= imm2
.reg
.data
.f32
;
541 assert(mul2
->op
== OP_MUL
&& mul2
->dType
== TYPE_F32
);
543 if (mul2
->getSrc(t
)->refCount() == 1) {
544 insn
= mul2
->getSrc(t
)->getInsn();
545 if (insn
->op
== OP_MUL
&& insn
->dType
== TYPE_F32
)
549 ImmediateValue
*imm
= mul1
->src(s1
).getImmediate();
552 imm
= mul1
->src(s1
).getImmediate();
555 bld
.setPosition(mul1
, false);
557 // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
558 ImmediateValue
imm1(mul1
->src(s1
).getImmediate(), TYPE_F32
);
559 mul1
->src(s1
).mod
.applyTo(imm1
);
560 mul1
->src(s1
).mod
= Modifier(0);
561 mul1
->setSrc(s1
, bld
.loadImm(NULL
, f
* imm1
.reg
.data
.f32
));
562 mul2
->def(0).replace(mul1
->getDef(0), false);
564 if (prog
->getTarget()->isPostMultiplySupported(OP_MUL
, f
, e
)) {
566 // d = mul c, imm -> d = mul_x_imm a, b
567 mul1
->postFactor
= e
;
568 mul2
->def(0).replace(mul1
->getDef(0), false);
570 mul1
->src(0).mod
= mul1
->src(0).mod
^ Modifier(NV50_IR_MOD_NEG
);
575 if (mul2
->getDef(0)->refCount() == 1) {
577 // d = mul b, c -> d = mul_x_imm a, c
579 insn
= mul2
->getDef(0)->uses
.front()->getInsn();
584 s2
= insn
->getSrc(0) == mul1
->getDef(0) ? 0 : 1;
586 if (insn
->op
== OP_MUL
&& insn
->dType
== TYPE_F32
)
587 if (!insn
->src(t2
).getImmediate())
589 if (mul2
&& prog
->getTarget()->isPostMultiplySupported(OP_MUL
, f
, e
)) {
590 mul2
->postFactor
= e
;
591 mul2
->setSrc(s2
, mul1
->src(t
));
593 mul2
->src(s2
).mod
= mul2
->src(s2
).mod
^ Modifier(NV50_IR_MOD_NEG
);
599 ConstantFolding::opnd(Instruction
*i
, ImmediateValue
*src
, int s
)
602 const operation op
= i
->op
;
604 ImmediateValue
imm(src
, i
->sType
);
606 i
->src(s
).mod
.applyTo(imm
);
610 if (i
->dType
== TYPE_F32
)
611 tryCollapseChainedMULs(i
, s
, imm
);
613 if (imm
.isInteger(0)) {
615 i
->setSrc(0, i
->getSrc(s
));
618 if (imm
.isInteger(1) || imm
.isInteger(-1)) {
619 if (imm
.isNegative())
620 i
->src(t
).mod
= i
->src(t
).mod
^ Modifier(NV50_IR_MOD_NEG
);
621 i
->op
= i
->src(t
).mod
.getOp();
623 i
->setSrc(0, i
->getSrc(1));
624 i
->src(0).mod
= i
->src(1).mod
;
631 if (imm
.isInteger(2) || imm
.isInteger(-2)) {
632 if (imm
.isNegative())
633 i
->src(t
).mod
= i
->src(t
).mod
^ Modifier(NV50_IR_MOD_NEG
);
635 i
->setSrc(s
, i
->getSrc(t
));
636 i
->src(s
).mod
= i
->src(t
).mod
;
638 if (!isFloatType(i
->sType
) && !imm
.isNegative() && imm
.isPow2()) {
641 i
->setSrc(1, new_ImmediateValue(prog
, imm
.reg
.data
.u32
));
645 if (imm
.isInteger(0)) {
647 i
->setSrc(0, i
->getSrc(1));
648 i
->src(0).mod
= i
->src(1).mod
;
651 i
->op
= i
->src(0).mod
.getOp();
653 i
->src(0).mod
= Modifier(0);
658 if (s
!= 1 || (i
->dType
!= TYPE_S32
&& i
->dType
!= TYPE_U32
))
660 bld
.setPosition(i
, false);
661 if (imm
.reg
.data
.u32
== 0) {
664 if (imm
.reg
.data
.u32
== 1) {
668 if (i
->dType
== TYPE_U32
&& imm
.isPow2()) {
670 i
->setSrc(1, bld
.mkImm(util_logbase2(imm
.reg
.data
.u32
)));
672 if (i
->dType
== TYPE_U32
) {
675 const uint32_t d
= imm
.reg
.data
.u32
;
678 uint32_t l
= util_logbase2(d
);
679 if (((uint32_t)1 << l
) < d
)
681 m
= (((uint64_t)1 << 32) * (((uint64_t)1 << l
) - d
)) / d
+ 1;
687 mul
= bld
.mkOp2(OP_MUL
, TYPE_U32
, tA
, i
->getSrc(0),
688 bld
.loadImm(NULL
, m
));
689 mul
->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
690 bld
.mkOp2(OP_SUB
, TYPE_U32
, tB
, i
->getSrc(0), tA
);
693 bld
.mkOp2(OP_SHR
, TYPE_U32
, tA
, tB
, bld
.mkImm(r
));
696 tB
= s
? bld
.getSSA() : i
->getDef(0);
697 bld
.mkOp2(OP_ADD
, TYPE_U32
, tB
, mul
->getDef(0), tA
);
699 bld
.mkOp2(OP_SHR
, TYPE_U32
, i
->getDef(0), tB
, bld
.mkImm(s
));
701 delete_Instruction(prog
, i
);
703 if (imm
.reg
.data
.s32
== -1) {
709 const int32_t d
= imm
.reg
.data
.s32
;
711 int32_t l
= util_logbase2(static_cast<unsigned>(abs(d
)));
712 if ((1 << l
) < abs(d
))
716 m
= ((uint64_t)1 << (32 + l
- 1)) / abs(d
) + 1 - ((uint64_t)1 << 32);
720 bld
.mkOp3(OP_MAD
, TYPE_S32
, tA
, i
->getSrc(0), bld
.loadImm(NULL
, m
),
721 i
->getSrc(0))->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
723 bld
.mkOp2(OP_SHR
, TYPE_S32
, tB
, tA
, bld
.mkImm(l
- 1));
727 bld
.mkCmp(OP_SET
, CC_LT
, TYPE_S32
, tA
, i
->getSrc(0), bld
.mkImm(0));
728 tD
= (d
< 0) ? bld
.getSSA() : i
->getDef(0)->asLValue();
729 bld
.mkOp2(OP_SUB
, TYPE_U32
, tD
, tB
, tA
);
731 bld
.mkOp1(OP_NEG
, TYPE_S32
, i
->getDef(0), tB
);
733 delete_Instruction(prog
, i
);
738 if (i
->sType
== TYPE_U32
&& imm
.isPow2()) {
739 bld
.setPosition(i
, false);
741 i
->setSrc(1, bld
.loadImm(NULL
, imm
.reg
.data
.u32
- 1));
745 case OP_SET
: // TODO: SET_AND,OR,XOR
747 CmpInstruction
*si
= findOriginForTestWithZero(i
->getSrc(t
));
749 if (i
->src(t
).mod
!= Modifier(0))
751 if (imm
.reg
.data
.u32
!= 0 || !si
|| si
->op
!= OP_SET
)
754 ccZ
= (CondCode
)((unsigned int)i
->asCmp()->setCond
& ~CC_U
);
756 ccZ
= reverseCondCode(ccZ
);
758 case CC_LT
: cc
= CC_FL
; break;
759 case CC_GE
: cc
= CC_TR
; break;
760 case CC_EQ
: cc
= inverseCondCode(cc
); break;
761 case CC_LE
: cc
= inverseCondCode(cc
); break;
767 i
->asCmp()->setCond
= cc
;
768 i
->setSrc(0, si
->src(0));
769 i
->setSrc(1, si
->src(1));
770 i
->sType
= si
->sType
;
776 if (s
!= 1 || i
->src(0).mod
!= Modifier(0))
778 // try to concatenate shifts
779 Instruction
*si
= i
->getSrc(0)->getInsn();
781 si
->op
!= OP_SHL
|| si
->src(1).mod
!= Modifier(0))
783 ImmediateValue
*siImm
= si
->src(1).getImmediate();
785 bld
.setPosition(i
, false);
786 i
->setSrc(0, si
->getSrc(0));
787 i
->setSrc(1, bld
.loadImm(NULL
,
788 imm
.reg
.data
.u32
+ siImm
->reg
.data
.u32
));
813 // =============================================================================
815 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
816 class ModifierFolding
: public Pass
819 virtual bool visit(BasicBlock
*);
823 ModifierFolding::visit(BasicBlock
*bb
)
825 const Target
*target
= prog
->getTarget();
827 Instruction
*i
, *next
, *mi
;
830 for (i
= bb
->getEntry(); i
; i
= next
) {
833 if (0 && i
->op
== OP_SUB
) {
834 // turn "sub" into "add neg" (do we really want this ?)
836 i
->src(0).mod
= i
->src(0).mod
^ Modifier(NV50_IR_MOD_NEG
);
839 for (int s
= 0; s
< 3 && i
->srcExists(s
); ++s
) {
840 mi
= i
->getSrc(s
)->getInsn();
842 mi
->predSrc
>= 0 || mi
->getDef(0)->refCount() > 8)
844 if (i
->sType
== TYPE_U32
&& mi
->dType
== TYPE_S32
) {
845 if ((i
->op
!= OP_ADD
&&
851 if (i
->sType
!= mi
->dType
) {
854 if ((mod
= Modifier(mi
->op
)) == Modifier(0))
856 mod
= mod
* mi
->src(0).mod
;
858 if ((i
->op
== OP_ABS
) || i
->src(s
).mod
.abs()) {
859 // abs neg [abs] = abs
860 mod
= mod
& Modifier(~(NV50_IR_MOD_NEG
| NV50_IR_MOD_ABS
));
862 if ((i
->op
== OP_NEG
) && mod
.neg()) {
864 // neg as both opcode and modifier on same insn is prohibited
865 // neg neg abs = abs, neg neg = identity
866 mod
= mod
& Modifier(~NV50_IR_MOD_NEG
);
868 mod
= mod
& Modifier(~NV50_IR_MOD_ABS
);
869 if (mod
== Modifier(0))
873 if (target
->isModSupported(i
, s
, mod
)) {
874 i
->setSrc(s
, mi
->getSrc(0));
875 i
->src(s
).mod
= i
->src(s
).mod
* mod
;
879 if (i
->op
== OP_SAT
) {
880 mi
= i
->getSrc(0)->getInsn();
882 mi
->getDef(0)->refCount() <= 1 && target
->isSatSupported(mi
)) {
884 mi
->setDef(0, i
->getDef(0));
885 delete_Instruction(prog
, i
);
893 // =============================================================================
895 // MUL + ADD -> MAD/FMA
896 // MIN/MAX(a, a) -> a, etc.
897 // SLCT(a, b, const) -> cc(const) ? a : b
899 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
900 class AlgebraicOpt
: public Pass
903 virtual bool visit(BasicBlock
*);
905 void handleADD(Instruction
*);
906 void handleMINMAX(Instruction
*);
907 void handleRCP(Instruction
*);
908 void handleSLCT(Instruction
*);
909 void handleLOGOP(Instruction
*);
910 void handleCVT(Instruction
*);
914 AlgebraicOpt::handleADD(Instruction
*add
)
916 Value
*src0
= add
->getSrc(0);
917 Value
*src1
= add
->getSrc(1);
922 if (!prog
->getTarget()->isOpSupported(OP_MAD
, add
->dType
))
925 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
928 if (src0
->refCount() == 1 &&
929 src0
->getUniqueInsn() && src0
->getUniqueInsn()->op
== OP_MUL
)
932 if (src1
->refCount() == 1 &&
933 src1
->getUniqueInsn() && src1
->getUniqueInsn()->op
== OP_MUL
)
938 if ((src0
->getUniqueInsn() && src0
->getUniqueInsn()->bb
!= add
->bb
) ||
939 (src1
->getUniqueInsn() && src1
->getUniqueInsn()->bb
!= add
->bb
))
942 src
= add
->getSrc(s
);
944 if (src
->getInsn()->postFactor
)
947 mod
[0] = add
->src(0).mod
;
948 mod
[1] = add
->src(1).mod
;
949 mod
[2] = src
->getUniqueInsn()->src(0).mod
;
950 mod
[3] = src
->getUniqueInsn()->src(1).mod
;
952 if (((mod
[0] | mod
[1]) | (mod
[2] | mod
[3])) & Modifier(~NV50_IR_MOD_NEG
))
956 add
->subOp
= src
->getInsn()->subOp
; // potentially mul-high
958 add
->setSrc(2, add
->src(s
? 0 : 1));
960 add
->setSrc(0, src
->getInsn()->getSrc(0));
961 add
->src(0).mod
= mod
[2] ^ mod
[s
];
962 add
->setSrc(1, src
->getInsn()->getSrc(1));
963 add
->src(1).mod
= mod
[3];
967 AlgebraicOpt::handleMINMAX(Instruction
*minmax
)
969 Value
*src0
= minmax
->getSrc(0);
970 Value
*src1
= minmax
->getSrc(1);
972 if (src0
!= src1
|| src0
->reg
.file
!= FILE_GPR
)
974 if (minmax
->src(0).mod
== minmax
->src(1).mod
) {
975 if (minmax
->src(0).mod
) {
977 minmax
->setSrc(1, NULL
);
979 minmax
->def(0).replace(minmax
->getSrc(0), false);
980 minmax
->bb
->remove(minmax
);
984 // min(x, -x) = -abs(x)
985 // min(x, -abs(x)) = -abs(x)
986 // min(x, abs(x)) = x
987 // max(x, -abs(x)) = x
988 // max(x, abs(x)) = abs(x)
989 // max(x, -x) = abs(x)
994 AlgebraicOpt::handleRCP(Instruction
*rcp
)
996 Instruction
*si
= rcp
->getSrc(0)->getUniqueInsn();
998 if (si
&& si
->op
== OP_RCP
) {
999 Modifier mod
= rcp
->src(0).mod
* si
->src(0).mod
;
1000 rcp
->op
= mod
.getOp();
1001 rcp
->setSrc(0, si
->getSrc(0));
1006 AlgebraicOpt::handleSLCT(Instruction
*slct
)
1008 if (slct
->getSrc(2)->reg
.file
== FILE_IMMEDIATE
) {
1009 if (slct
->getSrc(2)->asImm()->compare(slct
->asCmp()->setCond
, 0.0f
))
1010 slct
->setSrc(0, slct
->getSrc(1));
1012 if (slct
->getSrc(0) != slct
->getSrc(1)) {
1016 slct
->setSrc(1, NULL
);
1017 slct
->setSrc(2, NULL
);
1021 AlgebraicOpt::handleLOGOP(Instruction
*logop
)
1023 Value
*src0
= logop
->getSrc(0);
1024 Value
*src1
= logop
->getSrc(1);
1026 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
1030 if (logop
->src(0).mod
!= Modifier(0) ||
1031 logop
->src(1).mod
!= Modifier(0))
1033 if (logop
->op
== OP_AND
|| logop
->op
== OP_OR
) {
1034 logop
->def(0).replace(logop
->getSrc(0), false);
1035 delete_Instruction(prog
, logop
);
1038 // try AND(SET, SET) -> SET_AND(SET)
1039 Instruction
*set0
= src0
->getInsn();
1040 Instruction
*set1
= src1
->getInsn();
1042 if (!set0
|| set0
->fixed
|| !set1
|| set1
->fixed
)
1044 if (set1
->op
!= OP_SET
) {
1045 Instruction
*xchg
= set0
;
1048 if (set1
->op
!= OP_SET
)
1051 if (set0
->op
!= OP_SET
&&
1052 set0
->op
!= OP_SET_AND
&&
1053 set0
->op
!= OP_SET_OR
&&
1054 set0
->op
!= OP_SET_XOR
)
1056 if (set0
->getDef(0)->refCount() > 1 &&
1057 set1
->getDef(0)->refCount() > 1)
1059 if (set0
->getPredicate() || set1
->getPredicate())
1061 // check that they don't source each other
1062 for (int s
= 0; s
< 2; ++s
)
1063 if (set0
->getSrc(s
) == set1
->getDef(0) ||
1064 set1
->getSrc(s
) == set0
->getDef(0))
1067 set0
= set0
->clone(true);
1068 set1
= set1
->clone(false);
1069 logop
->bb
->insertAfter(logop
, set1
);
1070 logop
->bb
->insertAfter(logop
, set0
);
1072 set0
->dType
= TYPE_U8
;
1073 set0
->getDef(0)->reg
.file
= FILE_PREDICATE
;
1074 set0
->getDef(0)->reg
.size
= 1;
1075 set1
->setSrc(2, set0
->getDef(0));
1076 switch (logop
->op
) {
1077 case OP_AND
: set1
->op
= OP_SET_AND
; break;
1078 case OP_OR
: set1
->op
= OP_SET_OR
; break;
1079 case OP_XOR
: set1
->op
= OP_SET_XOR
; break;
1084 set1
->setDef(0, logop
->getDef(0));
1085 delete_Instruction(prog
, logop
);
1089 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1091 AlgebraicOpt::handleCVT(Instruction
*cvt
)
1093 if (cvt
->sType
!= TYPE_F32
||
1094 cvt
->dType
!= TYPE_S32
|| cvt
->src(0).mod
!= Modifier(0))
1096 Instruction
*insn
= cvt
->getSrc(0)->getInsn();
1097 if (!insn
|| insn
->op
!= OP_NEG
|| insn
->dType
!= TYPE_F32
)
1099 if (insn
->src(0).mod
!= Modifier(0))
1101 insn
= insn
->getSrc(0)->getInsn();
1102 if (!insn
|| insn
->op
!= OP_SET
|| insn
->dType
!= TYPE_F32
)
1105 Instruction
*bset
= insn
->clone(false);
1106 bset
->dType
= TYPE_U32
;
1107 bset
->setDef(0, cvt
->getDef(0));
1108 cvt
->bb
->insertAfter(cvt
, bset
);
1109 delete_Instruction(prog
, cvt
);
1113 AlgebraicOpt::visit(BasicBlock
*bb
)
1116 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
1148 // =============================================================================
1151 updateLdStOffset(Instruction
*ldst
, int32_t offset
, Function
*fn
)
1153 if (offset
!= ldst
->getSrc(0)->reg
.data
.offset
) {
1154 if (ldst
->getSrc(0)->refCount() > 1)
1155 ldst
->setSrc(0, ldst
->getSrc(0)->clone(fn
));
1156 ldst
->getSrc(0)->reg
.data
.offset
= offset
;
1160 // Combine loads and stores, forward stores to loads where possible.
1161 class MemoryOpt
: public Pass
1169 const Value
*rel
[2];
1177 bool overlaps(const Instruction
*ldst
) const;
1179 inline void link(Record
**);
1180 inline void unlink(Record
**);
1181 inline void set(const Instruction
*ldst
);
1187 Record
*loads
[DATA_FILE_COUNT
];
1188 Record
*stores
[DATA_FILE_COUNT
];
1190 MemoryPool recordPool
;
1193 virtual bool visit(BasicBlock
*);
1194 bool runOpt(BasicBlock
*);
1196 Record
**getList(const Instruction
*);
1198 Record
*findRecord(const Instruction
*, bool load
, bool& isAdjacent
) const;
1200 // merge @insn into load/store instruction from @rec
1201 bool combineLd(Record
*rec
, Instruction
*ld
);
1202 bool combineSt(Record
*rec
, Instruction
*st
);
1204 bool replaceLdFromLd(Instruction
*ld
, Record
*ldRec
);
1205 bool replaceLdFromSt(Instruction
*ld
, Record
*stRec
);
1206 bool replaceStFromSt(Instruction
*restrict st
, Record
*stRec
);
1208 void addRecord(Instruction
*ldst
);
1209 void purgeRecords(Instruction
*const st
, DataFile
);
1210 void lockStores(Instruction
*const ld
);
1217 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record
), 6)
1219 for (int i
= 0; i
< DATA_FILE_COUNT
; ++i
) {
1229 for (unsigned int i
= 0; i
< DATA_FILE_COUNT
; ++i
) {
1231 for (it
= loads
[i
]; it
; it
= next
) {
1233 recordPool
.release(it
);
1236 for (it
= stores
[i
]; it
; it
= next
) {
1238 recordPool
.release(it
);
1245 MemoryOpt::combineLd(Record
*rec
, Instruction
*ld
)
1247 int32_t offRc
= rec
->offset
;
1248 int32_t offLd
= ld
->getSrc(0)->reg
.data
.offset
;
1249 int sizeRc
= rec
->size
;
1250 int sizeLd
= typeSizeof(ld
->dType
);
1251 int size
= sizeRc
+ sizeLd
;
1254 if (!prog
->getTarget()->
1255 isAccessSupported(ld
->getSrc(0)->reg
.file
, typeOfSize(size
)))
1257 // no unaligned loads
1258 if (((size
== 0x8) && (MIN2(offLd
, offRc
) & 0x7)) ||
1259 ((size
== 0xc) && (MIN2(offLd
, offRc
) & 0xf)))
1262 assert(sizeRc
+ sizeLd
<= 16 && offRc
!= offLd
);
1264 for (j
= 0; sizeRc
; sizeRc
-= rec
->insn
->getDef(j
)->reg
.size
, ++j
);
1266 if (offLd
< offRc
) {
1268 for (sz
= 0, d
= 0; sz
< sizeLd
; sz
+= ld
->getDef(d
)->reg
.size
, ++d
);
1269 // d: nr of definitions in ld
1270 // j: nr of definitions in rec->insn, move:
1271 for (d
= d
+ j
- 1; j
> 0; --j
, --d
)
1272 rec
->insn
->setDef(d
, rec
->insn
->getDef(j
- 1));
1274 if (rec
->insn
->getSrc(0)->refCount() > 1)
1275 rec
->insn
->setSrc(0, rec
->insn
->getSrc(0)->clone(func
));
1276 rec
->offset
= rec
->insn
->getSrc(0)->reg
.data
.offset
= offLd
;
1282 // move definitions of @ld to @rec->insn
1283 for (j
= 0; sizeLd
; ++j
, ++d
) {
1284 sizeLd
-= ld
->getDef(j
)->reg
.size
;
1285 rec
->insn
->setDef(d
, ld
->getDef(j
));
1289 rec
->insn
->setType(typeOfSize(size
));
1291 delete_Instruction(prog
, ld
);
1297 MemoryOpt::combineSt(Record
*rec
, Instruction
*st
)
1299 int32_t offRc
= rec
->offset
;
1300 int32_t offSt
= st
->getSrc(0)->reg
.data
.offset
;
1301 int sizeRc
= rec
->size
;
1302 int sizeSt
= typeSizeof(st
->dType
);
1304 int size
= sizeRc
+ sizeSt
;
1306 Value
*src
[4]; // no modifiers in ValueRef allowed for st
1309 if (!prog
->getTarget()->
1310 isAccessSupported(st
->getSrc(0)->reg
.file
, typeOfSize(size
)))
1312 if (size
== 8 && MIN2(offRc
, offSt
) & 0x7)
1315 st
->takeExtraSources(0, extra
); // save predicate and indirect address
1317 if (offRc
< offSt
) {
1318 // save values from @st
1319 for (s
= 0; sizeSt
; ++s
) {
1320 sizeSt
-= st
->getSrc(s
+ 1)->reg
.size
;
1321 src
[s
] = st
->getSrc(s
+ 1);
1323 // set record's values as low sources of @st
1324 for (j
= 1; sizeRc
; ++j
) {
1325 sizeRc
-= st
->getSrc(j
)->reg
.size
;
1326 st
->setSrc(j
, rec
->insn
->getSrc(j
));
1328 // set saved values as high sources of @st
1329 for (k
= j
, j
= 0; j
< s
; ++j
)
1330 st
->setSrc(k
++, src
[j
]);
1332 updateLdStOffset(st
, offRc
, func
);
1334 for (j
= 1; sizeSt
; ++j
)
1335 sizeSt
-= st
->getSrc(j
)->reg
.size
;
1336 for (s
= 1; sizeRc
; ++j
, ++s
) {
1337 sizeRc
-= rec
->insn
->getSrc(s
)->reg
.size
;
1338 st
->setSrc(j
, rec
->insn
->getSrc(s
));
1340 rec
->offset
= offSt
;
1342 st
->putExtraSources(0, extra
); // restore pointer and predicate
1344 delete_Instruction(prog
, rec
->insn
);
1347 rec
->insn
->setType(typeOfSize(size
));
1352 MemoryOpt::Record::set(const Instruction
*ldst
)
1354 const Symbol
*mem
= ldst
->getSrc(0)->asSym();
1355 fileIndex
= mem
->reg
.fileIndex
;
1356 rel
[0] = ldst
->getIndirect(0, 0);
1357 rel
[1] = ldst
->getIndirect(0, 1);
1358 offset
= mem
->reg
.data
.offset
;
1359 base
= mem
->getBase();
1360 size
= typeSizeof(ldst
->sType
);
1364 MemoryOpt::Record::link(Record
**list
)
1374 MemoryOpt::Record::unlink(Record
**list
)
1384 MemoryOpt::Record
**
1385 MemoryOpt::getList(const Instruction
*insn
)
1387 if (insn
->op
== OP_LOAD
|| insn
->op
== OP_VFETCH
)
1388 return &loads
[insn
->src(0).getFile()];
1389 return &stores
[insn
->src(0).getFile()];
1393 MemoryOpt::addRecord(Instruction
*i
)
1395 Record
**list
= getList(i
);
1396 Record
*it
= reinterpret_cast<Record
*>(recordPool
.allocate());
1405 MemoryOpt::findRecord(const Instruction
*insn
, bool load
, bool& isAdj
) const
1407 const Symbol
*sym
= insn
->getSrc(0)->asSym();
1408 const int size
= typeSizeof(insn
->sType
);
1410 Record
*it
= load
? loads
[sym
->reg
.file
] : stores
[sym
->reg
.file
];
1412 for (; it
; it
= it
->next
) {
1413 if (it
->locked
&& insn
->op
!= OP_LOAD
)
1415 if ((it
->offset
>> 4) != (sym
->reg
.data
.offset
>> 4) ||
1416 it
->rel
[0] != insn
->getIndirect(0, 0) ||
1417 it
->fileIndex
!= sym
->reg
.fileIndex
||
1418 it
->rel
[1] != insn
->getIndirect(0, 1))
1421 if (it
->offset
< sym
->reg
.data
.offset
) {
1422 if (it
->offset
+ it
->size
>= sym
->reg
.data
.offset
) {
1423 isAdj
= (it
->offset
+ it
->size
== sym
->reg
.data
.offset
);
1426 if (!(it
->offset
& 0x7))
1430 isAdj
= it
->offset
!= sym
->reg
.data
.offset
;
1431 if (size
<= it
->size
&& !isAdj
)
1434 if (!(sym
->reg
.data
.offset
& 0x7))
1435 if (it
->offset
- size
<= sym
->reg
.data
.offset
)
1443 MemoryOpt::replaceLdFromSt(Instruction
*ld
, Record
*rec
)
1445 Instruction
*st
= rec
->insn
;
1446 int32_t offSt
= rec
->offset
;
1447 int32_t offLd
= ld
->getSrc(0)->reg
.data
.offset
;
1450 for (s
= 1; offSt
!= offLd
&& st
->srcExists(s
); ++s
)
1451 offSt
+= st
->getSrc(s
)->reg
.size
;
1455 for (d
= 0; ld
->defExists(d
) && st
->srcExists(s
); ++d
, ++s
) {
1456 if (ld
->getDef(d
)->reg
.size
!= st
->getSrc(s
)->reg
.size
)
1458 if (st
->getSrc(s
)->reg
.file
!= FILE_GPR
)
1460 ld
->def(d
).replace(st
->getSrc(s
), false);
1467 MemoryOpt::replaceLdFromLd(Instruction
*ldE
, Record
*rec
)
1469 Instruction
*ldR
= rec
->insn
;
1470 int32_t offR
= rec
->offset
;
1471 int32_t offE
= ldE
->getSrc(0)->reg
.data
.offset
;
1474 assert(offR
<= offE
);
1475 for (dR
= 0; offR
< offE
&& ldR
->defExists(dR
); ++dR
)
1476 offR
+= ldR
->getDef(dR
)->reg
.size
;
1480 for (dE
= 0; ldE
->defExists(dE
) && ldR
->defExists(dR
); ++dE
, ++dR
) {
1481 if (ldE
->getDef(dE
)->reg
.size
!= ldR
->getDef(dR
)->reg
.size
)
1483 ldE
->def(dE
).replace(ldR
->getDef(dR
), false);
1486 delete_Instruction(prog
, ldE
);
1491 MemoryOpt::replaceStFromSt(Instruction
*restrict st
, Record
*rec
)
1493 const Instruction
*const ri
= rec
->insn
;
1496 int32_t offS
= st
->getSrc(0)->reg
.data
.offset
;
1497 int32_t offR
= rec
->offset
;
1498 int32_t endS
= offS
+ typeSizeof(st
->dType
);
1499 int32_t endR
= offR
+ typeSizeof(ri
->dType
);
1501 rec
->size
= MAX2(endS
, endR
) - MIN2(offS
, offR
);
1503 st
->takeExtraSources(0, extra
);
1509 // get non-replaced sources of ri
1510 for (s
= 1; offR
< offS
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
)
1511 vals
[k
++] = ri
->getSrc(s
);
1513 // get replaced sources of st
1514 for (s
= 1; st
->srcExists(s
); offS
+= st
->getSrc(s
)->reg
.size
, ++s
)
1515 vals
[k
++] = st
->getSrc(s
);
1516 // skip replaced sources of ri
1517 for (s
= n
; offR
< endS
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
);
1518 // get non-replaced sources after values covered by st
1519 for (; offR
< endR
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
)
1520 vals
[k
++] = ri
->getSrc(s
);
1521 assert(k
<= Elements(vals
));
1522 for (s
= 0; s
< k
; ++s
)
1523 st
->setSrc(s
+ 1, vals
[s
]);
1524 st
->setSrc(0, ri
->getSrc(0));
1528 for (j
= 1; offR
< endS
; offR
+= ri
->getSrc(j
++)->reg
.size
);
1529 for (s
= 1; offS
< endS
; offS
+= st
->getSrc(s
++)->reg
.size
);
1530 for (; offR
< endR
; offR
+= ri
->getSrc(j
++)->reg
.size
)
1531 st
->setSrc(s
++, ri
->getSrc(j
));
1533 st
->putExtraSources(0, extra
);
1535 delete_Instruction(prog
, rec
->insn
);
1538 rec
->offset
= st
->getSrc(0)->reg
.data
.offset
;
1540 st
->setType(typeOfSize(rec
->size
));
1546 MemoryOpt::Record::overlaps(const Instruction
*ldst
) const
1551 if (this->fileIndex
!= that
.fileIndex
)
1554 if (this->rel
[0] || that
.rel
[0])
1555 return this->base
== that
.base
;
1557 (this->offset
< that
.offset
+ that
.size
) &&
1558 (this->offset
+ this->size
> that
.offset
);
1561 // We must not eliminate stores that affect the result of @ld if
1562 // we find later stores to the same location, and we may no longer
1563 // merge them with later stores.
1564 // The stored value can, however, still be used to determine the value
1565 // returned by future loads.
1567 MemoryOpt::lockStores(Instruction
*const ld
)
1569 for (Record
*r
= stores
[ld
->src(0).getFile()]; r
; r
= r
->next
)
1570 if (!r
->locked
&& r
->overlaps(ld
))
1574 // Prior loads from the location of @st are no longer valid.
1575 // Stores to the location of @st may no longer be used to derive
1576 // the value at it nor be coalesced into later stores.
1578 MemoryOpt::purgeRecords(Instruction
*const st
, DataFile f
)
1581 f
= st
->src(0).getFile();
1583 for (Record
*r
= loads
[f
]; r
; r
= r
->next
)
1584 if (!st
|| r
->overlaps(st
))
1585 r
->unlink(&loads
[f
]);
1587 for (Record
*r
= stores
[f
]; r
; r
= r
->next
)
1588 if (!st
|| r
->overlaps(st
))
1589 r
->unlink(&stores
[f
]);
1593 MemoryOpt::visit(BasicBlock
*bb
)
1595 bool ret
= runOpt(bb
);
1596 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
1597 // where 96 bit memory operations are forbidden.
1604 MemoryOpt::runOpt(BasicBlock
*bb
)
1606 Instruction
*ldst
, *next
;
1608 bool isAdjacent
= true;
1610 for (ldst
= bb
->getEntry(); ldst
; ldst
= next
) {
1615 if (ldst
->op
== OP_LOAD
|| ldst
->op
== OP_VFETCH
) {
1616 if (ldst
->isDead()) {
1617 // might have been produced by earlier optimization
1618 delete_Instruction(prog
, ldst
);
1622 if (ldst
->op
== OP_STORE
|| ldst
->op
== OP_EXPORT
) {
1625 // TODO: maybe have all fixed ops act as barrier ?
1626 if (ldst
->op
== OP_CALL
) {
1627 purgeRecords(NULL
, FILE_MEMORY_LOCAL
);
1628 purgeRecords(NULL
, FILE_MEMORY_GLOBAL
);
1629 purgeRecords(NULL
, FILE_MEMORY_SHARED
);
1630 purgeRecords(NULL
, FILE_SHADER_OUTPUT
);
1632 if (ldst
->op
== OP_EMIT
|| ldst
->op
== OP_RESTART
) {
1633 purgeRecords(NULL
, FILE_SHADER_OUTPUT
);
1637 if (ldst
->getPredicate()) // TODO: handle predicated ld/st
1641 DataFile file
= ldst
->src(0).getFile();
1643 // if ld l[]/g[] look for previous store to eliminate the reload
1644 if (file
== FILE_MEMORY_GLOBAL
|| file
== FILE_MEMORY_LOCAL
) {
1645 // TODO: shared memory ?
1646 rec
= findRecord(ldst
, false, isAdjacent
);
1647 if (rec
&& !isAdjacent
)
1648 keep
= !replaceLdFromSt(ldst
, rec
);
1651 // or look for ld from the same location and replace this one
1652 rec
= keep
? findRecord(ldst
, true, isAdjacent
) : NULL
;
1655 keep
= !replaceLdFromLd(ldst
, rec
);
1657 // or combine a previous load with this one
1658 keep
= !combineLd(rec
, ldst
);
1663 rec
= findRecord(ldst
, false, isAdjacent
);
1666 keep
= !replaceStFromSt(ldst
, rec
);
1668 keep
= !combineSt(rec
, ldst
);
1671 purgeRecords(ldst
, DATA_FILE_COUNT
);
1681 // =============================================================================
1683 // Turn control flow into predicated instructions (after register allocation !).
1685 // Could move this to before register allocation on NVC0 and also handle nested
1687 class FlatteningPass
: public Pass
1690 virtual bool visit(BasicBlock
*);
1692 bool tryPredicateConditional(BasicBlock
*);
1693 void predicateInstructions(BasicBlock
*, Value
*pred
, CondCode cc
);
1694 void tryPropagateBranch(BasicBlock
*);
1695 inline bool isConstantCondition(Value
*pred
);
1696 inline bool mayPredicate(const Instruction
*, const Value
*pred
) const;
1697 inline void removeFlow(Instruction
*);
1701 FlatteningPass::isConstantCondition(Value
*pred
)
1703 Instruction
*insn
= pred
->getUniqueInsn();
1705 if (insn
->op
!= OP_SET
|| insn
->srcExists(2))
1708 for (int s
= 0; s
< 2 && insn
->srcExists(s
); ++s
) {
1709 Instruction
*ld
= insn
->getSrc(s
)->getUniqueInsn();
1712 if (ld
->op
!= OP_MOV
&& ld
->op
!= OP_LOAD
)
1714 if (ld
->src(0).isIndirect(0))
1716 file
= ld
->src(0).getFile();
1718 file
= insn
->src(s
).getFile();
1719 // catch $r63 on NVC0
1720 if (file
== FILE_GPR
&& insn
->getSrc(s
)->reg
.data
.id
> prog
->maxGPR
)
1721 file
= FILE_IMMEDIATE
;
1723 if (file
!= FILE_IMMEDIATE
&& file
!= FILE_MEMORY_CONST
)
1730 FlatteningPass::removeFlow(Instruction
*insn
)
1732 FlowInstruction
*term
= insn
? insn
->asFlow() : NULL
;
1735 Graph::Edge::Type ty
= term
->bb
->cfg
.outgoing().getType();
1737 if (term
->op
== OP_BRA
) {
1738 // TODO: this might get more difficult when we get arbitrary BRAs
1739 if (ty
== Graph::Edge::CROSS
|| ty
== Graph::Edge::BACK
)
1742 if (term
->op
!= OP_JOIN
)
1745 delete_Instruction(prog
, term
);
1747 Value
*pred
= term
->getPredicate();
1749 if (pred
&& pred
->refCount() == 0) {
1750 Instruction
*pSet
= pred
->getUniqueInsn();
1751 pred
->join
->reg
.data
.id
= -1; // deallocate
1753 delete_Instruction(prog
, pSet
);
1758 FlatteningPass::predicateInstructions(BasicBlock
*bb
, Value
*pred
, CondCode cc
)
1760 for (Instruction
*i
= bb
->getEntry(); i
; i
= i
->next
) {
1763 assert(!i
->getPredicate());
1764 i
->setPredicate(cc
, pred
);
1766 removeFlow(bb
->getExit());
1770 FlatteningPass::mayPredicate(const Instruction
*insn
, const Value
*pred
) const
1772 if (insn
->isPseudo())
1774 // TODO: calls where we don't know which registers are modified
1776 if (!prog
->getTarget()->mayPredicate(insn
, pred
))
1778 for (int d
= 0; insn
->defExists(d
); ++d
)
1779 if (insn
->getDef(d
)->equals(pred
))
1784 // If we conditionally skip over or to a branch instruction, replace it.
1785 // NOTE: We do not update the CFG anymore here !
1787 FlatteningPass::tryPropagateBranch(BasicBlock
*bb
)
1789 BasicBlock
*bf
= NULL
;
1792 if (bb
->cfg
.outgoingCount() != 2)
1794 if (!bb
->getExit() || bb
->getExit()->op
!= OP_BRA
)
1796 Graph::EdgeIterator ei
= bb
->cfg
.outgoing();
1798 for (i
= 0; !ei
.end(); ++i
, ei
.next()) {
1799 bf
= BasicBlock::get(ei
.getNode());
1800 if (bf
->getInsnCount() == 1)
1803 if (ei
.end() || !bf
->getExit())
1805 FlowInstruction
*bra
= bb
->getExit()->asFlow();
1806 FlowInstruction
*rep
= bf
->getExit()->asFlow();
1808 if (rep
->getPredicate())
1810 if (rep
->op
!= OP_BRA
&&
1811 rep
->op
!= OP_JOIN
&&
1816 bra
->target
.bb
= rep
->target
.bb
;
1817 if (i
) // 2nd out block means branch not taken
1818 bra
->cc
= inverseCondCode(bra
->cc
);
1823 FlatteningPass::visit(BasicBlock
*bb
)
1825 if (tryPredicateConditional(bb
))
1828 // try to attach join to previous instruction
1829 Instruction
*insn
= bb
->getExit();
1830 if (insn
&& insn
->op
== OP_JOIN
&& !insn
->getPredicate()) {
1832 if (insn
&& !insn
->getPredicate() && !insn
->asFlow() && !insn
->isNop()) {
1834 bb
->remove(bb
->getExit());
1839 tryPropagateBranch(bb
);
1845 FlatteningPass::tryPredicateConditional(BasicBlock
*bb
)
1847 BasicBlock
*bL
= NULL
, *bR
= NULL
;
1848 unsigned int nL
= 0, nR
= 0, limit
= 12;
1852 mask
= bb
->initiatesSimpleConditional();
1856 assert(bb
->getExit());
1857 Value
*pred
= bb
->getExit()->getPredicate();
1860 if (isConstantCondition(pred
))
1863 Graph::EdgeIterator ei
= bb
->cfg
.outgoing();
1866 bL
= BasicBlock::get(ei
.getNode());
1867 for (insn
= bL
->getEntry(); insn
; insn
= insn
->next
, ++nL
)
1868 if (!mayPredicate(insn
, pred
))
1871 return false; // too long, do a real branch
1876 bR
= BasicBlock::get(ei
.getNode());
1877 for (insn
= bR
->getEntry(); insn
; insn
= insn
->next
, ++nR
)
1878 if (!mayPredicate(insn
, pred
))
1881 return false; // too long, do a real branch
1885 predicateInstructions(bL
, pred
, bb
->getExit()->cc
);
1887 predicateInstructions(bR
, pred
, inverseCondCode(bb
->getExit()->cc
));
1890 bb
->remove(bb
->joinAt
);
1893 removeFlow(bb
->getExit()); // delete the branch/join at the fork point
1895 // remove potential join operations at the end of the conditional
1896 if (prog
->getTarget()->joinAnterior
) {
1897 bb
= BasicBlock::get((bL
? bL
: bR
)->cfg
.outgoing().getNode());
1898 if (bb
->getEntry() && bb
->getEntry()->op
== OP_JOIN
)
1899 removeFlow(bb
->getEntry());
1905 // =============================================================================
1907 // Common subexpression elimination. Stupid O^2 implementation.
1908 class LocalCSE
: public Pass
1911 virtual bool visit(BasicBlock
*);
1913 inline bool tryReplace(Instruction
**, Instruction
*);
1915 DLList ops
[OP_LAST
+ 1];
1918 class GlobalCSE
: public Pass
1921 virtual bool visit(BasicBlock
*);
1925 Instruction::isActionEqual(const Instruction
*that
) const
1927 if (this->op
!= that
->op
||
1928 this->dType
!= that
->dType
||
1929 this->sType
!= that
->sType
)
1931 if (this->cc
!= that
->cc
)
1934 if (this->asTex()) {
1935 if (memcmp(&this->asTex()->tex
,
1936 &that
->asTex()->tex
,
1937 sizeof(this->asTex()->tex
)))
1940 if (this->asCmp()) {
1941 if (this->asCmp()->setCond
!= that
->asCmp()->setCond
)
1944 if (this->asFlow()) {
1947 if (this->atomic
!= that
->atomic
||
1948 this->ipa
!= that
->ipa
||
1949 this->lanes
!= that
->lanes
||
1950 this->perPatch
!= that
->perPatch
)
1952 if (this->postFactor
!= that
->postFactor
)
1956 if (this->subOp
!= that
->subOp
||
1957 this->saturate
!= that
->saturate
||
1958 this->rnd
!= that
->rnd
||
1959 this->ftz
!= that
->ftz
||
1960 this->dnz
!= that
->dnz
||
1961 this->cache
!= that
->cache
)
1968 Instruction::isResultEqual(const Instruction
*that
) const
1972 // NOTE: location of discard only affects tex with liveOnly and quadops
1973 if (!this->defExists(0) && this->op
!= OP_DISCARD
)
1976 if (!isActionEqual(that
))
1979 if (this->predSrc
!= that
->predSrc
)
1982 for (d
= 0; this->defExists(d
); ++d
) {
1983 if (!that
->defExists(d
) ||
1984 !this->getDef(d
)->equals(that
->getDef(d
), false))
1987 if (that
->defExists(d
))
1990 for (s
= 0; this->srcExists(s
); ++s
) {
1991 if (!that
->srcExists(s
))
1993 if (this->src(s
).mod
!= that
->src(s
).mod
)
1995 if (!this->getSrc(s
)->equals(that
->getSrc(s
), true))
1998 if (that
->srcExists(s
))
2001 if (op
== OP_LOAD
|| op
== OP_VFETCH
) {
2002 switch (src(0).getFile()) {
2003 case FILE_MEMORY_CONST
:
2004 case FILE_SHADER_INPUT
:
2014 // pull through common expressions from different in-blocks
2016 GlobalCSE::visit(BasicBlock
*bb
)
2018 Instruction
*phi
, *next
, *ik
;
2021 for (phi
= bb
->getPhi(); phi
&& phi
->op
== OP_PHI
; phi
= next
) {
2023 if (phi
->getSrc(0)->refCount() > 1)
2025 ik
= phi
->getSrc(0)->getInsn();
2026 for (s
= 1; phi
->srcExists(s
); ++s
) {
2027 if (phi
->getSrc(s
)->refCount() > 1)
2029 if (!phi
->getSrc(s
)->getInsn()->isResultEqual(ik
))
2032 if (!phi
->srcExists(s
)) {
2033 Instruction
*entry
= bb
->getEntry();
2035 if (!entry
|| entry
->op
!= OP_JOIN
)
2038 bb
->insertAfter(entry
, ik
);
2039 ik
->setDef(0, phi
->getDef(0));
2040 delete_Instruction(prog
, phi
);
2048 LocalCSE::tryReplace(Instruction
**ptr
, Instruction
*i
)
2050 Instruction
*old
= *ptr
;
2051 if (!old
->isResultEqual(i
))
2053 for (int d
= 0; old
->defExists(d
); ++d
)
2054 old
->def(d
).replace(i
->getDef(d
), false);
2055 delete_Instruction(prog
, old
);
2061 LocalCSE::visit(BasicBlock
*bb
)
2063 unsigned int replaced
;
2066 Instruction
*ir
, *next
;
2070 // will need to know the order of instructions
2072 for (ir
= bb
->getEntry(); ir
; ir
= ir
->next
)
2073 ir
->serial
= serial
++;
2075 for (ir
= bb
->getEntry(); ir
; ir
= next
) {
2082 ops
[ir
->op
].insert(ir
);
2086 for (s
= 0; ir
->srcExists(s
); ++s
)
2087 if (ir
->getSrc(s
)->asLValue())
2088 if (!src
|| ir
->getSrc(s
)->refCount() < src
->refCount())
2089 src
= ir
->getSrc(s
);
2092 for (Value::UseIterator it
= src
->uses
.begin();
2093 it
!= src
->uses
.end(); ++it
) {
2094 Instruction
*ik
= (*it
)->getInsn();
2095 if (ik
&& ik
->serial
< ir
->serial
&& ik
->bb
== ir
->bb
)
2096 if (tryReplace(&ir
, ik
))
2100 DLLIST_FOR_EACH(&ops
[ir
->op
], iter
)
2102 Instruction
*ik
= reinterpret_cast<Instruction
*>(iter
.get());
2103 if (tryReplace(&ir
, ik
))
2109 ops
[ir
->op
].insert(ir
);
2113 for (unsigned int i
= 0; i
<= OP_LAST
; ++i
)
2121 // =============================================================================
2123 // Remove computations of unused values.
2124 class DeadCodeElim
: public Pass
2127 bool buryAll(Program
*);
2130 virtual bool visit(BasicBlock
*);
2132 void checkSplitLoad(Instruction
*ld
); // for partially dead loads
2134 unsigned int deadCount
;
2138 DeadCodeElim::buryAll(Program
*prog
)
2142 if (!this->run(prog
, false, false))
2144 } while (deadCount
);
2150 DeadCodeElim::visit(BasicBlock
*bb
)
2154 for (Instruction
*i
= bb
->getFirst(); i
; i
= next
) {
2158 delete_Instruction(prog
, i
);
2160 if (i
->defExists(1) && (i
->op
== OP_VFETCH
|| i
->op
== OP_LOAD
)) {
2168 DeadCodeElim::checkSplitLoad(Instruction
*ld1
)
2170 Instruction
*ld2
= NULL
; // can get at most 2 loads
2173 int32_t addr1
, addr2
;
2174 int32_t size1
, size2
;
2176 uint32_t mask
= 0xffffffff;
2178 for (d
= 0; ld1
->defExists(d
); ++d
)
2179 if (!ld1
->getDef(d
)->refCount() && ld1
->getDef(d
)->reg
.data
.id
< 0)
2181 if (mask
== 0xffffffff)
2184 addr1
= ld1
->getSrc(0)->reg
.data
.offset
;
2187 for (d
= 0; ld1
->defExists(d
); ++d
) {
2188 if (mask
& (1 << d
)) {
2189 if (size1
&& (addr1
& 0x7))
2191 def1
[n1
] = ld1
->getDef(d
);
2192 size1
+= def1
[n1
++]->reg
.size
;
2195 addr1
+= ld1
->getDef(d
)->reg
.size
;
2200 for (addr2
= addr1
+ size1
; ld1
->defExists(d
); ++d
) {
2201 if (mask
& (1 << d
)) {
2202 def2
[n2
] = ld1
->getDef(d
);
2203 size2
+= def2
[n2
++]->reg
.size
;
2206 addr2
+= ld1
->getDef(d
)->reg
.size
;
2210 updateLdStOffset(ld1
, addr1
, func
);
2211 ld1
->setType(typeOfSize(size1
));
2212 for (d
= 0; d
< 4; ++d
)
2213 ld1
->setDef(d
, (d
< n1
) ? def1
[d
] : NULL
);
2218 ld2
= ld1
->clone(false);
2219 updateLdStOffset(ld2
, addr2
, func
);
2220 ld2
->setType(typeOfSize(size2
));
2221 for (d
= 0; d
< 4; ++d
)
2222 ld2
->setDef(d
, (d
< n2
) ? def2
[d
] : NULL
);
2224 ld1
->bb
->insertAfter(ld1
, ld2
);
2227 // =============================================================================
2229 #define RUN_PASS(l, n, f) \
2230 if (level >= (l)) { \
2231 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \
2232 INFO("PEEPHOLE: %s\n", #n); \
2234 if (!pass.f(this)) \
2239 Program::optimizeSSA(int level
)
2241 RUN_PASS(1, DeadCodeElim
, buryAll
);
2242 RUN_PASS(1, CopyPropagation
, run
);
2243 RUN_PASS(2, GlobalCSE
, run
);
2244 RUN_PASS(1, LocalCSE
, run
);
2245 RUN_PASS(2, AlgebraicOpt
, run
);
2246 RUN_PASS(2, ModifierFolding
, run
); // before load propagation -> less checks
2247 RUN_PASS(1, ConstantFolding
, foldAll
);
2248 RUN_PASS(1, LoadPropagation
, run
);
2249 RUN_PASS(2, MemoryOpt
, run
);
2250 RUN_PASS(2, LocalCSE
, run
);
2251 RUN_PASS(0, DeadCodeElim
, buryAll
);
2256 Program::optimizePostRA(int level
)
2258 RUN_PASS(2, FlatteningPass
, run
);