2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 #include "nv50_ir_target.h"
25 #include "nv50_ir_build_util.h"
28 #include "util/u_math.h"
34 Instruction::isNop() const
36 if (op
== OP_CONSTRAINT
|| op
== OP_PHI
)
38 if (terminator
|| join
) // XXX: should terminator imply flow ?
40 if (!fixed
&& op
== OP_NOP
)
43 if (defExists(0) && def
[0].rep()->reg
.data
.id
< 0) {
44 for (int d
= 1; defExists(d
); ++d
)
45 if (def
[d
].rep()->reg
.data
.id
>= 0)
46 WARN("part of vector result is unused !\n");
50 if (op
== OP_MOV
|| op
== OP_UNION
) {
51 if (!def
[0].rep()->equals(getSrc(0)))
54 if (!def
[0].rep()->equals(getSrc(1)))
62 bool Instruction::isDead() const
68 for (int d
= 0; defExists(d
); ++d
)
69 if (getDef(d
)->refCount() || getDef(d
)->reg
.data
.id
>= 0)
72 if (terminator
|| asFlow())
80 // =============================================================================
82 class CopyPropagation
: public Pass
85 virtual bool visit(BasicBlock
*);
88 // Propagate all MOVs forward to make subsequent optimization easier, except if
89 // the sources stem from a phi, in which case we don't want to mess up potential
90 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
92 CopyPropagation::visit(BasicBlock
*bb
)
94 Instruction
*mov
, *si
, *next
;
96 for (mov
= bb
->getEntry(); mov
; mov
= next
) {
98 if (mov
->op
!= OP_MOV
|| mov
->fixed
|| !mov
->getSrc(0)->asLValue())
100 si
= mov
->getSrc(0)->getInsn();
101 if (mov
->getDef(0)->reg
.data
.id
< 0 && si
&& si
->op
!= OP_PHI
) {
103 mov
->def
[0].replace(mov
->getSrc(0), false);
104 delete_Instruction(prog
, mov
);
110 // =============================================================================
112 class LoadPropagation
: public Pass
115 virtual bool visit(BasicBlock
*);
117 void checkSwapSrc01(Instruction
*);
119 bool isCSpaceLoad(Instruction
*);
120 bool isImmd32Load(Instruction
*);
124 LoadPropagation::isCSpaceLoad(Instruction
*ld
)
126 return ld
&& ld
->op
== OP_LOAD
&& ld
->src
[0].getFile() == FILE_MEMORY_CONST
;
130 LoadPropagation::isImmd32Load(Instruction
*ld
)
132 if (!ld
|| (ld
->op
!= OP_MOV
) || (typeSizeof(ld
->dType
) != 4))
134 return ld
->src
[0].getFile() == FILE_IMMEDIATE
;
138 LoadPropagation::checkSwapSrc01(Instruction
*insn
)
140 if (!prog
->getTarget()->getOpInfo(insn
).commutative
)
141 if (insn
->op
!= OP_SET
&& insn
->op
!= OP_SLCT
)
143 if (insn
->src
[1].getFile() != FILE_GPR
)
146 Instruction
*i0
= insn
->getSrc(0)->getInsn();
147 Instruction
*i1
= insn
->getSrc(1)->getInsn();
149 if (isCSpaceLoad(i0
)) {
150 if (!isCSpaceLoad(i1
))
151 insn
->swapSources(0, 1);
155 if (isImmd32Load(i0
)) {
156 if (!isCSpaceLoad(i1
) && !isImmd32Load(i1
))
157 insn
->swapSources(0, 1);
164 if (insn
->op
== OP_SET
)
165 insn
->asCmp()->setCond
= reverseCondCode(insn
->asCmp()->setCond
);
167 if (insn
->op
== OP_SLCT
)
168 insn
->asCmp()->setCond
= inverseCondCode(insn
->asCmp()->setCond
);
172 LoadPropagation::visit(BasicBlock
*bb
)
174 const Target
*targ
= prog
->getTarget();
177 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
183 for (int s
= 0; i
->srcExists(s
); ++s
) {
184 Instruction
*ld
= i
->getSrc(s
)->getInsn();
186 if (!ld
|| ld
->fixed
|| (ld
->op
!= OP_LOAD
&& ld
->op
!= OP_MOV
))
188 if (!targ
->insnCanLoad(i
, s
, ld
))
192 i
->setSrc(s
, ld
->getSrc(0));
193 if (ld
->src
[0].isIndirect(0))
194 i
->setIndirect(s
, 0, ld
->getIndirect(0, 0));
196 if (ld
->getDef(0)->refCount() == 0)
197 delete_Instruction(prog
, ld
);
203 // =============================================================================
205 // Evaluate constant expressions.
206 class ConstantFolding
: public Pass
209 bool foldAll(Program
*);
212 virtual bool visit(BasicBlock
*);
214 void expr(Instruction
*, ImmediateValue
*, ImmediateValue
*);
215 void opnd(Instruction
*, ImmediateValue
*, int s
);
217 void unary(Instruction
*, const ImmediateValue
&);
219 void tryCollapseChainedMULs(Instruction
*, const int s
, ImmediateValue
&);
221 // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
222 CmpInstruction
*findOriginForTestWithZero(Value
*);
224 unsigned int foldCount
;
229 // TODO: remember generated immediates and only revisit these
231 ConstantFolding::foldAll(Program
*prog
)
233 unsigned int iterCount
= 0;
238 } while (foldCount
&& ++iterCount
< 2);
243 ConstantFolding::visit(BasicBlock
*bb
)
245 Instruction
*i
, *next
;
247 for (i
= bb
->getEntry(); i
; i
= next
) {
249 if (i
->op
== OP_MOV
) // continue early, MOV appears frequently
252 ImmediateValue
*src0
= i
->srcExists(0) ? i
->src
[0].getImmediate() : NULL
;
253 ImmediateValue
*src1
= i
->srcExists(1) ? i
->src
[1].getImmediate() : NULL
;
268 ConstantFolding::findOriginForTestWithZero(Value
*value
)
272 Instruction
*insn
= value
->getInsn();
274 while (insn
&& insn
->op
!= OP_SET
) {
275 Instruction
*next
= NULL
;
280 next
= insn
->getSrc(0)->getInsn();
281 if (insn
->sType
!= next
->dType
)
285 next
= insn
->getSrc(0)->getInsn();
292 return insn
? insn
->asCmp() : NULL
;
296 Modifier::applyTo(ImmediateValue
& imm
) const
298 switch (imm
.reg
.type
) {
300 if (bits
& NV50_IR_MOD_ABS
)
301 imm
.reg
.data
.f32
= fabsf(imm
.reg
.data
.f32
);
302 if (bits
& NV50_IR_MOD_NEG
)
303 imm
.reg
.data
.f32
= -imm
.reg
.data
.f32
;
304 if (bits
& NV50_IR_MOD_SAT
) {
305 if (imm
.reg
.data
.f32
< 0.0f
)
306 imm
.reg
.data
.f32
= 0.0f
;
308 if (imm
.reg
.data
.f32
> 1.0f
)
309 imm
.reg
.data
.f32
= 1.0f
;
311 assert(!(bits
& NV50_IR_MOD_NOT
));
314 case TYPE_S8
: // NOTE: will be extended
317 case TYPE_U8
: // NOTE: treated as signed
320 if (bits
& NV50_IR_MOD_ABS
)
321 imm
.reg
.data
.s32
= (imm
.reg
.data
.s32
>= 0) ?
322 imm
.reg
.data
.s32
: -imm
.reg
.data
.s32
;
323 if (bits
& NV50_IR_MOD_NEG
)
324 imm
.reg
.data
.s32
= -imm
.reg
.data
.s32
;
325 if (bits
& NV50_IR_MOD_NOT
)
326 imm
.reg
.data
.s32
= ~imm
.reg
.data
.s32
;
330 if (bits
& NV50_IR_MOD_ABS
)
331 imm
.reg
.data
.f64
= fabs(imm
.reg
.data
.f64
);
332 if (bits
& NV50_IR_MOD_NEG
)
333 imm
.reg
.data
.f64
= -imm
.reg
.data
.f64
;
334 if (bits
& NV50_IR_MOD_SAT
) {
335 if (imm
.reg
.data
.f64
< 0.0)
336 imm
.reg
.data
.f64
= 0.0;
338 if (imm
.reg
.data
.f64
> 1.0)
339 imm
.reg
.data
.f64
= 1.0;
341 assert(!(bits
& NV50_IR_MOD_NOT
));
345 assert(!"invalid/unhandled type");
346 imm
.reg
.data
.u64
= 0;
352 Modifier::getOp() const
355 case NV50_IR_MOD_ABS
: return OP_ABS
;
356 case NV50_IR_MOD_NEG
: return OP_NEG
;
357 case NV50_IR_MOD_SAT
: return OP_SAT
;
358 case NV50_IR_MOD_NOT
: return OP_NOT
;
367 ConstantFolding::expr(Instruction
*i
,
368 ImmediateValue
*src0
, ImmediateValue
*src1
)
370 ImmediateValue
imm0(src0
, i
->sType
);
371 ImmediateValue
imm1(src1
, i
->sType
);
373 struct Storage
*const a
= &imm0
.reg
, *const b
= &imm1
.reg
;
375 i
->src
[0].mod
.applyTo(imm0
);
376 i
->src
[1].mod
.applyTo(imm1
);
382 if (i
->dnz
&& i
->dType
== TYPE_F32
) {
383 if (!isfinite(a
->data
.f32
))
385 if (!isfinite(b
->data
.f32
))
389 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
* b
->data
.f32
; break;
390 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
* b
->data
.f64
; break;
392 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
* b
->data
.u32
; break;
398 if (b
->data
.u32
== 0)
401 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
/ b
->data
.f32
; break;
402 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
/ b
->data
.f64
; break;
403 case TYPE_S32
: res
.data
.s32
= a
->data
.s32
/ b
->data
.s32
; break;
404 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
/ b
->data
.u32
; break;
411 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
+ b
->data
.f32
; break;
412 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
+ b
->data
.f64
; break;
414 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
+ b
->data
.u32
; break;
421 case TYPE_F32
: res
.data
.f32
= pow(a
->data
.f32
, b
->data
.f32
); break;
422 case TYPE_F64
: res
.data
.f64
= pow(a
->data
.f64
, b
->data
.f64
); break;
429 case TYPE_F32
: res
.data
.f32
= MAX2(a
->data
.f32
, b
->data
.f32
); break;
430 case TYPE_F64
: res
.data
.f64
= MAX2(a
->data
.f64
, b
->data
.f64
); break;
431 case TYPE_S32
: res
.data
.s32
= MAX2(a
->data
.s32
, b
->data
.s32
); break;
432 case TYPE_U32
: res
.data
.u32
= MAX2(a
->data
.u32
, b
->data
.u32
); break;
439 case TYPE_F32
: res
.data
.f32
= MIN2(a
->data
.f32
, b
->data
.f32
); break;
440 case TYPE_F64
: res
.data
.f64
= MIN2(a
->data
.f64
, b
->data
.f64
); break;
441 case TYPE_S32
: res
.data
.s32
= MIN2(a
->data
.s32
, b
->data
.s32
); break;
442 case TYPE_U32
: res
.data
.u32
= MIN2(a
->data
.u32
, b
->data
.u32
); break;
448 res
.data
.u64
= a
->data
.u64
& b
->data
.u64
;
451 res
.data
.u64
= a
->data
.u64
| b
->data
.u64
;
454 res
.data
.u64
= a
->data
.u64
^ b
->data
.u64
;
457 res
.data
.u32
= a
->data
.u32
<< b
->data
.u32
;
461 case TYPE_S32
: res
.data
.s32
= a
->data
.s32
>> b
->data
.u32
; break;
462 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
>> b
->data
.u32
; break;
468 if (a
->data
.u32
!= b
->data
.u32
)
470 res
.data
.u32
= a
->data
.u32
;
477 i
->src
[0].mod
= Modifier(0);
478 i
->src
[1].mod
= Modifier(0);
480 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.u32
));
483 i
->getSrc(0)->reg
.data
= res
.data
;
485 if (i
->op
== OP_MAD
|| i
->op
== OP_FMA
) {
488 i
->setSrc(1, i
->getSrc(0));
489 i
->setSrc(0, i
->getSrc(2));
492 i
->src
[1].mod
= i
->src
[2].mod
;
494 src0
= i
->src
[0].getImmediate();
496 expr(i
, src0
, i
->getSrc(1)->asImm());
503 ConstantFolding::unary(Instruction
*i
, const ImmediateValue
&imm
)
507 if (i
->dType
!= TYPE_F32
)
510 case OP_NEG
: res
.data
.f32
= -imm
.reg
.data
.f32
; break;
511 case OP_ABS
: res
.data
.f32
= fabsf(imm
.reg
.data
.f32
); break;
512 case OP_RCP
: res
.data
.f32
= 1.0f
/ imm
.reg
.data
.f32
; break;
513 case OP_RSQ
: res
.data
.f32
= 1.0f
/ sqrtf(imm
.reg
.data
.f32
); break;
514 case OP_LG2
: res
.data
.f32
= log2f(imm
.reg
.data
.f32
); break;
515 case OP_EX2
: res
.data
.f32
= exp2f(imm
.reg
.data
.f32
); break;
516 case OP_SIN
: res
.data
.f32
= sinf(imm
.reg
.data
.f32
); break;
517 case OP_COS
: res
.data
.f32
= cosf(imm
.reg
.data
.f32
); break;
518 case OP_SQRT
: res
.data
.f32
= sqrtf(imm
.reg
.data
.f32
); break;
521 // these should be handled in subsequent OP_SIN/COS/EX2
522 res
.data
.f32
= imm
.reg
.data
.f32
;
528 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.f32
));
529 i
->src
[0].mod
= Modifier(0);
533 ConstantFolding::tryCollapseChainedMULs(Instruction
*mul2
,
534 const int s
, ImmediateValue
& imm2
)
536 const int t
= s
? 0 : 1;
538 Instruction
*mul1
= NULL
; // mul1 before mul2
540 float f
= imm2
.reg
.data
.f32
;
542 assert(mul2
->op
== OP_MUL
&& mul2
->dType
== TYPE_F32
);
544 if (mul2
->getSrc(t
)->refCount() == 1) {
545 insn
= mul2
->getSrc(t
)->getInsn();
546 if (insn
->op
== OP_MUL
&& insn
->dType
== TYPE_F32
)
550 ImmediateValue
*imm
= mul1
->src
[s1
].getImmediate();
553 imm
= mul1
->src
[s1
].getImmediate();
556 bld
.setPosition(mul1
, false);
558 // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
559 ImmediateValue
imm1(mul1
->src
[s1
].getImmediate(), TYPE_F32
);
560 mul1
->src
[s1
].mod
.applyTo(imm1
);
561 mul1
->src
[s1
].mod
= Modifier(0);
562 mul1
->setSrc(s1
, bld
.loadImm(NULL
, f
* imm1
.reg
.data
.f32
));
563 mul2
->def
[0].replace(mul1
->getDef(0), false);
565 if (prog
->getTarget()->isPostMultiplySupported(OP_MUL
, f
, e
)) {
567 // d = mul c, imm -> d = mul_x_imm a, b
568 mul1
->postFactor
= e
;
569 mul2
->def
[0].replace(mul1
->getDef(0), false);
571 mul1
->src
[0].mod
= mul1
->src
[0].mod
^ Modifier(NV50_IR_MOD_NEG
);
576 if (mul2
->getDef(0)->refCount() == 1) {
578 // d = mul b, c -> d = mul_x_imm a, c
580 insn
= mul2
->getDef(0)->uses
.front()->getInsn();
585 s2
= insn
->getSrc(0) == mul1
->getDef(0) ? 0 : 1;
587 if (insn
->op
== OP_MUL
&& insn
->dType
== TYPE_F32
)
588 if (!insn
->src
[t2
].getImmediate())
590 if (mul2
&& prog
->getTarget()->isPostMultiplySupported(OP_MUL
, f
, e
)) {
591 mul2
->postFactor
= e
;
592 mul2
->setSrc(s2
, mul1
->src
[t
]);
594 mul2
->src
[s2
].mod
= mul2
->src
[s2
].mod
^ Modifier(NV50_IR_MOD_NEG
);
600 ConstantFolding::opnd(Instruction
*i
, ImmediateValue
*src
, int s
)
603 const operation op
= i
->op
;
605 ImmediateValue
imm(src
, i
->sType
);
607 i
->src
[s
].mod
.applyTo(imm
);
611 if (i
->dType
== TYPE_F32
)
612 tryCollapseChainedMULs(i
, s
, imm
);
614 if (imm
.isInteger(0)) {
616 i
->setSrc(0, i
->getSrc(s
));
619 if (imm
.isInteger(1) || imm
.isInteger(-1)) {
620 if (imm
.isNegative())
621 i
->src
[t
].mod
= i
->src
[t
].mod
^ Modifier(NV50_IR_MOD_NEG
);
622 i
->op
= i
->src
[t
].mod
.getOp();
624 i
->setSrc(0, i
->getSrc(1));
625 i
->src
[0].mod
= i
->src
[1].mod
;
632 if (imm
.isInteger(2) || imm
.isInteger(-2)) {
633 if (imm
.isNegative())
634 i
->src
[t
].mod
= i
->src
[t
].mod
^ Modifier(NV50_IR_MOD_NEG
);
636 i
->setSrc(s
, i
->getSrc(t
));
637 i
->src
[s
].mod
= i
->src
[t
].mod
;
639 if (!isFloatType(i
->sType
) && !imm
.isNegative() && imm
.isPow2()) {
642 i
->setSrc(1, new_ImmediateValue(prog
, imm
.reg
.data
.u32
));
646 if (imm
.isInteger(0)) {
648 i
->setSrc(0, i
->getSrc(1));
649 i
->src
[0].mod
= i
->src
[1].mod
;
652 i
->op
= i
->src
[0].mod
.getOp();
654 i
->src
[0].mod
= Modifier(0);
659 if (s
!= 1 || (i
->dType
!= TYPE_S32
&& i
->dType
!= TYPE_U32
))
661 bld
.setPosition(i
, false);
662 if (imm
.reg
.data
.u32
== 0) {
665 if (imm
.reg
.data
.u32
== 1) {
669 if (i
->dType
== TYPE_U32
&& imm
.isPow2()) {
671 i
->setSrc(1, bld
.mkImm(util_logbase2(imm
.reg
.data
.u32
)));
673 if (i
->dType
== TYPE_U32
) {
676 const uint32_t d
= imm
.reg
.data
.u32
;
679 uint32_t l
= util_logbase2(d
);
680 if (((uint32_t)1 << l
) < d
)
682 m
= (((uint64_t)1 << 32) * (((uint64_t)1 << l
) - d
)) / d
+ 1;
688 mul
= bld
.mkOp2(OP_MUL
, TYPE_U32
, tA
, i
->getSrc(0),
689 bld
.loadImm(NULL
, m
));
690 mul
->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
691 bld
.mkOp2(OP_SUB
, TYPE_U32
, tB
, i
->getSrc(0), tA
);
694 bld
.mkOp2(OP_SHR
, TYPE_U32
, tA
, tB
, bld
.mkImm(r
));
697 tB
= s
? bld
.getSSA() : i
->getDef(0);
698 bld
.mkOp2(OP_ADD
, TYPE_U32
, tB
, mul
->getDef(0), tA
);
700 bld
.mkOp2(OP_SHR
, TYPE_U32
, i
->getDef(0), tB
, bld
.mkImm(s
));
702 delete_Instruction(prog
, i
);
704 if (imm
.reg
.data
.s32
== -1) {
710 const int32_t d
= imm
.reg
.data
.s32
;
712 int32_t l
= util_logbase2(static_cast<unsigned>(abs(d
)));
713 if ((1 << l
) < abs(d
))
717 m
= ((uint64_t)1 << (32 + l
- 1)) / abs(d
) + 1 - ((uint64_t)1 << 32);
721 bld
.mkOp3(OP_MAD
, TYPE_S32
, tA
, i
->getSrc(0), bld
.loadImm(NULL
, m
),
722 i
->getSrc(0))->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
724 bld
.mkOp2(OP_SHR
, TYPE_S32
, tB
, tA
, bld
.mkImm(l
- 1));
728 bld
.mkCmp(OP_SET
, CC_LT
, TYPE_S32
, tA
, i
->getSrc(0), bld
.mkImm(0));
729 tD
= (d
< 0) ? bld
.getSSA() : i
->getDef(0)->asLValue();
730 bld
.mkOp2(OP_SUB
, TYPE_U32
, tD
, tB
, tA
);
732 bld
.mkOp1(OP_NEG
, TYPE_S32
, i
->getDef(0), tB
);
734 delete_Instruction(prog
, i
);
739 if (i
->sType
== TYPE_U32
&& imm
.isPow2()) {
740 bld
.setPosition(i
, false);
742 i
->setSrc(1, bld
.loadImm(NULL
, imm
.reg
.data
.u32
- 1));
746 case OP_SET
: // TODO: SET_AND,OR,XOR
748 CmpInstruction
*si
= findOriginForTestWithZero(i
->getSrc(t
));
750 if (i
->src
[t
].mod
!= Modifier(0))
752 if (imm
.reg
.data
.u32
!= 0 || !si
|| si
->op
!= OP_SET
)
755 ccZ
= (CondCode
)((unsigned int)i
->asCmp()->setCond
& ~CC_U
);
757 ccZ
= reverseCondCode(ccZ
);
759 case CC_LT
: cc
= CC_FL
; break;
760 case CC_GE
: cc
= CC_TR
; break;
761 case CC_EQ
: cc
= inverseCondCode(cc
); break;
762 case CC_LE
: cc
= inverseCondCode(cc
); break;
768 i
->asCmp()->setCond
= cc
;
769 i
->setSrc(0, si
->src
[0]);
770 i
->setSrc(1, si
->src
[1]);
771 i
->sType
= si
->sType
;
777 if (s
!= 1 || i
->src
[0].mod
!= Modifier(0))
779 // try to concatenate shifts
780 Instruction
*si
= i
->getSrc(0)->getInsn();
782 si
->op
!= OP_SHL
|| si
->src
[1].mod
!= Modifier(0))
784 ImmediateValue
*siImm
= si
->src
[1].getImmediate();
786 bld
.setPosition(i
, false);
787 i
->setSrc(0, si
->getSrc(0));
788 i
->setSrc(1, bld
.loadImm(NULL
,
789 imm
.reg
.data
.u32
+ siImm
->reg
.data
.u32
));
814 // =============================================================================
816 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
817 class ModifierFolding
: public Pass
820 virtual bool visit(BasicBlock
*);
824 ModifierFolding::visit(BasicBlock
*bb
)
826 const Target
*target
= prog
->getTarget();
828 Instruction
*i
, *next
, *mi
;
831 for (i
= bb
->getEntry(); i
; i
= next
) {
834 if (0 && i
->op
== OP_SUB
) {
835 // turn "sub" into "add neg" (do we really want this ?)
837 i
->src
[0].mod
= i
->src
[0].mod
^ Modifier(NV50_IR_MOD_NEG
);
840 for (int s
= 0; s
< 3 && i
->srcExists(s
); ++s
) {
841 mi
= i
->getSrc(s
)->getInsn();
843 mi
->predSrc
>= 0 || mi
->getDef(0)->refCount() > 8)
845 if (i
->sType
== TYPE_U32
&& mi
->dType
== TYPE_S32
) {
846 if ((i
->op
!= OP_ADD
&&
852 if (i
->sType
!= mi
->dType
) {
855 if ((mod
= Modifier(mi
->op
)) == Modifier(0))
857 mod
= mod
* mi
->src
[0].mod
;
859 if ((i
->op
== OP_ABS
) || i
->src
[s
].mod
.abs()) {
860 // abs neg [abs] = abs
861 mod
= mod
& Modifier(~(NV50_IR_MOD_NEG
| NV50_IR_MOD_ABS
));
863 if ((i
->op
== OP_NEG
) && mod
.neg()) {
865 // neg as both opcode and modifier on same insn is prohibited
866 // neg neg abs = abs, neg neg = identity
867 mod
= mod
& Modifier(~NV50_IR_MOD_NEG
);
869 mod
= mod
& Modifier(~NV50_IR_MOD_ABS
);
870 if (mod
== Modifier(0))
874 if (target
->isModSupported(i
, s
, mod
)) {
875 i
->setSrc(s
, mi
->getSrc(0));
876 i
->src
[s
].mod
= i
->src
[s
].mod
* mod
;
880 if (i
->op
== OP_SAT
) {
881 mi
= i
->getSrc(0)->getInsn();
883 mi
->getDef(0)->refCount() <= 1 && target
->isSatSupported(mi
)) {
885 mi
->setDef(0, i
->getDef(0));
886 delete_Instruction(prog
, i
);
894 // =============================================================================
896 // MUL + ADD -> MAD/FMA
897 // MIN/MAX(a, a) -> a, etc.
898 // SLCT(a, b, const) -> cc(const) ? a : b
900 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
901 class AlgebraicOpt
: public Pass
904 virtual bool visit(BasicBlock
*);
906 void handleADD(Instruction
*);
907 void handleMINMAX(Instruction
*);
908 void handleRCP(Instruction
*);
909 void handleSLCT(Instruction
*);
910 void handleLOGOP(Instruction
*);
911 void handleCVT(Instruction
*);
915 AlgebraicOpt::handleADD(Instruction
*add
)
917 Value
*src0
= add
->getSrc(0);
918 Value
*src1
= add
->getSrc(1);
923 if (!prog
->getTarget()->isOpSupported(OP_MAD
, add
->dType
))
926 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
929 if (src0
->refCount() == 1 &&
930 src0
->getUniqueInsn() && src0
->getUniqueInsn()->op
== OP_MUL
)
933 if (src1
->refCount() == 1 &&
934 src1
->getUniqueInsn() && src1
->getUniqueInsn()->op
== OP_MUL
)
939 if ((src0
->getUniqueInsn() && src0
->getUniqueInsn()->bb
!= add
->bb
) ||
940 (src1
->getUniqueInsn() && src1
->getUniqueInsn()->bb
!= add
->bb
))
943 src
= add
->getSrc(s
);
945 if (src
->getInsn()->postFactor
)
948 mod
[0] = add
->src
[0].mod
;
949 mod
[1] = add
->src
[1].mod
;
950 mod
[2] = src
->getUniqueInsn()->src
[0].mod
;
951 mod
[3] = src
->getUniqueInsn()->src
[1].mod
;
953 if (((mod
[0] | mod
[1]) | (mod
[2] | mod
[3])) & Modifier(~NV50_IR_MOD_NEG
))
957 add
->subOp
= src
->getInsn()->subOp
; // potentially mul-high
959 add
->setSrc(2, add
->src
[s
? 0 : 1]);
961 add
->setSrc(0, src
->getInsn()->getSrc(0));
962 add
->src
[0].mod
= mod
[2] ^ mod
[s
];
963 add
->setSrc(1, src
->getInsn()->getSrc(1));
964 add
->src
[1].mod
= mod
[3];
968 AlgebraicOpt::handleMINMAX(Instruction
*minmax
)
970 Value
*src0
= minmax
->getSrc(0);
971 Value
*src1
= minmax
->getSrc(1);
973 if (src0
!= src1
|| src0
->reg
.file
!= FILE_GPR
)
975 if (minmax
->src
[0].mod
== minmax
->src
[1].mod
) {
976 if (minmax
->src
[0].mod
) {
978 minmax
->setSrc(1, NULL
);
980 minmax
->def
[0].replace(minmax
->getSrc(0), false);
981 minmax
->bb
->remove(minmax
);
985 // min(x, -x) = -abs(x)
986 // min(x, -abs(x)) = -abs(x)
987 // min(x, abs(x)) = x
988 // max(x, -abs(x)) = x
989 // max(x, abs(x)) = abs(x)
990 // max(x, -x) = abs(x)
995 AlgebraicOpt::handleRCP(Instruction
*rcp
)
997 Instruction
*si
= rcp
->getSrc(0)->getUniqueInsn();
999 if (si
&& si
->op
== OP_RCP
) {
1000 Modifier mod
= rcp
->src
[0].mod
* si
->src
[0].mod
;
1001 rcp
->op
= mod
.getOp();
1002 rcp
->setSrc(0, si
->getSrc(0));
1007 AlgebraicOpt::handleSLCT(Instruction
*slct
)
1009 if (slct
->getSrc(2)->reg
.file
== FILE_IMMEDIATE
) {
1010 if (slct
->getSrc(2)->asImm()->compare(slct
->asCmp()->setCond
, 0.0f
))
1011 slct
->setSrc(0, slct
->getSrc(1));
1013 if (slct
->getSrc(0) != slct
->getSrc(1)) {
1017 slct
->setSrc(1, NULL
);
1018 slct
->setSrc(2, NULL
);
1022 AlgebraicOpt::handleLOGOP(Instruction
*logop
)
1024 Value
*src0
= logop
->getSrc(0);
1025 Value
*src1
= logop
->getSrc(1);
1027 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
1031 if (logop
->src
[0].mod
!= Modifier(0) ||
1032 logop
->src
[1].mod
!= Modifier(0))
1034 if (logop
->op
== OP_AND
|| logop
->op
== OP_OR
) {
1035 logop
->def
[0].replace(logop
->getSrc(0), false);
1036 delete_Instruction(prog
, logop
);
1039 // try AND(SET, SET) -> SET_AND(SET)
1040 Instruction
*set0
= src0
->getInsn();
1041 Instruction
*set1
= src1
->getInsn();
1043 if (!set0
|| set0
->fixed
|| !set1
|| set1
->fixed
)
1045 if (set1
->op
!= OP_SET
) {
1046 Instruction
*xchg
= set0
;
1049 if (set1
->op
!= OP_SET
)
1052 if (set0
->op
!= OP_SET
&&
1053 set0
->op
!= OP_SET_AND
&&
1054 set0
->op
!= OP_SET_OR
&&
1055 set0
->op
!= OP_SET_XOR
)
1057 if (set0
->getDef(0)->refCount() > 1 &&
1058 set1
->getDef(0)->refCount() > 1)
1060 if (set0
->getPredicate() || set1
->getPredicate())
1062 // check that they don't source each other
1063 for (int s
= 0; s
< 2; ++s
)
1064 if (set0
->getSrc(s
) == set1
->getDef(0) ||
1065 set1
->getSrc(s
) == set0
->getDef(0))
1068 set0
= set0
->clone(true);
1069 set1
= set1
->clone(false);
1070 logop
->bb
->insertAfter(logop
, set1
);
1071 logop
->bb
->insertAfter(logop
, set0
);
1073 set0
->dType
= TYPE_U8
;
1074 set0
->getDef(0)->reg
.file
= FILE_PREDICATE
;
1075 set0
->getDef(0)->reg
.size
= 1;
1076 set1
->setSrc(2, set0
->getDef(0));
1077 switch (logop
->op
) {
1078 case OP_AND
: set1
->op
= OP_SET_AND
; break;
1079 case OP_OR
: set1
->op
= OP_SET_OR
; break;
1080 case OP_XOR
: set1
->op
= OP_SET_XOR
; break;
1085 set1
->setDef(0, logop
->getDef(0));
1086 delete_Instruction(prog
, logop
);
1090 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1092 AlgebraicOpt::handleCVT(Instruction
*cvt
)
1094 if (cvt
->sType
!= TYPE_F32
||
1095 cvt
->dType
!= TYPE_S32
|| cvt
->src
[0].mod
!= Modifier(0))
1097 Instruction
*insn
= cvt
->getSrc(0)->getInsn();
1098 if (!insn
|| insn
->op
!= OP_NEG
|| insn
->dType
!= TYPE_F32
)
1100 if (insn
->src
[0].mod
!= Modifier(0))
1102 insn
= insn
->getSrc(0)->getInsn();
1103 if (!insn
|| insn
->op
!= OP_SET
|| insn
->dType
!= TYPE_F32
)
1106 Instruction
*bset
= insn
->clone(false);
1107 bset
->dType
= TYPE_U32
;
1108 bset
->setDef(0, cvt
->getDef(0));
1109 cvt
->bb
->insertAfter(cvt
, bset
);
1110 delete_Instruction(prog
, cvt
);
1114 AlgebraicOpt::visit(BasicBlock
*bb
)
1117 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
1149 // =============================================================================
1152 updateLdStOffset(Instruction
*ldst
, int32_t offset
, Function
*fn
)
1154 if (offset
!= ldst
->getSrc(0)->reg
.data
.offset
) {
1155 if (ldst
->getSrc(0)->refCount() > 1)
1156 ldst
->setSrc(0, ldst
->getSrc(0)->clone(fn
));
1157 ldst
->getSrc(0)->reg
.data
.offset
= offset
;
1161 // Combine loads and stores, forward stores to loads where possible.
1162 class MemoryOpt
: public Pass
1170 const Value
*rel
[2];
1178 bool overlaps(const Instruction
*ldst
) const;
1180 inline void link(Record
**);
1181 inline void unlink(Record
**);
1182 inline void set(const Instruction
*ldst
);
1188 Record
*loads
[DATA_FILE_COUNT
];
1189 Record
*stores
[DATA_FILE_COUNT
];
1191 MemoryPool recordPool
;
1194 virtual bool visit(BasicBlock
*);
1195 bool runOpt(BasicBlock
*);
1197 Record
**getList(const Instruction
*);
1199 Record
*findRecord(const Instruction
*, bool load
, bool& isAdjacent
) const;
1201 // merge @insn into load/store instruction from @rec
1202 bool combineLd(Record
*rec
, Instruction
*ld
);
1203 bool combineSt(Record
*rec
, Instruction
*st
);
1205 bool replaceLdFromLd(Instruction
*ld
, Record
*ldRec
);
1206 bool replaceLdFromSt(Instruction
*ld
, Record
*stRec
);
1207 bool replaceStFromSt(Instruction
*restrict st
, Record
*stRec
);
1209 void addRecord(Instruction
*ldst
);
1210 void purgeRecords(Instruction
*const st
, DataFile
);
1211 void lockStores(Instruction
*const ld
);
1218 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record
), 6)
1220 for (int i
= 0; i
< DATA_FILE_COUNT
; ++i
) {
1230 for (unsigned int i
= 0; i
< DATA_FILE_COUNT
; ++i
) {
1232 for (it
= loads
[i
]; it
; it
= next
) {
1234 recordPool
.release(it
);
1237 for (it
= stores
[i
]; it
; it
= next
) {
1239 recordPool
.release(it
);
1246 MemoryOpt::combineLd(Record
*rec
, Instruction
*ld
)
1248 int32_t offRc
= rec
->offset
;
1249 int32_t offLd
= ld
->getSrc(0)->reg
.data
.offset
;
1250 int sizeRc
= rec
->size
;
1251 int sizeLd
= typeSizeof(ld
->dType
);
1252 int size
= sizeRc
+ sizeLd
;
1255 if (!prog
->getTarget()->
1256 isAccessSupported(ld
->getSrc(0)->reg
.file
, typeOfSize(size
)))
1258 // no unaligned loads
1259 if (((size
== 0x8) && (MIN2(offLd
, offRc
) & 0x7)) ||
1260 ((size
== 0xc) && (MIN2(offLd
, offRc
) & 0xf)))
1263 assert(sizeRc
+ sizeLd
<= 16 && offRc
!= offLd
);
1265 for (j
= 0; sizeRc
; sizeRc
-= rec
->insn
->getDef(j
)->reg
.size
, ++j
);
1267 if (offLd
< offRc
) {
1269 for (sz
= 0, d
= 0; sz
< sizeLd
; sz
+= ld
->getDef(d
)->reg
.size
, ++d
);
1270 // d: nr of definitions in ld
1271 // j: nr of definitions in rec->insn, move:
1272 for (d
= d
+ j
- 1; j
> 0; --j
, --d
)
1273 rec
->insn
->setDef(d
, rec
->insn
->getDef(j
- 1));
1275 if (rec
->insn
->getSrc(0)->refCount() > 1)
1276 rec
->insn
->setSrc(0, rec
->insn
->getSrc(0)->clone(func
));
1277 rec
->offset
= rec
->insn
->getSrc(0)->reg
.data
.offset
= offLd
;
1283 // move definitions of @ld to @rec->insn
1284 for (j
= 0; sizeLd
; ++j
, ++d
) {
1285 sizeLd
-= ld
->getDef(j
)->reg
.size
;
1286 rec
->insn
->setDef(d
, ld
->getDef(j
));
1290 rec
->insn
->setType(typeOfSize(size
));
1292 delete_Instruction(prog
, ld
);
1298 MemoryOpt::combineSt(Record
*rec
, Instruction
*st
)
1300 int32_t offRc
= rec
->offset
;
1301 int32_t offSt
= st
->getSrc(0)->reg
.data
.offset
;
1302 int sizeRc
= rec
->size
;
1303 int sizeSt
= typeSizeof(st
->dType
);
1305 int size
= sizeRc
+ sizeSt
;
1307 Value
*src
[4]; // no modifiers in ValueRef allowed for st
1310 if (!prog
->getTarget()->
1311 isAccessSupported(st
->getSrc(0)->reg
.file
, typeOfSize(size
)))
1313 if (size
== 8 && MIN2(offRc
, offSt
) & 0x7)
1316 st
->takeExtraSources(0, extra
); // save predicate and indirect address
1318 if (offRc
< offSt
) {
1319 // save values from @st
1320 for (s
= 0; sizeSt
; ++s
) {
1321 sizeSt
-= st
->getSrc(s
+ 1)->reg
.size
;
1322 src
[s
] = st
->getSrc(s
+ 1);
1324 // set record's values as low sources of @st
1325 for (j
= 1; sizeRc
; ++j
) {
1326 sizeRc
-= st
->getSrc(j
)->reg
.size
;
1327 st
->setSrc(j
, rec
->insn
->getSrc(j
));
1329 // set saved values as high sources of @st
1330 for (k
= j
, j
= 0; j
< s
; ++j
)
1331 st
->setSrc(k
++, src
[j
]);
1333 updateLdStOffset(st
, offRc
, func
);
1335 for (j
= 1; sizeSt
; ++j
)
1336 sizeSt
-= st
->getSrc(j
)->reg
.size
;
1337 for (s
= 1; sizeRc
; ++j
, ++s
) {
1338 sizeRc
-= rec
->insn
->getSrc(s
)->reg
.size
;
1339 st
->setSrc(j
, rec
->insn
->getSrc(s
));
1341 rec
->offset
= offSt
;
1343 st
->putExtraSources(0, extra
); // restore pointer and predicate
1345 delete_Instruction(prog
, rec
->insn
);
1348 rec
->insn
->setType(typeOfSize(size
));
1353 MemoryOpt::Record::set(const Instruction
*ldst
)
1355 const Symbol
*mem
= ldst
->getSrc(0)->asSym();
1356 fileIndex
= mem
->reg
.fileIndex
;
1357 rel
[0] = ldst
->getIndirect(0, 0);
1358 rel
[1] = ldst
->getIndirect(0, 1);
1359 offset
= mem
->reg
.data
.offset
;
1360 base
= mem
->getBase();
1361 size
= typeSizeof(ldst
->sType
);
1365 MemoryOpt::Record::link(Record
**list
)
1375 MemoryOpt::Record::unlink(Record
**list
)
1385 MemoryOpt::Record
**
1386 MemoryOpt::getList(const Instruction
*insn
)
1388 if (insn
->op
== OP_LOAD
|| insn
->op
== OP_VFETCH
)
1389 return &loads
[insn
->src
[0].getFile()];
1390 return &stores
[insn
->src
[0].getFile()];
1394 MemoryOpt::addRecord(Instruction
*i
)
1396 Record
**list
= getList(i
);
1397 Record
*it
= reinterpret_cast<Record
*>(recordPool
.allocate());
1406 MemoryOpt::findRecord(const Instruction
*insn
, bool load
, bool& isAdj
) const
1408 const Symbol
*sym
= insn
->getSrc(0)->asSym();
1409 const int size
= typeSizeof(insn
->sType
);
1411 Record
*it
= load
? loads
[sym
->reg
.file
] : stores
[sym
->reg
.file
];
1413 for (; it
; it
= it
->next
) {
1414 if (it
->locked
&& insn
->op
!= OP_LOAD
)
1416 if ((it
->offset
>> 4) != (sym
->reg
.data
.offset
>> 4) ||
1417 it
->rel
[0] != insn
->getIndirect(0, 0) ||
1418 it
->fileIndex
!= sym
->reg
.fileIndex
||
1419 it
->rel
[1] != insn
->getIndirect(0, 1))
1422 if (it
->offset
< sym
->reg
.data
.offset
) {
1423 if (it
->offset
+ it
->size
>= sym
->reg
.data
.offset
) {
1424 isAdj
= (it
->offset
+ it
->size
== sym
->reg
.data
.offset
);
1427 if (!(it
->offset
& 0x7))
1431 isAdj
= it
->offset
!= sym
->reg
.data
.offset
;
1432 if (size
<= it
->size
&& !isAdj
)
1435 if (!(sym
->reg
.data
.offset
& 0x7))
1436 if (it
->offset
- size
<= sym
->reg
.data
.offset
)
1444 MemoryOpt::replaceLdFromSt(Instruction
*ld
, Record
*rec
)
1446 Instruction
*st
= rec
->insn
;
1447 int32_t offSt
= rec
->offset
;
1448 int32_t offLd
= ld
->getSrc(0)->reg
.data
.offset
;
1451 for (s
= 1; offSt
!= offLd
&& st
->srcExists(s
); ++s
)
1452 offSt
+= st
->getSrc(s
)->reg
.size
;
1456 for (d
= 0; ld
->defExists(d
) && st
->srcExists(s
); ++d
, ++s
) {
1457 if (ld
->getDef(d
)->reg
.size
!= st
->getSrc(s
)->reg
.size
)
1459 if (st
->getSrc(s
)->reg
.file
!= FILE_GPR
)
1461 ld
->def
[d
].replace(st
->getSrc(s
), false);
1468 MemoryOpt::replaceLdFromLd(Instruction
*ldE
, Record
*rec
)
1470 Instruction
*ldR
= rec
->insn
;
1471 int32_t offR
= rec
->offset
;
1472 int32_t offE
= ldE
->getSrc(0)->reg
.data
.offset
;
1475 assert(offR
<= offE
);
1476 for (dR
= 0; offR
< offE
&& ldR
->defExists(dR
); ++dR
)
1477 offR
+= ldR
->getDef(dR
)->reg
.size
;
1481 for (dE
= 0; ldE
->defExists(dE
) && ldR
->defExists(dR
); ++dE
, ++dR
) {
1482 if (ldE
->getDef(dE
)->reg
.size
!= ldR
->getDef(dR
)->reg
.size
)
1484 ldE
->def
[dE
].replace(ldR
->getDef(dR
), false);
1487 delete_Instruction(prog
, ldE
);
1492 MemoryOpt::replaceStFromSt(Instruction
*restrict st
, Record
*rec
)
1494 const Instruction
*const ri
= rec
->insn
;
1497 int32_t offS
= st
->getSrc(0)->reg
.data
.offset
;
1498 int32_t offR
= rec
->offset
;
1499 int32_t endS
= offS
+ typeSizeof(st
->dType
);
1500 int32_t endR
= offR
+ typeSizeof(ri
->dType
);
1502 rec
->size
= MAX2(endS
, endR
) - MIN2(offS
, offR
);
1504 st
->takeExtraSources(0, extra
);
1510 // get non-replaced sources of ri
1511 for (s
= 1; offR
< offS
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
)
1512 vals
[k
++] = ri
->getSrc(s
);
1514 // get replaced sources of st
1515 for (s
= 1; st
->srcExists(s
); offS
+= st
->getSrc(s
)->reg
.size
, ++s
)
1516 vals
[k
++] = st
->getSrc(s
);
1517 // skip replaced sources of ri
1518 for (s
= n
; offR
< endS
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
);
1519 // get non-replaced sources after values covered by st
1520 for (; offR
< endR
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
)
1521 vals
[k
++] = ri
->getSrc(s
);
1522 assert(k
<= Elements(vals
));
1523 for (s
= 0; s
< k
; ++s
)
1524 st
->setSrc(s
+ 1, vals
[s
]);
1525 st
->setSrc(0, ri
->getSrc(0));
1529 for (j
= 1; offR
< endS
; offR
+= ri
->getSrc(j
++)->reg
.size
);
1530 for (s
= 1; offS
< endS
; offS
+= st
->getSrc(s
++)->reg
.size
);
1531 for (; offR
< endR
; offR
+= ri
->getSrc(j
++)->reg
.size
)
1532 st
->setSrc(s
++, ri
->getSrc(j
));
1534 st
->putExtraSources(0, extra
);
1536 delete_Instruction(prog
, rec
->insn
);
1539 rec
->offset
= st
->getSrc(0)->reg
.data
.offset
;
1541 st
->setType(typeOfSize(rec
->size
));
1547 MemoryOpt::Record::overlaps(const Instruction
*ldst
) const
1552 if (this->fileIndex
!= that
.fileIndex
)
1555 if (this->rel
[0] || that
.rel
[0])
1556 return this->base
== that
.base
;
1558 (this->offset
< that
.offset
+ that
.size
) &&
1559 (this->offset
+ this->size
> that
.offset
);
1562 // We must not eliminate stores that affect the result of @ld if
1563 // we find later stores to the same location, and we may no longer
1564 // merge them with later stores.
1565 // The stored value can, however, still be used to determine the value
1566 // returned by future loads.
1568 MemoryOpt::lockStores(Instruction
*const ld
)
1570 for (Record
*r
= stores
[ld
->src
[0].getFile()]; r
; r
= r
->next
)
1571 if (!r
->locked
&& r
->overlaps(ld
))
1575 // Prior loads from the location of @st are no longer valid.
1576 // Stores to the location of @st may no longer be used to derive
1577 // the value at it nor be coalesced into later stores.
1579 MemoryOpt::purgeRecords(Instruction
*const st
, DataFile f
)
1582 f
= st
->src
[0].getFile();
1584 for (Record
*r
= loads
[f
]; r
; r
= r
->next
)
1585 if (!st
|| r
->overlaps(st
))
1586 r
->unlink(&loads
[f
]);
1588 for (Record
*r
= stores
[f
]; r
; r
= r
->next
)
1589 if (!st
|| r
->overlaps(st
))
1590 r
->unlink(&stores
[f
]);
1594 MemoryOpt::visit(BasicBlock
*bb
)
1596 bool ret
= runOpt(bb
);
1597 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
1598 // where 96 bit memory operations are forbidden.
1605 MemoryOpt::runOpt(BasicBlock
*bb
)
1607 Instruction
*ldst
, *next
;
1609 bool isAdjacent
= true;
1611 for (ldst
= bb
->getEntry(); ldst
; ldst
= next
) {
1616 if (ldst
->op
== OP_LOAD
|| ldst
->op
== OP_VFETCH
) {
1617 if (ldst
->isDead()) {
1618 // might have been produced by earlier optimization
1619 delete_Instruction(prog
, ldst
);
1623 if (ldst
->op
== OP_STORE
|| ldst
->op
== OP_EXPORT
) {
1626 // TODO: maybe have all fixed ops act as barrier ?
1627 if (ldst
->op
== OP_CALL
) {
1628 purgeRecords(NULL
, FILE_MEMORY_LOCAL
);
1629 purgeRecords(NULL
, FILE_MEMORY_GLOBAL
);
1630 purgeRecords(NULL
, FILE_MEMORY_SHARED
);
1631 purgeRecords(NULL
, FILE_SHADER_OUTPUT
);
1633 if (ldst
->op
== OP_EMIT
|| ldst
->op
== OP_RESTART
) {
1634 purgeRecords(NULL
, FILE_SHADER_OUTPUT
);
1638 if (ldst
->getPredicate()) // TODO: handle predicated ld/st
1642 DataFile file
= ldst
->src
[0].getFile();
1644 // if ld l[]/g[] look for previous store to eliminate the reload
1645 if (file
== FILE_MEMORY_GLOBAL
|| file
== FILE_MEMORY_LOCAL
) {
1646 // TODO: shared memory ?
1647 rec
= findRecord(ldst
, false, isAdjacent
);
1648 if (rec
&& !isAdjacent
)
1649 keep
= !replaceLdFromSt(ldst
, rec
);
1652 // or look for ld from the same location and replace this one
1653 rec
= keep
? findRecord(ldst
, true, isAdjacent
) : NULL
;
1656 keep
= !replaceLdFromLd(ldst
, rec
);
1658 // or combine a previous load with this one
1659 keep
= !combineLd(rec
, ldst
);
1664 rec
= findRecord(ldst
, false, isAdjacent
);
1667 keep
= !replaceStFromSt(ldst
, rec
);
1669 keep
= !combineSt(rec
, ldst
);
1672 purgeRecords(ldst
, DATA_FILE_COUNT
);
1682 // =============================================================================
1684 // Turn control flow into predicated instructions (after register allocation !).
1686 // Could move this to before register allocation on NVC0 and also handle nested
1688 class FlatteningPass
: public Pass
1691 virtual bool visit(BasicBlock
*);
1693 bool tryPredicateConditional(BasicBlock
*);
1694 void predicateInstructions(BasicBlock
*, Value
*pred
, CondCode cc
);
1695 void tryPropagateBranch(BasicBlock
*);
1696 inline bool isConstantCondition(Value
*pred
);
1697 inline bool mayPredicate(const Instruction
*, const Value
*pred
) const;
1698 inline void removeFlow(Instruction
*);
1702 FlatteningPass::isConstantCondition(Value
*pred
)
1704 Instruction
*insn
= pred
->getUniqueInsn();
1706 if (insn
->op
!= OP_SET
|| insn
->srcExists(2))
1709 for (int s
= 0; s
< 2 && insn
->srcExists(s
); ++s
) {
1710 Instruction
*ld
= insn
->getSrc(s
)->getUniqueInsn();
1713 if (ld
->op
!= OP_MOV
&& ld
->op
!= OP_LOAD
)
1715 if (ld
->src
[0].isIndirect(0))
1717 file
= ld
->src
[0].getFile();
1719 file
= insn
->src
[s
].getFile();
1720 // catch $r63 on NVC0
1721 if (file
== FILE_GPR
&& insn
->getSrc(s
)->reg
.data
.id
> prog
->maxGPR
)
1722 file
= FILE_IMMEDIATE
;
1724 if (file
!= FILE_IMMEDIATE
&& file
!= FILE_MEMORY_CONST
)
1731 FlatteningPass::removeFlow(Instruction
*insn
)
1733 FlowInstruction
*term
= insn
? insn
->asFlow() : NULL
;
1736 Graph::Edge::Type ty
= term
->bb
->cfg
.outgoing().getType();
1738 if (term
->op
== OP_BRA
) {
1739 // TODO: this might get more difficult when we get arbitrary BRAs
1740 if (ty
== Graph::Edge::CROSS
|| ty
== Graph::Edge::BACK
)
1743 if (term
->op
!= OP_JOIN
)
1746 delete_Instruction(prog
, term
);
1748 Value
*pred
= term
->getPredicate();
1750 if (pred
&& pred
->refCount() == 0) {
1751 Instruction
*pSet
= pred
->getUniqueInsn();
1752 pred
->join
->reg
.data
.id
= -1; // deallocate
1754 delete_Instruction(prog
, pSet
);
1759 FlatteningPass::predicateInstructions(BasicBlock
*bb
, Value
*pred
, CondCode cc
)
1761 for (Instruction
*i
= bb
->getEntry(); i
; i
= i
->next
) {
1764 assert(!i
->getPredicate());
1765 i
->setPredicate(cc
, pred
);
1767 removeFlow(bb
->getExit());
1771 FlatteningPass::mayPredicate(const Instruction
*insn
, const Value
*pred
) const
1773 if (insn
->isPseudo())
1775 // TODO: calls where we don't know which registers are modified
1777 if (!prog
->getTarget()->mayPredicate(insn
, pred
))
1779 for (int d
= 0; insn
->defExists(d
); ++d
)
1780 if (insn
->getDef(d
)->equals(pred
))
1785 // If we conditionally skip over or to a branch instruction, replace it.
1786 // NOTE: We do not update the CFG anymore here !
1788 FlatteningPass::tryPropagateBranch(BasicBlock
*bb
)
1790 BasicBlock
*bf
= NULL
;
1793 if (bb
->cfg
.outgoingCount() != 2)
1795 if (!bb
->getExit() || bb
->getExit()->op
!= OP_BRA
)
1797 Graph::EdgeIterator ei
= bb
->cfg
.outgoing();
1799 for (i
= 0; !ei
.end(); ++i
, ei
.next()) {
1800 bf
= BasicBlock::get(ei
.getNode());
1801 if (bf
->getInsnCount() == 1)
1804 if (ei
.end() || !bf
->getExit())
1806 FlowInstruction
*bra
= bb
->getExit()->asFlow();
1807 FlowInstruction
*rep
= bf
->getExit()->asFlow();
1809 if (rep
->getPredicate())
1811 if (rep
->op
!= OP_BRA
&&
1812 rep
->op
!= OP_JOIN
&&
1817 bra
->target
.bb
= rep
->target
.bb
;
1818 if (i
) // 2nd out block means branch not taken
1819 bra
->cc
= inverseCondCode(bra
->cc
);
1824 FlatteningPass::visit(BasicBlock
*bb
)
1826 if (tryPredicateConditional(bb
))
1829 // try to attach join to previous instruction
1830 Instruction
*insn
= bb
->getExit();
1831 if (insn
&& insn
->op
== OP_JOIN
&& !insn
->getPredicate()) {
1833 if (insn
&& !insn
->getPredicate() && !insn
->asFlow() && !insn
->isNop()) {
1835 bb
->remove(bb
->getExit());
1840 tryPropagateBranch(bb
);
1846 FlatteningPass::tryPredicateConditional(BasicBlock
*bb
)
1848 BasicBlock
*bL
= NULL
, *bR
= NULL
;
1849 unsigned int nL
= 0, nR
= 0, limit
= 12;
1853 mask
= bb
->initiatesSimpleConditional();
1857 assert(bb
->getExit());
1858 Value
*pred
= bb
->getExit()->getPredicate();
1861 if (isConstantCondition(pred
))
1864 Graph::EdgeIterator ei
= bb
->cfg
.outgoing();
1867 bL
= BasicBlock::get(ei
.getNode());
1868 for (insn
= bL
->getEntry(); insn
; insn
= insn
->next
, ++nL
)
1869 if (!mayPredicate(insn
, pred
))
1872 return false; // too long, do a real branch
1877 bR
= BasicBlock::get(ei
.getNode());
1878 for (insn
= bR
->getEntry(); insn
; insn
= insn
->next
, ++nR
)
1879 if (!mayPredicate(insn
, pred
))
1882 return false; // too long, do a real branch
1886 predicateInstructions(bL
, pred
, bb
->getExit()->cc
);
1888 predicateInstructions(bR
, pred
, inverseCondCode(bb
->getExit()->cc
));
1891 bb
->remove(bb
->joinAt
);
1894 removeFlow(bb
->getExit()); // delete the branch/join at the fork point
1896 // remove potential join operations at the end of the conditional
1897 if (prog
->getTarget()->joinAnterior
) {
1898 bb
= BasicBlock::get((bL
? bL
: bR
)->cfg
.outgoing().getNode());
1899 if (bb
->getEntry() && bb
->getEntry()->op
== OP_JOIN
)
1900 removeFlow(bb
->getEntry());
1906 // =============================================================================
1908 // Common subexpression elimination. Stupid O^2 implementation.
1909 class LocalCSE
: public Pass
1912 virtual bool visit(BasicBlock
*);
1914 inline bool tryReplace(Instruction
**, Instruction
*);
1916 DLList ops
[OP_LAST
+ 1];
1919 class GlobalCSE
: public Pass
1922 virtual bool visit(BasicBlock
*);
1926 Instruction::isActionEqual(const Instruction
*that
) const
1928 if (this->op
!= that
->op
||
1929 this->dType
!= that
->dType
||
1930 this->sType
!= that
->sType
)
1932 if (this->cc
!= that
->cc
)
1935 if (this->asTex()) {
1936 if (memcmp(&this->asTex()->tex
,
1937 &that
->asTex()->tex
,
1938 sizeof(this->asTex()->tex
)))
1941 if (this->asCmp()) {
1942 if (this->asCmp()->setCond
!= that
->asCmp()->setCond
)
1945 if (this->asFlow()) {
1948 if (this->atomic
!= that
->atomic
||
1949 this->ipa
!= that
->ipa
||
1950 this->lanes
!= that
->lanes
||
1951 this->perPatch
!= that
->perPatch
)
1953 if (this->postFactor
!= that
->postFactor
)
1957 if (this->subOp
!= that
->subOp
||
1958 this->saturate
!= that
->saturate
||
1959 this->rnd
!= that
->rnd
||
1960 this->ftz
!= that
->ftz
||
1961 this->dnz
!= that
->dnz
||
1962 this->cache
!= that
->cache
)
1969 Instruction::isResultEqual(const Instruction
*that
) const
1973 // NOTE: location of discard only affects tex with liveOnly and quadops
1974 if (!this->defExists(0) && this->op
!= OP_DISCARD
)
1977 if (!isActionEqual(that
))
1980 if (this->predSrc
!= that
->predSrc
)
1983 for (d
= 0; this->defExists(d
); ++d
) {
1984 if (!that
->defExists(d
) ||
1985 !this->getDef(d
)->equals(that
->getDef(d
), false))
1988 if (that
->defExists(d
))
1991 for (s
= 0; this->srcExists(s
); ++s
) {
1992 if (!that
->srcExists(s
))
1994 if (this->src
[s
].mod
!= that
->src
[s
].mod
)
1996 if (!this->getSrc(s
)->equals(that
->getSrc(s
), true))
1999 if (that
->srcExists(s
))
2002 if (op
== OP_LOAD
|| op
== OP_VFETCH
) {
2003 switch (src
[0].getFile()) {
2004 case FILE_MEMORY_CONST
:
2005 case FILE_SHADER_INPUT
:
2015 // pull through common expressions from different in-blocks
2017 GlobalCSE::visit(BasicBlock
*bb
)
2019 Instruction
*phi
, *next
, *ik
;
2022 for (phi
= bb
->getPhi(); phi
&& phi
->op
== OP_PHI
; phi
= next
) {
2024 if (phi
->getSrc(0)->refCount() > 1)
2026 ik
= phi
->getSrc(0)->getInsn();
2027 for (s
= 1; phi
->srcExists(s
); ++s
) {
2028 if (phi
->getSrc(s
)->refCount() > 1)
2030 if (!phi
->getSrc(s
)->getInsn()->isResultEqual(ik
))
2033 if (!phi
->srcExists(s
)) {
2034 Instruction
*entry
= bb
->getEntry();
2036 if (!entry
|| entry
->op
!= OP_JOIN
)
2039 bb
->insertAfter(entry
, ik
);
2040 ik
->setDef(0, phi
->getDef(0));
2041 delete_Instruction(prog
, phi
);
2049 LocalCSE::tryReplace(Instruction
**ptr
, Instruction
*i
)
2051 Instruction
*old
= *ptr
;
2052 if (!old
->isResultEqual(i
))
2054 for (int d
= 0; old
->defExists(d
); ++d
)
2055 old
->def
[d
].replace(i
->getDef(d
), false);
2056 delete_Instruction(prog
, old
);
2062 LocalCSE::visit(BasicBlock
*bb
)
2064 unsigned int replaced
;
2067 Instruction
*ir
, *next
;
2071 // will need to know the order of instructions
2073 for (ir
= bb
->getEntry(); ir
; ir
= ir
->next
)
2074 ir
->serial
= serial
++;
2076 for (ir
= bb
->getEntry(); ir
; ir
= next
) {
2083 ops
[ir
->op
].insert(ir
);
2087 for (s
= 0; ir
->srcExists(s
); ++s
)
2088 if (ir
->getSrc(s
)->asLValue())
2089 if (!src
|| ir
->getSrc(s
)->refCount() < src
->refCount())
2090 src
= ir
->getSrc(s
);
2093 for (Value::UseIterator it
= src
->uses
.begin();
2094 it
!= src
->uses
.end(); ++it
) {
2095 Instruction
*ik
= (*it
)->getInsn();
2096 if (ik
&& ik
->serial
< ir
->serial
&& ik
->bb
== ir
->bb
)
2097 if (tryReplace(&ir
, ik
))
2101 DLLIST_FOR_EACH(&ops
[ir
->op
], iter
)
2103 Instruction
*ik
= reinterpret_cast<Instruction
*>(iter
.get());
2104 if (tryReplace(&ir
, ik
))
2110 ops
[ir
->op
].insert(ir
);
2114 for (unsigned int i
= 0; i
<= OP_LAST
; ++i
)
2122 // =============================================================================
2124 // Remove computations of unused values.
2125 class DeadCodeElim
: public Pass
2128 bool buryAll(Program
*);
2131 virtual bool visit(BasicBlock
*);
2133 void checkSplitLoad(Instruction
*ld
); // for partially dead loads
2135 unsigned int deadCount
;
2139 DeadCodeElim::buryAll(Program
*prog
)
2143 if (!this->run(prog
, false, false))
2145 } while (deadCount
);
2151 DeadCodeElim::visit(BasicBlock
*bb
)
2155 for (Instruction
*i
= bb
->getFirst(); i
; i
= next
) {
2159 delete_Instruction(prog
, i
);
2161 if (i
->defExists(1) && (i
->op
== OP_VFETCH
|| i
->op
== OP_LOAD
)) {
2169 DeadCodeElim::checkSplitLoad(Instruction
*ld1
)
2171 Instruction
*ld2
= NULL
; // can get at most 2 loads
2174 int32_t addr1
, addr2
;
2175 int32_t size1
, size2
;
2177 uint32_t mask
= 0xffffffff;
2179 for (d
= 0; ld1
->defExists(d
); ++d
)
2180 if (!ld1
->getDef(d
)->refCount() && ld1
->getDef(d
)->reg
.data
.id
< 0)
2182 if (mask
== 0xffffffff)
2185 addr1
= ld1
->getSrc(0)->reg
.data
.offset
;
2188 for (d
= 0; ld1
->defExists(d
); ++d
) {
2189 if (mask
& (1 << d
)) {
2190 if (size1
&& (addr1
& 0x7))
2192 def1
[n1
] = ld1
->getDef(d
);
2193 size1
+= def1
[n1
++]->reg
.size
;
2196 addr1
+= ld1
->getDef(d
)->reg
.size
;
2201 for (addr2
= addr1
+ size1
; ld1
->defExists(d
); ++d
) {
2202 if (mask
& (1 << d
)) {
2203 def2
[n2
] = ld1
->getDef(d
);
2204 size2
+= def2
[n2
++]->reg
.size
;
2207 addr2
+= ld1
->getDef(d
)->reg
.size
;
2211 updateLdStOffset(ld1
, addr1
, func
);
2212 ld1
->setType(typeOfSize(size1
));
2213 for (d
= 0; d
< 4; ++d
)
2214 ld1
->setDef(d
, (d
< n1
) ? def1
[d
] : NULL
);
2219 ld2
= ld1
->clone(false);
2220 updateLdStOffset(ld2
, addr2
, func
);
2221 ld2
->setType(typeOfSize(size2
));
2222 for (d
= 0; d
< 4; ++d
)
2223 ld2
->setDef(d
, (d
< n2
) ? def2
[d
] : NULL
);
2225 ld1
->bb
->insertAfter(ld1
, ld2
);
2228 // =============================================================================
2230 #define RUN_PASS(l, n, f) \
2231 if (level >= (l)) { \
2232 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \
2233 INFO("PEEPHOLE: %s\n", #n); \
2235 if (!pass.f(this)) \
2240 Program::optimizeSSA(int level
)
2242 RUN_PASS(1, DeadCodeElim
, buryAll
);
2243 RUN_PASS(1, CopyPropagation
, run
);
2244 RUN_PASS(2, GlobalCSE
, run
);
2245 RUN_PASS(1, LocalCSE
, run
);
2246 RUN_PASS(2, AlgebraicOpt
, run
);
2247 RUN_PASS(2, ModifierFolding
, run
); // before load propagation -> less checks
2248 RUN_PASS(1, ConstantFolding
, foldAll
);
2249 RUN_PASS(1, LoadPropagation
, run
);
2250 RUN_PASS(2, MemoryOpt
, run
);
2251 RUN_PASS(2, LocalCSE
, run
);
2252 RUN_PASS(0, DeadCodeElim
, buryAll
);
2257 Program::optimizePostRA(int level
)
2259 RUN_PASS(2, FlatteningPass
, run
);