2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_target.h"
25 #include "codegen/nv50_ir_build_util.h"
28 #include "util/u_math.h"
34 Instruction::isNop() const
36 if (op
== OP_PHI
|| op
== OP_SPLIT
|| op
== OP_MERGE
|| op
== OP_CONSTRAINT
)
38 if (terminator
|| join
) // XXX: should terminator imply flow ?
42 if (!fixed
&& op
== OP_NOP
)
45 if (defExists(0) && def(0).rep()->reg
.data
.id
< 0) {
46 for (int d
= 1; defExists(d
); ++d
)
47 if (def(d
).rep()->reg
.data
.id
>= 0)
48 WARN("part of vector result is unused !\n");
52 if (op
== OP_MOV
|| op
== OP_UNION
) {
53 if (!getDef(0)->equals(getSrc(0)))
56 if (!def(0).rep()->equals(getSrc(1)))
64 bool Instruction::isDead() const
69 op
== OP_SUSTB
|| op
== OP_SUSTP
|| op
== OP_SUREDP
|| op
== OP_SUREDB
||
73 for (int d
= 0; defExists(d
); ++d
)
74 if (getDef(d
)->refCount() || getDef(d
)->reg
.data
.id
>= 0)
77 if (terminator
|| asFlow())
85 // =============================================================================
87 class CopyPropagation
: public Pass
90 virtual bool visit(BasicBlock
*);
93 // Propagate all MOVs forward to make subsequent optimization easier, except if
94 // the sources stem from a phi, in which case we don't want to mess up potential
95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
97 CopyPropagation::visit(BasicBlock
*bb
)
99 Instruction
*mov
, *si
, *next
;
101 for (mov
= bb
->getEntry(); mov
; mov
= next
) {
103 if (mov
->op
!= OP_MOV
|| mov
->fixed
|| !mov
->getSrc(0)->asLValue())
105 if (mov
->getPredicate())
107 if (mov
->def(0).getFile() != mov
->src(0).getFile())
109 si
= mov
->getSrc(0)->getInsn();
110 if (mov
->getDef(0)->reg
.data
.id
< 0 && si
&& si
->op
!= OP_PHI
) {
112 mov
->def(0).replace(mov
->getSrc(0), false);
113 delete_Instruction(prog
, mov
);
119 // =============================================================================
121 class MergeSplits
: public Pass
124 virtual bool visit(BasicBlock
*);
127 // For SPLIT / MERGE pairs that operate on the same registers, replace the
128 // post-merge def with the SPLIT's source.
130 MergeSplits::visit(BasicBlock
*bb
)
132 Instruction
*i
, *next
, *si
;
134 for (i
= bb
->getEntry(); i
; i
= next
) {
136 if (i
->op
!= OP_MERGE
|| typeSizeof(i
->dType
) != 8)
138 si
= i
->getSrc(0)->getInsn();
139 if (si
->op
!= OP_SPLIT
|| si
!= i
->getSrc(1)->getInsn())
141 i
->def(0).replace(si
->getSrc(0), false);
142 delete_Instruction(prog
, i
);
148 // =============================================================================
150 class LoadPropagation
: public Pass
153 virtual bool visit(BasicBlock
*);
155 void checkSwapSrc01(Instruction
*);
157 bool isCSpaceLoad(Instruction
*);
158 bool isImmdLoad(Instruction
*);
159 bool isAttribOrSharedLoad(Instruction
*);
163 LoadPropagation::isCSpaceLoad(Instruction
*ld
)
165 return ld
&& ld
->op
== OP_LOAD
&& ld
->src(0).getFile() == FILE_MEMORY_CONST
;
169 LoadPropagation::isImmdLoad(Instruction
*ld
)
171 if (!ld
|| (ld
->op
!= OP_MOV
) ||
172 ((typeSizeof(ld
->dType
) != 4) && (typeSizeof(ld
->dType
) != 8)))
175 // A 0 can be replaced with a register, so it doesn't count as an immediate.
177 return ld
->src(0).getImmediate(val
) && !val
.isInteger(0);
181 LoadPropagation::isAttribOrSharedLoad(Instruction
*ld
)
184 (ld
->op
== OP_VFETCH
||
185 (ld
->op
== OP_LOAD
&&
186 (ld
->src(0).getFile() == FILE_SHADER_INPUT
||
187 ld
->src(0).getFile() == FILE_MEMORY_SHARED
)));
191 LoadPropagation::checkSwapSrc01(Instruction
*insn
)
193 const Target
*targ
= prog
->getTarget();
194 if (!targ
->getOpInfo(insn
).commutative
)
195 if (insn
->op
!= OP_SET
&& insn
->op
!= OP_SLCT
&& insn
->op
!= OP_SUB
)
197 if (insn
->src(1).getFile() != FILE_GPR
)
199 // This is the special OP_SET used for alphatesting, we can't reverse its
200 // arguments as that will confuse the fixup code.
201 if (insn
->op
== OP_SET
&& insn
->subOp
)
204 Instruction
*i0
= insn
->getSrc(0)->getInsn();
205 Instruction
*i1
= insn
->getSrc(1)->getInsn();
207 // Swap sources to inline the less frequently used source. That way,
208 // optimistically, it will eventually be able to remove the instruction.
209 int i0refs
= insn
->getSrc(0)->refCount();
210 int i1refs
= insn
->getSrc(1)->refCount();
212 if ((isCSpaceLoad(i0
) || isImmdLoad(i0
)) && targ
->insnCanLoad(insn
, 1, i0
)) {
213 if ((!isImmdLoad(i1
) && !isCSpaceLoad(i1
)) ||
214 !targ
->insnCanLoad(insn
, 1, i1
) ||
216 insn
->swapSources(0, 1);
220 if (isAttribOrSharedLoad(i1
)) {
221 if (!isAttribOrSharedLoad(i0
))
222 insn
->swapSources(0, 1);
229 if (insn
->op
== OP_SET
|| insn
->op
== OP_SET_AND
||
230 insn
->op
== OP_SET_OR
|| insn
->op
== OP_SET_XOR
)
231 insn
->asCmp()->setCond
= reverseCondCode(insn
->asCmp()->setCond
);
233 if (insn
->op
== OP_SLCT
)
234 insn
->asCmp()->setCond
= inverseCondCode(insn
->asCmp()->setCond
);
236 if (insn
->op
== OP_SUB
) {
237 insn
->src(0).mod
= insn
->src(0).mod
^ Modifier(NV50_IR_MOD_NEG
);
238 insn
->src(1).mod
= insn
->src(1).mod
^ Modifier(NV50_IR_MOD_NEG
);
243 LoadPropagation::visit(BasicBlock
*bb
)
245 const Target
*targ
= prog
->getTarget();
248 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
251 if (i
->op
== OP_CALL
) // calls have args as sources, they must be in regs
254 if (i
->op
== OP_PFETCH
) // pfetch expects arg1 to be a reg
260 for (int s
= 0; i
->srcExists(s
); ++s
) {
261 Instruction
*ld
= i
->getSrc(s
)->getInsn();
263 if (!ld
|| ld
->fixed
|| (ld
->op
!= OP_LOAD
&& ld
->op
!= OP_MOV
))
265 if (!targ
->insnCanLoad(i
, s
, ld
))
269 i
->setSrc(s
, ld
->getSrc(0));
270 if (ld
->src(0).isIndirect(0))
271 i
->setIndirect(s
, 0, ld
->getIndirect(0, 0));
273 if (ld
->getDef(0)->refCount() == 0)
274 delete_Instruction(prog
, ld
);
280 // =============================================================================
282 class IndirectPropagation
: public Pass
285 virtual bool visit(BasicBlock
*);
289 IndirectPropagation::visit(BasicBlock
*bb
)
291 const Target
*targ
= prog
->getTarget();
294 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
297 for (int s
= 0; i
->srcExists(s
); ++s
) {
300 if (!i
->src(s
).isIndirect(0))
302 insn
= i
->getIndirect(s
, 0)->getInsn();
305 if (insn
->op
== OP_ADD
&& !isFloatType(insn
->dType
)) {
306 if (insn
->src(0).getFile() != targ
->nativeFile(FILE_ADDRESS
) ||
307 !insn
->src(1).getImmediate(imm
) ||
308 !targ
->insnCanLoadOffset(i
, s
, imm
.reg
.data
.s32
))
310 i
->setIndirect(s
, 0, insn
->getSrc(0));
311 i
->setSrc(s
, cloneShallow(func
, i
->getSrc(s
)));
312 i
->src(s
).get()->reg
.data
.offset
+= imm
.reg
.data
.u32
;
313 } else if (insn
->op
== OP_SUB
&& !isFloatType(insn
->dType
)) {
314 if (insn
->src(0).getFile() != targ
->nativeFile(FILE_ADDRESS
) ||
315 !insn
->src(1).getImmediate(imm
) ||
316 !targ
->insnCanLoadOffset(i
, s
, -imm
.reg
.data
.s32
))
318 i
->setIndirect(s
, 0, insn
->getSrc(0));
319 i
->setSrc(s
, cloneShallow(func
, i
->getSrc(s
)));
320 i
->src(s
).get()->reg
.data
.offset
-= imm
.reg
.data
.u32
;
321 } else if (insn
->op
== OP_MOV
) {
322 if (!insn
->src(0).getImmediate(imm
) ||
323 !targ
->insnCanLoadOffset(i
, s
, imm
.reg
.data
.s32
))
325 i
->setIndirect(s
, 0, NULL
);
326 i
->setSrc(s
, cloneShallow(func
, i
->getSrc(s
)));
327 i
->src(s
).get()->reg
.data
.offset
+= imm
.reg
.data
.u32
;
334 // =============================================================================
336 // Evaluate constant expressions.
337 class ConstantFolding
: public Pass
340 bool foldAll(Program
*);
343 virtual bool visit(BasicBlock
*);
345 void expr(Instruction
*, ImmediateValue
&, ImmediateValue
&);
346 void expr(Instruction
*, ImmediateValue
&, ImmediateValue
&, ImmediateValue
&);
347 void opnd(Instruction
*, ImmediateValue
&, int s
);
348 void opnd3(Instruction
*, ImmediateValue
&);
350 void unary(Instruction
*, const ImmediateValue
&);
352 void tryCollapseChainedMULs(Instruction
*, const int s
, ImmediateValue
&);
354 CmpInstruction
*findOriginForTestWithZero(Value
*);
356 unsigned int foldCount
;
361 // TODO: remember generated immediates and only revisit these
363 ConstantFolding::foldAll(Program
*prog
)
365 unsigned int iterCount
= 0;
370 } while (foldCount
&& ++iterCount
< 2);
375 ConstantFolding::visit(BasicBlock
*bb
)
377 Instruction
*i
, *next
;
379 for (i
= bb
->getEntry(); i
; i
= next
) {
381 if (i
->op
== OP_MOV
|| i
->op
== OP_CALL
)
384 ImmediateValue src0
, src1
, src2
;
386 if (i
->srcExists(2) &&
387 i
->src(0).getImmediate(src0
) &&
388 i
->src(1).getImmediate(src1
) &&
389 i
->src(2).getImmediate(src2
))
390 expr(i
, src0
, src1
, src2
);
392 if (i
->srcExists(1) &&
393 i
->src(0).getImmediate(src0
) && i
->src(1).getImmediate(src1
))
396 if (i
->srcExists(0) && i
->src(0).getImmediate(src0
))
399 if (i
->srcExists(1) && i
->src(1).getImmediate(src1
))
401 if (i
->srcExists(2) && i
->src(2).getImmediate(src2
))
408 ConstantFolding::findOriginForTestWithZero(Value
*value
)
412 Instruction
*insn
= value
->getInsn();
414 if (insn
->asCmp() && insn
->op
!= OP_SLCT
)
415 return insn
->asCmp();
417 /* Sometimes mov's will sneak in as a result of other folding. This gets
420 if (insn
->op
== OP_MOV
)
421 return findOriginForTestWithZero(insn
->getSrc(0));
423 /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
424 if (insn
->op
== OP_AND
) {
427 if (!insn
->src(s
).getImmediate(imm
)) {
429 if (!insn
->src(s
).getImmediate(imm
))
432 if (imm
.reg
.data
.f32
!= 1.0f
)
434 /* TODO: Come up with a way to handle the condition being inverted */
435 if (insn
->src(!s
).mod
!= Modifier(0))
437 return findOriginForTestWithZero(insn
->getSrc(!s
));
444 Modifier::applyTo(ImmediateValue
& imm
) const
446 if (!bits
) // avoid failure if imm.reg.type is unhandled (e.g. b128)
448 switch (imm
.reg
.type
) {
450 if (bits
& NV50_IR_MOD_ABS
)
451 imm
.reg
.data
.f32
= fabsf(imm
.reg
.data
.f32
);
452 if (bits
& NV50_IR_MOD_NEG
)
453 imm
.reg
.data
.f32
= -imm
.reg
.data
.f32
;
454 if (bits
& NV50_IR_MOD_SAT
) {
455 if (imm
.reg
.data
.f32
< 0.0f
)
456 imm
.reg
.data
.f32
= 0.0f
;
458 if (imm
.reg
.data
.f32
> 1.0f
)
459 imm
.reg
.data
.f32
= 1.0f
;
461 assert(!(bits
& NV50_IR_MOD_NOT
));
464 case TYPE_S8
: // NOTE: will be extended
467 case TYPE_U8
: // NOTE: treated as signed
470 if (bits
& NV50_IR_MOD_ABS
)
471 imm
.reg
.data
.s32
= (imm
.reg
.data
.s32
>= 0) ?
472 imm
.reg
.data
.s32
: -imm
.reg
.data
.s32
;
473 if (bits
& NV50_IR_MOD_NEG
)
474 imm
.reg
.data
.s32
= -imm
.reg
.data
.s32
;
475 if (bits
& NV50_IR_MOD_NOT
)
476 imm
.reg
.data
.s32
= ~imm
.reg
.data
.s32
;
480 if (bits
& NV50_IR_MOD_ABS
)
481 imm
.reg
.data
.f64
= fabs(imm
.reg
.data
.f64
);
482 if (bits
& NV50_IR_MOD_NEG
)
483 imm
.reg
.data
.f64
= -imm
.reg
.data
.f64
;
484 if (bits
& NV50_IR_MOD_SAT
) {
485 if (imm
.reg
.data
.f64
< 0.0)
486 imm
.reg
.data
.f64
= 0.0;
488 if (imm
.reg
.data
.f64
> 1.0)
489 imm
.reg
.data
.f64
= 1.0;
491 assert(!(bits
& NV50_IR_MOD_NOT
));
495 assert(!"invalid/unhandled type");
496 imm
.reg
.data
.u64
= 0;
502 Modifier::getOp() const
505 case NV50_IR_MOD_ABS
: return OP_ABS
;
506 case NV50_IR_MOD_NEG
: return OP_NEG
;
507 case NV50_IR_MOD_SAT
: return OP_SAT
;
508 case NV50_IR_MOD_NOT
: return OP_NOT
;
517 ConstantFolding::expr(Instruction
*i
,
518 ImmediateValue
&imm0
, ImmediateValue
&imm1
)
520 struct Storage
*const a
= &imm0
.reg
, *const b
= &imm1
.reg
;
522 DataType type
= i
->dType
;
524 memset(&res
.data
, 0, sizeof(res
.data
));
530 if (i
->dnz
&& i
->dType
== TYPE_F32
) {
531 if (!isfinite(a
->data
.f32
))
533 if (!isfinite(b
->data
.f32
))
538 res
.data
.f32
= a
->data
.f32
* b
->data
.f32
* exp2f(i
->postFactor
);
540 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
* b
->data
.f64
; break;
542 if (i
->subOp
== NV50_IR_SUBOP_MUL_HIGH
) {
543 res
.data
.s32
= ((int64_t)a
->data
.s32
* b
->data
.s32
) >> 32;
548 if (i
->subOp
== NV50_IR_SUBOP_MUL_HIGH
) {
549 res
.data
.u32
= ((uint64_t)a
->data
.u32
* b
->data
.u32
) >> 32;
552 res
.data
.u32
= a
->data
.u32
* b
->data
.u32
; break;
558 if (b
->data
.u32
== 0)
561 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
/ b
->data
.f32
; break;
562 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
/ b
->data
.f64
; break;
563 case TYPE_S32
: res
.data
.s32
= a
->data
.s32
/ b
->data
.s32
; break;
564 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
/ b
->data
.u32
; break;
571 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
+ b
->data
.f32
; break;
572 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
+ b
->data
.f64
; break;
574 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
+ b
->data
.u32
; break;
581 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
- b
->data
.f32
; break;
582 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
- b
->data
.f64
; break;
584 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
- b
->data
.u32
; break;
591 case TYPE_F32
: res
.data
.f32
= pow(a
->data
.f32
, b
->data
.f32
); break;
592 case TYPE_F64
: res
.data
.f64
= pow(a
->data
.f64
, b
->data
.f64
); break;
599 case TYPE_F32
: res
.data
.f32
= MAX2(a
->data
.f32
, b
->data
.f32
); break;
600 case TYPE_F64
: res
.data
.f64
= MAX2(a
->data
.f64
, b
->data
.f64
); break;
601 case TYPE_S32
: res
.data
.s32
= MAX2(a
->data
.s32
, b
->data
.s32
); break;
602 case TYPE_U32
: res
.data
.u32
= MAX2(a
->data
.u32
, b
->data
.u32
); break;
609 case TYPE_F32
: res
.data
.f32
= MIN2(a
->data
.f32
, b
->data
.f32
); break;
610 case TYPE_F64
: res
.data
.f64
= MIN2(a
->data
.f64
, b
->data
.f64
); break;
611 case TYPE_S32
: res
.data
.s32
= MIN2(a
->data
.s32
, b
->data
.s32
); break;
612 case TYPE_U32
: res
.data
.u32
= MIN2(a
->data
.u32
, b
->data
.u32
); break;
618 res
.data
.u64
= a
->data
.u64
& b
->data
.u64
;
621 res
.data
.u64
= a
->data
.u64
| b
->data
.u64
;
624 res
.data
.u64
= a
->data
.u64
^ b
->data
.u64
;
627 res
.data
.u32
= a
->data
.u32
<< b
->data
.u32
;
631 case TYPE_S32
: res
.data
.s32
= a
->data
.s32
>> b
->data
.u32
; break;
632 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
>> b
->data
.u32
; break;
638 if (a
->data
.u32
!= b
->data
.u32
)
640 res
.data
.u32
= a
->data
.u32
;
643 int offset
= b
->data
.u32
& 0xff;
644 int width
= (b
->data
.u32
>> 8) & 0xff;
651 if (width
+ offset
< 32) {
653 lshift
= 32 - width
- offset
;
655 if (i
->subOp
== NV50_IR_SUBOP_EXTBF_REV
)
656 res
.data
.u32
= util_bitreverse(a
->data
.u32
);
658 res
.data
.u32
= a
->data
.u32
;
660 case TYPE_S32
: res
.data
.s32
= (res
.data
.s32
<< lshift
) >> rshift
; break;
661 case TYPE_U32
: res
.data
.u32
= (res
.data
.u32
<< lshift
) >> rshift
; break;
668 res
.data
.u32
= util_bitcount(a
->data
.u32
& b
->data
.u32
);
671 // The two arguments to pfetch are logically added together. Normally
672 // the second argument will not be constant, but that can happen.
673 res
.data
.u32
= a
->data
.u32
+ b
->data
.u32
;
681 res
.data
.u64
= (((uint64_t)b
->data
.u32
) << 32) | a
->data
.u32
;
692 i
->src(0).mod
= Modifier(0);
693 i
->src(1).mod
= Modifier(0);
696 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.u32
));
699 i
->getSrc(0)->reg
.data
= res
.data
;
700 i
->getSrc(0)->reg
.type
= type
;
701 i
->getSrc(0)->reg
.size
= typeSizeof(type
);
706 ImmediateValue src0
, src1
= *i
->getSrc(0)->asImm();
708 // Move the immediate into position 1, where we know it might be
709 // emittable. However it might not be anyways, as there may be other
710 // restrictions, so move it into a separate LValue.
711 bld
.setPosition(i
, false);
713 i
->setSrc(1, bld
.mkMov(bld
.getSSA(type
), i
->getSrc(0), type
)->getDef(0));
714 i
->setSrc(0, i
->getSrc(2));
715 i
->src(0).mod
= i
->src(2).mod
;
718 if (i
->src(0).getImmediate(src0
))
725 // Leave PFETCH alone... we just folded its 2 args into 1.
728 i
->op
= i
->saturate
? OP_SAT
: OP_MOV
; /* SAT handled by unary() */
735 ConstantFolding::expr(Instruction
*i
,
736 ImmediateValue
&imm0
,
737 ImmediateValue
&imm1
,
738 ImmediateValue
&imm2
)
740 struct Storage
*const a
= &imm0
.reg
, *const b
= &imm1
.reg
, *const c
= &imm2
.reg
;
743 memset(&res
.data
, 0, sizeof(res
.data
));
747 int offset
= b
->data
.u32
& 0xff;
748 int width
= (b
->data
.u32
>> 8) & 0xff;
749 unsigned bitmask
= ((1 << width
) - 1) << offset
;
750 res
.data
.u32
= ((a
->data
.u32
<< offset
) & bitmask
) | (c
->data
.u32
& ~bitmask
);
757 res
.data
.f32
= a
->data
.f32
* b
->data
.f32
* exp2f(i
->postFactor
) +
761 res
.data
.f64
= a
->data
.f64
* b
->data
.f64
+ c
->data
.f64
;
764 if (i
->subOp
== NV50_IR_SUBOP_MUL_HIGH
) {
765 res
.data
.s32
= ((int64_t)a
->data
.s32
* b
->data
.s32
>> 32) + c
->data
.s32
;
770 if (i
->subOp
== NV50_IR_SUBOP_MUL_HIGH
) {
771 res
.data
.u32
= ((uint64_t)a
->data
.u32
* b
->data
.u32
>> 32) + c
->data
.u32
;
774 res
.data
.u32
= a
->data
.u32
* b
->data
.u32
+ c
->data
.u32
;
786 i
->src(0).mod
= Modifier(0);
787 i
->src(1).mod
= Modifier(0);
788 i
->src(2).mod
= Modifier(0);
790 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.u32
));
794 i
->getSrc(0)->reg
.data
= res
.data
;
795 i
->getSrc(0)->reg
.type
= i
->dType
;
796 i
->getSrc(0)->reg
.size
= typeSizeof(i
->dType
);
802 ConstantFolding::unary(Instruction
*i
, const ImmediateValue
&imm
)
806 if (i
->dType
!= TYPE_F32
)
809 case OP_NEG
: res
.data
.f32
= -imm
.reg
.data
.f32
; break;
810 case OP_ABS
: res
.data
.f32
= fabsf(imm
.reg
.data
.f32
); break;
811 case OP_SAT
: res
.data
.f32
= CLAMP(imm
.reg
.data
.f32
, 0.0f
, 1.0f
); break;
812 case OP_RCP
: res
.data
.f32
= 1.0f
/ imm
.reg
.data
.f32
; break;
813 case OP_RSQ
: res
.data
.f32
= 1.0f
/ sqrtf(imm
.reg
.data
.f32
); break;
814 case OP_LG2
: res
.data
.f32
= log2f(imm
.reg
.data
.f32
); break;
815 case OP_EX2
: res
.data
.f32
= exp2f(imm
.reg
.data
.f32
); break;
816 case OP_SIN
: res
.data
.f32
= sinf(imm
.reg
.data
.f32
); break;
817 case OP_COS
: res
.data
.f32
= cosf(imm
.reg
.data
.f32
); break;
818 case OP_SQRT
: res
.data
.f32
= sqrtf(imm
.reg
.data
.f32
); break;
821 // these should be handled in subsequent OP_SIN/COS/EX2
822 res
.data
.f32
= imm
.reg
.data
.f32
;
828 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.f32
));
829 i
->src(0).mod
= Modifier(0);
833 ConstantFolding::tryCollapseChainedMULs(Instruction
*mul2
,
834 const int s
, ImmediateValue
& imm2
)
836 const int t
= s
? 0 : 1;
838 Instruction
*mul1
= NULL
; // mul1 before mul2
840 float f
= imm2
.reg
.data
.f32
* exp2f(mul2
->postFactor
);
843 assert(mul2
->op
== OP_MUL
&& mul2
->dType
== TYPE_F32
);
845 if (mul2
->getSrc(t
)->refCount() == 1) {
846 insn
= mul2
->getSrc(t
)->getInsn();
847 if (!mul2
->src(t
).mod
&& insn
->op
== OP_MUL
&& insn
->dType
== TYPE_F32
)
849 if (mul1
&& !mul1
->saturate
) {
852 if (mul1
->src(s1
= 0).getImmediate(imm1
) ||
853 mul1
->src(s1
= 1).getImmediate(imm1
)) {
854 bld
.setPosition(mul1
, false);
856 // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
857 mul1
->setSrc(s1
, bld
.loadImm(NULL
, f
* imm1
.reg
.data
.f32
));
858 mul1
->src(s1
).mod
= Modifier(0);
859 mul2
->def(0).replace(mul1
->getDef(0), false);
860 mul1
->saturate
= mul2
->saturate
;
862 if (prog
->getTarget()->isPostMultiplySupported(OP_MUL
, f
, e
)) {
864 // d = mul c, imm -> d = mul_x_imm a, b
865 mul1
->postFactor
= e
;
866 mul2
->def(0).replace(mul1
->getDef(0), false);
868 mul1
->src(0).mod
*= Modifier(NV50_IR_MOD_NEG
);
869 mul1
->saturate
= mul2
->saturate
;
874 if (mul2
->getDef(0)->refCount() == 1 && !mul2
->saturate
) {
876 // d = mul b, c -> d = mul_x_imm a, c
878 insn
= (*mul2
->getDef(0)->uses
.begin())->getInsn();
883 s2
= insn
->getSrc(0) == mul1
->getDef(0) ? 0 : 1;
885 if (insn
->op
== OP_MUL
&& insn
->dType
== TYPE_F32
)
886 if (!insn
->src(s2
).mod
&& !insn
->src(t2
).getImmediate(imm1
))
888 if (mul2
&& prog
->getTarget()->isPostMultiplySupported(OP_MUL
, f
, e
)) {
889 mul2
->postFactor
= e
;
890 mul2
->setSrc(s2
, mul1
->src(t
));
892 mul2
->src(s2
).mod
*= Modifier(NV50_IR_MOD_NEG
);
898 ConstantFolding::opnd3(Instruction
*i
, ImmediateValue
&imm2
)
903 if (imm2
.isInteger(0)) {
916 ConstantFolding::opnd(Instruction
*i
, ImmediateValue
&imm0
, int s
)
918 const Target
*target
= prog
->getTarget();
920 const operation op
= i
->op
;
921 Instruction
*newi
= i
;
925 if (i
->dType
== TYPE_F32
)
926 tryCollapseChainedMULs(i
, s
, imm0
);
928 if (i
->subOp
== NV50_IR_SUBOP_MUL_HIGH
) {
929 assert(!isFloatType(i
->sType
));
930 if (imm0
.isInteger(1) && i
->dType
== TYPE_S32
) {
931 bld
.setPosition(i
, false);
932 // Need to set to the sign value, which is a compare.
933 newi
= bld
.mkCmp(OP_SET
, CC_LT
, TYPE_S32
, i
->getDef(0),
934 TYPE_S32
, i
->getSrc(t
), bld
.mkImm(0));
935 delete_Instruction(prog
, i
);
936 } else if (imm0
.isInteger(0) || imm0
.isInteger(1)) {
937 // The high bits can't be set in this case (either mul by 0 or
941 i
->setSrc(0, new_ImmediateValue(prog
, 0u));
942 i
->src(0).mod
= Modifier(0);
944 } else if (!imm0
.isNegative() && imm0
.isPow2()) {
945 // Translate into a shift
949 imm0
.reg
.data
.u32
= 32 - imm0
.reg
.data
.u32
;
950 i
->setSrc(0, i
->getSrc(t
));
951 i
->src(0).mod
= i
->src(t
).mod
;
952 i
->setSrc(1, new_ImmediateValue(prog
, imm0
.reg
.data
.u32
));
956 if (imm0
.isInteger(0)) {
958 i
->setSrc(0, new_ImmediateValue(prog
, 0u));
959 i
->src(0).mod
= Modifier(0);
963 if (!i
->postFactor
&& (imm0
.isInteger(1) || imm0
.isInteger(-1))) {
964 if (imm0
.isNegative())
965 i
->src(t
).mod
= i
->src(t
).mod
^ Modifier(NV50_IR_MOD_NEG
);
966 i
->op
= i
->src(t
).mod
.getOp();
968 i
->setSrc(0, i
->getSrc(1));
969 i
->src(0).mod
= i
->src(1).mod
;
976 if (!i
->postFactor
&& (imm0
.isInteger(2) || imm0
.isInteger(-2))) {
977 if (imm0
.isNegative())
978 i
->src(t
).mod
= i
->src(t
).mod
^ Modifier(NV50_IR_MOD_NEG
);
980 i
->setSrc(s
, i
->getSrc(t
));
981 i
->src(s
).mod
= i
->src(t
).mod
;
983 if (!isFloatType(i
->sType
) && !imm0
.isNegative() && imm0
.isPow2()) {
986 i
->setSrc(0, i
->getSrc(t
));
987 i
->src(0).mod
= i
->src(t
).mod
;
988 i
->setSrc(1, new_ImmediateValue(prog
, imm0
.reg
.data
.u32
));
991 if (i
->postFactor
&& i
->sType
== TYPE_F32
) {
992 /* Can't emit a postfactor with an immediate, have to fold it in */
993 i
->setSrc(s
, new_ImmediateValue(
994 prog
, imm0
.reg
.data
.f32
* exp2f(i
->postFactor
)));
999 if (imm0
.isInteger(0)) {
1000 i
->setSrc(0, i
->getSrc(2));
1001 i
->src(0).mod
= i
->src(2).mod
;
1004 i
->op
= i
->src(0).mod
.getOp();
1005 if (i
->op
!= OP_CVT
)
1008 if (i
->subOp
!= NV50_IR_SUBOP_MUL_HIGH
&&
1009 (imm0
.isInteger(1) || imm0
.isInteger(-1))) {
1010 if (imm0
.isNegative())
1011 i
->src(t
).mod
= i
->src(t
).mod
^ Modifier(NV50_IR_MOD_NEG
);
1013 i
->setSrc(0, i
->getSrc(1));
1014 i
->src(0).mod
= i
->src(1).mod
;
1016 i
->setSrc(1, i
->getSrc(2));
1017 i
->src(1).mod
= i
->src(2).mod
;
1021 if (s
== 1 && !imm0
.isNegative() && imm0
.isPow2() &&
1022 target
->isOpSupported(i
->op
, i
->dType
)) {
1025 i
->setSrc(1, new_ImmediateValue(prog
, imm0
.reg
.data
.u32
));
1031 if (imm0
.isInteger(0)) {
1033 i
->setSrc(0, i
->getSrc(1));
1034 i
->src(0).mod
= i
->src(1).mod
;
1037 i
->op
= i
->src(0).mod
.getOp();
1038 if (i
->op
!= OP_CVT
)
1039 i
->src(0).mod
= Modifier(0);
1044 if (s
!= 1 || (i
->dType
!= TYPE_S32
&& i
->dType
!= TYPE_U32
))
1046 bld
.setPosition(i
, false);
1047 if (imm0
.reg
.data
.u32
== 0) {
1050 if (imm0
.reg
.data
.u32
== 1) {
1054 if (i
->dType
== TYPE_U32
&& imm0
.isPow2()) {
1056 i
->setSrc(1, bld
.mkImm(util_logbase2(imm0
.reg
.data
.u32
)));
1058 if (i
->dType
== TYPE_U32
) {
1061 const uint32_t d
= imm0
.reg
.data
.u32
;
1064 uint32_t l
= util_logbase2(d
);
1065 if (((uint32_t)1 << l
) < d
)
1067 m
= (((uint64_t)1 << 32) * (((uint64_t)1 << l
) - d
)) / d
+ 1;
1069 s
= l
? (l
- 1) : 0;
1073 mul
= bld
.mkOp2(OP_MUL
, TYPE_U32
, tA
, i
->getSrc(0),
1074 bld
.loadImm(NULL
, m
));
1075 mul
->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
1076 bld
.mkOp2(OP_SUB
, TYPE_U32
, tB
, i
->getSrc(0), tA
);
1079 bld
.mkOp2(OP_SHR
, TYPE_U32
, tA
, tB
, bld
.mkImm(r
));
1082 tB
= s
? bld
.getSSA() : i
->getDef(0);
1083 newi
= bld
.mkOp2(OP_ADD
, TYPE_U32
, tB
, mul
->getDef(0), tA
);
1085 bld
.mkOp2(OP_SHR
, TYPE_U32
, i
->getDef(0), tB
, bld
.mkImm(s
));
1087 delete_Instruction(prog
, i
);
1089 if (imm0
.reg
.data
.s32
== -1) {
1095 const int32_t d
= imm0
.reg
.data
.s32
;
1097 int32_t l
= util_logbase2(static_cast<unsigned>(abs(d
)));
1098 if ((1 << l
) < abs(d
))
1102 m
= ((uint64_t)1 << (32 + l
- 1)) / abs(d
) + 1 - ((uint64_t)1 << 32);
1106 bld
.mkOp3(OP_MAD
, TYPE_S32
, tA
, i
->getSrc(0), bld
.loadImm(NULL
, m
),
1107 i
->getSrc(0))->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
1109 bld
.mkOp2(OP_SHR
, TYPE_S32
, tB
, tA
, bld
.mkImm(l
- 1));
1113 bld
.mkCmp(OP_SET
, CC_LT
, TYPE_S32
, tA
, TYPE_S32
, i
->getSrc(0), bld
.mkImm(0));
1114 tD
= (d
< 0) ? bld
.getSSA() : i
->getDef(0)->asLValue();
1115 newi
= bld
.mkOp2(OP_SUB
, TYPE_U32
, tD
, tB
, tA
);
1117 bld
.mkOp1(OP_NEG
, TYPE_S32
, i
->getDef(0), tB
);
1119 delete_Instruction(prog
, i
);
1124 if (i
->sType
== TYPE_U32
&& imm0
.isPow2()) {
1125 bld
.setPosition(i
, false);
1127 i
->setSrc(1, bld
.loadImm(NULL
, imm0
.reg
.data
.u32
- 1));
1131 case OP_SET
: // TODO: SET_AND,OR,XOR
1133 /* This optimizes the case where the output of a set is being compared
1134 * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1135 * can be a lot cleverer in our comparison.
1137 CmpInstruction
*si
= findOriginForTestWithZero(i
->getSrc(t
));
1139 if (imm0
.reg
.data
.u32
!= 0 || !si
)
1142 ccZ
= (CondCode
)((unsigned int)i
->asCmp()->setCond
& ~CC_U
);
1143 // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1146 ccZ
= reverseCondCode(ccZ
);
1147 // If there is a negative modifier, we need to undo that, by flipping
1148 // the comparison to zero.
1149 if (i
->src(t
).mod
.neg())
1150 ccZ
= reverseCondCode(ccZ
);
1151 // If this is a signed comparison, we expect the input to be a regular
1152 // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1153 // is positive, so just flip the sign.
1154 if (i
->sType
== TYPE_S32
) {
1155 assert(!isFloatType(si
->dType
));
1156 ccZ
= reverseCondCode(ccZ
);
1159 case CC_LT
: cc
= CC_FL
; break; // bool < 0 -- this is never true
1160 case CC_GE
: cc
= CC_TR
; break; // bool >= 0 -- this is always true
1161 case CC_EQ
: cc
= inverseCondCode(cc
); break; // bool == 0 -- !bool
1162 case CC_LE
: cc
= inverseCondCode(cc
); break; // bool <= 0 -- !bool
1163 case CC_GT
: break; // bool > 0 -- bool
1164 case CC_NE
: break; // bool != 0 -- bool
1169 // Update the condition of this SET to be identical to the origin set,
1170 // but with the updated condition code. The original SET should get
1173 i
->asCmp()->setCond
= cc
;
1174 i
->setSrc(0, si
->src(0));
1175 i
->setSrc(1, si
->src(1));
1176 if (si
->srcExists(2))
1177 i
->setSrc(2, si
->src(2));
1178 i
->sType
= si
->sType
;
1184 Instruction
*src
= i
->getSrc(t
)->getInsn();
1185 ImmediateValue imm1
;
1186 if (imm0
.reg
.data
.u32
== 0) {
1188 i
->setSrc(0, new_ImmediateValue(prog
, 0u));
1189 i
->src(0).mod
= Modifier(0);
1191 } else if (imm0
.reg
.data
.u32
== ~0U) {
1192 i
->op
= i
->src(t
).mod
.getOp();
1194 i
->setSrc(0, i
->getSrc(t
));
1195 i
->src(0).mod
= i
->src(t
).mod
;
1198 } else if (src
->asCmp()) {
1199 CmpInstruction
*cmp
= src
->asCmp();
1200 if (!cmp
|| cmp
->op
== OP_SLCT
|| cmp
->getDef(0)->refCount() > 1)
1202 if (!prog
->getTarget()->isOpSupported(cmp
->op
, TYPE_F32
))
1204 if (imm0
.reg
.data
.f32
!= 1.0)
1206 if (cmp
->dType
!= TYPE_U32
)
1209 cmp
->dType
= TYPE_F32
;
1210 if (i
->src(t
).mod
!= Modifier(0)) {
1211 assert(i
->src(t
).mod
== Modifier(NV50_IR_MOD_NOT
));
1212 i
->src(t
).mod
= Modifier(0);
1213 cmp
->setCond
= inverseCondCode(cmp
->setCond
);
1218 i
->setSrc(0, i
->getSrc(t
));
1221 } else if (prog
->getTarget()->isOpSupported(OP_EXTBF
, TYPE_U32
) &&
1222 src
->op
== OP_SHR
&&
1223 src
->src(1).getImmediate(imm1
) &&
1224 i
->src(t
).mod
== Modifier(0) &&
1225 util_is_power_of_two(imm0
.reg
.data
.u32
+ 1)) {
1226 // low byte = offset, high byte = width
1227 uint32_t ext
= (util_last_bit(imm0
.reg
.data
.u32
) << 8) | imm1
.reg
.data
.u32
;
1229 i
->setSrc(0, src
->getSrc(0));
1230 i
->setSrc(1, new_ImmediateValue(prog
, ext
));
1237 if (s
!= 1 || i
->src(0).mod
!= Modifier(0))
1239 // try to concatenate shifts
1240 Instruction
*si
= i
->getSrc(0)->getInsn();
1243 ImmediateValue imm1
;
1246 if (si
->src(1).getImmediate(imm1
)) {
1247 bld
.setPosition(i
, false);
1248 i
->setSrc(0, si
->getSrc(0));
1249 i
->setSrc(1, bld
.loadImm(NULL
, imm0
.reg
.data
.u32
+ imm1
.reg
.data
.u32
));
1253 if (si
->src(1).getImmediate(imm1
) && imm0
.reg
.data
.u32
== imm1
.reg
.data
.u32
) {
1254 bld
.setPosition(i
, false);
1256 i
->setSrc(0, si
->getSrc(0));
1257 i
->setSrc(1, bld
.loadImm(NULL
, ~((1 << imm0
.reg
.data
.u32
) - 1)));
1262 if (isFloatType(si
->dType
))
1264 if (si
->src(1).getImmediate(imm1
))
1266 else if (si
->src(0).getImmediate(imm1
))
1271 bld
.setPosition(i
, false);
1273 i
->setSrc(0, si
->getSrc(!muls
));
1274 i
->setSrc(1, bld
.loadImm(NULL
, imm1
.reg
.data
.u32
<< imm0
.reg
.data
.u32
));
1279 if (isFloatType(si
->dType
))
1281 if (si
->op
!= OP_SUB
&& si
->src(0).getImmediate(imm1
))
1283 else if (si
->src(1).getImmediate(imm1
))
1287 if (si
->src(!adds
).mod
!= Modifier(0))
1289 // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1291 // This is more operations, but if one of x, y is an immediate, then
1292 // we can get a situation where (a) we can use ISCADD, or (b)
1293 // propagate the add bit into an indirect load.
1294 bld
.setPosition(i
, false);
1296 i
->setSrc(adds
, bld
.loadImm(NULL
, imm1
.reg
.data
.u32
<< imm0
.reg
.data
.u32
));
1297 i
->setSrc(!adds
, bld
.mkOp2v(OP_SHL
, i
->dType
,
1298 bld
.getSSA(i
->def(0).getSize(), i
->def(0).getFile()),
1300 bld
.mkImm(imm0
.reg
.data
.u32
)));
1325 case TYPE_S32
: res
= util_last_bit_signed(imm0
.reg
.data
.s32
) - 1; break;
1326 case TYPE_U32
: res
= util_last_bit(imm0
.reg
.data
.u32
) - 1; break;
1330 if (i
->subOp
== NV50_IR_SUBOP_BFIND_SAMT
&& res
>= 0)
1332 bld
.setPosition(i
, false); /* make sure bld is init'ed */
1333 i
->setSrc(0, bld
.mkImm(res
));
1340 // Only deal with 1-arg POPCNT here
1341 if (i
->srcExists(1))
1343 uint32_t res
= util_bitcount(imm0
.reg
.data
.u32
);
1344 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
));
1352 // TODO: handle 64-bit values properly
1353 if (typeSizeof(i
->dType
) == 8 || typeSizeof(i
->sType
) == 8)
1356 // TODO: handle single byte/word extractions
1360 bld
.setPosition(i
, true); /* make sure bld is init'ed */
1362 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1364 switch (i->sType) { \
1366 res.data.dst = util_iround(i->saturate ? \
1367 CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1368 imm0.reg.data.f64); \
1371 res.data.dst = util_iround(i->saturate ? \
1372 CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1373 imm0.reg.data.f32); \
1376 res.data.dst = i->saturate ? \
1377 CLAMP(imm0.reg.data.s32, imin, imax) : \
1378 imm0.reg.data.s32; \
1381 res.data.dst = i->saturate ? \
1382 CLAMP(imm0.reg.data.u32, umin, umax) : \
1383 imm0.reg.data.u32; \
1386 res.data.dst = i->saturate ? \
1387 CLAMP(imm0.reg.data.s16, imin, imax) : \
1388 imm0.reg.data.s16; \
1391 res.data.dst = i->saturate ? \
1392 CLAMP(imm0.reg.data.u16, umin, umax) : \
1393 imm0.reg.data.u16; \
1397 i->setSrc(0, bld.mkImm(res.data.dst)); \
1401 CASE(TYPE_U16
, u16
, 0, UINT16_MAX
, 0, UINT16_MAX
, 0, UINT16_MAX
);
1402 CASE(TYPE_S16
, s16
, INT16_MIN
, INT16_MAX
, INT16_MIN
, INT16_MAX
, 0, INT16_MAX
);
1403 CASE(TYPE_U32
, u32
, 0, UINT32_MAX
, 0, INT32_MAX
, 0, UINT32_MAX
);
1404 CASE(TYPE_S32
, s32
, INT32_MIN
, INT32_MAX
, INT32_MIN
, INT32_MAX
, 0, INT32_MAX
);
1408 res
.data
.f32
= i
->saturate
?
1409 CLAMP(imm0
.reg
.data
.f64
, 0.0f
, 1.0f
) :
1413 res
.data
.f32
= i
->saturate
?
1414 CLAMP(imm0
.reg
.data
.f32
, 0.0f
, 1.0f
) :
1417 case TYPE_U16
: res
.data
.f32
= (float) imm0
.reg
.data
.u16
; break;
1418 case TYPE_U32
: res
.data
.f32
= (float) imm0
.reg
.data
.u32
; break;
1419 case TYPE_S16
: res
.data
.f32
= (float) imm0
.reg
.data
.s16
; break;
1420 case TYPE_S32
: res
.data
.f32
= (float) imm0
.reg
.data
.s32
; break;
1424 i
->setSrc(0, bld
.mkImm(res
.data
.f32
));
1429 res
.data
.f64
= i
->saturate
?
1430 CLAMP(imm0
.reg
.data
.f64
, 0.0f
, 1.0f
) :
1434 res
.data
.f64
= i
->saturate
?
1435 CLAMP(imm0
.reg
.data
.f32
, 0.0f
, 1.0f
) :
1438 case TYPE_U16
: res
.data
.f64
= (double) imm0
.reg
.data
.u16
; break;
1439 case TYPE_U32
: res
.data
.f64
= (double) imm0
.reg
.data
.u32
; break;
1440 case TYPE_S16
: res
.data
.f64
= (double) imm0
.reg
.data
.s16
; break;
1441 case TYPE_S32
: res
.data
.f64
= (double) imm0
.reg
.data
.s32
; break;
1445 i
->setSrc(0, bld
.mkImm(res
.data
.f64
));
1452 i
->setType(i
->dType
); /* Remove i->sType, which we don't need anymore */
1455 i
->src(0).mod
= Modifier(0); /* Clear the already applied modifier */
1465 // =============================================================================
1467 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1468 class ModifierFolding
: public Pass
1471 virtual bool visit(BasicBlock
*);
1475 ModifierFolding::visit(BasicBlock
*bb
)
1477 const Target
*target
= prog
->getTarget();
1479 Instruction
*i
, *next
, *mi
;
1482 for (i
= bb
->getEntry(); i
; i
= next
) {
1485 if (0 && i
->op
== OP_SUB
) {
1486 // turn "sub" into "add neg" (do we really want this ?)
1488 i
->src(0).mod
= i
->src(0).mod
^ Modifier(NV50_IR_MOD_NEG
);
1491 for (int s
= 0; s
< 3 && i
->srcExists(s
); ++s
) {
1492 mi
= i
->getSrc(s
)->getInsn();
1494 mi
->predSrc
>= 0 || mi
->getDef(0)->refCount() > 8)
1496 if (i
->sType
== TYPE_U32
&& mi
->dType
== TYPE_S32
) {
1497 if ((i
->op
!= OP_ADD
&&
1499 (mi
->op
!= OP_ABS
&&
1503 if (i
->sType
!= mi
->dType
) {
1506 if ((mod
= Modifier(mi
->op
)) == Modifier(0))
1508 mod
*= mi
->src(0).mod
;
1510 if ((i
->op
== OP_ABS
) || i
->src(s
).mod
.abs()) {
1511 // abs neg [abs] = abs
1512 mod
= mod
& Modifier(~(NV50_IR_MOD_NEG
| NV50_IR_MOD_ABS
));
1514 if ((i
->op
== OP_NEG
) && mod
.neg()) {
1516 // neg as both opcode and modifier on same insn is prohibited
1517 // neg neg abs = abs, neg neg = identity
1518 mod
= mod
& Modifier(~NV50_IR_MOD_NEG
);
1519 i
->op
= mod
.getOp();
1520 mod
= mod
& Modifier(~NV50_IR_MOD_ABS
);
1521 if (mod
== Modifier(0))
1525 if (target
->isModSupported(i
, s
, mod
)) {
1526 i
->setSrc(s
, mi
->getSrc(0));
1527 i
->src(s
).mod
*= mod
;
1531 if (i
->op
== OP_SAT
) {
1532 mi
= i
->getSrc(0)->getInsn();
1534 mi
->getDef(0)->refCount() <= 1 && target
->isSatSupported(mi
)) {
1536 mi
->setDef(0, i
->getDef(0));
1537 delete_Instruction(prog
, i
);
1545 // =============================================================================
1547 // MUL + ADD -> MAD/FMA
1548 // MIN/MAX(a, a) -> a, etc.
1549 // SLCT(a, b, const) -> cc(const) ? a : b
1551 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1552 class AlgebraicOpt
: public Pass
1555 virtual bool visit(BasicBlock
*);
1557 void handleABS(Instruction
*);
1558 bool handleADD(Instruction
*);
1559 bool tryADDToMADOrSAD(Instruction
*, operation toOp
);
1560 void handleMINMAX(Instruction
*);
1561 void handleRCP(Instruction
*);
1562 void handleSLCT(Instruction
*);
1563 void handleLOGOP(Instruction
*);
1564 void handleCVT_NEG(Instruction
*);
1565 void handleCVT_CVT(Instruction
*);
1566 void handleCVT_EXTBF(Instruction
*);
1567 void handleSUCLAMP(Instruction
*);
1568 void handleNEG(Instruction
*);
1574 AlgebraicOpt::handleABS(Instruction
*abs
)
1576 Instruction
*sub
= abs
->getSrc(0)->getInsn();
1579 !prog
->getTarget()->isOpSupported(OP_SAD
, abs
->dType
))
1581 // expect not to have mods yet, if we do, bail
1582 if (sub
->src(0).mod
|| sub
->src(1).mod
)
1584 // hidden conversion ?
1585 ty
= intTypeToSigned(sub
->dType
);
1586 if (abs
->dType
!= abs
->sType
|| ty
!= abs
->sType
)
1589 if ((sub
->op
!= OP_ADD
&& sub
->op
!= OP_SUB
) ||
1590 sub
->src(0).getFile() != FILE_GPR
|| sub
->src(0).mod
||
1591 sub
->src(1).getFile() != FILE_GPR
|| sub
->src(1).mod
)
1594 Value
*src0
= sub
->getSrc(0);
1595 Value
*src1
= sub
->getSrc(1);
1597 if (sub
->op
== OP_ADD
) {
1598 Instruction
*neg
= sub
->getSrc(1)->getInsn();
1599 if (neg
&& neg
->op
!= OP_NEG
) {
1600 neg
= sub
->getSrc(0)->getInsn();
1601 src0
= sub
->getSrc(1);
1603 if (!neg
|| neg
->op
!= OP_NEG
||
1604 neg
->dType
!= neg
->sType
|| neg
->sType
!= ty
)
1606 src1
= neg
->getSrc(0);
1610 abs
->moveSources(1, 2); // move sources >=1 up by 2
1612 abs
->setType(sub
->dType
);
1613 abs
->setSrc(0, src0
);
1614 abs
->setSrc(1, src1
);
1615 bld
.setPosition(abs
, false);
1616 abs
->setSrc(2, bld
.loadImm(bld
.getSSA(typeSizeof(ty
)), 0));
1620 AlgebraicOpt::handleADD(Instruction
*add
)
1622 Value
*src0
= add
->getSrc(0);
1623 Value
*src1
= add
->getSrc(1);
1625 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
1628 bool changed
= false;
1629 if (!changed
&& prog
->getTarget()->isOpSupported(OP_MAD
, add
->dType
))
1630 changed
= tryADDToMADOrSAD(add
, OP_MAD
);
1631 if (!changed
&& prog
->getTarget()->isOpSupported(OP_SAD
, add
->dType
))
1632 changed
= tryADDToMADOrSAD(add
, OP_SAD
);
1636 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1637 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1639 AlgebraicOpt::tryADDToMADOrSAD(Instruction
*add
, operation toOp
)
1641 Value
*src0
= add
->getSrc(0);
1642 Value
*src1
= add
->getSrc(1);
1645 const operation srcOp
= toOp
== OP_SAD
? OP_SAD
: OP_MUL
;
1646 const Modifier modBad
= Modifier(~((toOp
== OP_MAD
) ? NV50_IR_MOD_NEG
: 0));
1649 if (src0
->refCount() == 1 &&
1650 src0
->getUniqueInsn() && src0
->getUniqueInsn()->op
== srcOp
)
1653 if (src1
->refCount() == 1 &&
1654 src1
->getUniqueInsn() && src1
->getUniqueInsn()->op
== srcOp
)
1659 src
= add
->getSrc(s
);
1661 if (src
->getUniqueInsn() && src
->getUniqueInsn()->bb
!= add
->bb
)
1664 if (src
->getInsn()->saturate
|| src
->getInsn()->postFactor
||
1665 src
->getInsn()->dnz
)
1668 if (toOp
== OP_SAD
) {
1670 if (!src
->getInsn()->src(2).getImmediate(imm
))
1672 if (!imm
.isInteger(0))
1676 if (typeSizeof(add
->dType
) != typeSizeof(src
->getInsn()->dType
) ||
1677 isFloatType(add
->dType
) != isFloatType(src
->getInsn()->dType
))
1680 mod
[0] = add
->src(0).mod
;
1681 mod
[1] = add
->src(1).mod
;
1682 mod
[2] = src
->getUniqueInsn()->src(0).mod
;
1683 mod
[3] = src
->getUniqueInsn()->src(1).mod
;
1685 if (((mod
[0] | mod
[1]) | (mod
[2] | mod
[3])) & modBad
)
1689 add
->subOp
= src
->getInsn()->subOp
; // potentially mul-high
1690 add
->dType
= src
->getInsn()->dType
; // sign matters for imad hi
1691 add
->sType
= src
->getInsn()->sType
;
1693 add
->setSrc(2, add
->src(s
? 0 : 1));
1695 add
->setSrc(0, src
->getInsn()->getSrc(0));
1696 add
->src(0).mod
= mod
[2] ^ mod
[s
];
1697 add
->setSrc(1, src
->getInsn()->getSrc(1));
1698 add
->src(1).mod
= mod
[3];
1704 AlgebraicOpt::handleMINMAX(Instruction
*minmax
)
1706 Value
*src0
= minmax
->getSrc(0);
1707 Value
*src1
= minmax
->getSrc(1);
1709 if (src0
!= src1
|| src0
->reg
.file
!= FILE_GPR
)
1711 if (minmax
->src(0).mod
== minmax
->src(1).mod
) {
1712 if (minmax
->def(0).mayReplace(minmax
->src(0))) {
1713 minmax
->def(0).replace(minmax
->src(0), false);
1714 minmax
->bb
->remove(minmax
);
1716 minmax
->op
= OP_CVT
;
1717 minmax
->setSrc(1, NULL
);
1721 // min(x, -x) = -abs(x)
1722 // min(x, -abs(x)) = -abs(x)
1723 // min(x, abs(x)) = x
1724 // max(x, -abs(x)) = x
1725 // max(x, abs(x)) = abs(x)
1726 // max(x, -x) = abs(x)
1731 AlgebraicOpt::handleRCP(Instruction
*rcp
)
1733 Instruction
*si
= rcp
->getSrc(0)->getUniqueInsn();
1735 if (si
&& si
->op
== OP_RCP
) {
1736 Modifier mod
= rcp
->src(0).mod
* si
->src(0).mod
;
1737 rcp
->op
= mod
.getOp();
1738 rcp
->setSrc(0, si
->getSrc(0));
1743 AlgebraicOpt::handleSLCT(Instruction
*slct
)
1745 if (slct
->getSrc(2)->reg
.file
== FILE_IMMEDIATE
) {
1746 if (slct
->getSrc(2)->asImm()->compare(slct
->asCmp()->setCond
, 0.0f
))
1747 slct
->setSrc(0, slct
->getSrc(1));
1749 if (slct
->getSrc(0) != slct
->getSrc(1)) {
1753 slct
->setSrc(1, NULL
);
1754 slct
->setSrc(2, NULL
);
1758 AlgebraicOpt::handleLOGOP(Instruction
*logop
)
1760 Value
*src0
= logop
->getSrc(0);
1761 Value
*src1
= logop
->getSrc(1);
1763 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
1767 if ((logop
->op
== OP_AND
|| logop
->op
== OP_OR
) &&
1768 logop
->def(0).mayReplace(logop
->src(0))) {
1769 logop
->def(0).replace(logop
->src(0), false);
1770 delete_Instruction(prog
, logop
);
1773 // try AND(SET, SET) -> SET_AND(SET)
1774 Instruction
*set0
= src0
->getInsn();
1775 Instruction
*set1
= src1
->getInsn();
1777 if (!set0
|| set0
->fixed
|| !set1
|| set1
->fixed
)
1779 if (set1
->op
!= OP_SET
) {
1780 Instruction
*xchg
= set0
;
1783 if (set1
->op
!= OP_SET
)
1786 operation redOp
= (logop
->op
== OP_AND
? OP_SET_AND
:
1787 logop
->op
== OP_XOR
? OP_SET_XOR
: OP_SET_OR
);
1788 if (!prog
->getTarget()->isOpSupported(redOp
, set1
->sType
))
1790 if (set0
->op
!= OP_SET
&&
1791 set0
->op
!= OP_SET_AND
&&
1792 set0
->op
!= OP_SET_OR
&&
1793 set0
->op
!= OP_SET_XOR
)
1795 if (set0
->getDef(0)->refCount() > 1 &&
1796 set1
->getDef(0)->refCount() > 1)
1798 if (set0
->getPredicate() || set1
->getPredicate())
1800 // check that they don't source each other
1801 for (int s
= 0; s
< 2; ++s
)
1802 if (set0
->getSrc(s
) == set1
->getDef(0) ||
1803 set1
->getSrc(s
) == set0
->getDef(0))
1806 set0
= cloneForward(func
, set0
);
1807 set1
= cloneShallow(func
, set1
);
1808 logop
->bb
->insertAfter(logop
, set1
);
1809 logop
->bb
->insertAfter(logop
, set0
);
1811 set0
->dType
= TYPE_U8
;
1812 set0
->getDef(0)->reg
.file
= FILE_PREDICATE
;
1813 set0
->getDef(0)->reg
.size
= 1;
1814 set1
->setSrc(2, set0
->getDef(0));
1816 set1
->setDef(0, logop
->getDef(0));
1817 delete_Instruction(prog
, logop
);
1821 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1823 // F2I(NEG(I2F(ABS(SET))))
1825 AlgebraicOpt::handleCVT_NEG(Instruction
*cvt
)
1827 Instruction
*insn
= cvt
->getSrc(0)->getInsn();
1828 if (cvt
->sType
!= TYPE_F32
||
1829 cvt
->dType
!= TYPE_S32
|| cvt
->src(0).mod
!= Modifier(0))
1831 if (!insn
|| insn
->op
!= OP_NEG
|| insn
->dType
!= TYPE_F32
)
1833 if (insn
->src(0).mod
!= Modifier(0))
1835 insn
= insn
->getSrc(0)->getInsn();
1837 // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
1838 if (insn
&& insn
->op
== OP_CVT
&&
1839 insn
->dType
== TYPE_F32
&&
1840 insn
->sType
== TYPE_S32
) {
1841 insn
= insn
->getSrc(0)->getInsn();
1842 if (!insn
|| insn
->op
!= OP_ABS
|| insn
->sType
!= TYPE_S32
||
1845 insn
= insn
->getSrc(0)->getInsn();
1846 if (!insn
|| insn
->op
!= OP_SET
|| insn
->dType
!= TYPE_U32
)
1849 if (!insn
|| insn
->op
!= OP_SET
|| insn
->dType
!= TYPE_F32
) {
1853 Instruction
*bset
= cloneShallow(func
, insn
);
1854 bset
->dType
= TYPE_U32
;
1855 bset
->setDef(0, cvt
->getDef(0));
1856 cvt
->bb
->insertAfter(cvt
, bset
);
1857 delete_Instruction(prog
, cvt
);
1860 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
1861 // does a type conversion, this becomes trickier as there might be range
1862 // changes/etc. We could handle those in theory as long as the range was being
1863 // reduced or kept the same.
1865 AlgebraicOpt::handleCVT_CVT(Instruction
*cvt
)
1867 Instruction
*insn
= cvt
->getSrc(0)->getInsn();
1868 RoundMode rnd
= insn
->rnd
;
1870 if (insn
->saturate
||
1872 insn
->dType
!= insn
->sType
||
1873 insn
->dType
!= cvt
->sType
)
1892 if (!isFloatType(cvt
->dType
) || !isFloatType(insn
->sType
))
1893 rnd
= (RoundMode
)(rnd
& 3);
1896 cvt
->setSrc(0, insn
->getSrc(0));
1897 cvt
->src(0).mod
*= insn
->src(0).mod
;
1898 cvt
->sType
= insn
->sType
;
1901 // Some shaders extract packed bytes out of words and convert them to
1902 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
1903 // nv50 for word sizes.
1905 // CVT(EXTBF(x, byte/word))
1906 // CVT(AND(bytemask, x))
1907 // CVT(AND(bytemask, SHR(x, 8/16/24)))
1908 // CVT(SHR(x, 16/24))
1910 AlgebraicOpt::handleCVT_EXTBF(Instruction
*cvt
)
1912 Instruction
*insn
= cvt
->getSrc(0)->getInsn();
1915 unsigned width
, offset
;
1916 if ((cvt
->sType
!= TYPE_U32
&& cvt
->sType
!= TYPE_S32
) || !insn
)
1918 if (insn
->op
== OP_EXTBF
&& insn
->src(1).getImmediate(imm
)) {
1919 width
= (imm
.reg
.data
.u32
>> 8) & 0xff;
1920 offset
= imm
.reg
.data
.u32
& 0xff;
1921 arg
= insn
->getSrc(0);
1923 if (width
!= 8 && width
!= 16)
1925 if (width
== 8 && offset
& 0x7)
1927 if (width
== 16 && offset
& 0xf)
1929 } else if (insn
->op
== OP_AND
) {
1931 if (insn
->src(0).getImmediate(imm
))
1933 else if (insn
->src(1).getImmediate(imm
))
1938 if (imm
.reg
.data
.u32
== 0xff)
1940 else if (imm
.reg
.data
.u32
== 0xffff)
1945 arg
= insn
->getSrc(!s
);
1946 Instruction
*shift
= arg
->getInsn();
1948 if (shift
&& shift
->op
== OP_SHR
&&
1949 shift
->sType
== cvt
->sType
&&
1950 shift
->src(1).getImmediate(imm
) &&
1951 ((width
== 8 && (imm
.reg
.data
.u32
& 0x7) == 0) ||
1952 (width
== 16 && (imm
.reg
.data
.u32
& 0xf) == 0))) {
1953 arg
= shift
->getSrc(0);
1954 offset
= imm
.reg
.data
.u32
;
1956 // We just AND'd the high bits away, which means this is effectively an
1958 cvt
->sType
= TYPE_U32
;
1959 } else if (insn
->op
== OP_SHR
&&
1960 insn
->sType
== cvt
->sType
&&
1961 insn
->src(1).getImmediate(imm
)) {
1962 arg
= insn
->getSrc(0);
1963 if (imm
.reg
.data
.u32
== 24) {
1966 } else if (imm
.reg
.data
.u32
== 16) {
1977 // Irrespective of what came earlier, we can undo a shift on the argument
1978 // by adjusting the offset.
1979 Instruction
*shift
= arg
->getInsn();
1980 if (shift
&& shift
->op
== OP_SHL
&&
1981 shift
->src(1).getImmediate(imm
) &&
1982 ((width
== 8 && (imm
.reg
.data
.u32
& 0x7) == 0) ||
1983 (width
== 16 && (imm
.reg
.data
.u32
& 0xf) == 0)) &&
1984 imm
.reg
.data
.u32
<= offset
) {
1985 arg
= shift
->getSrc(0);
1986 offset
-= imm
.reg
.data
.u32
;
1989 // The unpackSnorm lowering still leaves a few shifts behind, but it's too
1990 // annoying to detect them.
1993 cvt
->sType
= cvt
->sType
== TYPE_U32
? TYPE_U8
: TYPE_S8
;
1995 assert(width
== 16);
1996 cvt
->sType
= cvt
->sType
== TYPE_U32
? TYPE_U16
: TYPE_S16
;
1998 cvt
->setSrc(0, arg
);
1999 cvt
->subOp
= offset
>> 3;
2002 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
2004 AlgebraicOpt::handleSUCLAMP(Instruction
*insn
)
2007 int32_t val
= insn
->getSrc(2)->asImm()->reg
.data
.s32
;
2011 assert(insn
->srcExists(0) && insn
->src(0).getFile() == FILE_GPR
);
2013 // look for ADD (TODO: only count references by non-SUCLAMP)
2014 if (insn
->getSrc(0)->refCount() > 1)
2016 add
= insn
->getSrc(0)->getInsn();
2017 if (!add
|| add
->op
!= OP_ADD
||
2018 (add
->dType
!= TYPE_U32
&&
2019 add
->dType
!= TYPE_S32
))
2022 // look for immediate
2023 for (s
= 0; s
< 2; ++s
)
2024 if (add
->src(s
).getImmediate(imm
))
2029 // determine if immediate fits
2030 val
+= imm
.reg
.data
.s32
;
2031 if (val
> 31 || val
< -32)
2033 // determine if other addend fits
2034 if (add
->src(s
).getFile() != FILE_GPR
|| add
->src(s
).mod
!= Modifier(0))
2037 bld
.setPosition(insn
, false); // make sure bld is init'ed
2039 insn
->setSrc(2, bld
.mkImm(val
));
2040 insn
->setSrc(0, add
->getSrc(s
));
2043 // NEG(AND(SET, 1)) -> SET
2045 AlgebraicOpt::handleNEG(Instruction
*i
) {
2046 Instruction
*src
= i
->getSrc(0)->getInsn();
2050 if (isFloatType(i
->sType
) || !src
|| src
->op
!= OP_AND
)
2053 if (src
->src(0).getImmediate(imm
))
2055 else if (src
->src(1).getImmediate(imm
))
2060 if (!imm
.isInteger(1))
2063 Instruction
*set
= src
->getSrc(b
)->getInsn();
2064 if ((set
->op
== OP_SET
|| set
->op
== OP_SET_AND
||
2065 set
->op
== OP_SET_OR
|| set
->op
== OP_SET_XOR
) &&
2066 !isFloatType(set
->dType
)) {
2067 i
->def(0).replace(set
->getDef(0), false);
2072 AlgebraicOpt::visit(BasicBlock
*bb
)
2075 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
2102 if (prog
->getTarget()->isOpSupported(OP_EXTBF
, TYPE_U32
))
2119 // =============================================================================
2122 updateLdStOffset(Instruction
*ldst
, int32_t offset
, Function
*fn
)
2124 if (offset
!= ldst
->getSrc(0)->reg
.data
.offset
) {
2125 if (ldst
->getSrc(0)->refCount() > 1)
2126 ldst
->setSrc(0, cloneShallow(fn
, ldst
->getSrc(0)));
2127 ldst
->getSrc(0)->reg
.data
.offset
= offset
;
2131 // Combine loads and stores, forward stores to loads where possible.
2132 class MemoryOpt
: public Pass
2140 const Value
*rel
[2];
2148 bool overlaps(const Instruction
*ldst
) const;
2150 inline void link(Record
**);
2151 inline void unlink(Record
**);
2152 inline void set(const Instruction
*ldst
);
2158 Record
*loads
[DATA_FILE_COUNT
];
2159 Record
*stores
[DATA_FILE_COUNT
];
2161 MemoryPool recordPool
;
2164 virtual bool visit(BasicBlock
*);
2165 bool runOpt(BasicBlock
*);
2167 Record
**getList(const Instruction
*);
2169 Record
*findRecord(const Instruction
*, bool load
, bool& isAdjacent
) const;
2171 // merge @insn into load/store instruction from @rec
2172 bool combineLd(Record
*rec
, Instruction
*ld
);
2173 bool combineSt(Record
*rec
, Instruction
*st
);
2175 bool replaceLdFromLd(Instruction
*ld
, Record
*ldRec
);
2176 bool replaceLdFromSt(Instruction
*ld
, Record
*stRec
);
2177 bool replaceStFromSt(Instruction
*restrict st
, Record
*stRec
);
2179 void addRecord(Instruction
*ldst
);
2180 void purgeRecords(Instruction
*const st
, DataFile
);
2181 void lockStores(Instruction
*const ld
);
2188 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record
), 6)
2190 for (int i
= 0; i
< DATA_FILE_COUNT
; ++i
) {
2200 for (unsigned int i
= 0; i
< DATA_FILE_COUNT
; ++i
) {
2202 for (it
= loads
[i
]; it
; it
= next
) {
2204 recordPool
.release(it
);
2207 for (it
= stores
[i
]; it
; it
= next
) {
2209 recordPool
.release(it
);
2216 MemoryOpt::combineLd(Record
*rec
, Instruction
*ld
)
2218 int32_t offRc
= rec
->offset
;
2219 int32_t offLd
= ld
->getSrc(0)->reg
.data
.offset
;
2220 int sizeRc
= rec
->size
;
2221 int sizeLd
= typeSizeof(ld
->dType
);
2222 int size
= sizeRc
+ sizeLd
;
2225 if (!prog
->getTarget()->
2226 isAccessSupported(ld
->getSrc(0)->reg
.file
, typeOfSize(size
)))
2228 // no unaligned loads
2229 if (((size
== 0x8) && (MIN2(offLd
, offRc
) & 0x7)) ||
2230 ((size
== 0xc) && (MIN2(offLd
, offRc
) & 0xf)))
2232 // for compute indirect loads are not guaranteed to be aligned
2233 if (prog
->getType() == Program::TYPE_COMPUTE
&& rec
->rel
[0])
2236 assert(sizeRc
+ sizeLd
<= 16 && offRc
!= offLd
);
2238 for (j
= 0; sizeRc
; sizeRc
-= rec
->insn
->getDef(j
)->reg
.size
, ++j
);
2240 if (offLd
< offRc
) {
2242 for (sz
= 0, d
= 0; sz
< sizeLd
; sz
+= ld
->getDef(d
)->reg
.size
, ++d
);
2243 // d: nr of definitions in ld
2244 // j: nr of definitions in rec->insn, move:
2245 for (d
= d
+ j
- 1; j
> 0; --j
, --d
)
2246 rec
->insn
->setDef(d
, rec
->insn
->getDef(j
- 1));
2248 if (rec
->insn
->getSrc(0)->refCount() > 1)
2249 rec
->insn
->setSrc(0, cloneShallow(func
, rec
->insn
->getSrc(0)));
2250 rec
->offset
= rec
->insn
->getSrc(0)->reg
.data
.offset
= offLd
;
2256 // move definitions of @ld to @rec->insn
2257 for (j
= 0; sizeLd
; ++j
, ++d
) {
2258 sizeLd
-= ld
->getDef(j
)->reg
.size
;
2259 rec
->insn
->setDef(d
, ld
->getDef(j
));
2263 rec
->insn
->getSrc(0)->reg
.size
= size
;
2264 rec
->insn
->setType(typeOfSize(size
));
2266 delete_Instruction(prog
, ld
);
2272 MemoryOpt::combineSt(Record
*rec
, Instruction
*st
)
2274 int32_t offRc
= rec
->offset
;
2275 int32_t offSt
= st
->getSrc(0)->reg
.data
.offset
;
2276 int sizeRc
= rec
->size
;
2277 int sizeSt
= typeSizeof(st
->dType
);
2279 int size
= sizeRc
+ sizeSt
;
2281 Value
*src
[4]; // no modifiers in ValueRef allowed for st
2284 if (!prog
->getTarget()->
2285 isAccessSupported(st
->getSrc(0)->reg
.file
, typeOfSize(size
)))
2287 // no unaligned stores
2288 if (size
== 8 && MIN2(offRc
, offSt
) & 0x7)
2290 // for compute indirect stores are not guaranteed to be aligned
2291 if (prog
->getType() == Program::TYPE_COMPUTE
&& rec
->rel
[0])
2294 st
->takeExtraSources(0, extra
); // save predicate and indirect address
2296 if (offRc
< offSt
) {
2297 // save values from @st
2298 for (s
= 0; sizeSt
; ++s
) {
2299 sizeSt
-= st
->getSrc(s
+ 1)->reg
.size
;
2300 src
[s
] = st
->getSrc(s
+ 1);
2302 // set record's values as low sources of @st
2303 for (j
= 1; sizeRc
; ++j
) {
2304 sizeRc
-= rec
->insn
->getSrc(j
)->reg
.size
;
2305 st
->setSrc(j
, rec
->insn
->getSrc(j
));
2307 // set saved values as high sources of @st
2308 for (k
= j
, j
= 0; j
< s
; ++j
)
2309 st
->setSrc(k
++, src
[j
]);
2311 updateLdStOffset(st
, offRc
, func
);
2313 for (j
= 1; sizeSt
; ++j
)
2314 sizeSt
-= st
->getSrc(j
)->reg
.size
;
2315 for (s
= 1; sizeRc
; ++j
, ++s
) {
2316 sizeRc
-= rec
->insn
->getSrc(s
)->reg
.size
;
2317 st
->setSrc(j
, rec
->insn
->getSrc(s
));
2319 rec
->offset
= offSt
;
2321 st
->putExtraSources(0, extra
); // restore pointer and predicate
2323 delete_Instruction(prog
, rec
->insn
);
2326 rec
->insn
->getSrc(0)->reg
.size
= size
;
2327 rec
->insn
->setType(typeOfSize(size
));
2332 MemoryOpt::Record::set(const Instruction
*ldst
)
2334 const Symbol
*mem
= ldst
->getSrc(0)->asSym();
2335 fileIndex
= mem
->reg
.fileIndex
;
2336 rel
[0] = ldst
->getIndirect(0, 0);
2337 rel
[1] = ldst
->getIndirect(0, 1);
2338 offset
= mem
->reg
.data
.offset
;
2339 base
= mem
->getBase();
2340 size
= typeSizeof(ldst
->sType
);
2344 MemoryOpt::Record::link(Record
**list
)
2354 MemoryOpt::Record::unlink(Record
**list
)
2364 MemoryOpt::Record
**
2365 MemoryOpt::getList(const Instruction
*insn
)
2367 if (insn
->op
== OP_LOAD
|| insn
->op
== OP_VFETCH
)
2368 return &loads
[insn
->src(0).getFile()];
2369 return &stores
[insn
->src(0).getFile()];
2373 MemoryOpt::addRecord(Instruction
*i
)
2375 Record
**list
= getList(i
);
2376 Record
*it
= reinterpret_cast<Record
*>(recordPool
.allocate());
2385 MemoryOpt::findRecord(const Instruction
*insn
, bool load
, bool& isAdj
) const
2387 const Symbol
*sym
= insn
->getSrc(0)->asSym();
2388 const int size
= typeSizeof(insn
->sType
);
2390 Record
*it
= load
? loads
[sym
->reg
.file
] : stores
[sym
->reg
.file
];
2392 for (; it
; it
= it
->next
) {
2393 if (it
->locked
&& insn
->op
!= OP_LOAD
)
2395 if ((it
->offset
>> 4) != (sym
->reg
.data
.offset
>> 4) ||
2396 it
->rel
[0] != insn
->getIndirect(0, 0) ||
2397 it
->fileIndex
!= sym
->reg
.fileIndex
||
2398 it
->rel
[1] != insn
->getIndirect(0, 1))
2401 if (it
->offset
< sym
->reg
.data
.offset
) {
2402 if (it
->offset
+ it
->size
>= sym
->reg
.data
.offset
) {
2403 isAdj
= (it
->offset
+ it
->size
== sym
->reg
.data
.offset
);
2406 if (!(it
->offset
& 0x7))
2410 isAdj
= it
->offset
!= sym
->reg
.data
.offset
;
2411 if (size
<= it
->size
&& !isAdj
)
2414 if (!(sym
->reg
.data
.offset
& 0x7))
2415 if (it
->offset
- size
<= sym
->reg
.data
.offset
)
2423 MemoryOpt::replaceLdFromSt(Instruction
*ld
, Record
*rec
)
2425 Instruction
*st
= rec
->insn
;
2426 int32_t offSt
= rec
->offset
;
2427 int32_t offLd
= ld
->getSrc(0)->reg
.data
.offset
;
2430 for (s
= 1; offSt
!= offLd
&& st
->srcExists(s
); ++s
)
2431 offSt
+= st
->getSrc(s
)->reg
.size
;
2435 for (d
= 0; ld
->defExists(d
) && st
->srcExists(s
); ++d
, ++s
) {
2436 if (ld
->getDef(d
)->reg
.size
!= st
->getSrc(s
)->reg
.size
)
2438 if (st
->getSrc(s
)->reg
.file
!= FILE_GPR
)
2440 ld
->def(d
).replace(st
->src(s
), false);
2447 MemoryOpt::replaceLdFromLd(Instruction
*ldE
, Record
*rec
)
2449 Instruction
*ldR
= rec
->insn
;
2450 int32_t offR
= rec
->offset
;
2451 int32_t offE
= ldE
->getSrc(0)->reg
.data
.offset
;
2454 assert(offR
<= offE
);
2455 for (dR
= 0; offR
< offE
&& ldR
->defExists(dR
); ++dR
)
2456 offR
+= ldR
->getDef(dR
)->reg
.size
;
2460 for (dE
= 0; ldE
->defExists(dE
) && ldR
->defExists(dR
); ++dE
, ++dR
) {
2461 if (ldE
->getDef(dE
)->reg
.size
!= ldR
->getDef(dR
)->reg
.size
)
2463 ldE
->def(dE
).replace(ldR
->getDef(dR
), false);
2466 delete_Instruction(prog
, ldE
);
2471 MemoryOpt::replaceStFromSt(Instruction
*restrict st
, Record
*rec
)
2473 const Instruction
*const ri
= rec
->insn
;
2476 int32_t offS
= st
->getSrc(0)->reg
.data
.offset
;
2477 int32_t offR
= rec
->offset
;
2478 int32_t endS
= offS
+ typeSizeof(st
->dType
);
2479 int32_t endR
= offR
+ typeSizeof(ri
->dType
);
2481 rec
->size
= MAX2(endS
, endR
) - MIN2(offS
, offR
);
2483 st
->takeExtraSources(0, extra
);
2489 // get non-replaced sources of ri
2490 for (s
= 1; offR
< offS
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
)
2491 vals
[k
++] = ri
->getSrc(s
);
2493 // get replaced sources of st
2494 for (s
= 1; st
->srcExists(s
); offS
+= st
->getSrc(s
)->reg
.size
, ++s
)
2495 vals
[k
++] = st
->getSrc(s
);
2496 // skip replaced sources of ri
2497 for (s
= n
; offR
< endS
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
);
2498 // get non-replaced sources after values covered by st
2499 for (; offR
< endR
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
)
2500 vals
[k
++] = ri
->getSrc(s
);
2501 assert((unsigned int)k
<= ARRAY_SIZE(vals
));
2502 for (s
= 0; s
< k
; ++s
)
2503 st
->setSrc(s
+ 1, vals
[s
]);
2504 st
->setSrc(0, ri
->getSrc(0));
2508 for (j
= 1; offR
< endS
; offR
+= ri
->getSrc(j
++)->reg
.size
);
2509 for (s
= 1; offS
< endS
; offS
+= st
->getSrc(s
++)->reg
.size
);
2510 for (; offR
< endR
; offR
+= ri
->getSrc(j
++)->reg
.size
)
2511 st
->setSrc(s
++, ri
->getSrc(j
));
2513 st
->putExtraSources(0, extra
);
2515 delete_Instruction(prog
, rec
->insn
);
2518 rec
->offset
= st
->getSrc(0)->reg
.data
.offset
;
2520 st
->setType(typeOfSize(rec
->size
));
2526 MemoryOpt::Record::overlaps(const Instruction
*ldst
) const
2531 if (this->fileIndex
!= that
.fileIndex
)
2534 if (this->rel
[0] || that
.rel
[0])
2535 return this->base
== that
.base
;
2537 (this->offset
< that
.offset
+ that
.size
) &&
2538 (this->offset
+ this->size
> that
.offset
);
2541 // We must not eliminate stores that affect the result of @ld if
2542 // we find later stores to the same location, and we may no longer
2543 // merge them with later stores.
2544 // The stored value can, however, still be used to determine the value
2545 // returned by future loads.
2547 MemoryOpt::lockStores(Instruction
*const ld
)
2549 for (Record
*r
= stores
[ld
->src(0).getFile()]; r
; r
= r
->next
)
2550 if (!r
->locked
&& r
->overlaps(ld
))
2554 // Prior loads from the location of @st are no longer valid.
2555 // Stores to the location of @st may no longer be used to derive
2556 // the value at it nor be coalesced into later stores.
2558 MemoryOpt::purgeRecords(Instruction
*const st
, DataFile f
)
2561 f
= st
->src(0).getFile();
2563 for (Record
*r
= loads
[f
]; r
; r
= r
->next
)
2564 if (!st
|| r
->overlaps(st
))
2565 r
->unlink(&loads
[f
]);
2567 for (Record
*r
= stores
[f
]; r
; r
= r
->next
)
2568 if (!st
|| r
->overlaps(st
))
2569 r
->unlink(&stores
[f
]);
2573 MemoryOpt::visit(BasicBlock
*bb
)
2575 bool ret
= runOpt(bb
);
2576 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
2577 // where 96 bit memory operations are forbidden.
2584 MemoryOpt::runOpt(BasicBlock
*bb
)
2586 Instruction
*ldst
, *next
;
2588 bool isAdjacent
= true;
2590 for (ldst
= bb
->getEntry(); ldst
; ldst
= next
) {
2595 if (ldst
->op
== OP_LOAD
|| ldst
->op
== OP_VFETCH
) {
2596 if (ldst
->isDead()) {
2597 // might have been produced by earlier optimization
2598 delete_Instruction(prog
, ldst
);
2602 if (ldst
->op
== OP_STORE
|| ldst
->op
== OP_EXPORT
) {
2603 if (typeSizeof(ldst
->dType
) == 4 &&
2604 ldst
->src(1).getFile() == FILE_GPR
&&
2605 ldst
->getSrc(1)->getInsn()->op
== OP_NOP
) {
2606 delete_Instruction(prog
, ldst
);
2611 // TODO: maybe have all fixed ops act as barrier ?
2612 if (ldst
->op
== OP_CALL
||
2613 ldst
->op
== OP_BAR
||
2614 ldst
->op
== OP_MEMBAR
) {
2615 purgeRecords(NULL
, FILE_MEMORY_LOCAL
);
2616 purgeRecords(NULL
, FILE_MEMORY_GLOBAL
);
2617 purgeRecords(NULL
, FILE_MEMORY_SHARED
);
2618 purgeRecords(NULL
, FILE_SHADER_OUTPUT
);
2620 if (ldst
->op
== OP_ATOM
|| ldst
->op
== OP_CCTL
) {
2621 if (ldst
->src(0).getFile() == FILE_MEMORY_GLOBAL
) {
2622 purgeRecords(NULL
, FILE_MEMORY_LOCAL
);
2623 purgeRecords(NULL
, FILE_MEMORY_GLOBAL
);
2624 purgeRecords(NULL
, FILE_MEMORY_SHARED
);
2626 purgeRecords(NULL
, ldst
->src(0).getFile());
2629 if (ldst
->op
== OP_EMIT
|| ldst
->op
== OP_RESTART
) {
2630 purgeRecords(NULL
, FILE_SHADER_OUTPUT
);
2634 if (ldst
->getPredicate()) // TODO: handle predicated ld/st
2636 if (ldst
->perPatch
) // TODO: create separate per-patch lists
2640 DataFile file
= ldst
->src(0).getFile();
2642 // if ld l[]/g[] look for previous store to eliminate the reload
2643 if (file
== FILE_MEMORY_GLOBAL
|| file
== FILE_MEMORY_LOCAL
) {
2644 // TODO: shared memory ?
2645 rec
= findRecord(ldst
, false, isAdjacent
);
2646 if (rec
&& !isAdjacent
)
2647 keep
= !replaceLdFromSt(ldst
, rec
);
2650 // or look for ld from the same location and replace this one
2651 rec
= keep
? findRecord(ldst
, true, isAdjacent
) : NULL
;
2654 keep
= !replaceLdFromLd(ldst
, rec
);
2656 // or combine a previous load with this one
2657 keep
= !combineLd(rec
, ldst
);
2662 rec
= findRecord(ldst
, false, isAdjacent
);
2665 keep
= !replaceStFromSt(ldst
, rec
);
2667 keep
= !combineSt(rec
, ldst
);
2670 purgeRecords(ldst
, DATA_FILE_COUNT
);
2680 // =============================================================================
2682 // Turn control flow into predicated instructions (after register allocation !).
2684 // Could move this to before register allocation on NVC0 and also handle nested
2686 class FlatteningPass
: public Pass
2689 virtual bool visit(Function
*);
2690 virtual bool visit(BasicBlock
*);
2692 bool tryPredicateConditional(BasicBlock
*);
2693 void predicateInstructions(BasicBlock
*, Value
*pred
, CondCode cc
);
2694 void tryPropagateBranch(BasicBlock
*);
2695 inline bool isConstantCondition(Value
*pred
);
2696 inline bool mayPredicate(const Instruction
*, const Value
*pred
) const;
2697 inline void removeFlow(Instruction
*);
2703 FlatteningPass::isConstantCondition(Value
*pred
)
2705 Instruction
*insn
= pred
->getUniqueInsn();
2707 if (insn
->op
!= OP_SET
|| insn
->srcExists(2))
2710 for (int s
= 0; s
< 2 && insn
->srcExists(s
); ++s
) {
2711 Instruction
*ld
= insn
->getSrc(s
)->getUniqueInsn();
2714 if (ld
->op
!= OP_MOV
&& ld
->op
!= OP_LOAD
)
2716 if (ld
->src(0).isIndirect(0))
2718 file
= ld
->src(0).getFile();
2720 file
= insn
->src(s
).getFile();
2721 // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
2722 // in register "units", which can vary between targets.
2723 if (file
== FILE_GPR
) {
2724 Value
*v
= insn
->getSrc(s
);
2725 int bytes
= v
->reg
.data
.id
* MIN2(v
->reg
.size
, 4);
2726 int units
= bytes
>> gpr_unit
;
2727 if (units
> prog
->maxGPR
)
2728 file
= FILE_IMMEDIATE
;
2731 if (file
!= FILE_IMMEDIATE
&& file
!= FILE_MEMORY_CONST
)
2738 FlatteningPass::removeFlow(Instruction
*insn
)
2740 FlowInstruction
*term
= insn
? insn
->asFlow() : NULL
;
2743 Graph::Edge::Type ty
= term
->bb
->cfg
.outgoing().getType();
2745 if (term
->op
== OP_BRA
) {
2746 // TODO: this might get more difficult when we get arbitrary BRAs
2747 if (ty
== Graph::Edge::CROSS
|| ty
== Graph::Edge::BACK
)
2750 if (term
->op
!= OP_JOIN
)
2753 Value
*pred
= term
->getPredicate();
2755 delete_Instruction(prog
, term
);
2757 if (pred
&& pred
->refCount() == 0) {
2758 Instruction
*pSet
= pred
->getUniqueInsn();
2759 pred
->join
->reg
.data
.id
= -1; // deallocate
2761 delete_Instruction(prog
, pSet
);
2766 FlatteningPass::predicateInstructions(BasicBlock
*bb
, Value
*pred
, CondCode cc
)
2768 for (Instruction
*i
= bb
->getEntry(); i
; i
= i
->next
) {
2771 assert(!i
->getPredicate());
2772 i
->setPredicate(cc
, pred
);
2774 removeFlow(bb
->getExit());
2778 FlatteningPass::mayPredicate(const Instruction
*insn
, const Value
*pred
) const
2780 if (insn
->isPseudo())
2782 // TODO: calls where we don't know which registers are modified
2784 if (!prog
->getTarget()->mayPredicate(insn
, pred
))
2786 for (int d
= 0; insn
->defExists(d
); ++d
)
2787 if (insn
->getDef(d
)->equals(pred
))
2792 // If we jump to BRA/RET/EXIT, replace the jump with it.
2793 // NOTE: We do not update the CFG anymore here !
2795 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
2797 // @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
2805 FlatteningPass::tryPropagateBranch(BasicBlock
*bb
)
2807 for (Instruction
*i
= bb
->getExit(); i
&& i
->op
== OP_BRA
; i
= i
->prev
) {
2808 BasicBlock
*bf
= i
->asFlow()->target
.bb
;
2810 if (bf
->getInsnCount() != 1)
2813 FlowInstruction
*bra
= i
->asFlow();
2814 FlowInstruction
*rep
= bf
->getExit()->asFlow();
2816 if (!rep
|| rep
->getPredicate())
2818 if (rep
->op
!= OP_BRA
&&
2819 rep
->op
!= OP_JOIN
&&
2823 // TODO: If there are multiple branches to @rep, only the first would
2824 // be replaced, so only remove them after this pass is done ?
2825 // Also, need to check all incident blocks for fall-through exits and
2826 // add the branch there.
2828 bra
->target
.bb
= rep
->target
.bb
;
2829 if (bf
->cfg
.incidentCount() == 1)
2835 FlatteningPass::visit(Function
*fn
)
2837 gpr_unit
= prog
->getTarget()->getFileUnit(FILE_GPR
);
2843 FlatteningPass::visit(BasicBlock
*bb
)
2845 if (tryPredicateConditional(bb
))
2848 // try to attach join to previous instruction
2849 if (prog
->getTarget()->hasJoin
) {
2850 Instruction
*insn
= bb
->getExit();
2851 if (insn
&& insn
->op
== OP_JOIN
&& !insn
->getPredicate()) {
2853 if (insn
&& !insn
->getPredicate() &&
2855 insn
->op
!= OP_TEXBAR
&&
2856 !isTextureOp(insn
->op
) && // probably just nve4
2857 !isSurfaceOp(insn
->op
) && // not confirmed
2858 insn
->op
!= OP_LINTERP
&& // probably just nve4
2859 insn
->op
!= OP_PINTERP
&& // probably just nve4
2860 ((insn
->op
!= OP_LOAD
&& insn
->op
!= OP_STORE
&& insn
->op
!= OP_ATOM
) ||
2861 (typeSizeof(insn
->dType
) <= 4 && !insn
->src(0).isIndirect(0))) &&
2864 bb
->remove(bb
->getExit());
2870 tryPropagateBranch(bb
);
2876 FlatteningPass::tryPredicateConditional(BasicBlock
*bb
)
2878 BasicBlock
*bL
= NULL
, *bR
= NULL
;
2879 unsigned int nL
= 0, nR
= 0, limit
= 12;
2883 mask
= bb
->initiatesSimpleConditional();
2887 assert(bb
->getExit());
2888 Value
*pred
= bb
->getExit()->getPredicate();
2891 if (isConstantCondition(pred
))
2894 Graph::EdgeIterator ei
= bb
->cfg
.outgoing();
2897 bL
= BasicBlock::get(ei
.getNode());
2898 for (insn
= bL
->getEntry(); insn
; insn
= insn
->next
, ++nL
)
2899 if (!mayPredicate(insn
, pred
))
2902 return false; // too long, do a real branch
2907 bR
= BasicBlock::get(ei
.getNode());
2908 for (insn
= bR
->getEntry(); insn
; insn
= insn
->next
, ++nR
)
2909 if (!mayPredicate(insn
, pred
))
2912 return false; // too long, do a real branch
2916 predicateInstructions(bL
, pred
, bb
->getExit()->cc
);
2918 predicateInstructions(bR
, pred
, inverseCondCode(bb
->getExit()->cc
));
2921 bb
->remove(bb
->joinAt
);
2924 removeFlow(bb
->getExit()); // delete the branch/join at the fork point
2926 // remove potential join operations at the end of the conditional
2927 if (prog
->getTarget()->joinAnterior
) {
2928 bb
= BasicBlock::get((bL
? bL
: bR
)->cfg
.outgoing().getNode());
2929 if (bb
->getEntry() && bb
->getEntry()->op
== OP_JOIN
)
2930 removeFlow(bb
->getEntry());
2936 // =============================================================================
2938 // Fold Immediate into MAD; must be done after register allocation due to
2939 // constraint SDST == SSRC2
2941 // Does NVC0+ have other situations where this pass makes sense?
2942 class NV50PostRaConstantFolding
: public Pass
2945 virtual bool visit(BasicBlock
*);
2949 post_ra_dead(Instruction
*i
)
2951 for (int d
= 0; i
->defExists(d
); ++d
)
2952 if (i
->getDef(d
)->refCount())
2958 NV50PostRaConstantFolding::visit(BasicBlock
*bb
)
2963 for (Instruction
*i
= bb
->getFirst(); i
; i
= i
->next
) {
2966 if (i
->def(0).getFile() != FILE_GPR
||
2967 i
->src(0).getFile() != FILE_GPR
||
2968 i
->src(1).getFile() != FILE_GPR
||
2969 i
->src(2).getFile() != FILE_GPR
||
2970 i
->getDef(0)->reg
.data
.id
!= i
->getSrc(2)->reg
.data
.id
)
2973 if (i
->getDef(0)->reg
.data
.id
>= 64 ||
2974 i
->getSrc(0)->reg
.data
.id
>= 64)
2977 if (i
->flagsSrc
>= 0 && i
->getSrc(i
->flagsSrc
)->reg
.data
.id
!= 0)
2980 if (i
->getPredicate())
2983 def
= i
->getSrc(1)->getInsn();
2984 if (def
&& def
->op
== OP_SPLIT
&& typeSizeof(def
->sType
) == 4)
2985 def
= def
->getSrc(0)->getInsn();
2986 if (def
&& def
->op
== OP_MOV
&& def
->src(0).getFile() == FILE_IMMEDIATE
) {
2987 vtmp
= i
->getSrc(1);
2988 if (isFloatType(i
->sType
)) {
2989 i
->setSrc(1, def
->getSrc(0));
2992 bool ret
= def
->src(0).getImmediate(val
);
2994 if (i
->getSrc(1)->reg
.data
.id
& 1)
2995 val
.reg
.data
.u32
>>= 16;
2996 val
.reg
.data
.u32
&= 0xffff;
2997 i
->setSrc(1, new_ImmediateValue(bb
->getProgram(), val
.reg
.data
.u32
));
3000 /* There's no post-RA dead code elimination, so do it here
3001 * XXX: if we add more code-removing post-RA passes, we might
3002 * want to create a post-RA dead-code elim pass */
3003 if (post_ra_dead(vtmp
->getInsn())) {
3004 Value
*src
= vtmp
->getInsn()->getSrc(0);
3005 // Careful -- splits will have already been removed from the
3006 // functions. Don't double-delete.
3007 if (vtmp
->getInsn()->bb
)
3008 delete_Instruction(prog
, vtmp
->getInsn());
3009 if (src
->getInsn() && post_ra_dead(src
->getInsn()))
3010 delete_Instruction(prog
, src
->getInsn());
3024 // =============================================================================
3026 // Common subexpression elimination. Stupid O^2 implementation.
3027 class LocalCSE
: public Pass
3030 virtual bool visit(BasicBlock
*);
3032 inline bool tryReplace(Instruction
**, Instruction
*);
3034 DLList ops
[OP_LAST
+ 1];
3037 class GlobalCSE
: public Pass
3040 virtual bool visit(BasicBlock
*);
3044 Instruction::isActionEqual(const Instruction
*that
) const
3046 if (this->op
!= that
->op
||
3047 this->dType
!= that
->dType
||
3048 this->sType
!= that
->sType
)
3050 if (this->cc
!= that
->cc
)
3053 if (this->asTex()) {
3054 if (memcmp(&this->asTex()->tex
,
3055 &that
->asTex()->tex
,
3056 sizeof(this->asTex()->tex
)))
3059 if (this->asCmp()) {
3060 if (this->asCmp()->setCond
!= that
->asCmp()->setCond
)
3063 if (this->asFlow()) {
3066 if (this->ipa
!= that
->ipa
||
3067 this->lanes
!= that
->lanes
||
3068 this->perPatch
!= that
->perPatch
)
3070 if (this->postFactor
!= that
->postFactor
)
3074 if (this->subOp
!= that
->subOp
||
3075 this->saturate
!= that
->saturate
||
3076 this->rnd
!= that
->rnd
||
3077 this->ftz
!= that
->ftz
||
3078 this->dnz
!= that
->dnz
||
3079 this->cache
!= that
->cache
||
3080 this->mask
!= that
->mask
)
3087 Instruction::isResultEqual(const Instruction
*that
) const
3091 // NOTE: location of discard only affects tex with liveOnly and quadops
3092 if (!this->defExists(0) && this->op
!= OP_DISCARD
)
3095 if (!isActionEqual(that
))
3098 if (this->predSrc
!= that
->predSrc
)
3101 for (d
= 0; this->defExists(d
); ++d
) {
3102 if (!that
->defExists(d
) ||
3103 !this->getDef(d
)->equals(that
->getDef(d
), false))
3106 if (that
->defExists(d
))
3109 for (s
= 0; this->srcExists(s
); ++s
) {
3110 if (!that
->srcExists(s
))
3112 if (this->src(s
).mod
!= that
->src(s
).mod
)
3114 if (!this->getSrc(s
)->equals(that
->getSrc(s
), true))
3117 if (that
->srcExists(s
))
3120 if (op
== OP_LOAD
|| op
== OP_VFETCH
|| op
== OP_ATOM
) {
3121 switch (src(0).getFile()) {
3122 case FILE_MEMORY_CONST
:
3123 case FILE_SHADER_INPUT
:
3125 case FILE_SHADER_OUTPUT
:
3126 return bb
->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL
;
3135 // pull through common expressions from different in-blocks
3137 GlobalCSE::visit(BasicBlock
*bb
)
3139 Instruction
*phi
, *next
, *ik
;
3142 // TODO: maybe do this with OP_UNION, too
3144 for (phi
= bb
->getPhi(); phi
&& phi
->op
== OP_PHI
; phi
= next
) {
3146 if (phi
->getSrc(0)->refCount() > 1)
3148 ik
= phi
->getSrc(0)->getInsn();
3150 continue; // probably a function input
3151 if (ik
->defCount(0xff) > 1)
3152 continue; // too painful to check if we can really push this forward
3153 for (s
= 1; phi
->srcExists(s
); ++s
) {
3154 if (phi
->getSrc(s
)->refCount() > 1)
3156 if (!phi
->getSrc(s
)->getInsn() ||
3157 !phi
->getSrc(s
)->getInsn()->isResultEqual(ik
))
3160 if (!phi
->srcExists(s
)) {
3161 Instruction
*entry
= bb
->getEntry();
3163 if (!entry
|| entry
->op
!= OP_JOIN
)
3166 bb
->insertAfter(entry
, ik
);
3167 ik
->setDef(0, phi
->getDef(0));
3168 delete_Instruction(prog
, phi
);
3176 LocalCSE::tryReplace(Instruction
**ptr
, Instruction
*i
)
3178 Instruction
*old
= *ptr
;
3180 // TODO: maybe relax this later (causes trouble with OP_UNION)
3181 if (i
->isPredicated())
3184 if (!old
->isResultEqual(i
))
3187 for (int d
= 0; old
->defExists(d
); ++d
)
3188 old
->def(d
).replace(i
->getDef(d
), false);
3189 delete_Instruction(prog
, old
);
3195 LocalCSE::visit(BasicBlock
*bb
)
3197 unsigned int replaced
;
3200 Instruction
*ir
, *next
;
3204 // will need to know the order of instructions
3206 for (ir
= bb
->getFirst(); ir
; ir
= ir
->next
)
3207 ir
->serial
= serial
++;
3209 for (ir
= bb
->getEntry(); ir
; ir
= next
) {
3216 ops
[ir
->op
].insert(ir
);
3220 for (s
= 0; ir
->srcExists(s
); ++s
)
3221 if (ir
->getSrc(s
)->asLValue())
3222 if (!src
|| ir
->getSrc(s
)->refCount() < src
->refCount())
3223 src
= ir
->getSrc(s
);
3226 for (Value::UseIterator it
= src
->uses
.begin();
3227 it
!= src
->uses
.end(); ++it
) {
3228 Instruction
*ik
= (*it
)->getInsn();
3229 if (ik
&& ik
->bb
== ir
->bb
&& ik
->serial
< ir
->serial
)
3230 if (tryReplace(&ir
, ik
))
3234 DLLIST_FOR_EACH(&ops
[ir
->op
], iter
)
3236 Instruction
*ik
= reinterpret_cast<Instruction
*>(iter
.get());
3237 if (tryReplace(&ir
, ik
))
3243 ops
[ir
->op
].insert(ir
);
3247 for (unsigned int i
= 0; i
<= OP_LAST
; ++i
)
3255 // =============================================================================
3257 // Remove computations of unused values.
3258 class DeadCodeElim
: public Pass
3261 bool buryAll(Program
*);
3264 virtual bool visit(BasicBlock
*);
3266 void checkSplitLoad(Instruction
*ld
); // for partially dead loads
3268 unsigned int deadCount
;
3272 DeadCodeElim::buryAll(Program
*prog
)
3276 if (!this->run(prog
, false, false))
3278 } while (deadCount
);
3284 DeadCodeElim::visit(BasicBlock
*bb
)
3288 for (Instruction
*i
= bb
->getExit(); i
; i
= prev
) {
3292 delete_Instruction(prog
, i
);
3294 if (i
->defExists(1) &&
3296 (i
->op
== OP_VFETCH
|| i
->op
== OP_LOAD
)) {
3299 if (i
->defExists(0) && !i
->getDef(0)->refCount()) {
3300 if (i
->op
== OP_ATOM
||
3301 i
->op
== OP_SUREDP
||
3302 i
->op
== OP_SUREDB
) {
3304 } else if (i
->op
== OP_LOAD
&& i
->subOp
== NV50_IR_SUBOP_LOAD_LOCKED
) {
3305 i
->setDef(0, i
->getDef(1));
3313 // Each load can go into up to 4 destinations, any of which might potentially
3314 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3315 // of where the holes are. We find the first contiguous region, put it into
3316 // the first load, and then put the second contiguous region into the second
3317 // load. There can be at most 2 contiguous regions.
3319 // Note that there are some restrictions, for example it's not possible to do
3320 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3321 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3322 // split into a 64-bit and 32-bit load.
3324 DeadCodeElim::checkSplitLoad(Instruction
*ld1
)
3326 Instruction
*ld2
= NULL
; // can get at most 2 loads
3329 int32_t addr1
, addr2
;
3330 int32_t size1
, size2
;
3332 uint32_t mask
= 0xffffffff;
3334 for (d
= 0; ld1
->defExists(d
); ++d
)
3335 if (!ld1
->getDef(d
)->refCount() && ld1
->getDef(d
)->reg
.data
.id
< 0)
3337 if (mask
== 0xffffffff)
3340 addr1
= ld1
->getSrc(0)->reg
.data
.offset
;
3344 // Compute address/width for first load
3345 for (d
= 0; ld1
->defExists(d
); ++d
) {
3346 if (mask
& (1 << d
)) {
3347 if (size1
&& (addr1
& 0x7))
3349 def1
[n1
] = ld1
->getDef(d
);
3350 size1
+= def1
[n1
++]->reg
.size
;
3353 addr1
+= ld1
->getDef(d
)->reg
.size
;
3359 // Scale back the size of the first load until it can be loaded. This
3360 // typically happens for TYPE_B96 loads.
3362 !prog
->getTarget()->isAccessSupported(ld1
->getSrc(0)->reg
.file
,
3363 typeOfSize(size1
))) {
3364 size1
-= def1
[--n1
]->reg
.size
;
3368 // Compute address/width for second load
3369 for (addr2
= addr1
+ size1
; ld1
->defExists(d
); ++d
) {
3370 if (mask
& (1 << d
)) {
3371 assert(!size2
|| !(addr2
& 0x7));
3372 def2
[n2
] = ld1
->getDef(d
);
3373 size2
+= def2
[n2
++]->reg
.size
;
3376 addr2
+= ld1
->getDef(d
)->reg
.size
;
3382 // Make sure that we've processed all the values
3383 for (; ld1
->defExists(d
); ++d
)
3384 assert(!(mask
& (1 << d
)));
3386 updateLdStOffset(ld1
, addr1
, func
);
3387 ld1
->setType(typeOfSize(size1
));
3388 for (d
= 0; d
< 4; ++d
)
3389 ld1
->setDef(d
, (d
< n1
) ? def1
[d
] : NULL
);
3394 ld2
= cloneShallow(func
, ld1
);
3395 updateLdStOffset(ld2
, addr2
, func
);
3396 ld2
->setType(typeOfSize(size2
));
3397 for (d
= 0; d
< 4; ++d
)
3398 ld2
->setDef(d
, (d
< n2
) ? def2
[d
] : NULL
);
3400 ld1
->bb
->insertAfter(ld1
, ld2
);
3403 // =============================================================================
3405 #define RUN_PASS(l, n, f) \
3406 if (level >= (l)) { \
3407 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \
3408 INFO("PEEPHOLE: %s\n", #n); \
3410 if (!pass.f(this)) \
3415 Program::optimizeSSA(int level
)
3417 RUN_PASS(1, DeadCodeElim
, buryAll
);
3418 RUN_PASS(1, CopyPropagation
, run
);
3419 RUN_PASS(1, MergeSplits
, run
);
3420 RUN_PASS(2, GlobalCSE
, run
);
3421 RUN_PASS(1, LocalCSE
, run
);
3422 RUN_PASS(2, AlgebraicOpt
, run
);
3423 RUN_PASS(2, ModifierFolding
, run
); // before load propagation -> less checks
3424 RUN_PASS(1, ConstantFolding
, foldAll
);
3425 RUN_PASS(1, LoadPropagation
, run
);
3426 RUN_PASS(1, IndirectPropagation
, run
);
3427 RUN_PASS(2, MemoryOpt
, run
);
3428 RUN_PASS(2, LocalCSE
, run
);
3429 RUN_PASS(0, DeadCodeElim
, buryAll
);
3435 Program::optimizePostRA(int level
)
3437 RUN_PASS(2, FlatteningPass
, run
);
3438 if (getTarget()->getChipset() < 0xc0)
3439 RUN_PASS(2, NV50PostRaConstantFolding
, run
);