2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_target.h"
25 #include "codegen/nv50_ir_build_util.h"
28 #include "util/u_math.h"
34 Instruction::isNop() const
36 if (op
== OP_PHI
|| op
== OP_SPLIT
|| op
== OP_MERGE
|| op
== OP_CONSTRAINT
)
38 if (terminator
|| join
) // XXX: should terminator imply flow ?
42 if (!fixed
&& op
== OP_NOP
)
45 if (defExists(0) && def(0).rep()->reg
.data
.id
< 0) {
46 for (int d
= 1; defExists(d
); ++d
)
47 if (def(d
).rep()->reg
.data
.id
>= 0)
48 WARN("part of vector result is unused !\n");
52 if (op
== OP_MOV
|| op
== OP_UNION
) {
53 if (!getDef(0)->equals(getSrc(0)))
56 if (!def(0).rep()->equals(getSrc(1)))
64 bool Instruction::isDead() const
69 op
== OP_SUSTB
|| op
== OP_SUSTP
|| op
== OP_SUREDP
|| op
== OP_SUREDB
||
73 for (int d
= 0; defExists(d
); ++d
)
74 if (getDef(d
)->refCount() || getDef(d
)->reg
.data
.id
>= 0)
77 if (terminator
|| asFlow())
85 // =============================================================================
87 class CopyPropagation
: public Pass
90 virtual bool visit(BasicBlock
*);
93 // Propagate all MOVs forward to make subsequent optimization easier, except if
94 // the sources stem from a phi, in which case we don't want to mess up potential
95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
97 CopyPropagation::visit(BasicBlock
*bb
)
99 Instruction
*mov
, *si
, *next
;
101 for (mov
= bb
->getEntry(); mov
; mov
= next
) {
103 if (mov
->op
!= OP_MOV
|| mov
->fixed
|| !mov
->getSrc(0)->asLValue())
105 if (mov
->getPredicate())
107 if (mov
->def(0).getFile() != mov
->src(0).getFile())
109 si
= mov
->getSrc(0)->getInsn();
110 if (mov
->getDef(0)->reg
.data
.id
< 0 && si
&& si
->op
!= OP_PHI
) {
112 mov
->def(0).replace(mov
->getSrc(0), false);
113 delete_Instruction(prog
, mov
);
119 // =============================================================================
121 class MergeSplits
: public Pass
124 virtual bool visit(BasicBlock
*);
127 // For SPLIT / MERGE pairs that operate on the same registers, replace the
128 // post-merge def with the SPLIT's source.
130 MergeSplits::visit(BasicBlock
*bb
)
132 Instruction
*i
, *next
, *si
;
134 for (i
= bb
->getEntry(); i
; i
= next
) {
136 if (i
->op
!= OP_MERGE
|| typeSizeof(i
->dType
) != 8)
138 si
= i
->getSrc(0)->getInsn();
139 if (si
->op
!= OP_SPLIT
|| si
!= i
->getSrc(1)->getInsn())
141 i
->def(0).replace(si
->getSrc(0), false);
142 delete_Instruction(prog
, i
);
148 // =============================================================================
150 class LoadPropagation
: public Pass
153 virtual bool visit(BasicBlock
*);
155 void checkSwapSrc01(Instruction
*);
157 bool isCSpaceLoad(Instruction
*);
158 bool isImmdLoad(Instruction
*);
159 bool isAttribOrSharedLoad(Instruction
*);
163 LoadPropagation::isCSpaceLoad(Instruction
*ld
)
165 return ld
&& ld
->op
== OP_LOAD
&& ld
->src(0).getFile() == FILE_MEMORY_CONST
;
169 LoadPropagation::isImmdLoad(Instruction
*ld
)
171 if (!ld
|| (ld
->op
!= OP_MOV
) ||
172 ((typeSizeof(ld
->dType
) != 4) && (typeSizeof(ld
->dType
) != 8)))
175 // A 0 can be replaced with a register, so it doesn't count as an immediate.
177 return ld
->src(0).getImmediate(val
) && !val
.isInteger(0);
181 LoadPropagation::isAttribOrSharedLoad(Instruction
*ld
)
184 (ld
->op
== OP_VFETCH
||
185 (ld
->op
== OP_LOAD
&&
186 (ld
->src(0).getFile() == FILE_SHADER_INPUT
||
187 ld
->src(0).getFile() == FILE_MEMORY_SHARED
)));
191 LoadPropagation::checkSwapSrc01(Instruction
*insn
)
193 const Target
*targ
= prog
->getTarget();
194 if (!targ
->getOpInfo(insn
).commutative
) {
195 if (insn
->op
!= OP_SET
&& insn
->op
!= OP_SLCT
&&
196 insn
->op
!= OP_SUB
&& insn
->op
!= OP_XMAD
)
198 // XMAD is only commutative if both the CBCC and MRG flags are not set.
199 if (insn
->op
== OP_XMAD
&&
200 (insn
->subOp
& NV50_IR_SUBOP_XMAD_CMODE_MASK
) == NV50_IR_SUBOP_XMAD_CBCC
)
202 if (insn
->op
== OP_XMAD
&& (insn
->subOp
& NV50_IR_SUBOP_XMAD_MRG
))
205 if (insn
->src(1).getFile() != FILE_GPR
)
207 // This is the special OP_SET used for alphatesting, we can't reverse its
208 // arguments as that will confuse the fixup code.
209 if (insn
->op
== OP_SET
&& insn
->subOp
)
212 Instruction
*i0
= insn
->getSrc(0)->getInsn();
213 Instruction
*i1
= insn
->getSrc(1)->getInsn();
215 // Swap sources to inline the less frequently used source. That way,
216 // optimistically, it will eventually be able to remove the instruction.
217 int i0refs
= insn
->getSrc(0)->refCount();
218 int i1refs
= insn
->getSrc(1)->refCount();
220 if ((isCSpaceLoad(i0
) || isImmdLoad(i0
)) && targ
->insnCanLoad(insn
, 1, i0
)) {
221 if ((!isImmdLoad(i1
) && !isCSpaceLoad(i1
)) ||
222 !targ
->insnCanLoad(insn
, 1, i1
) ||
224 insn
->swapSources(0, 1);
228 if (isAttribOrSharedLoad(i1
)) {
229 if (!isAttribOrSharedLoad(i0
))
230 insn
->swapSources(0, 1);
237 if (insn
->op
== OP_SET
|| insn
->op
== OP_SET_AND
||
238 insn
->op
== OP_SET_OR
|| insn
->op
== OP_SET_XOR
)
239 insn
->asCmp()->setCond
= reverseCondCode(insn
->asCmp()->setCond
);
241 if (insn
->op
== OP_SLCT
)
242 insn
->asCmp()->setCond
= inverseCondCode(insn
->asCmp()->setCond
);
244 if (insn
->op
== OP_SUB
) {
245 insn
->src(0).mod
= insn
->src(0).mod
^ Modifier(NV50_IR_MOD_NEG
);
246 insn
->src(1).mod
= insn
->src(1).mod
^ Modifier(NV50_IR_MOD_NEG
);
248 if (insn
->op
== OP_XMAD
) {
250 uint16_t h1
= (insn
->subOp
>> 1 & NV50_IR_SUBOP_XMAD_H1(0)) |
251 (insn
->subOp
<< 1 & NV50_IR_SUBOP_XMAD_H1(1));
252 insn
->subOp
= (insn
->subOp
& ~NV50_IR_SUBOP_XMAD_H1_MASK
) | h1
;
257 LoadPropagation::visit(BasicBlock
*bb
)
259 const Target
*targ
= prog
->getTarget();
262 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
265 if (i
->op
== OP_CALL
) // calls have args as sources, they must be in regs
268 if (i
->op
== OP_PFETCH
) // pfetch expects arg1 to be a reg
274 for (int s
= 0; i
->srcExists(s
); ++s
) {
275 Instruction
*ld
= i
->getSrc(s
)->getInsn();
277 if (!ld
|| ld
->fixed
|| (ld
->op
!= OP_LOAD
&& ld
->op
!= OP_MOV
))
279 if (!targ
->insnCanLoad(i
, s
, ld
))
283 i
->setSrc(s
, ld
->getSrc(0));
284 if (ld
->src(0).isIndirect(0))
285 i
->setIndirect(s
, 0, ld
->getIndirect(0, 0));
287 if (ld
->getDef(0)->refCount() == 0)
288 delete_Instruction(prog
, ld
);
294 // =============================================================================
296 class IndirectPropagation
: public Pass
299 virtual bool visit(BasicBlock
*);
305 IndirectPropagation::visit(BasicBlock
*bb
)
307 const Target
*targ
= prog
->getTarget();
310 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
313 bld
.setPosition(i
, false);
315 for (int s
= 0; i
->srcExists(s
); ++s
) {
318 if (!i
->src(s
).isIndirect(0))
320 insn
= i
->getIndirect(s
, 0)->getInsn();
323 if (insn
->op
== OP_ADD
&& !isFloatType(insn
->dType
)) {
324 if (insn
->src(0).getFile() != targ
->nativeFile(FILE_ADDRESS
) ||
325 !insn
->src(1).getImmediate(imm
) ||
326 !targ
->insnCanLoadOffset(i
, s
, imm
.reg
.data
.s32
))
328 i
->setIndirect(s
, 0, insn
->getSrc(0));
329 i
->setSrc(s
, cloneShallow(func
, i
->getSrc(s
)));
330 i
->src(s
).get()->reg
.data
.offset
+= imm
.reg
.data
.u32
;
331 } else if (insn
->op
== OP_SUB
&& !isFloatType(insn
->dType
)) {
332 if (insn
->src(0).getFile() != targ
->nativeFile(FILE_ADDRESS
) ||
333 !insn
->src(1).getImmediate(imm
) ||
334 !targ
->insnCanLoadOffset(i
, s
, -imm
.reg
.data
.s32
))
336 i
->setIndirect(s
, 0, insn
->getSrc(0));
337 i
->setSrc(s
, cloneShallow(func
, i
->getSrc(s
)));
338 i
->src(s
).get()->reg
.data
.offset
-= imm
.reg
.data
.u32
;
339 } else if (insn
->op
== OP_MOV
) {
340 if (!insn
->src(0).getImmediate(imm
) ||
341 !targ
->insnCanLoadOffset(i
, s
, imm
.reg
.data
.s32
))
343 i
->setIndirect(s
, 0, NULL
);
344 i
->setSrc(s
, cloneShallow(func
, i
->getSrc(s
)));
345 i
->src(s
).get()->reg
.data
.offset
+= imm
.reg
.data
.u32
;
346 } else if (insn
->op
== OP_SHLADD
) {
347 if (!insn
->src(2).getImmediate(imm
) ||
348 !targ
->insnCanLoadOffset(i
, s
, imm
.reg
.data
.s32
))
350 i
->setIndirect(s
, 0, bld
.mkOp2v(
351 OP_SHL
, TYPE_U32
, bld
.getSSA(), insn
->getSrc(0), insn
->getSrc(1)));
352 i
->setSrc(s
, cloneShallow(func
, i
->getSrc(s
)));
353 i
->src(s
).get()->reg
.data
.offset
+= imm
.reg
.data
.u32
;
360 // =============================================================================
362 // Evaluate constant expressions.
363 class ConstantFolding
: public Pass
366 bool foldAll(Program
*);
369 virtual bool visit(BasicBlock
*);
371 void expr(Instruction
*, ImmediateValue
&, ImmediateValue
&);
372 void expr(Instruction
*, ImmediateValue
&, ImmediateValue
&, ImmediateValue
&);
373 /* true if i was deleted */
374 bool opnd(Instruction
*i
, ImmediateValue
&, int s
);
375 void opnd3(Instruction
*, ImmediateValue
&);
377 void unary(Instruction
*, const ImmediateValue
&);
379 void tryCollapseChainedMULs(Instruction
*, const int s
, ImmediateValue
&);
381 CmpInstruction
*findOriginForTestWithZero(Value
*);
383 bool createMul(DataType ty
, Value
*def
, Value
*a
, int64_t b
, Value
*c
);
385 unsigned int foldCount
;
390 // TODO: remember generated immediates and only revisit these
392 ConstantFolding::foldAll(Program
*prog
)
394 unsigned int iterCount
= 0;
399 } while (foldCount
&& ++iterCount
< 2);
404 ConstantFolding::visit(BasicBlock
*bb
)
406 Instruction
*i
, *next
;
408 for (i
= bb
->getEntry(); i
; i
= next
) {
410 if (i
->op
== OP_MOV
|| i
->op
== OP_CALL
)
413 ImmediateValue src0
, src1
, src2
;
415 if (i
->srcExists(2) &&
416 i
->src(0).getImmediate(src0
) &&
417 i
->src(1).getImmediate(src1
) &&
418 i
->src(2).getImmediate(src2
)) {
419 expr(i
, src0
, src1
, src2
);
421 if (i
->srcExists(1) &&
422 i
->src(0).getImmediate(src0
) && i
->src(1).getImmediate(src1
)) {
425 if (i
->srcExists(0) && i
->src(0).getImmediate(src0
)) {
426 if (opnd(i
, src0
, 0))
429 if (i
->srcExists(1) && i
->src(1).getImmediate(src1
)) {
430 if (opnd(i
, src1
, 1))
433 if (i
->srcExists(2) && i
->src(2).getImmediate(src2
))
440 ConstantFolding::findOriginForTestWithZero(Value
*value
)
444 Instruction
*insn
= value
->getInsn();
448 if (insn
->asCmp() && insn
->op
!= OP_SLCT
)
449 return insn
->asCmp();
451 /* Sometimes mov's will sneak in as a result of other folding. This gets
454 if (insn
->op
== OP_MOV
)
455 return findOriginForTestWithZero(insn
->getSrc(0));
457 /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
458 if (insn
->op
== OP_AND
) {
461 if (!insn
->src(s
).getImmediate(imm
)) {
463 if (!insn
->src(s
).getImmediate(imm
))
466 if (imm
.reg
.data
.f32
!= 1.0f
)
468 /* TODO: Come up with a way to handle the condition being inverted */
469 if (insn
->src(!s
).mod
!= Modifier(0))
471 return findOriginForTestWithZero(insn
->getSrc(!s
));
478 Modifier::applyTo(ImmediateValue
& imm
) const
480 if (!bits
) // avoid failure if imm.reg.type is unhandled (e.g. b128)
482 switch (imm
.reg
.type
) {
484 if (bits
& NV50_IR_MOD_ABS
)
485 imm
.reg
.data
.f32
= fabsf(imm
.reg
.data
.f32
);
486 if (bits
& NV50_IR_MOD_NEG
)
487 imm
.reg
.data
.f32
= -imm
.reg
.data
.f32
;
488 if (bits
& NV50_IR_MOD_SAT
) {
489 if (imm
.reg
.data
.f32
< 0.0f
)
490 imm
.reg
.data
.f32
= 0.0f
;
492 if (imm
.reg
.data
.f32
> 1.0f
)
493 imm
.reg
.data
.f32
= 1.0f
;
495 assert(!(bits
& NV50_IR_MOD_NOT
));
498 case TYPE_S8
: // NOTE: will be extended
501 case TYPE_U8
: // NOTE: treated as signed
504 if (bits
& NV50_IR_MOD_ABS
)
505 imm
.reg
.data
.s32
= (imm
.reg
.data
.s32
>= 0) ?
506 imm
.reg
.data
.s32
: -imm
.reg
.data
.s32
;
507 if (bits
& NV50_IR_MOD_NEG
)
508 imm
.reg
.data
.s32
= -imm
.reg
.data
.s32
;
509 if (bits
& NV50_IR_MOD_NOT
)
510 imm
.reg
.data
.s32
= ~imm
.reg
.data
.s32
;
514 if (bits
& NV50_IR_MOD_ABS
)
515 imm
.reg
.data
.f64
= fabs(imm
.reg
.data
.f64
);
516 if (bits
& NV50_IR_MOD_NEG
)
517 imm
.reg
.data
.f64
= -imm
.reg
.data
.f64
;
518 if (bits
& NV50_IR_MOD_SAT
) {
519 if (imm
.reg
.data
.f64
< 0.0)
520 imm
.reg
.data
.f64
= 0.0;
522 if (imm
.reg
.data
.f64
> 1.0)
523 imm
.reg
.data
.f64
= 1.0;
525 assert(!(bits
& NV50_IR_MOD_NOT
));
529 assert(!"invalid/unhandled type");
530 imm
.reg
.data
.u64
= 0;
536 Modifier::getOp() const
539 case NV50_IR_MOD_ABS
: return OP_ABS
;
540 case NV50_IR_MOD_NEG
: return OP_NEG
;
541 case NV50_IR_MOD_SAT
: return OP_SAT
;
542 case NV50_IR_MOD_NOT
: return OP_NOT
;
551 ConstantFolding::expr(Instruction
*i
,
552 ImmediateValue
&imm0
, ImmediateValue
&imm1
)
554 struct Storage
*const a
= &imm0
.reg
, *const b
= &imm1
.reg
;
556 DataType type
= i
->dType
;
558 memset(&res
.data
, 0, sizeof(res
.data
));
564 if (i
->dnz
&& i
->dType
== TYPE_F32
) {
565 if (!isfinite(a
->data
.f32
))
567 if (!isfinite(b
->data
.f32
))
572 res
.data
.f32
= a
->data
.f32
* b
->data
.f32
* exp2f(i
->postFactor
);
574 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
* b
->data
.f64
; break;
576 if (i
->subOp
== NV50_IR_SUBOP_MUL_HIGH
) {
577 res
.data
.s32
= ((int64_t)a
->data
.s32
* b
->data
.s32
) >> 32;
582 if (i
->subOp
== NV50_IR_SUBOP_MUL_HIGH
) {
583 res
.data
.u32
= ((uint64_t)a
->data
.u32
* b
->data
.u32
) >> 32;
586 res
.data
.u32
= a
->data
.u32
* b
->data
.u32
; break;
592 if (b
->data
.u32
== 0)
595 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
/ b
->data
.f32
; break;
596 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
/ b
->data
.f64
; break;
597 case TYPE_S32
: res
.data
.s32
= a
->data
.s32
/ b
->data
.s32
; break;
598 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
/ b
->data
.u32
; break;
605 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
+ b
->data
.f32
; break;
606 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
+ b
->data
.f64
; break;
608 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
+ b
->data
.u32
; break;
615 case TYPE_F32
: res
.data
.f32
= a
->data
.f32
- b
->data
.f32
; break;
616 case TYPE_F64
: res
.data
.f64
= a
->data
.f64
- b
->data
.f64
; break;
618 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
- b
->data
.u32
; break;
625 case TYPE_F32
: res
.data
.f32
= pow(a
->data
.f32
, b
->data
.f32
); break;
626 case TYPE_F64
: res
.data
.f64
= pow(a
->data
.f64
, b
->data
.f64
); break;
633 case TYPE_F32
: res
.data
.f32
= MAX2(a
->data
.f32
, b
->data
.f32
); break;
634 case TYPE_F64
: res
.data
.f64
= MAX2(a
->data
.f64
, b
->data
.f64
); break;
635 case TYPE_S32
: res
.data
.s32
= MAX2(a
->data
.s32
, b
->data
.s32
); break;
636 case TYPE_U32
: res
.data
.u32
= MAX2(a
->data
.u32
, b
->data
.u32
); break;
643 case TYPE_F32
: res
.data
.f32
= MIN2(a
->data
.f32
, b
->data
.f32
); break;
644 case TYPE_F64
: res
.data
.f64
= MIN2(a
->data
.f64
, b
->data
.f64
); break;
645 case TYPE_S32
: res
.data
.s32
= MIN2(a
->data
.s32
, b
->data
.s32
); break;
646 case TYPE_U32
: res
.data
.u32
= MIN2(a
->data
.u32
, b
->data
.u32
); break;
652 res
.data
.u64
= a
->data
.u64
& b
->data
.u64
;
655 res
.data
.u64
= a
->data
.u64
| b
->data
.u64
;
658 res
.data
.u64
= a
->data
.u64
^ b
->data
.u64
;
661 res
.data
.u32
= a
->data
.u32
<< b
->data
.u32
;
665 case TYPE_S32
: res
.data
.s32
= a
->data
.s32
>> b
->data
.u32
; break;
666 case TYPE_U32
: res
.data
.u32
= a
->data
.u32
>> b
->data
.u32
; break;
672 if (a
->data
.u32
!= b
->data
.u32
)
674 res
.data
.u32
= a
->data
.u32
;
677 int offset
= b
->data
.u32
& 0xff;
678 int width
= (b
->data
.u32
>> 8) & 0xff;
685 if (width
+ offset
< 32) {
687 lshift
= 32 - width
- offset
;
689 if (i
->subOp
== NV50_IR_SUBOP_EXTBF_REV
)
690 res
.data
.u32
= util_bitreverse(a
->data
.u32
);
692 res
.data
.u32
= a
->data
.u32
;
694 case TYPE_S32
: res
.data
.s32
= (res
.data
.s32
<< lshift
) >> rshift
; break;
695 case TYPE_U32
: res
.data
.u32
= (res
.data
.u32
<< lshift
) >> rshift
; break;
702 res
.data
.u32
= util_bitcount(a
->data
.u32
& b
->data
.u32
);
705 // The two arguments to pfetch are logically added together. Normally
706 // the second argument will not be constant, but that can happen.
707 res
.data
.u32
= a
->data
.u32
+ b
->data
.u32
;
715 res
.data
.u64
= (((uint64_t)b
->data
.u32
) << 32) | a
->data
.u32
;
726 i
->src(0).mod
= Modifier(0);
727 i
->src(1).mod
= Modifier(0);
730 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.u32
));
733 i
->getSrc(0)->reg
.data
= res
.data
;
734 i
->getSrc(0)->reg
.type
= type
;
735 i
->getSrc(0)->reg
.size
= typeSizeof(type
);
740 ImmediateValue src0
, src1
= *i
->getSrc(0)->asImm();
742 // Move the immediate into position 1, where we know it might be
743 // emittable. However it might not be anyways, as there may be other
744 // restrictions, so move it into a separate LValue.
745 bld
.setPosition(i
, false);
748 i
->setSrc(1, bld
.mkMov(bld
.getSSA(type
), i
->getSrc(0), type
)->getDef(0));
749 i
->setSrc(0, i
->getSrc(2));
750 i
->src(0).mod
= i
->src(2).mod
;
753 if (i
->src(0).getImmediate(src0
))
760 // Leave PFETCH alone... we just folded its 2 args into 1.
763 i
->op
= i
->saturate
? OP_SAT
: OP_MOV
;
765 unary(i
, *i
->getSrc(0)->asImm());
772 ConstantFolding::expr(Instruction
*i
,
773 ImmediateValue
&imm0
,
774 ImmediateValue
&imm1
,
775 ImmediateValue
&imm2
)
777 struct Storage
*const a
= &imm0
.reg
, *const b
= &imm1
.reg
, *const c
= &imm2
.reg
;
780 memset(&res
.data
, 0, sizeof(res
.data
));
784 int offset
= b
->data
.u32
& 0xff;
785 int width
= (b
->data
.u32
>> 8) & 0xff;
786 unsigned bitmask
= ((1 << width
) - 1) << offset
;
787 res
.data
.u32
= ((a
->data
.u32
<< offset
) & bitmask
) | (c
->data
.u32
& ~bitmask
);
794 res
.data
.f32
= a
->data
.f32
* b
->data
.f32
* exp2f(i
->postFactor
) +
798 res
.data
.f64
= a
->data
.f64
* b
->data
.f64
+ c
->data
.f64
;
801 if (i
->subOp
== NV50_IR_SUBOP_MUL_HIGH
) {
802 res
.data
.s32
= ((int64_t)a
->data
.s32
* b
->data
.s32
>> 32) + c
->data
.s32
;
807 if (i
->subOp
== NV50_IR_SUBOP_MUL_HIGH
) {
808 res
.data
.u32
= ((uint64_t)a
->data
.u32
* b
->data
.u32
>> 32) + c
->data
.u32
;
811 res
.data
.u32
= a
->data
.u32
* b
->data
.u32
+ c
->data
.u32
;
819 res
.data
.u32
= (a
->data
.u32
<< b
->data
.u32
) + c
->data
.u32
;
826 i
->src(0).mod
= Modifier(0);
827 i
->src(1).mod
= Modifier(0);
828 i
->src(2).mod
= Modifier(0);
830 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.u32
));
834 i
->getSrc(0)->reg
.data
= res
.data
;
835 i
->getSrc(0)->reg
.type
= i
->dType
;
836 i
->getSrc(0)->reg
.size
= typeSizeof(i
->dType
);
842 ConstantFolding::unary(Instruction
*i
, const ImmediateValue
&imm
)
846 if (i
->dType
!= TYPE_F32
)
849 case OP_NEG
: res
.data
.f32
= -imm
.reg
.data
.f32
; break;
850 case OP_ABS
: res
.data
.f32
= fabsf(imm
.reg
.data
.f32
); break;
851 case OP_SAT
: res
.data
.f32
= SATURATE(imm
.reg
.data
.f32
); break;
852 case OP_RCP
: res
.data
.f32
= 1.0f
/ imm
.reg
.data
.f32
; break;
853 case OP_RSQ
: res
.data
.f32
= 1.0f
/ sqrtf(imm
.reg
.data
.f32
); break;
854 case OP_LG2
: res
.data
.f32
= log2f(imm
.reg
.data
.f32
); break;
855 case OP_EX2
: res
.data
.f32
= exp2f(imm
.reg
.data
.f32
); break;
856 case OP_SIN
: res
.data
.f32
= sinf(imm
.reg
.data
.f32
); break;
857 case OP_COS
: res
.data
.f32
= cosf(imm
.reg
.data
.f32
); break;
858 case OP_SQRT
: res
.data
.f32
= sqrtf(imm
.reg
.data
.f32
); break;
861 // these should be handled in subsequent OP_SIN/COS/EX2
862 res
.data
.f32
= imm
.reg
.data
.f32
;
868 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
.data
.f32
));
869 i
->src(0).mod
= Modifier(0);
873 ConstantFolding::tryCollapseChainedMULs(Instruction
*mul2
,
874 const int s
, ImmediateValue
& imm2
)
876 const int t
= s
? 0 : 1;
878 Instruction
*mul1
= NULL
; // mul1 before mul2
880 float f
= imm2
.reg
.data
.f32
* exp2f(mul2
->postFactor
);
883 assert(mul2
->op
== OP_MUL
&& mul2
->dType
== TYPE_F32
);
885 if (mul2
->getSrc(t
)->refCount() == 1) {
886 insn
= mul2
->getSrc(t
)->getInsn();
887 if (!mul2
->src(t
).mod
&& insn
->op
== OP_MUL
&& insn
->dType
== TYPE_F32
)
889 if (mul1
&& !mul1
->saturate
) {
892 if (mul1
->src(s1
= 0).getImmediate(imm1
) ||
893 mul1
->src(s1
= 1).getImmediate(imm1
)) {
894 bld
.setPosition(mul1
, false);
896 // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
897 mul1
->setSrc(s1
, bld
.loadImm(NULL
, f
* imm1
.reg
.data
.f32
));
898 mul1
->src(s1
).mod
= Modifier(0);
899 mul2
->def(0).replace(mul1
->getDef(0), false);
900 mul1
->saturate
= mul2
->saturate
;
902 if (prog
->getTarget()->isPostMultiplySupported(OP_MUL
, f
, e
)) {
904 // d = mul c, imm -> d = mul_x_imm a, b
905 mul1
->postFactor
= e
;
906 mul2
->def(0).replace(mul1
->getDef(0), false);
908 mul1
->src(0).mod
*= Modifier(NV50_IR_MOD_NEG
);
909 mul1
->saturate
= mul2
->saturate
;
914 if (mul2
->getDef(0)->refCount() == 1 && !mul2
->saturate
) {
916 // d = mul b, c -> d = mul_x_imm a, c
918 insn
= (*mul2
->getDef(0)->uses
.begin())->getInsn();
923 s2
= insn
->getSrc(0) == mul1
->getDef(0) ? 0 : 1;
925 if (insn
->op
== OP_MUL
&& insn
->dType
== TYPE_F32
)
926 if (!insn
->src(s2
).mod
&& !insn
->src(t2
).getImmediate(imm1
))
928 if (mul2
&& prog
->getTarget()->isPostMultiplySupported(OP_MUL
, f
, e
)) {
929 mul2
->postFactor
= e
;
930 mul2
->setSrc(s2
, mul1
->src(t
));
932 mul2
->src(s2
).mod
*= Modifier(NV50_IR_MOD_NEG
);
938 ConstantFolding::opnd3(Instruction
*i
, ImmediateValue
&imm2
)
943 if (imm2
.isInteger(0)) {
951 if (imm2
.isInteger(0)) {
964 ConstantFolding::createMul(DataType ty
, Value
*def
, Value
*a
, int64_t b
, Value
*c
)
966 const Target
*target
= prog
->getTarget();
967 int64_t absB
= llabs(b
);
969 //a * (2^shl) -> a << shl
970 if (b
>= 0 && util_is_power_of_two_or_zero64(b
)) {
971 int shl
= util_logbase2_64(b
);
973 Value
*res
= c
? bld
.getSSA(typeSizeof(ty
)) : def
;
974 bld
.mkOp2(OP_SHL
, ty
, res
, a
, bld
.mkImm(shl
));
976 bld
.mkOp2(OP_ADD
, ty
, def
, res
, c
);
981 //a * (2^shl + 1) -> a << shl + a
982 //a * -(2^shl + 1) -> -a << shl + a
983 //a * (2^shl - 1) -> a << shl - a
984 //a * -(2^shl - 1) -> -a << shl - a
985 if (typeSizeof(ty
) == 4 &&
986 (util_is_power_of_two_or_zero64(absB
- 1) ||
987 util_is_power_of_two_or_zero64(absB
+ 1)) &&
988 target
->isOpSupported(OP_SHLADD
, TYPE_U32
)) {
989 bool subA
= util_is_power_of_two_or_zero64(absB
+ 1);
990 int shl
= subA
? util_logbase2_64(absB
+ 1) : util_logbase2_64(absB
- 1);
992 Value
*res
= c
? bld
.getSSA() : def
;
993 Instruction
*insn
= bld
.mkOp3(OP_SHLADD
, TYPE_U32
, res
, a
, bld
.mkImm(shl
), a
);
995 insn
->src(0).mod
= Modifier(NV50_IR_MOD_NEG
);
997 insn
->src(2).mod
= Modifier(NV50_IR_MOD_NEG
);
1000 bld
.mkOp2(OP_ADD
, TYPE_U32
, def
, res
, c
);
1005 if (typeSizeof(ty
) == 4 && b
>= 0 && b
<= 0xffff &&
1006 target
->isOpSupported(OP_XMAD
, TYPE_U32
)) {
1007 Value
*tmp
= bld
.mkOp3v(OP_XMAD
, TYPE_U32
, bld
.getSSA(),
1008 a
, bld
.mkImm((uint32_t)b
), c
? c
: bld
.mkImm(0));
1009 bld
.mkOp3(OP_XMAD
, TYPE_U32
, def
, a
, bld
.mkImm((uint32_t)b
), tmp
)->subOp
=
1010 NV50_IR_SUBOP_XMAD_PSL
| NV50_IR_SUBOP_XMAD_H1(0);
1019 ConstantFolding::opnd(Instruction
*i
, ImmediateValue
&imm0
, int s
)
1022 const operation op
= i
->op
;
1023 Instruction
*newi
= i
;
1024 bool deleted
= false;
1028 bld
.setPosition(i
, false);
1030 uint8_t size
= i
->getDef(0)->reg
.size
;
1031 uint8_t bitsize
= size
* 8;
1032 uint32_t mask
= (1ULL << bitsize
) - 1;
1033 assert(bitsize
<= 32);
1035 uint64_t val
= imm0
.reg
.data
.u64
;
1036 for (int8_t d
= 0; i
->defExists(d
); ++d
) {
1037 Value
*def
= i
->getDef(d
);
1038 assert(def
->reg
.size
== size
);
1040 newi
= bld
.mkMov(def
, bld
.mkImm((uint32_t)(val
& mask
)), TYPE_U32
);
1043 delete_Instruction(prog
, i
);
1048 if (i
->dType
== TYPE_F32
&& !i
->precise
)
1049 tryCollapseChainedMULs(i
, s
, imm0
);
1051 if (i
->subOp
== NV50_IR_SUBOP_MUL_HIGH
) {
1052 assert(!isFloatType(i
->sType
));
1053 if (imm0
.isInteger(1) && i
->dType
== TYPE_S32
) {
1054 bld
.setPosition(i
, false);
1055 // Need to set to the sign value, which is a compare.
1056 newi
= bld
.mkCmp(OP_SET
, CC_LT
, TYPE_S32
, i
->getDef(0),
1057 TYPE_S32
, i
->getSrc(t
), bld
.mkImm(0));
1058 delete_Instruction(prog
, i
);
1060 } else if (imm0
.isInteger(0) || imm0
.isInteger(1)) {
1061 // The high bits can't be set in this case (either mul by 0 or
1065 i
->setSrc(0, new_ImmediateValue(prog
, 0u));
1066 i
->src(0).mod
= Modifier(0);
1068 } else if (!imm0
.isNegative() && imm0
.isPow2()) {
1069 // Translate into a shift
1073 imm0
.reg
.data
.u32
= 32 - imm0
.reg
.data
.u32
;
1074 i
->setSrc(0, i
->getSrc(t
));
1075 i
->src(0).mod
= i
->src(t
).mod
;
1076 i
->setSrc(1, new_ImmediateValue(prog
, imm0
.reg
.data
.u32
));
1080 if (imm0
.isInteger(0)) {
1082 i
->setSrc(0, new_ImmediateValue(prog
, 0u));
1083 i
->src(0).mod
= Modifier(0);
1087 if (!i
->postFactor
&& (imm0
.isInteger(1) || imm0
.isInteger(-1))) {
1088 if (imm0
.isNegative())
1089 i
->src(t
).mod
= i
->src(t
).mod
^ Modifier(NV50_IR_MOD_NEG
);
1090 i
->op
= i
->src(t
).mod
.getOp();
1092 i
->setSrc(0, i
->getSrc(1));
1093 i
->src(0).mod
= i
->src(1).mod
;
1096 if (i
->op
!= OP_CVT
)
1100 if (!i
->postFactor
&& (imm0
.isInteger(2) || imm0
.isInteger(-2))) {
1101 if (imm0
.isNegative())
1102 i
->src(t
).mod
= i
->src(t
).mod
^ Modifier(NV50_IR_MOD_NEG
);
1105 i
->setSrc(s
, i
->getSrc(t
));
1106 i
->src(s
).mod
= i
->src(t
).mod
;
1108 if (!isFloatType(i
->dType
) && !i
->src(t
).mod
) {
1109 bld
.setPosition(i
, false);
1110 int64_t b
= typeSizeof(i
->dType
) == 8 ? imm0
.reg
.data
.s64
: imm0
.reg
.data
.s32
;
1111 if (createMul(i
->dType
, i
->getDef(0), i
->getSrc(t
), b
, NULL
)) {
1112 delete_Instruction(prog
, i
);
1116 if (i
->postFactor
&& i
->sType
== TYPE_F32
) {
1117 /* Can't emit a postfactor with an immediate, have to fold it in */
1118 i
->setSrc(s
, new_ImmediateValue(
1119 prog
, imm0
.reg
.data
.f32
* exp2f(i
->postFactor
)));
1125 if (imm0
.isInteger(0)) {
1126 i
->setSrc(0, i
->getSrc(2));
1127 i
->src(0).mod
= i
->src(2).mod
;
1130 i
->op
= i
->src(0).mod
.getOp();
1131 if (i
->op
!= OP_CVT
)
1134 if (i
->subOp
!= NV50_IR_SUBOP_MUL_HIGH
&&
1135 (imm0
.isInteger(1) || imm0
.isInteger(-1))) {
1136 if (imm0
.isNegative())
1137 i
->src(t
).mod
= i
->src(t
).mod
^ Modifier(NV50_IR_MOD_NEG
);
1139 i
->setSrc(0, i
->getSrc(1));
1140 i
->src(0).mod
= i
->src(1).mod
;
1142 i
->setSrc(1, i
->getSrc(2));
1143 i
->src(1).mod
= i
->src(2).mod
;
1148 if (!isFloatType(i
->dType
) && !i
->subOp
&& !i
->src(t
).mod
&& !i
->src(2).mod
) {
1149 bld
.setPosition(i
, false);
1150 int64_t b
= typeSizeof(i
->dType
) == 8 ? imm0
.reg
.data
.s64
: imm0
.reg
.data
.s32
;
1151 if (createMul(i
->dType
, i
->getDef(0), i
->getSrc(t
), b
, i
->getSrc(2))) {
1152 delete_Instruction(prog
, i
);
1158 if (imm0
.isInteger(0) && s
== 0 && typeSizeof(i
->dType
) == 8 &&
1159 !isFloatType(i
->dType
))
1165 if (imm0
.isInteger(0)) {
1167 i
->setSrc(0, i
->getSrc(1));
1168 i
->src(0).mod
= i
->src(1).mod
;
1169 if (i
->op
== OP_SUB
)
1170 i
->src(0).mod
= i
->src(0).mod
^ Modifier(NV50_IR_MOD_NEG
);
1173 i
->op
= i
->src(0).mod
.getOp();
1174 if (i
->op
!= OP_CVT
)
1175 i
->src(0).mod
= Modifier(0);
1180 if (s
!= 1 || (i
->dType
!= TYPE_S32
&& i
->dType
!= TYPE_U32
))
1182 bld
.setPosition(i
, false);
1183 if (imm0
.reg
.data
.u32
== 0) {
1186 if (imm0
.reg
.data
.u32
== 1) {
1190 if (i
->dType
== TYPE_U32
&& imm0
.isPow2()) {
1192 i
->setSrc(1, bld
.mkImm(util_logbase2(imm0
.reg
.data
.u32
)));
1194 if (i
->dType
== TYPE_U32
) {
1197 const uint32_t d
= imm0
.reg
.data
.u32
;
1200 uint32_t l
= util_logbase2(d
);
1201 if (((uint32_t)1 << l
) < d
)
1203 m
= (((uint64_t)1 << 32) * (((uint64_t)1 << l
) - d
)) / d
+ 1;
1205 s
= l
? (l
- 1) : 0;
1209 mul
= bld
.mkOp2(OP_MUL
, TYPE_U32
, tA
, i
->getSrc(0),
1210 bld
.loadImm(NULL
, m
));
1211 mul
->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
1212 bld
.mkOp2(OP_SUB
, TYPE_U32
, tB
, i
->getSrc(0), tA
);
1215 bld
.mkOp2(OP_SHR
, TYPE_U32
, tA
, tB
, bld
.mkImm(r
));
1218 tB
= s
? bld
.getSSA() : i
->getDef(0);
1219 newi
= bld
.mkOp2(OP_ADD
, TYPE_U32
, tB
, mul
->getDef(0), tA
);
1221 bld
.mkOp2(OP_SHR
, TYPE_U32
, i
->getDef(0), tB
, bld
.mkImm(s
));
1223 delete_Instruction(prog
, i
);
1226 if (imm0
.reg
.data
.s32
== -1) {
1232 const int32_t d
= imm0
.reg
.data
.s32
;
1234 int32_t l
= util_logbase2(static_cast<unsigned>(abs(d
)));
1235 if ((1 << l
) < abs(d
))
1239 m
= ((uint64_t)1 << (32 + l
- 1)) / abs(d
) + 1 - ((uint64_t)1 << 32);
1243 bld
.mkOp3(OP_MAD
, TYPE_S32
, tA
, i
->getSrc(0), bld
.loadImm(NULL
, m
),
1244 i
->getSrc(0))->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
1246 bld
.mkOp2(OP_SHR
, TYPE_S32
, tB
, tA
, bld
.mkImm(l
- 1));
1250 bld
.mkCmp(OP_SET
, CC_LT
, TYPE_S32
, tA
, TYPE_S32
, i
->getSrc(0), bld
.mkImm(0));
1251 tD
= (d
< 0) ? bld
.getSSA() : i
->getDef(0)->asLValue();
1252 newi
= bld
.mkOp2(OP_SUB
, TYPE_U32
, tD
, tB
, tA
);
1254 bld
.mkOp1(OP_NEG
, TYPE_S32
, i
->getDef(0), tB
);
1256 delete_Instruction(prog
, i
);
1262 if (s
== 1 && imm0
.isPow2()) {
1263 bld
.setPosition(i
, false);
1264 if (i
->sType
== TYPE_U32
) {
1266 i
->setSrc(1, bld
.loadImm(NULL
, imm0
.reg
.data
.u32
- 1));
1267 } else if (i
->sType
== TYPE_S32
) {
1268 // Do it on the absolute value of the input, and then restore the
1269 // sign. The only odd case is MIN_INT, but that should work out
1270 // as well, since MIN_INT mod any power of 2 is 0.
1272 // Technically we don't have to do any of this since MOD is
1273 // undefined with negative arguments in GLSL, but this seems like
1274 // the nice thing to do.
1275 Value
*abs
= bld
.mkOp1v(OP_ABS
, TYPE_S32
, bld
.getSSA(), i
->getSrc(0));
1276 Value
*neg
, *v1
, *v2
;
1277 bld
.mkCmp(OP_SET
, CC_LT
, TYPE_S32
,
1278 (neg
= bld
.getSSA(1, prog
->getTarget()->nativeFile(FILE_PREDICATE
))),
1279 TYPE_S32
, i
->getSrc(0), bld
.loadImm(NULL
, 0));
1280 Value
*mod
= bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), abs
,
1281 bld
.loadImm(NULL
, imm0
.reg
.data
.u32
- 1));
1282 bld
.mkOp1(OP_NEG
, TYPE_S32
, (v1
= bld
.getSSA()), mod
)
1283 ->setPredicate(CC_P
, neg
);
1284 bld
.mkOp1(OP_MOV
, TYPE_S32
, (v2
= bld
.getSSA()), mod
)
1285 ->setPredicate(CC_NOT_P
, neg
);
1286 newi
= bld
.mkOp2(OP_UNION
, TYPE_S32
, i
->getDef(0), v1
, v2
);
1288 delete_Instruction(prog
, i
);
1291 } else if (s
== 1) {
1292 // In this case, we still want the optimized lowering that we get
1293 // from having division by an immediate.
1295 // a % b == a - (a/b) * b
1296 bld
.setPosition(i
, false);
1297 Value
*div
= bld
.mkOp2v(OP_DIV
, i
->sType
, bld
.getSSA(),
1298 i
->getSrc(0), i
->getSrc(1));
1299 newi
= bld
.mkOp2(OP_ADD
, i
->sType
, i
->getDef(0), i
->getSrc(0),
1300 bld
.mkOp2v(OP_MUL
, i
->sType
, bld
.getSSA(), div
, i
->getSrc(1)));
1301 // TODO: Check that target supports this. In this case, we know that
1303 newi
->src(1).mod
= Modifier(NV50_IR_MOD_NEG
);
1305 delete_Instruction(prog
, i
);
1310 case OP_SET
: // TODO: SET_AND,OR,XOR
1312 /* This optimizes the case where the output of a set is being compared
1313 * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1314 * can be a lot cleverer in our comparison.
1316 CmpInstruction
*si
= findOriginForTestWithZero(i
->getSrc(t
));
1318 if (imm0
.reg
.data
.u32
!= 0 || !si
)
1321 ccZ
= (CondCode
)((unsigned int)i
->asCmp()->setCond
& ~CC_U
);
1322 // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1325 ccZ
= reverseCondCode(ccZ
);
1326 // If there is a negative modifier, we need to undo that, by flipping
1327 // the comparison to zero.
1328 if (i
->src(t
).mod
.neg())
1329 ccZ
= reverseCondCode(ccZ
);
1330 // If this is a signed comparison, we expect the input to be a regular
1331 // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1332 // is positive, so just flip the sign.
1333 if (i
->sType
== TYPE_S32
) {
1334 assert(!isFloatType(si
->dType
));
1335 ccZ
= reverseCondCode(ccZ
);
1338 case CC_LT
: cc
= CC_FL
; break; // bool < 0 -- this is never true
1339 case CC_GE
: cc
= CC_TR
; break; // bool >= 0 -- this is always true
1340 case CC_EQ
: cc
= inverseCondCode(cc
); break; // bool == 0 -- !bool
1341 case CC_LE
: cc
= inverseCondCode(cc
); break; // bool <= 0 -- !bool
1342 case CC_GT
: break; // bool > 0 -- bool
1343 case CC_NE
: break; // bool != 0 -- bool
1348 // Update the condition of this SET to be identical to the origin set,
1349 // but with the updated condition code. The original SET should get
1352 i
->asCmp()->setCond
= cc
;
1353 i
->setSrc(0, si
->src(0));
1354 i
->setSrc(1, si
->src(1));
1355 if (si
->srcExists(2))
1356 i
->setSrc(2, si
->src(2));
1357 i
->sType
= si
->sType
;
1363 Instruction
*src
= i
->getSrc(t
)->getInsn();
1364 ImmediateValue imm1
;
1365 if (imm0
.reg
.data
.u32
== 0) {
1367 i
->setSrc(0, new_ImmediateValue(prog
, 0u));
1368 i
->src(0).mod
= Modifier(0);
1370 } else if (imm0
.reg
.data
.u32
== ~0U) {
1371 i
->op
= i
->src(t
).mod
.getOp();
1373 i
->setSrc(0, i
->getSrc(t
));
1374 i
->src(0).mod
= i
->src(t
).mod
;
1377 } else if (src
->asCmp()) {
1378 CmpInstruction
*cmp
= src
->asCmp();
1379 if (!cmp
|| cmp
->op
== OP_SLCT
|| cmp
->getDef(0)->refCount() > 1)
1381 if (!prog
->getTarget()->isOpSupported(cmp
->op
, TYPE_F32
))
1383 if (imm0
.reg
.data
.f32
!= 1.0)
1385 if (cmp
->dType
!= TYPE_U32
)
1388 cmp
->dType
= TYPE_F32
;
1389 if (i
->src(t
).mod
!= Modifier(0)) {
1390 assert(i
->src(t
).mod
== Modifier(NV50_IR_MOD_NOT
));
1391 i
->src(t
).mod
= Modifier(0);
1392 cmp
->setCond
= inverseCondCode(cmp
->setCond
);
1397 i
->setSrc(0, i
->getSrc(t
));
1400 } else if (prog
->getTarget()->isOpSupported(OP_EXTBF
, TYPE_U32
) &&
1401 src
->op
== OP_SHR
&&
1402 src
->src(1).getImmediate(imm1
) &&
1403 i
->src(t
).mod
== Modifier(0) &&
1404 util_is_power_of_two_or_zero(imm0
.reg
.data
.u32
+ 1)) {
1405 // low byte = offset, high byte = width
1406 uint32_t ext
= (util_last_bit(imm0
.reg
.data
.u32
) << 8) | imm1
.reg
.data
.u32
;
1408 i
->setSrc(0, src
->getSrc(0));
1409 i
->setSrc(1, new_ImmediateValue(prog
, ext
));
1410 } else if (src
->op
== OP_SHL
&&
1411 src
->src(1).getImmediate(imm1
) &&
1412 i
->src(t
).mod
== Modifier(0) &&
1413 util_is_power_of_two_or_zero(~imm0
.reg
.data
.u32
+ 1) &&
1414 util_last_bit(~imm0
.reg
.data
.u32
) <= imm1
.reg
.data
.u32
) {
1418 i
->setSrc(0, i
->getSrc(t
));
1427 if (s
!= 1 || i
->src(0).mod
!= Modifier(0))
1429 // try to concatenate shifts
1430 Instruction
*si
= i
->getSrc(0)->getInsn();
1433 ImmediateValue imm1
;
1436 if (si
->src(1).getImmediate(imm1
)) {
1437 bld
.setPosition(i
, false);
1438 i
->setSrc(0, si
->getSrc(0));
1439 i
->setSrc(1, bld
.loadImm(NULL
, imm0
.reg
.data
.u32
+ imm1
.reg
.data
.u32
));
1443 if (si
->src(1).getImmediate(imm1
) && imm0
.reg
.data
.u32
== imm1
.reg
.data
.u32
) {
1444 bld
.setPosition(i
, false);
1446 i
->setSrc(0, si
->getSrc(0));
1447 i
->setSrc(1, bld
.loadImm(NULL
, ~((1 << imm0
.reg
.data
.u32
) - 1)));
1452 if (isFloatType(si
->dType
))
1454 if (si
->src(1).getImmediate(imm1
))
1456 else if (si
->src(0).getImmediate(imm1
))
1461 bld
.setPosition(i
, false);
1463 i
->setSrc(0, si
->getSrc(!muls
));
1464 i
->setSrc(1, bld
.loadImm(NULL
, imm1
.reg
.data
.u32
<< imm0
.reg
.data
.u32
));
1469 if (isFloatType(si
->dType
))
1471 if (si
->op
!= OP_SUB
&& si
->src(0).getImmediate(imm1
))
1473 else if (si
->src(1).getImmediate(imm1
))
1477 if (si
->src(!adds
).mod
!= Modifier(0))
1479 // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1481 // This is more operations, but if one of x, y is an immediate, then
1482 // we can get a situation where (a) we can use ISCADD, or (b)
1483 // propagate the add bit into an indirect load.
1484 bld
.setPosition(i
, false);
1486 i
->setSrc(adds
, bld
.loadImm(NULL
, imm1
.reg
.data
.u32
<< imm0
.reg
.data
.u32
));
1487 i
->setSrc(!adds
, bld
.mkOp2v(OP_SHL
, i
->dType
,
1488 bld
.getSSA(i
->def(0).getSize(), i
->def(0).getFile()),
1490 bld
.mkImm(imm0
.reg
.data
.u32
)));
1515 case TYPE_S32
: res
= util_last_bit_signed(imm0
.reg
.data
.s32
) - 1; break;
1516 case TYPE_U32
: res
= util_last_bit(imm0
.reg
.data
.u32
) - 1; break;
1520 if (i
->subOp
== NV50_IR_SUBOP_BFIND_SAMT
&& res
>= 0)
1522 bld
.setPosition(i
, false); /* make sure bld is init'ed */
1523 i
->setSrc(0, bld
.mkImm(res
));
1530 // Only deal with 1-arg POPCNT here
1531 if (i
->srcExists(1))
1533 uint32_t res
= util_bitcount(imm0
.reg
.data
.u32
);
1534 i
->setSrc(0, new_ImmediateValue(i
->bb
->getProgram(), res
));
1542 // TODO: handle 64-bit values properly
1543 if (typeSizeof(i
->dType
) == 8 || typeSizeof(i
->sType
) == 8)
1546 // TODO: handle single byte/word extractions
1550 bld
.setPosition(i
, true); /* make sure bld is init'ed */
1552 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1554 switch (i->sType) { \
1556 res.data.dst = util_iround(i->saturate ? \
1557 CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1558 imm0.reg.data.f64); \
1561 res.data.dst = util_iround(i->saturate ? \
1562 CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1563 imm0.reg.data.f32); \
1566 res.data.dst = i->saturate ? \
1567 CLAMP(imm0.reg.data.s32, imin, imax) : \
1568 imm0.reg.data.s32; \
1571 res.data.dst = i->saturate ? \
1572 CLAMP(imm0.reg.data.u32, umin, umax) : \
1573 imm0.reg.data.u32; \
1576 res.data.dst = i->saturate ? \
1577 CLAMP(imm0.reg.data.s16, imin, imax) : \
1578 imm0.reg.data.s16; \
1581 res.data.dst = i->saturate ? \
1582 CLAMP(imm0.reg.data.u16, umin, umax) : \
1583 imm0.reg.data.u16; \
1585 default: return false; \
1587 i->setSrc(0, bld.mkImm(res.data.dst)); \
1591 CASE(TYPE_U16
, u16
, 0, UINT16_MAX
, 0, UINT16_MAX
, 0, UINT16_MAX
);
1592 CASE(TYPE_S16
, s16
, INT16_MIN
, INT16_MAX
, INT16_MIN
, INT16_MAX
, 0, INT16_MAX
);
1593 CASE(TYPE_U32
, u32
, 0, UINT32_MAX
, 0, INT32_MAX
, 0, UINT32_MAX
);
1594 CASE(TYPE_S32
, s32
, INT32_MIN
, INT32_MAX
, INT32_MIN
, INT32_MAX
, 0, INT32_MAX
);
1598 res
.data
.f32
= i
->saturate
?
1599 SATURATE(imm0
.reg
.data
.f64
) :
1603 res
.data
.f32
= i
->saturate
?
1604 SATURATE(imm0
.reg
.data
.f32
) :
1607 case TYPE_U16
: res
.data
.f32
= (float) imm0
.reg
.data
.u16
; break;
1608 case TYPE_U32
: res
.data
.f32
= (float) imm0
.reg
.data
.u32
; break;
1609 case TYPE_S16
: res
.data
.f32
= (float) imm0
.reg
.data
.s16
; break;
1610 case TYPE_S32
: res
.data
.f32
= (float) imm0
.reg
.data
.s32
; break;
1614 i
->setSrc(0, bld
.mkImm(res
.data
.f32
));
1619 res
.data
.f64
= i
->saturate
?
1620 SATURATE(imm0
.reg
.data
.f64
) :
1624 res
.data
.f64
= i
->saturate
?
1625 SATURATE(imm0
.reg
.data
.f32
) :
1628 case TYPE_U16
: res
.data
.f64
= (double) imm0
.reg
.data
.u16
; break;
1629 case TYPE_U32
: res
.data
.f64
= (double) imm0
.reg
.data
.u32
; break;
1630 case TYPE_S16
: res
.data
.f64
= (double) imm0
.reg
.data
.s16
; break;
1631 case TYPE_S32
: res
.data
.f64
= (double) imm0
.reg
.data
.s32
; break;
1635 i
->setSrc(0, bld
.mkImm(res
.data
.f64
));
1642 i
->setType(i
->dType
); /* Remove i->sType, which we don't need anymore */
1645 i
->src(0).mod
= Modifier(0); /* Clear the already applied modifier */
1652 // This can get left behind some of the optimizations which simplify
1653 // saturatable values.
1654 if (newi
->op
== OP_MOV
&& newi
->saturate
) {
1658 if (newi
->src(0).getImmediate(tmp
))
1667 // =============================================================================
1669 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1670 class ModifierFolding
: public Pass
1673 virtual bool visit(BasicBlock
*);
1677 ModifierFolding::visit(BasicBlock
*bb
)
1679 const Target
*target
= prog
->getTarget();
1681 Instruction
*i
, *next
, *mi
;
1684 for (i
= bb
->getEntry(); i
; i
= next
) {
1687 if (0 && i
->op
== OP_SUB
) {
1688 // turn "sub" into "add neg" (do we really want this ?)
1690 i
->src(0).mod
= i
->src(0).mod
^ Modifier(NV50_IR_MOD_NEG
);
1693 for (int s
= 0; s
< 3 && i
->srcExists(s
); ++s
) {
1694 mi
= i
->getSrc(s
)->getInsn();
1696 mi
->predSrc
>= 0 || mi
->getDef(0)->refCount() > 8)
1698 if (i
->sType
== TYPE_U32
&& mi
->dType
== TYPE_S32
) {
1699 if ((i
->op
!= OP_ADD
&&
1701 (mi
->op
!= OP_ABS
&&
1705 if (i
->sType
!= mi
->dType
) {
1708 if ((mod
= Modifier(mi
->op
)) == Modifier(0))
1710 mod
*= mi
->src(0).mod
;
1712 if ((i
->op
== OP_ABS
) || i
->src(s
).mod
.abs()) {
1713 // abs neg [abs] = abs
1714 mod
= mod
& Modifier(~(NV50_IR_MOD_NEG
| NV50_IR_MOD_ABS
));
1716 if ((i
->op
== OP_NEG
) && mod
.neg()) {
1718 // neg as both opcode and modifier on same insn is prohibited
1719 // neg neg abs = abs, neg neg = identity
1720 mod
= mod
& Modifier(~NV50_IR_MOD_NEG
);
1721 i
->op
= mod
.getOp();
1722 mod
= mod
& Modifier(~NV50_IR_MOD_ABS
);
1723 if (mod
== Modifier(0))
1727 if (target
->isModSupported(i
, s
, mod
)) {
1728 i
->setSrc(s
, mi
->getSrc(0));
1729 i
->src(s
).mod
*= mod
;
1733 if (i
->op
== OP_SAT
) {
1734 mi
= i
->getSrc(0)->getInsn();
1736 mi
->getDef(0)->refCount() <= 1 && target
->isSatSupported(mi
)) {
1738 mi
->setDef(0, i
->getDef(0));
1739 delete_Instruction(prog
, i
);
1747 // =============================================================================
1749 // MUL + ADD -> MAD/FMA
1750 // MIN/MAX(a, a) -> a, etc.
1751 // SLCT(a, b, const) -> cc(const) ? a : b
1753 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1754 // EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
1755 class AlgebraicOpt
: public Pass
1758 virtual bool visit(BasicBlock
*);
1760 void handleABS(Instruction
*);
1761 bool handleADD(Instruction
*);
1762 bool tryADDToMADOrSAD(Instruction
*, operation toOp
);
1763 void handleMINMAX(Instruction
*);
1764 void handleRCP(Instruction
*);
1765 void handleSLCT(Instruction
*);
1766 void handleLOGOP(Instruction
*);
1767 void handleCVT_NEG(Instruction
*);
1768 void handleCVT_CVT(Instruction
*);
1769 void handleCVT_EXTBF(Instruction
*);
1770 void handleSUCLAMP(Instruction
*);
1771 void handleNEG(Instruction
*);
1772 void handleEXTBF_RDSV(Instruction
*);
1778 AlgebraicOpt::handleABS(Instruction
*abs
)
1780 Instruction
*sub
= abs
->getSrc(0)->getInsn();
1783 !prog
->getTarget()->isOpSupported(OP_SAD
, abs
->dType
))
1785 // expect not to have mods yet, if we do, bail
1786 if (sub
->src(0).mod
|| sub
->src(1).mod
)
1788 // hidden conversion ?
1789 ty
= intTypeToSigned(sub
->dType
);
1790 if (abs
->dType
!= abs
->sType
|| ty
!= abs
->sType
)
1793 if ((sub
->op
!= OP_ADD
&& sub
->op
!= OP_SUB
) ||
1794 sub
->src(0).getFile() != FILE_GPR
|| sub
->src(0).mod
||
1795 sub
->src(1).getFile() != FILE_GPR
|| sub
->src(1).mod
)
1798 Value
*src0
= sub
->getSrc(0);
1799 Value
*src1
= sub
->getSrc(1);
1801 if (sub
->op
== OP_ADD
) {
1802 Instruction
*neg
= sub
->getSrc(1)->getInsn();
1803 if (neg
&& neg
->op
!= OP_NEG
) {
1804 neg
= sub
->getSrc(0)->getInsn();
1805 src0
= sub
->getSrc(1);
1807 if (!neg
|| neg
->op
!= OP_NEG
||
1808 neg
->dType
!= neg
->sType
|| neg
->sType
!= ty
)
1810 src1
= neg
->getSrc(0);
1814 abs
->moveSources(1, 2); // move sources >=1 up by 2
1816 abs
->setType(sub
->dType
);
1817 abs
->setSrc(0, src0
);
1818 abs
->setSrc(1, src1
);
1819 bld
.setPosition(abs
, false);
1820 abs
->setSrc(2, bld
.loadImm(bld
.getSSA(typeSizeof(ty
)), 0));
1824 AlgebraicOpt::handleADD(Instruction
*add
)
1826 Value
*src0
= add
->getSrc(0);
1827 Value
*src1
= add
->getSrc(1);
1829 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
1832 bool changed
= false;
1833 // we can't optimize to MAD if the add is precise
1834 if (!add
->precise
&& prog
->getTarget()->isOpSupported(OP_MAD
, add
->dType
))
1835 changed
= tryADDToMADOrSAD(add
, OP_MAD
);
1836 if (!changed
&& prog
->getTarget()->isOpSupported(OP_SAD
, add
->dType
))
1837 changed
= tryADDToMADOrSAD(add
, OP_SAD
);
1841 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1842 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1844 AlgebraicOpt::tryADDToMADOrSAD(Instruction
*add
, operation toOp
)
1846 Value
*src0
= add
->getSrc(0);
1847 Value
*src1
= add
->getSrc(1);
1850 const operation srcOp
= toOp
== OP_SAD
? OP_SAD
: OP_MUL
;
1851 const Modifier modBad
= Modifier(~((toOp
== OP_MAD
) ? NV50_IR_MOD_NEG
: 0));
1854 if (src0
->refCount() == 1 &&
1855 src0
->getUniqueInsn() && src0
->getUniqueInsn()->op
== srcOp
)
1858 if (src1
->refCount() == 1 &&
1859 src1
->getUniqueInsn() && src1
->getUniqueInsn()->op
== srcOp
)
1864 src
= add
->getSrc(s
);
1866 if (src
->getUniqueInsn() && src
->getUniqueInsn()->bb
!= add
->bb
)
1869 if (src
->getInsn()->saturate
|| src
->getInsn()->postFactor
||
1870 src
->getInsn()->dnz
|| src
->getInsn()->precise
)
1873 if (toOp
== OP_SAD
) {
1875 if (!src
->getInsn()->src(2).getImmediate(imm
))
1877 if (!imm
.isInteger(0))
1881 if (typeSizeof(add
->dType
) != typeSizeof(src
->getInsn()->dType
) ||
1882 isFloatType(add
->dType
) != isFloatType(src
->getInsn()->dType
))
1885 mod
[0] = add
->src(0).mod
;
1886 mod
[1] = add
->src(1).mod
;
1887 mod
[2] = src
->getUniqueInsn()->src(0).mod
;
1888 mod
[3] = src
->getUniqueInsn()->src(1).mod
;
1890 if (((mod
[0] | mod
[1]) | (mod
[2] | mod
[3])) & modBad
)
1894 add
->subOp
= src
->getInsn()->subOp
; // potentially mul-high
1895 add
->dnz
= src
->getInsn()->dnz
;
1896 add
->dType
= src
->getInsn()->dType
; // sign matters for imad hi
1897 add
->sType
= src
->getInsn()->sType
;
1899 add
->setSrc(2, add
->src(s
? 0 : 1));
1901 add
->setSrc(0, src
->getInsn()->getSrc(0));
1902 add
->src(0).mod
= mod
[2] ^ mod
[s
];
1903 add
->setSrc(1, src
->getInsn()->getSrc(1));
1904 add
->src(1).mod
= mod
[3];
1910 AlgebraicOpt::handleMINMAX(Instruction
*minmax
)
1912 Value
*src0
= minmax
->getSrc(0);
1913 Value
*src1
= minmax
->getSrc(1);
1915 if (src0
!= src1
|| src0
->reg
.file
!= FILE_GPR
)
1917 if (minmax
->src(0).mod
== minmax
->src(1).mod
) {
1918 if (minmax
->def(0).mayReplace(minmax
->src(0))) {
1919 minmax
->def(0).replace(minmax
->src(0), false);
1920 delete_Instruction(prog
, minmax
);
1922 minmax
->op
= OP_CVT
;
1923 minmax
->setSrc(1, NULL
);
1927 // min(x, -x) = -abs(x)
1928 // min(x, -abs(x)) = -abs(x)
1929 // min(x, abs(x)) = x
1930 // max(x, -abs(x)) = x
1931 // max(x, abs(x)) = abs(x)
1932 // max(x, -x) = abs(x)
1937 // rcp(sqrt(a)) = rsq(a)
1939 AlgebraicOpt::handleRCP(Instruction
*rcp
)
1941 Instruction
*si
= rcp
->getSrc(0)->getUniqueInsn();
1946 if (si
->op
== OP_RCP
) {
1947 Modifier mod
= rcp
->src(0).mod
* si
->src(0).mod
;
1948 rcp
->op
= mod
.getOp();
1949 rcp
->setSrc(0, si
->getSrc(0));
1950 } else if (si
->op
== OP_SQRT
) {
1952 rcp
->setSrc(0, si
->getSrc(0));
1953 rcp
->src(0).mod
= rcp
->src(0).mod
* si
->src(0).mod
;
1958 AlgebraicOpt::handleSLCT(Instruction
*slct
)
1960 if (slct
->getSrc(2)->reg
.file
== FILE_IMMEDIATE
) {
1961 if (slct
->getSrc(2)->asImm()->compare(slct
->asCmp()->setCond
, 0.0f
))
1962 slct
->setSrc(0, slct
->getSrc(1));
1964 if (slct
->getSrc(0) != slct
->getSrc(1)) {
1968 slct
->setSrc(1, NULL
);
1969 slct
->setSrc(2, NULL
);
1973 AlgebraicOpt::handleLOGOP(Instruction
*logop
)
1975 Value
*src0
= logop
->getSrc(0);
1976 Value
*src1
= logop
->getSrc(1);
1978 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
1982 if ((logop
->op
== OP_AND
|| logop
->op
== OP_OR
) &&
1983 logop
->def(0).mayReplace(logop
->src(0))) {
1984 logop
->def(0).replace(logop
->src(0), false);
1985 delete_Instruction(prog
, logop
);
1988 // try AND(SET, SET) -> SET_AND(SET)
1989 Instruction
*set0
= src0
->getInsn();
1990 Instruction
*set1
= src1
->getInsn();
1992 if (!set0
|| set0
->fixed
|| !set1
|| set1
->fixed
)
1994 if (set1
->op
!= OP_SET
) {
1995 Instruction
*xchg
= set0
;
1998 if (set1
->op
!= OP_SET
)
2001 operation redOp
= (logop
->op
== OP_AND
? OP_SET_AND
:
2002 logop
->op
== OP_XOR
? OP_SET_XOR
: OP_SET_OR
);
2003 if (!prog
->getTarget()->isOpSupported(redOp
, set1
->sType
))
2005 if (set0
->op
!= OP_SET
&&
2006 set0
->op
!= OP_SET_AND
&&
2007 set0
->op
!= OP_SET_OR
&&
2008 set0
->op
!= OP_SET_XOR
)
2010 if (set0
->getDef(0)->refCount() > 1 &&
2011 set1
->getDef(0)->refCount() > 1)
2013 if (set0
->getPredicate() || set1
->getPredicate())
2015 // check that they don't source each other
2016 for (int s
= 0; s
< 2; ++s
)
2017 if (set0
->getSrc(s
) == set1
->getDef(0) ||
2018 set1
->getSrc(s
) == set0
->getDef(0))
2021 set0
= cloneForward(func
, set0
);
2022 set1
= cloneShallow(func
, set1
);
2023 logop
->bb
->insertAfter(logop
, set1
);
2024 logop
->bb
->insertAfter(logop
, set0
);
2026 set0
->dType
= TYPE_U8
;
2027 set0
->getDef(0)->reg
.file
= FILE_PREDICATE
;
2028 set0
->getDef(0)->reg
.size
= 1;
2029 set1
->setSrc(2, set0
->getDef(0));
2031 set1
->setDef(0, logop
->getDef(0));
2032 delete_Instruction(prog
, logop
);
2036 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
2038 // F2I(NEG(I2F(ABS(SET))))
2040 AlgebraicOpt::handleCVT_NEG(Instruction
*cvt
)
2042 Instruction
*insn
= cvt
->getSrc(0)->getInsn();
2043 if (cvt
->sType
!= TYPE_F32
||
2044 cvt
->dType
!= TYPE_S32
|| cvt
->src(0).mod
!= Modifier(0))
2046 if (!insn
|| insn
->op
!= OP_NEG
|| insn
->dType
!= TYPE_F32
)
2048 if (insn
->src(0).mod
!= Modifier(0))
2050 insn
= insn
->getSrc(0)->getInsn();
2052 // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
2053 if (insn
&& insn
->op
== OP_CVT
&&
2054 insn
->dType
== TYPE_F32
&&
2055 insn
->sType
== TYPE_S32
) {
2056 insn
= insn
->getSrc(0)->getInsn();
2057 if (!insn
|| insn
->op
!= OP_ABS
|| insn
->sType
!= TYPE_S32
||
2060 insn
= insn
->getSrc(0)->getInsn();
2061 if (!insn
|| insn
->op
!= OP_SET
|| insn
->dType
!= TYPE_U32
)
2064 if (!insn
|| insn
->op
!= OP_SET
|| insn
->dType
!= TYPE_F32
) {
2068 Instruction
*bset
= cloneShallow(func
, insn
);
2069 bset
->dType
= TYPE_U32
;
2070 bset
->setDef(0, cvt
->getDef(0));
2071 cvt
->bb
->insertAfter(cvt
, bset
);
2072 delete_Instruction(prog
, cvt
);
2075 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
2076 // does a type conversion, this becomes trickier as there might be range
2077 // changes/etc. We could handle those in theory as long as the range was being
2078 // reduced or kept the same.
2080 AlgebraicOpt::handleCVT_CVT(Instruction
*cvt
)
2082 Instruction
*insn
= cvt
->getSrc(0)->getInsn();
2087 insn
->dType
!= insn
->sType
||
2088 insn
->dType
!= cvt
->sType
)
2091 RoundMode rnd
= insn
->rnd
;
2108 if (!isFloatType(cvt
->dType
) || !isFloatType(insn
->sType
))
2109 rnd
= (RoundMode
)(rnd
& 3);
2112 cvt
->setSrc(0, insn
->getSrc(0));
2113 cvt
->src(0).mod
*= insn
->src(0).mod
;
2114 cvt
->sType
= insn
->sType
;
2117 // Some shaders extract packed bytes out of words and convert them to
2118 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
2119 // nv50 for word sizes.
2121 // CVT(EXTBF(x, byte/word))
2122 // CVT(AND(bytemask, x))
2123 // CVT(AND(bytemask, SHR(x, 8/16/24)))
2124 // CVT(SHR(x, 16/24))
2126 AlgebraicOpt::handleCVT_EXTBF(Instruction
*cvt
)
2128 Instruction
*insn
= cvt
->getSrc(0)->getInsn();
2131 unsigned width
, offset
;
2132 if ((cvt
->sType
!= TYPE_U32
&& cvt
->sType
!= TYPE_S32
) || !insn
)
2134 if (insn
->op
== OP_EXTBF
&& insn
->src(1).getImmediate(imm
)) {
2135 width
= (imm
.reg
.data
.u32
>> 8) & 0xff;
2136 offset
= imm
.reg
.data
.u32
& 0xff;
2137 arg
= insn
->getSrc(0);
2139 if (width
!= 8 && width
!= 16)
2141 if (width
== 8 && offset
& 0x7)
2143 if (width
== 16 && offset
& 0xf)
2145 } else if (insn
->op
== OP_AND
) {
2147 if (insn
->src(0).getImmediate(imm
))
2149 else if (insn
->src(1).getImmediate(imm
))
2154 if (imm
.reg
.data
.u32
== 0xff)
2156 else if (imm
.reg
.data
.u32
== 0xffff)
2161 arg
= insn
->getSrc(!s
);
2162 Instruction
*shift
= arg
->getInsn();
2164 if (shift
&& shift
->op
== OP_SHR
&&
2165 shift
->sType
== cvt
->sType
&&
2166 shift
->src(1).getImmediate(imm
) &&
2167 ((width
== 8 && (imm
.reg
.data
.u32
& 0x7) == 0) ||
2168 (width
== 16 && (imm
.reg
.data
.u32
& 0xf) == 0))) {
2169 arg
= shift
->getSrc(0);
2170 offset
= imm
.reg
.data
.u32
;
2172 // We just AND'd the high bits away, which means this is effectively an
2174 cvt
->sType
= TYPE_U32
;
2175 } else if (insn
->op
== OP_SHR
&&
2176 insn
->sType
== cvt
->sType
&&
2177 insn
->src(1).getImmediate(imm
)) {
2178 arg
= insn
->getSrc(0);
2179 if (imm
.reg
.data
.u32
== 24) {
2182 } else if (imm
.reg
.data
.u32
== 16) {
2193 // Irrespective of what came earlier, we can undo a shift on the argument
2194 // by adjusting the offset.
2195 Instruction
*shift
= arg
->getInsn();
2196 if (shift
&& shift
->op
== OP_SHL
&&
2197 shift
->src(1).getImmediate(imm
) &&
2198 ((width
== 8 && (imm
.reg
.data
.u32
& 0x7) == 0) ||
2199 (width
== 16 && (imm
.reg
.data
.u32
& 0xf) == 0)) &&
2200 imm
.reg
.data
.u32
<= offset
) {
2201 arg
= shift
->getSrc(0);
2202 offset
-= imm
.reg
.data
.u32
;
2205 // The unpackSnorm lowering still leaves a few shifts behind, but it's too
2206 // annoying to detect them.
2209 cvt
->sType
= cvt
->sType
== TYPE_U32
? TYPE_U8
: TYPE_S8
;
2211 assert(width
== 16);
2212 cvt
->sType
= cvt
->sType
== TYPE_U32
? TYPE_U16
: TYPE_S16
;
2214 cvt
->setSrc(0, arg
);
2215 cvt
->subOp
= offset
>> 3;
2218 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
2220 AlgebraicOpt::handleSUCLAMP(Instruction
*insn
)
2223 int32_t val
= insn
->getSrc(2)->asImm()->reg
.data
.s32
;
2227 assert(insn
->srcExists(0) && insn
->src(0).getFile() == FILE_GPR
);
2229 // look for ADD (TODO: only count references by non-SUCLAMP)
2230 if (insn
->getSrc(0)->refCount() > 1)
2232 add
= insn
->getSrc(0)->getInsn();
2233 if (!add
|| add
->op
!= OP_ADD
||
2234 (add
->dType
!= TYPE_U32
&&
2235 add
->dType
!= TYPE_S32
))
2238 // look for immediate
2239 for (s
= 0; s
< 2; ++s
)
2240 if (add
->src(s
).getImmediate(imm
))
2245 // determine if immediate fits
2246 val
+= imm
.reg
.data
.s32
;
2247 if (val
> 31 || val
< -32)
2249 // determine if other addend fits
2250 if (add
->src(s
).getFile() != FILE_GPR
|| add
->src(s
).mod
!= Modifier(0))
2253 bld
.setPosition(insn
, false); // make sure bld is init'ed
2255 insn
->setSrc(2, bld
.mkImm(val
));
2256 insn
->setSrc(0, add
->getSrc(s
));
2259 // NEG(AND(SET, 1)) -> SET
2261 AlgebraicOpt::handleNEG(Instruction
*i
) {
2262 Instruction
*src
= i
->getSrc(0)->getInsn();
2266 if (isFloatType(i
->sType
) || !src
|| src
->op
!= OP_AND
)
2269 if (src
->src(0).getImmediate(imm
))
2271 else if (src
->src(1).getImmediate(imm
))
2276 if (!imm
.isInteger(1))
2279 Instruction
*set
= src
->getSrc(b
)->getInsn();
2280 if ((set
->op
== OP_SET
|| set
->op
== OP_SET_AND
||
2281 set
->op
== OP_SET_OR
|| set
->op
== OP_SET_XOR
) &&
2282 !isFloatType(set
->dType
)) {
2283 i
->def(0).replace(set
->getDef(0), false);
2287 // EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
2289 AlgebraicOpt::handleEXTBF_RDSV(Instruction
*i
)
2291 Instruction
*rdsv
= i
->getSrc(0)->getUniqueInsn();
2292 if (rdsv
->op
!= OP_RDSV
||
2293 rdsv
->getSrc(0)->asSym()->reg
.data
.sv
.sv
!= SV_COMBINED_TID
)
2295 // Avoid creating more RDSV instructions
2296 if (rdsv
->getDef(0)->refCount() > 1)
2300 if (!i
->src(1).getImmediate(imm
))
2304 if (imm
.isInteger(0x1000))
2307 if (imm
.isInteger(0x0a10))
2310 if (imm
.isInteger(0x061a))
2315 bld
.setPosition(i
, false);
2318 i
->setSrc(0, bld
.mkSysVal(SV_TID
, index
));
2323 AlgebraicOpt::visit(BasicBlock
*bb
)
2326 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
2353 if (prog
->getTarget()->isOpSupported(OP_EXTBF
, TYPE_U32
))
2363 handleEXTBF_RDSV(i
);
2373 // =============================================================================
2375 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2376 // MUL(a, b) -> a few XMADs
2377 // MAD/FMA(a, b, c) -> a few XMADs
2378 class LateAlgebraicOpt
: public Pass
2381 virtual bool visit(Instruction
*);
2383 void handleADD(Instruction
*);
2384 void handleMULMAD(Instruction
*);
2385 bool tryADDToSHLADD(Instruction
*);
2391 LateAlgebraicOpt::handleADD(Instruction
*add
)
2393 Value
*src0
= add
->getSrc(0);
2394 Value
*src1
= add
->getSrc(1);
2396 if (src0
->reg
.file
!= FILE_GPR
|| src1
->reg
.file
!= FILE_GPR
)
2399 if (prog
->getTarget()->isOpSupported(OP_SHLADD
, add
->dType
))
2400 tryADDToSHLADD(add
);
2403 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2405 LateAlgebraicOpt::tryADDToSHLADD(Instruction
*add
)
2407 Value
*src0
= add
->getSrc(0);
2408 Value
*src1
= add
->getSrc(1);
2414 if (add
->saturate
|| add
->usesFlags() || typeSizeof(add
->dType
) == 8
2415 || isFloatType(add
->dType
))
2418 if (src0
->getUniqueInsn() && src0
->getUniqueInsn()->op
== OP_SHL
)
2421 if (src1
->getUniqueInsn() && src1
->getUniqueInsn()->op
== OP_SHL
)
2426 src
= add
->getSrc(s
);
2427 shl
= src
->getUniqueInsn();
2429 if (shl
->bb
!= add
->bb
|| shl
->usesFlags() || shl
->subOp
|| shl
->src(0).mod
)
2432 if (!shl
->src(1).getImmediate(imm
))
2435 add
->op
= OP_SHLADD
;
2436 add
->setSrc(2, add
->src(!s
));
2437 // SHL can't have any modifiers, but the ADD source may have had
2438 // one. Preserve it.
2439 add
->setSrc(0, shl
->getSrc(0));
2441 add
->src(0).mod
= add
->src(1).mod
;
2442 add
->setSrc(1, new_ImmediateValue(shl
->bb
->getProgram(), imm
.reg
.data
.u32
));
2443 add
->src(1).mod
= Modifier(0);
2448 // MUL(a, b) -> a few XMADs
2449 // MAD/FMA(a, b, c) -> a few XMADs
2451 LateAlgebraicOpt::handleMULMAD(Instruction
*i
)
2453 // TODO: handle NV50_IR_SUBOP_MUL_HIGH
2454 if (!prog
->getTarget()->isOpSupported(OP_XMAD
, TYPE_U32
))
2456 if (isFloatType(i
->dType
) || typeSizeof(i
->dType
) != 4)
2458 if (i
->subOp
|| i
->usesFlags() || i
->flagsDef
>= 0)
2461 assert(!i
->src(0).mod
);
2462 assert(!i
->src(1).mod
);
2463 assert(i
->op
== OP_MUL
? 1 : !i
->src(2).mod
);
2465 bld
.setPosition(i
, false);
2467 Value
*a
= i
->getSrc(0);
2468 Value
*b
= i
->getSrc(1);
2469 Value
*c
= i
->op
== OP_MUL
? bld
.mkImm(0) : i
->getSrc(2);
2471 Value
*tmp0
= bld
.getSSA();
2472 Value
*tmp1
= bld
.getSSA();
2474 Instruction
*insn
= bld
.mkOp3(OP_XMAD
, TYPE_U32
, tmp0
, b
, a
, c
);
2475 insn
->setPredicate(i
->cc
, i
->getPredicate());
2477 insn
= bld
.mkOp3(OP_XMAD
, TYPE_U32
, tmp1
, b
, a
, bld
.mkImm(0));
2478 insn
->setPredicate(i
->cc
, i
->getPredicate());
2479 insn
->subOp
= NV50_IR_SUBOP_XMAD_MRG
| NV50_IR_SUBOP_XMAD_H1(1);
2481 Value
*pred
= i
->getPredicate();
2482 i
->setPredicate(i
->cc
, NULL
);
2488 i
->subOp
= NV50_IR_SUBOP_XMAD_PSL
| NV50_IR_SUBOP_XMAD_CBCC
;
2489 i
->subOp
|= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
2491 i
->setPredicate(i
->cc
, pred
);
2495 LateAlgebraicOpt::visit(Instruction
*i
)
2513 // =============================================================================
2515 // Split 64-bit MUL and MAD
2516 class Split64BitOpPreRA
: public Pass
2519 virtual bool visit(BasicBlock
*);
2520 void split64MulMad(Function
*, Instruction
*, DataType
);
2526 Split64BitOpPreRA::visit(BasicBlock
*bb
)
2528 Instruction
*i
, *next
;
2531 for (i
= bb
->getEntry(); i
; i
= next
) {
2536 case TYPE_U64
: hTy
= TYPE_U32
; break;
2537 case TYPE_S64
: hTy
= TYPE_S32
; break;
2542 if (i
->op
== OP_MAD
|| i
->op
== OP_MUL
)
2543 split64MulMad(func
, i
, hTy
);
2550 Split64BitOpPreRA::split64MulMad(Function
*fn
, Instruction
*i
, DataType hTy
)
2552 assert(i
->op
== OP_MAD
|| i
->op
== OP_MUL
);
2553 assert(!isFloatType(i
->dType
) && !isFloatType(i
->sType
));
2554 assert(typeSizeof(hTy
) == 4);
2556 bld
.setPosition(i
, true);
2558 Value
*zero
= bld
.mkImm(0u);
2559 Value
*carry
= bld
.getSSA(1, FILE_FLAGS
);
2561 // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
2562 // values (a, b and c might be 32-bit values), using 32-bit operations. This
2563 // gives the following operations:
2564 // * `d.low = low(a.low * b.low) (+ c.low)?`
2565 // * `d.high = low(a.high * b.low) + low(a.low * b.high)
2566 // + high(a.low * b.low) (+ c.high)?`
2568 // To compute the high bits, we can split in the following operations:
2569 // * `tmp1 = low(a.high * b.low) (+ c.high)?`
2570 // * `tmp2 = low(a.low * b.high) + tmp1`
2571 // * `d.high = high(a.low * b.low) + tmp2`
2573 // mkSplit put lower bits at index 0 and higher bits at index 1
2576 if (i
->getSrc(0)->reg
.size
== 8)
2577 bld
.mkSplit(op1
, 4, i
->getSrc(0));
2579 op1
[0] = i
->getSrc(0);
2583 if (i
->getSrc(1)->reg
.size
== 8)
2584 bld
.mkSplit(op2
, 4, i
->getSrc(1));
2586 op2
[0] = i
->getSrc(1);
2590 Value
*op3
[2] = { NULL
, NULL
};
2591 if (i
->op
== OP_MAD
) {
2592 if (i
->getSrc(2)->reg
.size
== 8)
2593 bld
.mkSplit(op3
, 4, i
->getSrc(2));
2595 op3
[0] = i
->getSrc(2);
2600 Value
*tmpRes1Hi
= bld
.getSSA();
2601 if (i
->op
== OP_MAD
)
2602 bld
.mkOp3(OP_MAD
, hTy
, tmpRes1Hi
, op1
[1], op2
[0], op3
[1]);
2604 bld
.mkOp2(OP_MUL
, hTy
, tmpRes1Hi
, op1
[1], op2
[0]);
2606 Value
*tmpRes2Hi
= bld
.mkOp3v(OP_MAD
, hTy
, bld
.getSSA(), op1
[0], op2
[1], tmpRes1Hi
);
2608 Value
*def
[2] = { bld
.getSSA(), bld
.getSSA() };
2610 // If it was a MAD, add the carry from the low bits
2611 // It is not needed if it was a MUL, since we added high(a.low * b.low) to
2613 if (i
->op
== OP_MAD
)
2614 bld
.mkOp3(OP_MAD
, hTy
, def
[0], op1
[0], op2
[0], op3
[0])->setFlagsDef(1, carry
);
2616 bld
.mkOp2(OP_MUL
, hTy
, def
[0], op1
[0], op2
[0]);
2618 Instruction
*hiPart3
= bld
.mkOp3(OP_MAD
, hTy
, def
[1], op1
[0], op2
[0], tmpRes2Hi
);
2619 hiPart3
->subOp
= NV50_IR_SUBOP_MUL_HIGH
;
2620 if (i
->op
== OP_MAD
)
2621 hiPart3
->setFlagsSrc(3, carry
);
2623 bld
.mkOp2(OP_MERGE
, i
->dType
, i
->getDef(0), def
[0], def
[1]);
2625 delete_Instruction(fn
->getProgram(), i
);
2628 // =============================================================================
2631 updateLdStOffset(Instruction
*ldst
, int32_t offset
, Function
*fn
)
2633 if (offset
!= ldst
->getSrc(0)->reg
.data
.offset
) {
2634 if (ldst
->getSrc(0)->refCount() > 1)
2635 ldst
->setSrc(0, cloneShallow(fn
, ldst
->getSrc(0)));
2636 ldst
->getSrc(0)->reg
.data
.offset
= offset
;
2640 // Combine loads and stores, forward stores to loads where possible.
2641 class MemoryOpt
: public Pass
2649 const Value
*rel
[2];
2657 bool overlaps(const Instruction
*ldst
) const;
2659 inline void link(Record
**);
2660 inline void unlink(Record
**);
2661 inline void set(const Instruction
*ldst
);
2667 Record
*loads
[DATA_FILE_COUNT
];
2668 Record
*stores
[DATA_FILE_COUNT
];
2670 MemoryPool recordPool
;
2673 virtual bool visit(BasicBlock
*);
2674 bool runOpt(BasicBlock
*);
2676 Record
**getList(const Instruction
*);
2678 Record
*findRecord(const Instruction
*, bool load
, bool& isAdjacent
) const;
2680 // merge @insn into load/store instruction from @rec
2681 bool combineLd(Record
*rec
, Instruction
*ld
);
2682 bool combineSt(Record
*rec
, Instruction
*st
);
2684 bool replaceLdFromLd(Instruction
*ld
, Record
*ldRec
);
2685 bool replaceLdFromSt(Instruction
*ld
, Record
*stRec
);
2686 bool replaceStFromSt(Instruction
*restrict st
, Record
*stRec
);
2688 void addRecord(Instruction
*ldst
);
2689 void purgeRecords(Instruction
*const st
, DataFile
);
2690 void lockStores(Instruction
*const ld
);
2697 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record
), 6)
2699 for (int i
= 0; i
< DATA_FILE_COUNT
; ++i
) {
2709 for (unsigned int i
= 0; i
< DATA_FILE_COUNT
; ++i
) {
2711 for (it
= loads
[i
]; it
; it
= next
) {
2713 recordPool
.release(it
);
2716 for (it
= stores
[i
]; it
; it
= next
) {
2718 recordPool
.release(it
);
2725 MemoryOpt::combineLd(Record
*rec
, Instruction
*ld
)
2727 int32_t offRc
= rec
->offset
;
2728 int32_t offLd
= ld
->getSrc(0)->reg
.data
.offset
;
2729 int sizeRc
= rec
->size
;
2730 int sizeLd
= typeSizeof(ld
->dType
);
2731 int size
= sizeRc
+ sizeLd
;
2734 if (!prog
->getTarget()->
2735 isAccessSupported(ld
->getSrc(0)->reg
.file
, typeOfSize(size
)))
2737 // no unaligned loads
2738 if (((size
== 0x8) && (MIN2(offLd
, offRc
) & 0x7)) ||
2739 ((size
== 0xc) && (MIN2(offLd
, offRc
) & 0xf)))
2741 // for compute indirect loads are not guaranteed to be aligned
2742 if (prog
->getType() == Program::TYPE_COMPUTE
&& rec
->rel
[0])
2745 assert(sizeRc
+ sizeLd
<= 16 && offRc
!= offLd
);
2747 // lock any stores that overlap with the load being merged into the
2751 for (j
= 0; sizeRc
; sizeRc
-= rec
->insn
->getDef(j
)->reg
.size
, ++j
);
2753 if (offLd
< offRc
) {
2755 for (sz
= 0, d
= 0; sz
< sizeLd
; sz
+= ld
->getDef(d
)->reg
.size
, ++d
);
2756 // d: nr of definitions in ld
2757 // j: nr of definitions in rec->insn, move:
2758 for (d
= d
+ j
- 1; j
> 0; --j
, --d
)
2759 rec
->insn
->setDef(d
, rec
->insn
->getDef(j
- 1));
2761 if (rec
->insn
->getSrc(0)->refCount() > 1)
2762 rec
->insn
->setSrc(0, cloneShallow(func
, rec
->insn
->getSrc(0)));
2763 rec
->offset
= rec
->insn
->getSrc(0)->reg
.data
.offset
= offLd
;
2769 // move definitions of @ld to @rec->insn
2770 for (j
= 0; sizeLd
; ++j
, ++d
) {
2771 sizeLd
-= ld
->getDef(j
)->reg
.size
;
2772 rec
->insn
->setDef(d
, ld
->getDef(j
));
2776 rec
->insn
->getSrc(0)->reg
.size
= size
;
2777 rec
->insn
->setType(typeOfSize(size
));
2779 delete_Instruction(prog
, ld
);
2785 MemoryOpt::combineSt(Record
*rec
, Instruction
*st
)
2787 int32_t offRc
= rec
->offset
;
2788 int32_t offSt
= st
->getSrc(0)->reg
.data
.offset
;
2789 int sizeRc
= rec
->size
;
2790 int sizeSt
= typeSizeof(st
->dType
);
2792 int size
= sizeRc
+ sizeSt
;
2794 Value
*src
[4]; // no modifiers in ValueRef allowed for st
2797 if (!prog
->getTarget()->
2798 isAccessSupported(st
->getSrc(0)->reg
.file
, typeOfSize(size
)))
2800 // no unaligned stores
2801 if (size
== 8 && MIN2(offRc
, offSt
) & 0x7)
2803 // for compute indirect stores are not guaranteed to be aligned
2804 if (prog
->getType() == Program::TYPE_COMPUTE
&& rec
->rel
[0])
2807 // There's really no great place to put this in a generic manner. Seemingly
2808 // wide stores at 0x60 don't work in GS shaders on SM50+. Don't combine
2810 if (prog
->getTarget()->getChipset() >= NVISA_GM107_CHIPSET
&&
2811 prog
->getType() == Program::TYPE_GEOMETRY
&&
2812 st
->getSrc(0)->reg
.file
== FILE_SHADER_OUTPUT
&&
2813 rec
->rel
[0] == NULL
&&
2814 MIN2(offRc
, offSt
) == 0x60)
2817 // remove any existing load/store records for the store being merged into
2818 // the existing record.
2819 purgeRecords(st
, DATA_FILE_COUNT
);
2821 st
->takeExtraSources(0, extra
); // save predicate and indirect address
2823 if (offRc
< offSt
) {
2824 // save values from @st
2825 for (s
= 0; sizeSt
; ++s
) {
2826 sizeSt
-= st
->getSrc(s
+ 1)->reg
.size
;
2827 src
[s
] = st
->getSrc(s
+ 1);
2829 // set record's values as low sources of @st
2830 for (j
= 1; sizeRc
; ++j
) {
2831 sizeRc
-= rec
->insn
->getSrc(j
)->reg
.size
;
2832 st
->setSrc(j
, rec
->insn
->getSrc(j
));
2834 // set saved values as high sources of @st
2835 for (k
= j
, j
= 0; j
< s
; ++j
)
2836 st
->setSrc(k
++, src
[j
]);
2838 updateLdStOffset(st
, offRc
, func
);
2840 for (j
= 1; sizeSt
; ++j
)
2841 sizeSt
-= st
->getSrc(j
)->reg
.size
;
2842 for (s
= 1; sizeRc
; ++j
, ++s
) {
2843 sizeRc
-= rec
->insn
->getSrc(s
)->reg
.size
;
2844 st
->setSrc(j
, rec
->insn
->getSrc(s
));
2846 rec
->offset
= offSt
;
2848 st
->putExtraSources(0, extra
); // restore pointer and predicate
2850 delete_Instruction(prog
, rec
->insn
);
2853 rec
->insn
->getSrc(0)->reg
.size
= size
;
2854 rec
->insn
->setType(typeOfSize(size
));
2859 MemoryOpt::Record::set(const Instruction
*ldst
)
2861 const Symbol
*mem
= ldst
->getSrc(0)->asSym();
2862 fileIndex
= mem
->reg
.fileIndex
;
2863 rel
[0] = ldst
->getIndirect(0, 0);
2864 rel
[1] = ldst
->getIndirect(0, 1);
2865 offset
= mem
->reg
.data
.offset
;
2866 base
= mem
->getBase();
2867 size
= typeSizeof(ldst
->sType
);
2871 MemoryOpt::Record::link(Record
**list
)
2881 MemoryOpt::Record::unlink(Record
**list
)
2891 MemoryOpt::Record
**
2892 MemoryOpt::getList(const Instruction
*insn
)
2894 if (insn
->op
== OP_LOAD
|| insn
->op
== OP_VFETCH
)
2895 return &loads
[insn
->src(0).getFile()];
2896 return &stores
[insn
->src(0).getFile()];
2900 MemoryOpt::addRecord(Instruction
*i
)
2902 Record
**list
= getList(i
);
2903 Record
*it
= reinterpret_cast<Record
*>(recordPool
.allocate());
2912 MemoryOpt::findRecord(const Instruction
*insn
, bool load
, bool& isAdj
) const
2914 const Symbol
*sym
= insn
->getSrc(0)->asSym();
2915 const int size
= typeSizeof(insn
->sType
);
2917 Record
*it
= load
? loads
[sym
->reg
.file
] : stores
[sym
->reg
.file
];
2919 for (; it
; it
= it
->next
) {
2920 if (it
->locked
&& insn
->op
!= OP_LOAD
&& insn
->op
!= OP_VFETCH
)
2922 if ((it
->offset
>> 4) != (sym
->reg
.data
.offset
>> 4) ||
2923 it
->rel
[0] != insn
->getIndirect(0, 0) ||
2924 it
->fileIndex
!= sym
->reg
.fileIndex
||
2925 it
->rel
[1] != insn
->getIndirect(0, 1))
2928 if (it
->offset
< sym
->reg
.data
.offset
) {
2929 if (it
->offset
+ it
->size
>= sym
->reg
.data
.offset
) {
2930 isAdj
= (it
->offset
+ it
->size
== sym
->reg
.data
.offset
);
2933 if (!(it
->offset
& 0x7))
2937 isAdj
= it
->offset
!= sym
->reg
.data
.offset
;
2938 if (size
<= it
->size
&& !isAdj
)
2941 if (!(sym
->reg
.data
.offset
& 0x7))
2942 if (it
->offset
- size
<= sym
->reg
.data
.offset
)
2950 MemoryOpt::replaceLdFromSt(Instruction
*ld
, Record
*rec
)
2952 Instruction
*st
= rec
->insn
;
2953 int32_t offSt
= rec
->offset
;
2954 int32_t offLd
= ld
->getSrc(0)->reg
.data
.offset
;
2957 for (s
= 1; offSt
!= offLd
&& st
->srcExists(s
); ++s
)
2958 offSt
+= st
->getSrc(s
)->reg
.size
;
2962 for (d
= 0; ld
->defExists(d
) && st
->srcExists(s
); ++d
, ++s
) {
2963 if (ld
->getDef(d
)->reg
.size
!= st
->getSrc(s
)->reg
.size
)
2965 if (st
->getSrc(s
)->reg
.file
!= FILE_GPR
)
2967 ld
->def(d
).replace(st
->src(s
), false);
2974 MemoryOpt::replaceLdFromLd(Instruction
*ldE
, Record
*rec
)
2976 Instruction
*ldR
= rec
->insn
;
2977 int32_t offR
= rec
->offset
;
2978 int32_t offE
= ldE
->getSrc(0)->reg
.data
.offset
;
2981 assert(offR
<= offE
);
2982 for (dR
= 0; offR
< offE
&& ldR
->defExists(dR
); ++dR
)
2983 offR
+= ldR
->getDef(dR
)->reg
.size
;
2987 for (dE
= 0; ldE
->defExists(dE
) && ldR
->defExists(dR
); ++dE
, ++dR
) {
2988 if (ldE
->getDef(dE
)->reg
.size
!= ldR
->getDef(dR
)->reg
.size
)
2990 ldE
->def(dE
).replace(ldR
->getDef(dR
), false);
2993 delete_Instruction(prog
, ldE
);
2998 MemoryOpt::replaceStFromSt(Instruction
*restrict st
, Record
*rec
)
3000 const Instruction
*const ri
= rec
->insn
;
3003 int32_t offS
= st
->getSrc(0)->reg
.data
.offset
;
3004 int32_t offR
= rec
->offset
;
3005 int32_t endS
= offS
+ typeSizeof(st
->dType
);
3006 int32_t endR
= offR
+ typeSizeof(ri
->dType
);
3008 rec
->size
= MAX2(endS
, endR
) - MIN2(offS
, offR
);
3010 st
->takeExtraSources(0, extra
);
3016 // get non-replaced sources of ri
3017 for (s
= 1; offR
< offS
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
)
3018 vals
[k
++] = ri
->getSrc(s
);
3020 // get replaced sources of st
3021 for (s
= 1; st
->srcExists(s
); offS
+= st
->getSrc(s
)->reg
.size
, ++s
)
3022 vals
[k
++] = st
->getSrc(s
);
3023 // skip replaced sources of ri
3024 for (s
= n
; offR
< endS
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
);
3025 // get non-replaced sources after values covered by st
3026 for (; offR
< endR
; offR
+= ri
->getSrc(s
)->reg
.size
, ++s
)
3027 vals
[k
++] = ri
->getSrc(s
);
3028 assert((unsigned int)k
<= ARRAY_SIZE(vals
));
3029 for (s
= 0; s
< k
; ++s
)
3030 st
->setSrc(s
+ 1, vals
[s
]);
3031 st
->setSrc(0, ri
->getSrc(0));
3035 for (j
= 1; offR
< endS
; offR
+= ri
->getSrc(j
++)->reg
.size
);
3036 for (s
= 1; offS
< endS
; offS
+= st
->getSrc(s
++)->reg
.size
);
3037 for (; offR
< endR
; offR
+= ri
->getSrc(j
++)->reg
.size
)
3038 st
->setSrc(s
++, ri
->getSrc(j
));
3040 st
->putExtraSources(0, extra
);
3042 delete_Instruction(prog
, rec
->insn
);
3045 rec
->offset
= st
->getSrc(0)->reg
.data
.offset
;
3047 st
->setType(typeOfSize(rec
->size
));
3053 MemoryOpt::Record::overlaps(const Instruction
*ldst
) const
3058 // This assumes that images/buffers can't overlap. They can.
3059 // TODO: Plumb the restrict logic through, and only skip when it's a
3060 // restrict situation, or there can implicitly be no writes.
3061 if (this->fileIndex
!= that
.fileIndex
&& this->rel
[1] == that
.rel
[1])
3064 if (this->rel
[0] || that
.rel
[0])
3065 return this->base
== that
.base
;
3068 (this->offset
< that
.offset
+ that
.size
) &&
3069 (this->offset
+ this->size
> that
.offset
);
3072 // We must not eliminate stores that affect the result of @ld if
3073 // we find later stores to the same location, and we may no longer
3074 // merge them with later stores.
3075 // The stored value can, however, still be used to determine the value
3076 // returned by future loads.
3078 MemoryOpt::lockStores(Instruction
*const ld
)
3080 for (Record
*r
= stores
[ld
->src(0).getFile()]; r
; r
= r
->next
)
3081 if (!r
->locked
&& r
->overlaps(ld
))
3085 // Prior loads from the location of @st are no longer valid.
3086 // Stores to the location of @st may no longer be used to derive
3087 // the value at it nor be coalesced into later stores.
3089 MemoryOpt::purgeRecords(Instruction
*const st
, DataFile f
)
3092 f
= st
->src(0).getFile();
3094 for (Record
*r
= loads
[f
]; r
; r
= r
->next
)
3095 if (!st
|| r
->overlaps(st
))
3096 r
->unlink(&loads
[f
]);
3098 for (Record
*r
= stores
[f
]; r
; r
= r
->next
)
3099 if (!st
|| r
->overlaps(st
))
3100 r
->unlink(&stores
[f
]);
3104 MemoryOpt::visit(BasicBlock
*bb
)
3106 bool ret
= runOpt(bb
);
3107 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
3108 // where 96 bit memory operations are forbidden.
3115 MemoryOpt::runOpt(BasicBlock
*bb
)
3117 Instruction
*ldst
, *next
;
3119 bool isAdjacent
= true;
3121 for (ldst
= bb
->getEntry(); ldst
; ldst
= next
) {
3126 if (ldst
->op
== OP_LOAD
|| ldst
->op
== OP_VFETCH
) {
3127 if (ldst
->isDead()) {
3128 // might have been produced by earlier optimization
3129 delete_Instruction(prog
, ldst
);
3133 if (ldst
->op
== OP_STORE
|| ldst
->op
== OP_EXPORT
) {
3134 if (typeSizeof(ldst
->dType
) == 4 &&
3135 ldst
->src(1).getFile() == FILE_GPR
&&
3136 ldst
->getSrc(1)->getInsn()->op
== OP_NOP
) {
3137 delete_Instruction(prog
, ldst
);
3142 // TODO: maybe have all fixed ops act as barrier ?
3143 if (ldst
->op
== OP_CALL
||
3144 ldst
->op
== OP_BAR
||
3145 ldst
->op
== OP_MEMBAR
) {
3146 purgeRecords(NULL
, FILE_MEMORY_LOCAL
);
3147 purgeRecords(NULL
, FILE_MEMORY_GLOBAL
);
3148 purgeRecords(NULL
, FILE_MEMORY_SHARED
);
3149 purgeRecords(NULL
, FILE_SHADER_OUTPUT
);
3151 if (ldst
->op
== OP_ATOM
|| ldst
->op
== OP_CCTL
) {
3152 if (ldst
->src(0).getFile() == FILE_MEMORY_GLOBAL
) {
3153 purgeRecords(NULL
, FILE_MEMORY_LOCAL
);
3154 purgeRecords(NULL
, FILE_MEMORY_GLOBAL
);
3155 purgeRecords(NULL
, FILE_MEMORY_SHARED
);
3157 purgeRecords(NULL
, ldst
->src(0).getFile());
3160 if (ldst
->op
== OP_EMIT
|| ldst
->op
== OP_RESTART
) {
3161 purgeRecords(NULL
, FILE_SHADER_OUTPUT
);
3165 if (ldst
->getPredicate()) // TODO: handle predicated ld/st
3167 if (ldst
->perPatch
) // TODO: create separate per-patch lists
3171 DataFile file
= ldst
->src(0).getFile();
3173 // if ld l[]/g[] look for previous store to eliminate the reload
3174 if (file
== FILE_MEMORY_GLOBAL
|| file
== FILE_MEMORY_LOCAL
) {
3175 // TODO: shared memory ?
3176 rec
= findRecord(ldst
, false, isAdjacent
);
3177 if (rec
&& !isAdjacent
)
3178 keep
= !replaceLdFromSt(ldst
, rec
);
3181 // or look for ld from the same location and replace this one
3182 rec
= keep
? findRecord(ldst
, true, isAdjacent
) : NULL
;
3185 keep
= !replaceLdFromLd(ldst
, rec
);
3187 // or combine a previous load with this one
3188 keep
= !combineLd(rec
, ldst
);
3193 rec
= findRecord(ldst
, false, isAdjacent
);
3196 keep
= !replaceStFromSt(ldst
, rec
);
3198 keep
= !combineSt(rec
, ldst
);
3201 purgeRecords(ldst
, DATA_FILE_COUNT
);
3211 // =============================================================================
3213 // Turn control flow into predicated instructions (after register allocation !).
3215 // Could move this to before register allocation on NVC0 and also handle nested
3217 class FlatteningPass
: public Pass
3220 virtual bool visit(Function
*);
3221 virtual bool visit(BasicBlock
*);
3223 bool tryPredicateConditional(BasicBlock
*);
3224 void predicateInstructions(BasicBlock
*, Value
*pred
, CondCode cc
);
3225 void tryPropagateBranch(BasicBlock
*);
3226 inline bool isConstantCondition(Value
*pred
);
3227 inline bool mayPredicate(const Instruction
*, const Value
*pred
) const;
3228 inline void removeFlow(Instruction
*);
3234 FlatteningPass::isConstantCondition(Value
*pred
)
3236 Instruction
*insn
= pred
->getUniqueInsn();
3238 if (insn
->op
!= OP_SET
|| insn
->srcExists(2))
3241 for (int s
= 0; s
< 2 && insn
->srcExists(s
); ++s
) {
3242 Instruction
*ld
= insn
->getSrc(s
)->getUniqueInsn();
3245 if (ld
->op
!= OP_MOV
&& ld
->op
!= OP_LOAD
)
3247 if (ld
->src(0).isIndirect(0))
3249 file
= ld
->src(0).getFile();
3251 file
= insn
->src(s
).getFile();
3252 // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
3253 // in register "units", which can vary between targets.
3254 if (file
== FILE_GPR
) {
3255 Value
*v
= insn
->getSrc(s
);
3256 int bytes
= v
->reg
.data
.id
* MIN2(v
->reg
.size
, 4);
3257 int units
= bytes
>> gpr_unit
;
3258 if (units
> prog
->maxGPR
)
3259 file
= FILE_IMMEDIATE
;
3262 if (file
!= FILE_IMMEDIATE
&& file
!= FILE_MEMORY_CONST
)
3269 FlatteningPass::removeFlow(Instruction
*insn
)
3271 FlowInstruction
*term
= insn
? insn
->asFlow() : NULL
;
3274 Graph::Edge::Type ty
= term
->bb
->cfg
.outgoing().getType();
3276 if (term
->op
== OP_BRA
) {
3277 // TODO: this might get more difficult when we get arbitrary BRAs
3278 if (ty
== Graph::Edge::CROSS
|| ty
== Graph::Edge::BACK
)
3281 if (term
->op
!= OP_JOIN
)
3284 Value
*pred
= term
->getPredicate();
3286 delete_Instruction(prog
, term
);
3288 if (pred
&& pred
->refCount() == 0) {
3289 Instruction
*pSet
= pred
->getUniqueInsn();
3290 pred
->join
->reg
.data
.id
= -1; // deallocate
3292 delete_Instruction(prog
, pSet
);
3297 FlatteningPass::predicateInstructions(BasicBlock
*bb
, Value
*pred
, CondCode cc
)
3299 for (Instruction
*i
= bb
->getEntry(); i
; i
= i
->next
) {
3302 assert(!i
->getPredicate());
3303 i
->setPredicate(cc
, pred
);
3305 removeFlow(bb
->getExit());
3309 FlatteningPass::mayPredicate(const Instruction
*insn
, const Value
*pred
) const
3311 if (insn
->isPseudo())
3313 // TODO: calls where we don't know which registers are modified
3315 if (!prog
->getTarget()->mayPredicate(insn
, pred
))
3317 for (int d
= 0; insn
->defExists(d
); ++d
)
3318 if (insn
->getDef(d
)->equals(pred
))
3323 // If we jump to BRA/RET/EXIT, replace the jump with it.
3324 // NOTE: We do not update the CFG anymore here !
3326 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
3328 // @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
3336 FlatteningPass::tryPropagateBranch(BasicBlock
*bb
)
3338 for (Instruction
*i
= bb
->getExit(); i
&& i
->op
== OP_BRA
; i
= i
->prev
) {
3339 BasicBlock
*bf
= i
->asFlow()->target
.bb
;
3341 if (bf
->getInsnCount() != 1)
3344 FlowInstruction
*bra
= i
->asFlow();
3345 FlowInstruction
*rep
= bf
->getExit()->asFlow();
3347 if (!rep
|| rep
->getPredicate())
3349 if (rep
->op
!= OP_BRA
&&
3350 rep
->op
!= OP_JOIN
&&
3354 // TODO: If there are multiple branches to @rep, only the first would
3355 // be replaced, so only remove them after this pass is done ?
3356 // Also, need to check all incident blocks for fall-through exits and
3357 // add the branch there.
3359 bra
->target
.bb
= rep
->target
.bb
;
3360 if (bf
->cfg
.incidentCount() == 1)
3366 FlatteningPass::visit(Function
*fn
)
3368 gpr_unit
= prog
->getTarget()->getFileUnit(FILE_GPR
);
3374 FlatteningPass::visit(BasicBlock
*bb
)
3376 if (tryPredicateConditional(bb
))
3379 // try to attach join to previous instruction
3380 if (prog
->getTarget()->hasJoin
) {
3381 Instruction
*insn
= bb
->getExit();
3382 if (insn
&& insn
->op
== OP_JOIN
&& !insn
->getPredicate()) {
3384 if (insn
&& !insn
->getPredicate() &&
3386 insn
->op
!= OP_DISCARD
&&
3387 insn
->op
!= OP_TEXBAR
&&
3388 !isTextureOp(insn
->op
) && // probably just nve4
3389 !isSurfaceOp(insn
->op
) && // not confirmed
3390 insn
->op
!= OP_LINTERP
&& // probably just nve4
3391 insn
->op
!= OP_PINTERP
&& // probably just nve4
3392 ((insn
->op
!= OP_LOAD
&& insn
->op
!= OP_STORE
&& insn
->op
!= OP_ATOM
) ||
3393 (typeSizeof(insn
->dType
) <= 4 && !insn
->src(0).isIndirect(0))) &&
3396 bb
->remove(bb
->getExit());
3402 tryPropagateBranch(bb
);
3408 FlatteningPass::tryPredicateConditional(BasicBlock
*bb
)
3410 BasicBlock
*bL
= NULL
, *bR
= NULL
;
3411 unsigned int nL
= 0, nR
= 0, limit
= 12;
3415 mask
= bb
->initiatesSimpleConditional();
3419 assert(bb
->getExit());
3420 Value
*pred
= bb
->getExit()->getPredicate();
3423 if (isConstantCondition(pred
))
3426 Graph::EdgeIterator ei
= bb
->cfg
.outgoing();
3429 bL
= BasicBlock::get(ei
.getNode());
3430 for (insn
= bL
->getEntry(); insn
; insn
= insn
->next
, ++nL
)
3431 if (!mayPredicate(insn
, pred
))
3434 return false; // too long, do a real branch
3439 bR
= BasicBlock::get(ei
.getNode());
3440 for (insn
= bR
->getEntry(); insn
; insn
= insn
->next
, ++nR
)
3441 if (!mayPredicate(insn
, pred
))
3444 return false; // too long, do a real branch
3448 predicateInstructions(bL
, pred
, bb
->getExit()->cc
);
3450 predicateInstructions(bR
, pred
, inverseCondCode(bb
->getExit()->cc
));
3453 bb
->remove(bb
->joinAt
);
3456 removeFlow(bb
->getExit()); // delete the branch/join at the fork point
3458 // remove potential join operations at the end of the conditional
3459 if (prog
->getTarget()->joinAnterior
) {
3460 bb
= BasicBlock::get((bL
? bL
: bR
)->cfg
.outgoing().getNode());
3461 if (bb
->getEntry() && bb
->getEntry()->op
== OP_JOIN
)
3462 removeFlow(bb
->getEntry());
3468 // =============================================================================
3470 // Fold Immediate into MAD; must be done after register allocation due to
3471 // constraint SDST == SSRC2
3473 // Does NVC0+ have other situations where this pass makes sense?
3474 class PostRaLoadPropagation
: public Pass
3477 virtual bool visit(Instruction
*);
3479 void handleMADforNV50(Instruction
*);
3480 void handleMADforNVC0(Instruction
*);
3484 post_ra_dead(Instruction
*i
)
3486 for (int d
= 0; i
->defExists(d
); ++d
)
3487 if (i
->getDef(d
)->refCount())
3492 // Fold Immediate into MAD; must be done after register allocation due to
3493 // constraint SDST == SSRC2
3495 PostRaLoadPropagation::handleMADforNV50(Instruction
*i
)
3497 if (i
->def(0).getFile() != FILE_GPR
||
3498 i
->src(0).getFile() != FILE_GPR
||
3499 i
->src(1).getFile() != FILE_GPR
||
3500 i
->src(2).getFile() != FILE_GPR
||
3501 i
->getDef(0)->reg
.data
.id
!= i
->getSrc(2)->reg
.data
.id
)
3504 if (i
->getDef(0)->reg
.data
.id
>= 64 ||
3505 i
->getSrc(0)->reg
.data
.id
>= 64)
3508 if (i
->flagsSrc
>= 0 && i
->getSrc(i
->flagsSrc
)->reg
.data
.id
!= 0)
3511 if (i
->getPredicate())
3515 Instruction
*def
= i
->getSrc(1)->getInsn();
3517 if (def
&& def
->op
== OP_SPLIT
&& typeSizeof(def
->sType
) == 4)
3518 def
= def
->getSrc(0)->getInsn();
3519 if (def
&& def
->op
== OP_MOV
&& def
->src(0).getFile() == FILE_IMMEDIATE
) {
3520 vtmp
= i
->getSrc(1);
3521 if (isFloatType(i
->sType
)) {
3522 i
->setSrc(1, def
->getSrc(0));
3525 // getImmediate() has side-effects on the argument so this *shouldn't*
3526 // be folded into the assert()
3527 ASSERTED
bool ret
= def
->src(0).getImmediate(val
);
3529 if (i
->getSrc(1)->reg
.data
.id
& 1)
3530 val
.reg
.data
.u32
>>= 16;
3531 val
.reg
.data
.u32
&= 0xffff;
3532 i
->setSrc(1, new_ImmediateValue(prog
, val
.reg
.data
.u32
));
3535 /* There's no post-RA dead code elimination, so do it here
3536 * XXX: if we add more code-removing post-RA passes, we might
3537 * want to create a post-RA dead-code elim pass */
3538 if (post_ra_dead(vtmp
->getInsn())) {
3539 Value
*src
= vtmp
->getInsn()->getSrc(0);
3540 // Careful -- splits will have already been removed from the
3541 // functions. Don't double-delete.
3542 if (vtmp
->getInsn()->bb
)
3543 delete_Instruction(prog
, vtmp
->getInsn());
3544 if (src
->getInsn() && post_ra_dead(src
->getInsn()))
3545 delete_Instruction(prog
, src
->getInsn());
3551 PostRaLoadPropagation::handleMADforNVC0(Instruction
*i
)
3553 if (i
->def(0).getFile() != FILE_GPR
||
3554 i
->src(0).getFile() != FILE_GPR
||
3555 i
->src(1).getFile() != FILE_GPR
||
3556 i
->src(2).getFile() != FILE_GPR
||
3557 i
->getDef(0)->reg
.data
.id
!= i
->getSrc(2)->reg
.data
.id
)
3560 // TODO: gm107 can also do this for S32, maybe other chipsets as well
3561 if (i
->dType
!= TYPE_F32
)
3564 if ((i
->src(2).mod
| Modifier(NV50_IR_MOD_NEG
)) != Modifier(NV50_IR_MOD_NEG
))
3570 if (i
->src(0).getImmediate(val
))
3572 else if (i
->src(1).getImmediate(val
))
3577 if ((i
->src(s
).mod
| Modifier(NV50_IR_MOD_NEG
)) != Modifier(NV50_IR_MOD_NEG
))
3581 i
->swapSources(0, 1);
3583 Instruction
*imm
= i
->getSrc(1)->getInsn();
3584 i
->setSrc(1, imm
->getSrc(0));
3585 if (post_ra_dead(imm
))
3586 delete_Instruction(prog
, imm
);
3590 PostRaLoadPropagation::visit(Instruction
*i
)
3595 if (prog
->getTarget()->getChipset() < 0xc0)
3596 handleMADforNV50(i
);
3598 handleMADforNVC0(i
);
3607 // =============================================================================
3609 // Common subexpression elimination. Stupid O^2 implementation.
3610 class LocalCSE
: public Pass
3613 virtual bool visit(BasicBlock
*);
3615 inline bool tryReplace(Instruction
**, Instruction
*);
3617 DLList ops
[OP_LAST
+ 1];
3620 class GlobalCSE
: public Pass
3623 virtual bool visit(BasicBlock
*);
3627 Instruction::isActionEqual(const Instruction
*that
) const
3629 if (this->op
!= that
->op
||
3630 this->dType
!= that
->dType
||
3631 this->sType
!= that
->sType
)
3633 if (this->cc
!= that
->cc
)
3636 if (this->asTex()) {
3637 if (memcmp(&this->asTex()->tex
,
3638 &that
->asTex()->tex
,
3639 sizeof(this->asTex()->tex
)))
3642 if (this->asCmp()) {
3643 if (this->asCmp()->setCond
!= that
->asCmp()->setCond
)
3646 if (this->asFlow()) {
3649 if (this->op
== OP_PHI
&& this->bb
!= that
->bb
) {
3650 /* TODO: we could probably be a bit smarter here by following the
3651 * control flow, but honestly, it is quite painful to check */
3654 if (this->ipa
!= that
->ipa
||
3655 this->lanes
!= that
->lanes
||
3656 this->perPatch
!= that
->perPatch
)
3658 if (this->postFactor
!= that
->postFactor
)
3662 if (this->subOp
!= that
->subOp
||
3663 this->saturate
!= that
->saturate
||
3664 this->rnd
!= that
->rnd
||
3665 this->ftz
!= that
->ftz
||
3666 this->dnz
!= that
->dnz
||
3667 this->cache
!= that
->cache
||
3668 this->mask
!= that
->mask
)
3675 Instruction::isResultEqual(const Instruction
*that
) const
3679 // NOTE: location of discard only affects tex with liveOnly and quadops
3680 if (!this->defExists(0) && this->op
!= OP_DISCARD
)
3683 if (!isActionEqual(that
))
3686 if (this->predSrc
!= that
->predSrc
)
3689 for (d
= 0; this->defExists(d
); ++d
) {
3690 if (!that
->defExists(d
) ||
3691 !this->getDef(d
)->equals(that
->getDef(d
), false))
3694 if (that
->defExists(d
))
3697 for (s
= 0; this->srcExists(s
); ++s
) {
3698 if (!that
->srcExists(s
))
3700 if (this->src(s
).mod
!= that
->src(s
).mod
)
3702 if (!this->getSrc(s
)->equals(that
->getSrc(s
), true))
3705 if (that
->srcExists(s
))
3708 if (op
== OP_LOAD
|| op
== OP_VFETCH
|| op
== OP_ATOM
) {
3709 switch (src(0).getFile()) {
3710 case FILE_MEMORY_CONST
:
3711 case FILE_SHADER_INPUT
:
3713 case FILE_SHADER_OUTPUT
:
3714 return bb
->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL
;
3723 // pull through common expressions from different in-blocks
3725 GlobalCSE::visit(BasicBlock
*bb
)
3727 Instruction
*phi
, *next
, *ik
;
3730 // TODO: maybe do this with OP_UNION, too
3732 for (phi
= bb
->getPhi(); phi
&& phi
->op
== OP_PHI
; phi
= next
) {
3734 if (phi
->getSrc(0)->refCount() > 1)
3736 ik
= phi
->getSrc(0)->getInsn();
3738 continue; // probably a function input
3739 if (ik
->defCount(0xff) > 1)
3740 continue; // too painful to check if we can really push this forward
3741 for (s
= 1; phi
->srcExists(s
); ++s
) {
3742 if (phi
->getSrc(s
)->refCount() > 1)
3744 if (!phi
->getSrc(s
)->getInsn() ||
3745 !phi
->getSrc(s
)->getInsn()->isResultEqual(ik
))
3748 if (!phi
->srcExists(s
)) {
3749 assert(ik
->op
!= OP_PHI
);
3750 Instruction
*entry
= bb
->getEntry();
3752 if (!entry
|| entry
->op
!= OP_JOIN
)
3755 bb
->insertAfter(entry
, ik
);
3756 ik
->setDef(0, phi
->getDef(0));
3757 delete_Instruction(prog
, phi
);
3765 LocalCSE::tryReplace(Instruction
**ptr
, Instruction
*i
)
3767 Instruction
*old
= *ptr
;
3769 // TODO: maybe relax this later (causes trouble with OP_UNION)
3770 if (i
->isPredicated())
3773 if (!old
->isResultEqual(i
))
3776 for (int d
= 0; old
->defExists(d
); ++d
)
3777 old
->def(d
).replace(i
->getDef(d
), false);
3778 delete_Instruction(prog
, old
);
3784 LocalCSE::visit(BasicBlock
*bb
)
3786 unsigned int replaced
;
3789 Instruction
*ir
, *next
;
3793 // will need to know the order of instructions
3795 for (ir
= bb
->getFirst(); ir
; ir
= ir
->next
)
3796 ir
->serial
= serial
++;
3798 for (ir
= bb
->getFirst(); ir
; ir
= next
) {
3805 ops
[ir
->op
].insert(ir
);
3809 for (s
= 0; ir
->srcExists(s
); ++s
)
3810 if (ir
->getSrc(s
)->asLValue())
3811 if (!src
|| ir
->getSrc(s
)->refCount() < src
->refCount())
3812 src
= ir
->getSrc(s
);
3815 for (Value::UseIterator it
= src
->uses
.begin();
3816 it
!= src
->uses
.end(); ++it
) {
3817 Instruction
*ik
= (*it
)->getInsn();
3818 if (ik
&& ik
->bb
== ir
->bb
&& ik
->serial
< ir
->serial
)
3819 if (tryReplace(&ir
, ik
))
3823 DLLIST_FOR_EACH(&ops
[ir
->op
], iter
)
3825 Instruction
*ik
= reinterpret_cast<Instruction
*>(iter
.get());
3826 if (tryReplace(&ir
, ik
))
3832 ops
[ir
->op
].insert(ir
);
3836 for (unsigned int i
= 0; i
<= OP_LAST
; ++i
)
3844 // =============================================================================
3846 // Remove computations of unused values.
3847 class DeadCodeElim
: public Pass
3850 bool buryAll(Program
*);
3853 virtual bool visit(BasicBlock
*);
3855 void checkSplitLoad(Instruction
*ld
); // for partially dead loads
3857 unsigned int deadCount
;
3861 DeadCodeElim::buryAll(Program
*prog
)
3865 if (!this->run(prog
, false, false))
3867 } while (deadCount
);
3873 DeadCodeElim::visit(BasicBlock
*bb
)
3877 for (Instruction
*i
= bb
->getExit(); i
; i
= prev
) {
3881 delete_Instruction(prog
, i
);
3883 if (i
->defExists(1) &&
3885 (i
->op
== OP_VFETCH
|| i
->op
== OP_LOAD
)) {
3888 if (i
->defExists(0) && !i
->getDef(0)->refCount()) {
3889 if (i
->op
== OP_ATOM
||
3890 i
->op
== OP_SUREDP
||
3891 i
->op
== OP_SUREDB
) {
3893 if (i
->op
== OP_ATOM
&& i
->subOp
== NV50_IR_SUBOP_ATOM_EXCH
) {
3894 i
->cache
= CACHE_CV
;
3898 } else if (i
->op
== OP_LOAD
&& i
->subOp
== NV50_IR_SUBOP_LOAD_LOCKED
) {
3899 i
->setDef(0, i
->getDef(1));
3907 // Each load can go into up to 4 destinations, any of which might potentially
3908 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3909 // of where the holes are. We find the first contiguous region, put it into
3910 // the first load, and then put the second contiguous region into the second
3911 // load. There can be at most 2 contiguous regions.
3913 // Note that there are some restrictions, for example it's not possible to do
3914 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3915 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3916 // split into a 64-bit and 32-bit load.
3918 DeadCodeElim::checkSplitLoad(Instruction
*ld1
)
3920 Instruction
*ld2
= NULL
; // can get at most 2 loads
3923 int32_t addr1
, addr2
;
3924 int32_t size1
, size2
;
3926 uint32_t mask
= 0xffffffff;
3928 for (d
= 0; ld1
->defExists(d
); ++d
)
3929 if (!ld1
->getDef(d
)->refCount() && ld1
->getDef(d
)->reg
.data
.id
< 0)
3931 if (mask
== 0xffffffff)
3934 addr1
= ld1
->getSrc(0)->reg
.data
.offset
;
3938 // Compute address/width for first load
3939 for (d
= 0; ld1
->defExists(d
); ++d
) {
3940 if (mask
& (1 << d
)) {
3941 if (size1
&& (addr1
& 0x7))
3943 def1
[n1
] = ld1
->getDef(d
);
3944 size1
+= def1
[n1
++]->reg
.size
;
3947 addr1
+= ld1
->getDef(d
)->reg
.size
;
3953 // Scale back the size of the first load until it can be loaded. This
3954 // typically happens for TYPE_B96 loads.
3956 !prog
->getTarget()->isAccessSupported(ld1
->getSrc(0)->reg
.file
,
3957 typeOfSize(size1
))) {
3958 size1
-= def1
[--n1
]->reg
.size
;
3962 // Compute address/width for second load
3963 for (addr2
= addr1
+ size1
; ld1
->defExists(d
); ++d
) {
3964 if (mask
& (1 << d
)) {
3965 assert(!size2
|| !(addr2
& 0x7));
3966 def2
[n2
] = ld1
->getDef(d
);
3967 size2
+= def2
[n2
++]->reg
.size
;
3970 addr2
+= ld1
->getDef(d
)->reg
.size
;
3976 // Make sure that we've processed all the values
3977 for (; ld1
->defExists(d
); ++d
)
3978 assert(!(mask
& (1 << d
)));
3980 updateLdStOffset(ld1
, addr1
, func
);
3981 ld1
->setType(typeOfSize(size1
));
3982 for (d
= 0; d
< 4; ++d
)
3983 ld1
->setDef(d
, (d
< n1
) ? def1
[d
] : NULL
);
3988 ld2
= cloneShallow(func
, ld1
);
3989 updateLdStOffset(ld2
, addr2
, func
);
3990 ld2
->setType(typeOfSize(size2
));
3991 for (d
= 0; d
< 4; ++d
)
3992 ld2
->setDef(d
, (d
< n2
) ? def2
[d
] : NULL
);
3994 ld1
->bb
->insertAfter(ld1
, ld2
);
3997 // =============================================================================
3999 #define RUN_PASS(l, n, f) \
4000 if (level >= (l)) { \
4001 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \
4002 INFO("PEEPHOLE: %s\n", #n); \
4004 if (!pass.f(this)) \
4009 Program::optimizeSSA(int level
)
4011 RUN_PASS(1, DeadCodeElim
, buryAll
);
4012 RUN_PASS(1, CopyPropagation
, run
);
4013 RUN_PASS(1, MergeSplits
, run
);
4014 RUN_PASS(2, GlobalCSE
, run
);
4015 RUN_PASS(1, LocalCSE
, run
);
4016 RUN_PASS(2, AlgebraicOpt
, run
);
4017 RUN_PASS(2, ModifierFolding
, run
); // before load propagation -> less checks
4018 RUN_PASS(1, ConstantFolding
, foldAll
);
4019 RUN_PASS(0, Split64BitOpPreRA
, run
);
4020 RUN_PASS(2, LateAlgebraicOpt
, run
);
4021 RUN_PASS(1, LoadPropagation
, run
);
4022 RUN_PASS(1, IndirectPropagation
, run
);
4023 RUN_PASS(2, MemoryOpt
, run
);
4024 RUN_PASS(2, LocalCSE
, run
);
4025 RUN_PASS(0, DeadCodeElim
, buryAll
);
4031 Program::optimizePostRA(int level
)
4033 RUN_PASS(2, FlatteningPass
, run
);
4034 RUN_PASS(2, PostRaLoadPropagation
, run
);