2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
26 #include "codegen/nv50_ir_target_nvc0.h"
27 #include "codegen/nv50_ir_lowering_nvc0.h"
39 #define QUADOP(q, r, s, t) \
40 ((QOP_##q << 6) | (QOP_##r << 4) | \
41 (QOP_##s << 2) | (QOP_##t << 0))
44 NVC0LegalizeSSA::handleDIV(Instruction
*i
)
46 FlowInstruction
*call
;
49 bld
.setPosition(i
, false);
51 // Generate movs to the input regs for the call we want to generate
52 for (int s
= 0; i
->srcExists(s
); ++s
) {
53 Instruction
*ld
= i
->getSrc(s
)->getInsn();
54 // check if we are moving an immediate, propagate it in that case
55 if (!ld
|| ld
->fixed
|| (ld
->op
!= OP_LOAD
&& ld
->op
!= OP_MOV
) ||
56 !(ld
->src(0).getFile() == FILE_IMMEDIATE
))
57 bld
.mkMovToReg(s
, i
->getSrc(s
));
59 assert(ld
->getSrc(0) != NULL
);
60 bld
.mkMovToReg(s
, ld
->getSrc(0));
61 // Clear the src, to make code elimination possible here before we
62 // delete the instruction i later
65 delete_Instruction(prog
, ld
);
70 case TYPE_U32
: builtin
= NVC0_BUILTIN_DIV_U32
; break;
71 case TYPE_S32
: builtin
= NVC0_BUILTIN_DIV_S32
; break;
75 call
= bld
.mkFlow(OP_CALL
, NULL
, CC_ALWAYS
, NULL
);
76 bld
.mkMovFromReg(i
->getDef(0), i
->op
== OP_DIV
? 0 : 1);
77 bld
.mkClobber(FILE_GPR
, (i
->op
== OP_DIV
) ? 0xe : 0xd, 2);
78 bld
.mkClobber(FILE_PREDICATE
, (i
->dType
== TYPE_S32
) ? 0xf : 0x3, 0);
81 call
->absolute
= call
->builtin
= 1;
82 call
->target
.builtin
= builtin
;
83 delete_Instruction(prog
, i
);
87 NVC0LegalizeSSA::handleRCPRSQLib(Instruction
*i
, Value
*src
[])
89 FlowInstruction
*call
;
93 def
[0] = bld
.mkMovToReg(0, src
[0])->getDef(0);
94 def
[1] = bld
.mkMovToReg(1, src
[1])->getDef(0);
97 builtin
= NVC0_BUILTIN_RCP_F64
;
99 builtin
= NVC0_BUILTIN_RSQ_F64
;
101 call
= bld
.mkFlow(OP_CALL
, NULL
, CC_ALWAYS
, NULL
);
102 def
[0] = bld
.getSSA();
103 def
[1] = bld
.getSSA();
104 bld
.mkMovFromReg(def
[0], 0);
105 bld
.mkMovFromReg(def
[1], 1);
106 bld
.mkClobber(FILE_GPR
, 0x3fc, 2);
107 bld
.mkClobber(FILE_PREDICATE
, i
->op
== OP_RSQ
? 0x3 : 0x1, 0);
108 bld
.mkOp2(OP_MERGE
, TYPE_U64
, i
->getDef(0), def
[0], def
[1]);
111 call
->absolute
= call
->builtin
= 1;
112 call
->target
.builtin
= builtin
;
113 delete_Instruction(prog
, i
);
119 NVC0LegalizeSSA::handleRCPRSQ(Instruction
*i
)
121 assert(i
->dType
== TYPE_F64
);
122 // There are instructions that will compute the high 32 bits of the 64-bit
123 // float. We will just stick 0 in the bottom 32 bits.
125 bld
.setPosition(i
, false);
127 // 1. Take the source and it up.
128 Value
*src
[2], *dst
[2], *def
= i
->getDef(0);
129 bld
.mkSplit(src
, 4, i
->getSrc(0));
131 int chip
= prog
->getTarget()->getChipset();
132 if (chip
>= NVISA_GK104_CHIPSET
) {
133 handleRCPRSQLib(i
, src
);
137 // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
138 dst
[0] = bld
.loadImm(NULL
, 0);
139 dst
[1] = bld
.getSSA();
141 // 3. The new version of the instruction takes the high 32 bits of the
142 // source and outputs the high 32 bits of the destination.
143 i
->setSrc(0, src
[1]);
144 i
->setDef(0, dst
[1]);
145 i
->setType(TYPE_F32
);
146 i
->subOp
= NV50_IR_SUBOP_RCPRSQ_64H
;
148 // 4. Recombine the two dst pieces back into the original destination.
149 bld
.setPosition(i
, true);
150 bld
.mkOp2(OP_MERGE
, TYPE_U64
, def
, dst
[0], dst
[1]);
154 NVC0LegalizeSSA::handleFTZ(Instruction
*i
)
156 // Only want to flush float inputs
157 assert(i
->sType
== TYPE_F32
);
159 // If we're already flushing denorms (and NaN's) to zero, no need for this.
163 // Only certain classes of operations can flush
164 OpClass cls
= prog
->getTarget()->getOpClass(i
->op
);
165 if (cls
!= OPCLASS_ARITH
&& cls
!= OPCLASS_COMPARE
&&
166 cls
!= OPCLASS_CONVERT
)
173 NVC0LegalizeSSA::handleTEXLOD(TexInstruction
*i
)
175 if (i
->tex
.levelZero
)
180 // The LOD argument comes right after the coordinates (before depth bias,
182 int arg
= i
->tex
.target
.getArgCount();
184 // SM30+ stores the indirect handle as a separate arg, which comes before
186 if (prog
->getTarget()->getChipset() >= NVISA_GK104_CHIPSET
&&
187 i
->tex
.rIndirectSrc
>= 0)
189 // SM20 stores indirect handle combined with array coordinate
190 if (prog
->getTarget()->getChipset() < NVISA_GK104_CHIPSET
&&
191 !i
->tex
.target
.isArray() &&
192 i
->tex
.rIndirectSrc
>= 0)
195 if (!i
->src(arg
).getImmediate(lod
) || !lod
.isInteger(0))
200 i
->tex
.levelZero
= true;
201 i
->moveSources(arg
+ 1, -1);
205 NVC0LegalizeSSA::handleShift(Instruction
*lo
)
207 Value
*shift
= lo
->getSrc(1);
208 Value
*dst64
= lo
->getDef(0);
209 Value
*src
[2], *dst
[2];
210 operation op
= lo
->op
;
212 bld
.setPosition(lo
, false);
214 bld
.mkSplit(src
, 4, lo
->getSrc(0));
216 // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to
217 // be completely emulated. For SM35+, we can use the more directed SHF
219 if (prog
->getTarget()->getChipset() < NVISA_GK20A_CHIPSET
) {
220 // The strategy here is to handle shifts >= 32 and less than 32 as
224 // If the shift is <= 32, then
225 // (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)
226 // If the shift is > 32, then
227 // (HI,LO) << x = (LO << (x - 32), 0)
230 // If the shift is <= 32, then
231 // (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)
232 // If the shift is > 32, then
233 // (HI,LO) >> x = (0, HI >> (x - 32))
235 // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we
236 // can use to our advantage. Also note the structural similarities
237 // between the right/left cases. The main difference is swapping hi/lo
238 // on input and output.
240 Value
*x32_minus_shift
, *pred
, *hi1
, *hi2
;
241 DataType type
= isSignedIntType(lo
->dType
) ? TYPE_S32
: TYPE_U32
;
242 operation antiop
= op
== OP_SHR
? OP_SHL
: OP_SHR
;
244 std::swap(src
[0], src
[1]);
245 bld
.mkOp2(OP_ADD
, TYPE_U32
, (x32_minus_shift
= bld
.getSSA()), shift
, bld
.mkImm(0x20))
246 ->src(0).mod
= Modifier(NV50_IR_MOD_NEG
);
247 bld
.mkCmp(OP_SET
, CC_LE
, TYPE_U8
, (pred
= bld
.getSSA(1, FILE_PREDICATE
)),
248 TYPE_U32
, shift
, bld
.mkImm(32));
249 // Compute HI (shift <= 32)
250 bld
.mkOp2(OP_OR
, TYPE_U32
, (hi1
= bld
.getSSA()),
251 bld
.mkOp2v(op
, TYPE_U32
, bld
.getSSA(), src
[1], shift
),
252 bld
.mkOp2v(antiop
, TYPE_U32
, bld
.getSSA(), src
[0], x32_minus_shift
))
253 ->setPredicate(CC_P
, pred
);
254 // Compute LO (all shift values)
255 bld
.mkOp2(op
, type
, (dst
[0] = bld
.getSSA()), src
[0], shift
);
256 // Compute HI (shift > 32)
257 bld
.mkOp2(op
, type
, (hi2
= bld
.getSSA()), src
[0],
258 bld
.mkOp1v(OP_NEG
, TYPE_S32
, bld
.getSSA(), x32_minus_shift
))
259 ->setPredicate(CC_NOT_P
, pred
);
260 bld
.mkOp2(OP_UNION
, TYPE_U32
, (dst
[1] = bld
.getSSA()), hi1
, hi2
);
262 std::swap(dst
[0], dst
[1]);
263 bld
.mkOp2(OP_MERGE
, TYPE_U64
, dst64
, dst
[0], dst
[1]);
264 delete_Instruction(prog
, lo
);
268 Instruction
*hi
= new_Instruction(func
, op
, TYPE_U32
);
269 lo
->bb
->insertAfter(lo
, hi
);
271 hi
->sType
= lo
->sType
;
272 lo
->dType
= TYPE_U32
;
274 hi
->setDef(0, (dst
[1] = bld
.getSSA()));
275 if (lo
->op
== OP_SHR
)
276 hi
->subOp
|= NV50_IR_SUBOP_SHIFT_HIGH
;
277 lo
->setDef(0, (dst
[0] = bld
.getSSA()));
279 bld
.setPosition(hi
, true);
281 if (lo
->op
== OP_SHL
)
284 hi
->setSrc(0, new_ImmediateValue(prog
, 0u));
285 hi
->setSrc(1, shift
);
286 hi
->setSrc(2, lo
->op
== OP_SHL
? src
[0] : src
[1]);
288 lo
->setSrc(0, src
[0]);
289 lo
->setSrc(1, shift
);
290 lo
->setSrc(2, src
[1]);
292 bld
.mkOp2(OP_MERGE
, TYPE_U64
, dst64
, dst
[0], dst
[1]);
296 NVC0LegalizeSSA::handleSET(CmpInstruction
*cmp
)
298 DataType hTy
= cmp
->sType
== TYPE_S64
? TYPE_S32
: TYPE_U32
;
300 Value
*src0
[2], *src1
[2];
301 bld
.setPosition(cmp
, false);
303 bld
.mkSplit(src0
, 4, cmp
->getSrc(0));
304 bld
.mkSplit(src1
, 4, cmp
->getSrc(1));
305 bld
.mkOp2(OP_SUB
, hTy
, NULL
, src0
[0], src1
[0])
306 ->setFlagsDef(0, (carry
= bld
.getSSA(1, FILE_FLAGS
)));
307 cmp
->setFlagsSrc(cmp
->srcCount(), carry
);
308 cmp
->setSrc(0, src0
[1]);
309 cmp
->setSrc(1, src1
[1]);
314 NVC0LegalizeSSA::handleBREV(Instruction
*i
)
317 i
->subOp
= NV50_IR_SUBOP_EXTBF_REV
;
318 i
->setSrc(1, bld
.mkImm(0x2000));
322 NVC0LegalizeSSA::visit(Function
*fn
)
324 bld
.setProgram(fn
->getProgram());
329 NVC0LegalizeSSA::visit(BasicBlock
*bb
)
332 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
335 if (i
->sType
== TYPE_F32
&& prog
->getType() != Program::TYPE_COMPUTE
)
341 if (i
->sType
!= TYPE_F32
)
346 if (i
->dType
== TYPE_F64
)
351 handleTEXLOD(i
->asTex());
355 if (typeSizeof(i
->sType
) == 8)
362 if (typeSizeof(i
->sType
) == 8 && i
->sType
!= TYPE_F64
)
363 handleSET(i
->asCmp());
375 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program
*prog
)
379 needTexBar(prog
->getTarget()->getChipset() >= 0xe0 &&
380 prog
->getTarget()->getChipset() < 0x110)
385 NVC0LegalizePostRA::insnDominatedBy(const Instruction
*later
,
386 const Instruction
*early
) const
388 if (early
->bb
== later
->bb
)
389 return early
->serial
< later
->serial
;
390 return later
->bb
->dominatedBy(early
->bb
);
394 NVC0LegalizePostRA::addTexUse(std::list
<TexUse
> &uses
,
395 Instruction
*usei
, const Instruction
*texi
)
398 bool dominated
= insnDominatedBy(usei
, texi
);
399 // Uses before the tex have to all be included. Just because an earlier
400 // instruction dominates another instruction doesn't mean that there's no
401 // way to get from the tex to the later instruction. For example you could
402 // have nested loops, with the tex in the inner loop, and uses before it in
403 // both loops - even though the outer loop's instruction would dominate the
404 // inner's, we still want a texbar before the inner loop's instruction.
406 // However we can still use the eliding logic between uses dominated by the
407 // tex instruction, as that is unambiguously correct.
409 for (std::list
<TexUse
>::iterator it
= uses
.begin(); it
!= uses
.end();) {
411 if (insnDominatedBy(usei
, it
->insn
)) {
415 if (insnDominatedBy(it
->insn
, usei
)) {
424 uses
.push_back(TexUse(usei
, texi
, dominated
));
427 // While it might be tempting to use the an algorithm that just looks at tex
428 // uses, not all texture results are guaranteed to be used on all paths. In
429 // the case where along some control flow path a texture result is never used,
430 // we might reuse that register for something else, creating a
431 // write-after-write hazard. So we have to manually look through all
432 // instructions looking for ones that reference the registers in question.
434 NVC0LegalizePostRA::findFirstUses(
435 Instruction
*texi
, std::list
<TexUse
> &uses
)
437 int minGPR
= texi
->def(0).rep()->reg
.data
.id
;
438 int maxGPR
= minGPR
+ texi
->def(0).rep()->reg
.size
/ 4 - 1;
440 unordered_set
<const BasicBlock
*> visited
;
441 findFirstUsesBB(minGPR
, maxGPR
, texi
->next
, texi
, uses
, visited
);
445 NVC0LegalizePostRA::findFirstUsesBB(
446 int minGPR
, int maxGPR
, Instruction
*start
,
447 const Instruction
*texi
, std::list
<TexUse
> &uses
,
448 unordered_set
<const BasicBlock
*> &visited
)
450 const BasicBlock
*bb
= start
->bb
;
452 // We don't process the whole bb the first time around. This is correct,
453 // however we might be in a loop and hit this BB again, and need to process
454 // the full thing. So only mark a bb as visited if we processed it from the
456 if (start
== bb
->getEntry()) {
457 if (visited
.find(bb
) != visited
.end())
462 for (Instruction
*insn
= start
; insn
!= bb
->getExit(); insn
= insn
->next
) {
466 for (int d
= 0; insn
->defExists(d
); ++d
) {
467 const Value
*def
= insn
->def(d
).rep();
468 if (insn
->def(d
).getFile() != FILE_GPR
||
469 def
->reg
.data
.id
+ def
->reg
.size
/ 4 - 1 < minGPR
||
470 def
->reg
.data
.id
> maxGPR
)
472 addTexUse(uses
, insn
, texi
);
476 for (int s
= 0; insn
->srcExists(s
); ++s
) {
477 const Value
*src
= insn
->src(s
).rep();
478 if (insn
->src(s
).getFile() != FILE_GPR
||
479 src
->reg
.data
.id
+ src
->reg
.size
/ 4 - 1 < minGPR
||
480 src
->reg
.data
.id
> maxGPR
)
482 addTexUse(uses
, insn
, texi
);
487 for (Graph::EdgeIterator ei
= bb
->cfg
.outgoing(); !ei
.end(); ei
.next()) {
488 findFirstUsesBB(minGPR
, maxGPR
, BasicBlock::get(ei
.getNode())->getEntry(),
489 texi
, uses
, visited
);
494 // This pass is a bit long and ugly and can probably be optimized.
496 // 1. obtain a list of TEXes and their outputs' first use(s)
497 // 2. calculate the barrier level of each first use (minimal number of TEXes,
498 // over all paths, between the TEX and the use in question)
499 // 3. for each barrier, if all paths from the source TEX to that barrier
500 // contain a barrier of lesser level, it can be culled
502 NVC0LegalizePostRA::insertTextureBarriers(Function
*fn
)
504 std::list
<TexUse
> *uses
;
505 std::vector
<Instruction
*> texes
;
506 std::vector
<int> bbFirstTex
;
507 std::vector
<int> bbFirstUse
;
508 std::vector
<int> texCounts
;
509 std::vector
<TexUse
> useVec
;
512 fn
->orderInstructions(insns
);
514 texCounts
.resize(fn
->allBBlocks
.getSize(), 0);
515 bbFirstTex
.resize(fn
->allBBlocks
.getSize(), insns
.getSize());
516 bbFirstUse
.resize(fn
->allBBlocks
.getSize(), insns
.getSize());
518 // tag BB CFG nodes by their id for later
519 for (ArrayList::Iterator i
= fn
->allBBlocks
.iterator(); !i
.end(); i
.next()) {
520 BasicBlock
*bb
= reinterpret_cast<BasicBlock
*>(i
.get());
522 bb
->cfg
.tag
= bb
->getId();
525 // gather the first uses for each TEX
526 for (int i
= 0; i
< insns
.getSize(); ++i
) {
527 Instruction
*tex
= reinterpret_cast<Instruction
*>(insns
.get(i
));
528 if (isTextureOp(tex
->op
)) {
529 texes
.push_back(tex
);
530 if (!texCounts
.at(tex
->bb
->getId()))
531 bbFirstTex
[tex
->bb
->getId()] = texes
.size() - 1;
532 texCounts
[tex
->bb
->getId()]++;
538 uses
= new std::list
<TexUse
>[texes
.size()];
541 for (size_t i
= 0; i
< texes
.size(); ++i
) {
542 findFirstUses(texes
[i
], uses
[i
]);
545 // determine the barrier level at each use
546 for (size_t i
= 0; i
< texes
.size(); ++i
) {
547 for (std::list
<TexUse
>::iterator u
= uses
[i
].begin(); u
!= uses
[i
].end();
549 BasicBlock
*tb
= texes
[i
]->bb
;
550 BasicBlock
*ub
= u
->insn
->bb
;
553 for (size_t j
= i
+ 1; j
< texes
.size() &&
554 texes
[j
]->bb
== tb
&& texes
[j
]->serial
< u
->insn
->serial
;
558 u
->level
= fn
->cfg
.findLightestPathWeight(&tb
->cfg
,
559 &ub
->cfg
, texCounts
);
561 WARN("Failed to find path TEX -> TEXBAR\n");
565 // this counted all TEXes in the origin block, correct that
566 u
->level
-= i
- bbFirstTex
.at(tb
->getId()) + 1 /* this TEX */;
567 // and did not count the TEXes in the destination block, add those
568 for (size_t j
= bbFirstTex
.at(ub
->getId()); j
< texes
.size() &&
569 texes
[j
]->bb
== ub
&& texes
[j
]->serial
< u
->insn
->serial
;
573 assert(u
->level
>= 0);
574 useVec
.push_back(*u
);
579 // insert the barriers
580 for (size_t i
= 0; i
< useVec
.size(); ++i
) {
581 Instruction
*prev
= useVec
[i
].insn
->prev
;
582 if (useVec
[i
].level
< 0)
584 if (prev
&& prev
->op
== OP_TEXBAR
) {
585 if (prev
->subOp
> useVec
[i
].level
)
586 prev
->subOp
= useVec
[i
].level
;
587 prev
->setSrc(prev
->srcCount(), useVec
[i
].tex
->getDef(0));
589 Instruction
*bar
= new_Instruction(func
, OP_TEXBAR
, TYPE_NONE
);
591 bar
->subOp
= useVec
[i
].level
;
592 // make use explicit to ease latency calculation
593 bar
->setSrc(bar
->srcCount(), useVec
[i
].tex
->getDef(0));
594 useVec
[i
].insn
->bb
->insertBefore(useVec
[i
].insn
, bar
);
598 if (fn
->getProgram()->optLevel
< 3)
601 std::vector
<Limits
> limitT
, limitB
, limitS
; // entry, exit, single
603 limitT
.resize(fn
->allBBlocks
.getSize(), Limits(0, 0));
604 limitB
.resize(fn
->allBBlocks
.getSize(), Limits(0, 0));
605 limitS
.resize(fn
->allBBlocks
.getSize());
607 // cull unneeded barriers (should do that earlier, but for simplicity)
608 IteratorRef bi
= fn
->cfg
.iteratorCFG();
609 // first calculate min/max outstanding TEXes for each BB
610 for (bi
->reset(); !bi
->end(); bi
->next()) {
611 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
612 BasicBlock
*bb
= BasicBlock::get(n
);
614 int max
= std::numeric_limits
<int>::max();
615 for (Instruction
*i
= bb
->getFirst(); i
; i
= i
->next
) {
616 if (isTextureOp(i
->op
)) {
618 if (max
< std::numeric_limits
<int>::max())
621 if (i
->op
== OP_TEXBAR
) {
622 min
= MIN2(min
, i
->subOp
);
623 max
= MIN2(max
, i
->subOp
);
626 // limits when looking at an isolated block
627 limitS
[bb
->getId()].min
= min
;
628 limitS
[bb
->getId()].max
= max
;
630 // propagate the min/max values
631 for (unsigned int l
= 0; l
<= fn
->loopNestingBound
; ++l
) {
632 for (bi
->reset(); !bi
->end(); bi
->next()) {
633 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
634 BasicBlock
*bb
= BasicBlock::get(n
);
635 const int bbId
= bb
->getId();
636 for (Graph::EdgeIterator ei
= n
->incident(); !ei
.end(); ei
.next()) {
637 BasicBlock
*in
= BasicBlock::get(ei
.getNode());
638 const int inId
= in
->getId();
639 limitT
[bbId
].min
= MAX2(limitT
[bbId
].min
, limitB
[inId
].min
);
640 limitT
[bbId
].max
= MAX2(limitT
[bbId
].max
, limitB
[inId
].max
);
642 // I just hope this is correct ...
643 if (limitS
[bbId
].max
== std::numeric_limits
<int>::max()) {
645 limitB
[bbId
].min
= limitT
[bbId
].min
+ limitS
[bbId
].min
;
646 limitB
[bbId
].max
= limitT
[bbId
].max
+ limitS
[bbId
].min
;
648 // block contained a barrier
649 limitB
[bbId
].min
= MIN2(limitS
[bbId
].max
,
650 limitT
[bbId
].min
+ limitS
[bbId
].min
);
651 limitB
[bbId
].max
= MIN2(limitS
[bbId
].max
,
652 limitT
[bbId
].max
+ limitS
[bbId
].min
);
656 // finally delete unnecessary barriers
657 for (bi
->reset(); !bi
->end(); bi
->next()) {
658 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
659 BasicBlock
*bb
= BasicBlock::get(n
);
660 Instruction
*prev
= NULL
;
662 int max
= limitT
[bb
->getId()].max
;
663 for (Instruction
*i
= bb
->getFirst(); i
; i
= next
) {
665 if (i
->op
== OP_TEXBAR
) {
666 if (i
->subOp
>= max
) {
667 delete_Instruction(prog
, i
);
671 if (prev
&& prev
->op
== OP_TEXBAR
&& prev
->subOp
>= max
) {
672 delete_Instruction(prog
, prev
);
677 if (isTextureOp(i
->op
)) {
680 if (i
&& !i
->isNop())
688 NVC0LegalizePostRA::visit(Function
*fn
)
691 insertTextureBarriers(fn
);
693 rZero
= new_LValue(fn
, FILE_GPR
);
694 pOne
= new_LValue(fn
, FILE_PREDICATE
);
695 carry
= new_LValue(fn
, FILE_FLAGS
);
697 rZero
->reg
.data
.id
= (prog
->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET
) ? 255 : 63;
698 carry
->reg
.data
.id
= 0;
699 pOne
->reg
.data
.id
= 7;
705 NVC0LegalizePostRA::replaceZero(Instruction
*i
)
707 for (int s
= 0; i
->srcExists(s
); ++s
) {
708 if (s
== 2 && i
->op
== OP_SUCLAMP
)
710 if (s
== 1 && i
->op
== OP_SHLADD
)
712 ImmediateValue
*imm
= i
->getSrc(s
)->asImm();
714 if (i
->op
== OP_SELP
&& s
== 2) {
716 if (imm
->reg
.data
.u64
== 0)
717 i
->src(s
).mod
= i
->src(s
).mod
^ Modifier(NV50_IR_MOD_NOT
);
718 } else if (imm
->reg
.data
.u64
== 0) {
725 // replace CONT with BRA for single unconditional continue
727 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock
*bb
)
729 if (bb
->cfg
.incidentCount() != 2 || bb
->getEntry()->op
!= OP_PRECONT
)
731 Graph::EdgeIterator ei
= bb
->cfg
.incident();
732 if (ei
.getType() != Graph::Edge::BACK
)
734 if (ei
.getType() != Graph::Edge::BACK
)
736 BasicBlock
*contBB
= BasicBlock::get(ei
.getNode());
738 if (!contBB
->getExit() || contBB
->getExit()->op
!= OP_CONT
||
739 contBB
->getExit()->getPredicate())
741 contBB
->getExit()->op
= OP_BRA
;
742 bb
->remove(bb
->getEntry()); // delete PRECONT
745 assert(ei
.end() || ei
.getType() != Graph::Edge::BACK
);
749 // replace branches to join blocks with join ops
751 NVC0LegalizePostRA::propagateJoin(BasicBlock
*bb
)
753 if (bb
->getEntry()->op
!= OP_JOIN
|| bb
->getEntry()->asFlow()->limit
)
755 for (Graph::EdgeIterator ei
= bb
->cfg
.incident(); !ei
.end(); ei
.next()) {
756 BasicBlock
*in
= BasicBlock::get(ei
.getNode());
757 Instruction
*exit
= in
->getExit();
759 in
->insertTail(new FlowInstruction(func
, OP_JOIN
, bb
));
760 // there should always be a terminator instruction
761 WARN("inserted missing terminator in BB:%i\n", in
->getId());
763 if (exit
->op
== OP_BRA
) {
765 exit
->asFlow()->limit
= 1; // must-not-propagate marker
768 bb
->remove(bb
->getEntry());
771 // replaces instructions which would end up as f2f or i2i with faster
773 // - fabs(a) -> fadd(0, abs a)
774 // - fneg(a) -> fadd(neg 0, neg a)
775 // - ineg(a) -> iadd(0, neg a)
776 // - fneg(abs a) -> fadd(neg 0, neg abs a)
777 // - sat(a) -> sat add(0, a)
779 NVC0LegalizePostRA::replaceCvt(Instruction
*cvt
)
781 if (!isFloatType(cvt
->sType
) && typeSizeof(cvt
->sType
) != 4)
783 if (cvt
->sType
!= cvt
->dType
)
785 // we could make it work, but in this case we have optimizations disabled
786 // and we don't really care either way.
787 if (cvt
->src(0).getFile() != FILE_GPR
&&
788 cvt
->src(0).getFile() != FILE_MEMORY_CONST
)
797 if (!isFloatType(cvt
->sType
))
800 mod1
= NV50_IR_MOD_ABS
;
803 if (!isFloatType(cvt
->sType
) && cvt
->src(0).mod
)
805 if (isFloatType(cvt
->sType
) &&
806 (cvt
->src(0).mod
&& cvt
->src(0).mod
!= Modifier(NV50_IR_MOD_ABS
)))
809 mod0
= isFloatType(cvt
->sType
) ? NV50_IR_MOD_NEG
: 0;
810 mod1
= cvt
->src(0).mod
== Modifier(NV50_IR_MOD_ABS
) ?
811 NV50_IR_MOD_NEG_ABS
: NV50_IR_MOD_NEG
;
814 if (!isFloatType(cvt
->sType
) && cvt
->src(0).mod
.abs())
817 mod1
= cvt
->src(0).mod
;
818 cvt
->saturate
= true;
825 cvt
->moveSources(0, 1);
826 cvt
->setSrc(0, rZero
);
827 cvt
->src(0).mod
= mod0
;
828 cvt
->src(1).mod
= mod1
;
832 NVC0LegalizePostRA::visit(BasicBlock
*bb
)
834 Instruction
*i
, *next
;
836 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
837 for (i
= bb
->getFirst(); i
; i
= next
) {
839 if (i
->op
== OP_EMIT
|| i
->op
== OP_RESTART
) {
840 if (!i
->getDef(0)->refCount())
842 if (i
->src(0).getFile() == FILE_IMMEDIATE
)
843 i
->setSrc(0, rZero
); // initial value must be 0
849 if (i
->op
== OP_BAR
&& i
->subOp
== NV50_IR_SUBOP_BAR_SYNC
&&
850 prog
->getType() != Program::TYPE_COMPUTE
) {
851 // It seems like barriers are never required for tessellation since
852 // the warp size is 32, and there are always at most 32 tcs threads.
855 if (i
->op
== OP_LOAD
&& i
->subOp
== NV50_IR_SUBOP_LDC_IS
) {
856 int offset
= i
->src(0).get()->reg
.data
.offset
;
857 if (abs(offset
) >= 0x10000)
858 i
->src(0).get()->reg
.fileIndex
+= offset
>> 16;
859 i
->src(0).get()->reg
.data
.offset
= (int)(short)offset
;
861 // TODO: Move this to before register allocation for operations that
862 // need the $c register !
863 if (typeSizeof(i
->sType
) == 8 || typeSizeof(i
->dType
) == 8) {
865 hi
= BuildUtil::split64BitOpPostRA(func
, i
, rZero
, carry
);
870 if (i
->op
!= OP_MOV
&& i
->op
!= OP_PFETCH
)
873 if (i
->op
== OP_SAT
|| i
->op
== OP_NEG
|| i
->op
== OP_ABS
)
880 if (!tryReplaceContWithBra(bb
))
886 NVC0LoweringPass::NVC0LoweringPass(Program
*prog
) : targ(prog
->getTarget())
888 bld
.setProgram(prog
);
892 NVC0LoweringPass::visit(Function
*fn
)
894 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
895 assert(!strncmp(fn
->getName(), "MAIN", 4));
896 // TODO: when we generate actual functions pass this value along somehow
897 bld
.setPosition(BasicBlock::get(fn
->cfg
.getRoot()), false);
898 gpEmitAddress
= bld
.loadImm(NULL
, 0)->asLValue();
900 bld
.setPosition(BasicBlock::get(fn
->cfgExit
)->getExit(), false);
901 if (prog
->getTarget()->getChipset() >= NVISA_GV100_CHIPSET
)
902 bld
.mkOp1(OP_FINAL
, TYPE_NONE
, NULL
, gpEmitAddress
)->fixed
= 1;
903 bld
.mkMovToReg(0, gpEmitAddress
);
910 NVC0LoweringPass::visit(BasicBlock
*bb
)
916 NVC0LoweringPass::loadTexHandle(Value
*ptr
, unsigned int slot
)
918 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
919 uint32_t off
= prog
->driver
->io
.texBindBase
+ slot
* 4;
922 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(2));
925 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
928 // move array source to first slot, convert to u16, add indirections
930 NVC0LoweringPass::handleTEX(TexInstruction
*i
)
932 const int dim
= i
->tex
.target
.getDim() + i
->tex
.target
.isCube();
933 const int arg
= i
->tex
.target
.getArgCount();
934 const int lyr
= arg
- (i
->tex
.target
.isMS() ? 2 : 1);
935 const int chipset
= prog
->getTarget()->getChipset();
937 /* Only normalize in the non-explicit derivatives case. For explicit
938 * derivatives, this is handled in handleManualTXD.
940 if (i
->tex
.target
.isCube() && i
->dPdx
[0].get() == NULL
) {
943 for (c
= 0; c
< 3; ++c
)
944 src
[c
] = bld
.mkOp1v(OP_ABS
, TYPE_F32
, bld
.getSSA(), i
->getSrc(c
));
945 val
= bld
.getScratch();
946 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[0], src
[1]);
947 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[2], val
);
948 bld
.mkOp1(OP_RCP
, TYPE_F32
, val
, val
);
949 for (c
= 0; c
< 3; ++c
) {
950 i
->setSrc(c
, bld
.mkOp2v(OP_MUL
, TYPE_F32
, bld
.getSSA(),
955 // Arguments to the TEX instruction are a little insane. Even though the
956 // encoding is identical between SM20 and SM30, the arguments mean
957 // different things between Fermi and Kepler+. A lot of arguments are
958 // optional based on flags passed to the instruction. This summarizes the
968 // - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
969 // - other: 4 bits each, single reg
973 // array (+ offsets for txd in upper 16 bits)
978 // offsets (same as fermi, except txd which takes it with array)
995 if (chipset
>= NVISA_GK104_CHIPSET
) {
996 if (i
->tex
.rIndirectSrc
>= 0 || i
->tex
.sIndirectSrc
>= 0) {
997 // XXX this ignores tsc, and assumes a 1:1 mapping
998 assert(i
->tex
.rIndirectSrc
>= 0);
999 if (!i
->tex
.bindless
) {
1000 Value
*hnd
= loadTexHandle(i
->getIndirectR(), i
->tex
.r
);
1003 i
->setIndirectR(hnd
);
1005 i
->setIndirectS(NULL
);
1006 } else if (i
->tex
.r
== i
->tex
.s
|| i
->op
== OP_TXF
) {
1007 if (i
->tex
.r
== 0xffff)
1008 i
->tex
.r
= prog
->driver
->io
.fbtexBindBase
/ 4;
1010 i
->tex
.r
+= prog
->driver
->io
.texBindBase
/ 4;
1011 i
->tex
.s
= 0; // only a single cX[] value possible here
1013 Value
*hnd
= bld
.getScratch();
1014 Value
*rHnd
= loadTexHandle(NULL
, i
->tex
.r
);
1015 Value
*sHnd
= loadTexHandle(NULL
, i
->tex
.s
);
1017 bld
.mkOp3(OP_INSBF
, TYPE_U32
, hnd
, rHnd
, bld
.mkImm(0x1400), sHnd
);
1019 i
->tex
.r
= 0; // not used for indirect tex
1021 i
->setIndirectR(hnd
);
1023 if (i
->tex
.target
.isArray()) {
1024 LValue
*layer
= new_LValue(func
, FILE_GPR
);
1025 Value
*src
= i
->getSrc(lyr
);
1026 const int sat
= (i
->op
== OP_TXF
) ? 1 : 0;
1027 DataType sTy
= (i
->op
== OP_TXF
) ? TYPE_U32
: TYPE_F32
;
1028 bld
.mkCvt(OP_CVT
, TYPE_U16
, layer
, sTy
, src
)->saturate
= sat
;
1029 if (i
->op
!= OP_TXD
|| chipset
< NVISA_GM107_CHIPSET
) {
1030 for (int s
= dim
; s
>= 1; --s
)
1031 i
->setSrc(s
, i
->getSrc(s
- 1));
1032 i
->setSrc(0, layer
);
1034 i
->setSrc(dim
, layer
);
1037 // Move the indirect reference to the first place
1038 if (i
->tex
.rIndirectSrc
>= 0 && (
1039 i
->op
== OP_TXD
|| chipset
< NVISA_GM107_CHIPSET
)) {
1040 Value
*hnd
= i
->getIndirectR();
1042 i
->setIndirectR(NULL
);
1043 i
->moveSources(0, 1);
1045 i
->tex
.rIndirectSrc
= 0;
1046 i
->tex
.sIndirectSrc
= -1;
1048 // Move the indirect reference to right after the coords
1049 else if (i
->tex
.rIndirectSrc
>= 0 && chipset
>= NVISA_GM107_CHIPSET
) {
1050 Value
*hnd
= i
->getIndirectR();
1052 i
->setIndirectR(NULL
);
1053 i
->moveSources(arg
, 1);
1054 i
->setSrc(arg
, hnd
);
1055 i
->tex
.rIndirectSrc
= 0;
1056 i
->tex
.sIndirectSrc
= -1;
1059 // (nvc0) generate and move the tsc/tic/array source to the front
1060 if (i
->tex
.target
.isArray() || i
->tex
.rIndirectSrc
>= 0 || i
->tex
.sIndirectSrc
>= 0) {
1061 LValue
*src
= new_LValue(func
, FILE_GPR
); // 0xttxsaaaa
1063 Value
*ticRel
= i
->getIndirectR();
1064 Value
*tscRel
= i
->getIndirectS();
1066 if (i
->tex
.r
== 0xffff) {
1072 i
->setSrc(i
->tex
.rIndirectSrc
, NULL
);
1074 ticRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
1075 ticRel
, bld
.mkImm(i
->tex
.r
));
1078 i
->setSrc(i
->tex
.sIndirectSrc
, NULL
);
1080 tscRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
1081 tscRel
, bld
.mkImm(i
->tex
.s
));
1084 Value
*arrayIndex
= i
->tex
.target
.isArray() ? i
->getSrc(lyr
) : NULL
;
1086 for (int s
= dim
; s
>= 1; --s
)
1087 i
->setSrc(s
, i
->getSrc(s
- 1));
1088 i
->setSrc(0, arrayIndex
);
1090 i
->moveSources(0, 1);
1094 int sat
= (i
->op
== OP_TXF
) ? 1 : 0;
1095 DataType sTy
= (i
->op
== OP_TXF
) ? TYPE_U32
: TYPE_F32
;
1096 bld
.mkCvt(OP_CVT
, TYPE_U16
, src
, sTy
, arrayIndex
)->saturate
= sat
;
1098 bld
.loadImm(src
, 0);
1102 bld
.mkOp3(OP_INSBF
, TYPE_U32
, src
, ticRel
, bld
.mkImm(0x0917), src
);
1104 bld
.mkOp3(OP_INSBF
, TYPE_U32
, src
, tscRel
, bld
.mkImm(0x0710), src
);
1109 // For nvc0, the sample id has to be in the second operand, as the offset
1110 // does. Right now we don't know how to pass both in, and this case can't
1111 // happen with OpenGL. On nve0, the sample id is part of the texture
1112 // coordinate argument.
1113 assert(chipset
>= NVISA_GK104_CHIPSET
||
1114 !i
->tex
.useOffsets
|| !i
->tex
.target
.isMS());
1116 // offset is between lod and dc
1117 if (i
->tex
.useOffsets
) {
1119 int s
= i
->srcCount(0xff, true);
1120 if (i
->op
!= OP_TXD
|| chipset
< NVISA_GK104_CHIPSET
) {
1121 if (i
->tex
.target
.isShadow())
1123 if (i
->srcExists(s
)) // move potential predicate out of the way
1124 i
->moveSources(s
, 1);
1125 if (i
->tex
.useOffsets
== 4 && i
->srcExists(s
+ 1))
1126 i
->moveSources(s
+ 1, 1);
1128 if (i
->op
== OP_TXG
) {
1129 // Either there is 1 offset, which goes into the 2 low bytes of the
1130 // first source, or there are 4 offsets, which go into 2 sources (8
1131 // values, 1 byte each).
1132 Value
*offs
[2] = {NULL
, NULL
};
1133 for (n
= 0; n
< i
->tex
.useOffsets
; n
++) {
1134 for (c
= 0; c
< 2; ++c
) {
1135 if ((n
% 2) == 0 && c
== 0)
1136 bld
.mkMov(offs
[n
/ 2] = bld
.getScratch(), i
->offset
[n
][c
].get());
1138 bld
.mkOp3(OP_INSBF
, TYPE_U32
,
1140 i
->offset
[n
][c
].get(),
1141 bld
.mkImm(0x800 | ((n
* 16 + c
* 8) % 32)),
1145 i
->setSrc(s
, offs
[0]);
1147 i
->setSrc(s
+ 1, offs
[1]);
1150 assert(i
->tex
.useOffsets
== 1);
1151 for (c
= 0; c
< 3; ++c
) {
1153 if (!i
->offset
[0][c
].getImmediate(val
))
1154 assert(!"non-immediate offset passed to non-TXG");
1155 imm
|= (val
.reg
.data
.u32
& 0xf) << (c
* 4);
1157 if (i
->op
== OP_TXD
&& chipset
>= NVISA_GK104_CHIPSET
) {
1158 // The offset goes into the upper 16 bits of the array index. So
1159 // create it if it's not already there, and INSBF it if it already
1161 s
= (i
->tex
.rIndirectSrc
>= 0) ? 1 : 0;
1162 if (chipset
>= NVISA_GM107_CHIPSET
)
1164 if (i
->tex
.target
.isArray()) {
1165 Value
*offset
= bld
.getScratch();
1166 bld
.mkOp3(OP_INSBF
, TYPE_U32
, offset
,
1167 bld
.loadImm(NULL
, imm
), bld
.mkImm(0xc10),
1169 i
->setSrc(s
, offset
);
1171 i
->moveSources(s
, 1);
1172 i
->setSrc(s
, bld
.loadImm(NULL
, imm
<< 16));
1175 i
->setSrc(s
, bld
.loadImm(NULL
, imm
));
1184 NVC0LoweringPass::handleManualTXD(TexInstruction
*i
)
1186 // Always done from the l0 perspective. This is the way that NVIDIA's
1187 // driver does it, and doing it from the "current" lane's perpsective
1188 // doesn't seem to always work for reasons that aren't altogether clear,
1189 // even in frag shaders.
1191 // Note that we must move not only the coordinates into lane0, but also all
1192 // ancillary arguments, like array indices and depth compare as they may
1193 // differ between lanes. Offsets for TXD are supposed to be uniform, so we
1194 // leave them alone.
1195 static const uint8_t qOps
[2] =
1196 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) };
1199 Value
*crd
[3], *arr
[2], *shadow
;
1201 Value
*zero
= bld
.loadImm(bld
.getSSA(), 0);
1203 const int dim
= i
->tex
.target
.getDim() + i
->tex
.target
.isCube();
1205 // This function is invoked after handleTEX lowering, so we have to expect
1206 // the arguments in the order that the hw wants them. For Fermi, array and
1207 // indirect are both in the leading arg, while for Kepler, array and
1208 // indirect are separate (and both precede the coordinates). Maxwell is
1209 // handled in a separate function.
1211 if (targ
->getChipset() < NVISA_GK104_CHIPSET
)
1212 array
= i
->tex
.target
.isArray() || i
->tex
.rIndirectSrc
>= 0;
1214 array
= i
->tex
.target
.isArray() + (i
->tex
.rIndirectSrc
>= 0);
1216 i
->op
= OP_TEX
; // no need to clone dPdx/dPdy later
1218 for (c
= 0; c
< dim
; ++c
)
1219 crd
[c
] = bld
.getScratch();
1220 for (c
= 0; c
< array
; ++c
)
1221 arr
[c
] = bld
.getScratch();
1222 shadow
= bld
.getScratch();
1224 for (l
= 0; l
< 4; ++l
) {
1225 Value
*src
[3], *val
;
1227 bld
.mkOp(OP_QUADON
, TYPE_NONE
, NULL
);
1228 // we're using the texture result from lane 0 in all cases, so make sure
1229 // that lane 0 is pointing at the proper array index, indirect value,
1230 // and depth compare.
1232 for (c
= 0; c
< array
; ++c
)
1233 bld
.mkQuadop(0x00, arr
[c
], l
, i
->getSrc(c
), zero
);
1234 if (i
->tex
.target
.isShadow()) {
1235 // The next argument after coords is the depth compare
1236 bld
.mkQuadop(0x00, shadow
, l
, i
->getSrc(array
+ dim
), zero
);
1239 // mov position coordinates from lane l to all lanes
1240 for (c
= 0; c
< dim
; ++c
)
1241 bld
.mkQuadop(0x00, crd
[c
], l
, i
->getSrc(c
+ array
), zero
);
1242 // add dPdx from lane l to lanes dx
1243 for (c
= 0; c
< dim
; ++c
)
1244 bld
.mkQuadop(qOps
[0], crd
[c
], l
, i
->dPdx
[c
].get(), crd
[c
]);
1245 // add dPdy from lane l to lanes dy
1246 for (c
= 0; c
< dim
; ++c
)
1247 bld
.mkQuadop(qOps
[1], crd
[c
], l
, i
->dPdy
[c
].get(), crd
[c
]);
1248 // normalize cube coordinates
1249 if (i
->tex
.target
.isCube()) {
1250 for (c
= 0; c
< 3; ++c
)
1251 src
[c
] = bld
.mkOp1v(OP_ABS
, TYPE_F32
, bld
.getSSA(), crd
[c
]);
1252 val
= bld
.getScratch();
1253 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[0], src
[1]);
1254 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[2], val
);
1255 bld
.mkOp1(OP_RCP
, TYPE_F32
, val
, val
);
1256 for (c
= 0; c
< 3; ++c
)
1257 src
[c
] = bld
.mkOp2v(OP_MUL
, TYPE_F32
, bld
.getSSA(), crd
[c
], val
);
1259 for (c
= 0; c
< dim
; ++c
)
1263 bld
.insert(tex
= cloneForward(func
, i
));
1265 for (c
= 0; c
< array
; ++c
)
1266 tex
->setSrc(c
, arr
[c
]);
1267 if (i
->tex
.target
.isShadow())
1268 tex
->setSrc(array
+ dim
, shadow
);
1270 for (c
= 0; c
< dim
; ++c
)
1271 tex
->setSrc(c
+ array
, src
[c
]);
1272 // broadcast results from lane 0 to all lanes so that the moves *into*
1273 // the target lane pick up the proper value.
1275 for (c
= 0; i
->defExists(c
); ++c
)
1276 bld
.mkQuadop(0x00, tex
->getDef(c
), 0, tex
->getDef(c
), zero
);
1277 bld
.mkOp(OP_QUADPOP
, TYPE_NONE
, NULL
);
1280 for (c
= 0; i
->defExists(c
); ++c
) {
1282 def
[c
][l
] = bld
.getSSA();
1283 mov
= bld
.mkMov(def
[c
][l
], tex
->getDef(c
));
1285 mov
->lanes
= 1 << l
;
1289 for (c
= 0; i
->defExists(c
); ++c
) {
1290 Instruction
*u
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(c
));
1291 for (l
= 0; l
< 4; ++l
)
1292 u
->setSrc(l
, def
[c
][l
]);
1300 NVC0LoweringPass::handleTXD(TexInstruction
*txd
)
1302 int dim
= txd
->tex
.target
.getDim() + txd
->tex
.target
.isCube();
1303 unsigned arg
= txd
->tex
.target
.getArgCount();
1304 unsigned expected_args
= arg
;
1305 const int chipset
= prog
->getTarget()->getChipset();
1307 if (chipset
>= NVISA_GK104_CHIPSET
) {
1308 if (!txd
->tex
.target
.isArray() && txd
->tex
.useOffsets
)
1310 if (txd
->tex
.rIndirectSrc
>= 0 || txd
->tex
.sIndirectSrc
>= 0)
1313 if (txd
->tex
.useOffsets
)
1315 if (!txd
->tex
.target
.isArray() && (
1316 txd
->tex
.rIndirectSrc
>= 0 || txd
->tex
.sIndirectSrc
>= 0))
1320 if (expected_args
> 4 ||
1322 txd
->tex
.target
.isShadow())
1326 while (txd
->srcExists(arg
))
1329 txd
->tex
.derivAll
= true;
1330 if (txd
->op
== OP_TEX
)
1331 return handleManualTXD(txd
);
1333 assert(arg
== expected_args
);
1334 for (int c
= 0; c
< dim
; ++c
) {
1335 txd
->setSrc(arg
+ c
* 2 + 0, txd
->dPdx
[c
]);
1336 txd
->setSrc(arg
+ c
* 2 + 1, txd
->dPdy
[c
]);
1337 txd
->dPdx
[c
].set(NULL
);
1338 txd
->dPdy
[c
].set(NULL
);
1341 // In this case we have fewer than 4 "real" arguments, which means that
1342 // handleTEX didn't apply any padding. However we have to make sure that
1343 // the second "group" of arguments still gets padded up to 4.
1344 if (chipset
>= NVISA_GK104_CHIPSET
) {
1345 int s
= arg
+ 2 * dim
;
1346 if (s
>= 4 && s
< 7) {
1347 if (txd
->srcExists(s
)) // move potential predicate out of the way
1348 txd
->moveSources(s
, 7 - s
);
1350 txd
->setSrc(s
++, bld
.loadImm(NULL
, 0));
1358 NVC0LoweringPass::handleTXQ(TexInstruction
*txq
)
1360 const int chipset
= prog
->getTarget()->getChipset();
1361 if (chipset
>= NVISA_GK104_CHIPSET
&& txq
->tex
.rIndirectSrc
< 0)
1362 txq
->tex
.r
+= prog
->driver
->io
.texBindBase
/ 4;
1364 if (txq
->tex
.rIndirectSrc
< 0)
1367 Value
*ticRel
= txq
->getIndirectR();
1369 txq
->setIndirectS(NULL
);
1370 txq
->tex
.sIndirectSrc
= -1;
1374 if (chipset
< NVISA_GK104_CHIPSET
) {
1375 LValue
*src
= new_LValue(func
, FILE_GPR
); // 0xttxsaaaa
1377 txq
->setSrc(txq
->tex
.rIndirectSrc
, NULL
);
1379 ticRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
1380 ticRel
, bld
.mkImm(txq
->tex
.r
));
1382 bld
.mkOp2(OP_SHL
, TYPE_U32
, src
, ticRel
, bld
.mkImm(0x17));
1384 txq
->moveSources(0, 1);
1385 txq
->setSrc(0, src
);
1387 Value
*hnd
= loadTexHandle(txq
->getIndirectR(), txq
->tex
.r
);
1391 txq
->setIndirectR(NULL
);
1392 txq
->moveSources(0, 1);
1393 txq
->setSrc(0, hnd
);
1394 txq
->tex
.rIndirectSrc
= 0;
1401 NVC0LoweringPass::handleTXLQ(TexInstruction
*i
)
1403 /* The outputs are inverted compared to what the TGSI instruction
1404 * expects. Take that into account in the mask.
1406 assert((i
->tex
.mask
& ~3) == 0);
1407 if (i
->tex
.mask
== 1)
1409 else if (i
->tex
.mask
== 2)
1412 bld
.setPosition(i
, true);
1414 /* The returned values are not quite what we want:
1415 * (a) convert from s16/u16 to f32
1416 * (b) multiply by 1/256
1418 for (int def
= 0; def
< 2; ++def
) {
1419 if (!i
->defExists(def
))
1421 enum DataType type
= TYPE_S16
;
1422 if (i
->tex
.mask
== 2 || def
> 0)
1424 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(def
), type
, i
->getDef(def
));
1425 bld
.mkOp2(OP_MUL
, TYPE_F32
, i
->getDef(def
),
1426 i
->getDef(def
), bld
.loadImm(NULL
, 1.0f
/ 256));
1428 if (i
->tex
.mask
== 3) {
1429 LValue
*t
= new_LValue(func
, FILE_GPR
);
1430 bld
.mkMov(t
, i
->getDef(0));
1431 bld
.mkMov(i
->getDef(0), i
->getDef(1));
1432 bld
.mkMov(i
->getDef(1), t
);
1438 NVC0LoweringPass::handleBUFQ(Instruction
*bufq
)
1441 bufq
->setSrc(0, loadBufLength32(bufq
->getIndirect(0, 1),
1442 bufq
->getSrc(0)->reg
.fileIndex
* 16));
1443 bufq
->setIndirect(0, 0, NULL
);
1444 bufq
->setIndirect(0, 1, NULL
);
1449 NVC0LoweringPass::handleSharedATOMNVE4(Instruction
*atom
)
1451 assert(atom
->src(0).getFile() == FILE_MEMORY_SHARED
);
1453 BasicBlock
*currBB
= atom
->bb
;
1454 BasicBlock
*tryLockBB
= atom
->bb
->splitBefore(atom
, false);
1455 BasicBlock
*joinBB
= atom
->bb
->splitAfter(atom
);
1456 BasicBlock
*setAndUnlockBB
= new BasicBlock(func
);
1457 BasicBlock
*failLockBB
= new BasicBlock(func
);
1459 bld
.setPosition(currBB
, true);
1460 assert(!currBB
->joinAt
);
1461 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
1463 CmpInstruction
*pred
=
1464 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
1465 TYPE_U32
, bld
.mkImm(0), bld
.mkImm(1));
1467 bld
.mkFlow(OP_BRA
, tryLockBB
, CC_ALWAYS
, NULL
);
1468 currBB
->cfg
.attach(&tryLockBB
->cfg
, Graph::Edge::TREE
);
1470 bld
.setPosition(tryLockBB
, true);
1473 bld
.mkLoad(TYPE_U32
, atom
->getDef(0), atom
->getSrc(0)->asSym(),
1474 atom
->getIndirect(0, 0));
1475 ld
->setDef(1, bld
.getSSA(1, FILE_PREDICATE
));
1476 ld
->subOp
= NV50_IR_SUBOP_LOAD_LOCKED
;
1478 bld
.mkFlow(OP_BRA
, setAndUnlockBB
, CC_P
, ld
->getDef(1));
1479 bld
.mkFlow(OP_BRA
, failLockBB
, CC_ALWAYS
, NULL
);
1480 tryLockBB
->cfg
.attach(&failLockBB
->cfg
, Graph::Edge::CROSS
);
1481 tryLockBB
->cfg
.attach(&setAndUnlockBB
->cfg
, Graph::Edge::TREE
);
1483 tryLockBB
->cfg
.detach(&joinBB
->cfg
);
1486 bld
.setPosition(setAndUnlockBB
, true);
1488 if (atom
->subOp
== NV50_IR_SUBOP_ATOM_EXCH
) {
1489 // Read the old value, and write the new one.
1490 stVal
= atom
->getSrc(1);
1491 } else if (atom
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
1492 CmpInstruction
*set
=
1493 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(),
1494 TYPE_U32
, ld
->getDef(0), atom
->getSrc(1));
1496 bld
.mkCmp(OP_SLCT
, CC_NE
, TYPE_U32
, (stVal
= bld
.getSSA()),
1497 TYPE_U32
, atom
->getSrc(2), ld
->getDef(0), set
->getDef(0));
1501 switch (atom
->subOp
) {
1502 case NV50_IR_SUBOP_ATOM_ADD
:
1505 case NV50_IR_SUBOP_ATOM_AND
:
1508 case NV50_IR_SUBOP_ATOM_OR
:
1511 case NV50_IR_SUBOP_ATOM_XOR
:
1514 case NV50_IR_SUBOP_ATOM_MIN
:
1517 case NV50_IR_SUBOP_ATOM_MAX
:
1525 stVal
= bld
.mkOp2v(op
, atom
->dType
, bld
.getSSA(), ld
->getDef(0),
1530 bld
.mkStore(OP_STORE
, TYPE_U32
, atom
->getSrc(0)->asSym(),
1531 atom
->getIndirect(0, 0), stVal
);
1532 st
->setDef(0, pred
->getDef(0));
1533 st
->subOp
= NV50_IR_SUBOP_STORE_UNLOCKED
;
1535 bld
.mkFlow(OP_BRA
, failLockBB
, CC_ALWAYS
, NULL
);
1536 setAndUnlockBB
->cfg
.attach(&failLockBB
->cfg
, Graph::Edge::TREE
);
1538 // Lock until the store has not been performed.
1539 bld
.setPosition(failLockBB
, true);
1540 bld
.mkFlow(OP_BRA
, tryLockBB
, CC_NOT_P
, pred
->getDef(0));
1541 bld
.mkFlow(OP_BRA
, joinBB
, CC_ALWAYS
, NULL
);
1542 failLockBB
->cfg
.attach(&tryLockBB
->cfg
, Graph::Edge::BACK
);
1543 failLockBB
->cfg
.attach(&joinBB
->cfg
, Graph::Edge::TREE
);
1545 bld
.setPosition(joinBB
, false);
1546 bld
.mkFlow(OP_JOIN
, NULL
, CC_ALWAYS
, NULL
)->fixed
= 1;
1550 NVC0LoweringPass::handleSharedATOM(Instruction
*atom
)
1552 assert(atom
->src(0).getFile() == FILE_MEMORY_SHARED
);
1554 BasicBlock
*currBB
= atom
->bb
;
1555 BasicBlock
*tryLockAndSetBB
= atom
->bb
->splitBefore(atom
, false);
1556 BasicBlock
*joinBB
= atom
->bb
->splitAfter(atom
);
1558 bld
.setPosition(currBB
, true);
1559 assert(!currBB
->joinAt
);
1560 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
1562 bld
.mkFlow(OP_BRA
, tryLockAndSetBB
, CC_ALWAYS
, NULL
);
1563 currBB
->cfg
.attach(&tryLockAndSetBB
->cfg
, Graph::Edge::TREE
);
1565 bld
.setPosition(tryLockAndSetBB
, true);
1568 bld
.mkLoad(TYPE_U32
, atom
->getDef(0), atom
->getSrc(0)->asSym(),
1569 atom
->getIndirect(0, 0));
1570 ld
->setDef(1, bld
.getSSA(1, FILE_PREDICATE
));
1571 ld
->subOp
= NV50_IR_SUBOP_LOAD_LOCKED
;
1574 if (atom
->subOp
== NV50_IR_SUBOP_ATOM_EXCH
) {
1575 // Read the old value, and write the new one.
1576 stVal
= atom
->getSrc(1);
1577 } else if (atom
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
1578 CmpInstruction
*set
=
1579 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
1580 TYPE_U32
, ld
->getDef(0), atom
->getSrc(1));
1581 set
->setPredicate(CC_P
, ld
->getDef(1));
1584 bld
.mkOp3(OP_SELP
, TYPE_U32
, bld
.getSSA(), ld
->getDef(0),
1585 atom
->getSrc(2), set
->getDef(0));
1586 selp
->src(2).mod
= Modifier(NV50_IR_MOD_NOT
);
1587 selp
->setPredicate(CC_P
, ld
->getDef(1));
1589 stVal
= selp
->getDef(0);
1593 switch (atom
->subOp
) {
1594 case NV50_IR_SUBOP_ATOM_ADD
:
1597 case NV50_IR_SUBOP_ATOM_AND
:
1600 case NV50_IR_SUBOP_ATOM_OR
:
1603 case NV50_IR_SUBOP_ATOM_XOR
:
1606 case NV50_IR_SUBOP_ATOM_MIN
:
1609 case NV50_IR_SUBOP_ATOM_MAX
:
1618 bld
.mkOp2(op
, atom
->dType
, bld
.getSSA(), ld
->getDef(0),
1620 i
->setPredicate(CC_P
, ld
->getDef(1));
1622 stVal
= i
->getDef(0);
1626 bld
.mkStore(OP_STORE
, TYPE_U32
, atom
->getSrc(0)->asSym(),
1627 atom
->getIndirect(0, 0), stVal
);
1628 st
->setPredicate(CC_P
, ld
->getDef(1));
1629 st
->subOp
= NV50_IR_SUBOP_STORE_UNLOCKED
;
1631 // Loop until the lock is acquired.
1632 bld
.mkFlow(OP_BRA
, tryLockAndSetBB
, CC_NOT_P
, ld
->getDef(1));
1633 tryLockAndSetBB
->cfg
.attach(&tryLockAndSetBB
->cfg
, Graph::Edge::BACK
);
1634 tryLockAndSetBB
->cfg
.attach(&joinBB
->cfg
, Graph::Edge::CROSS
);
1635 bld
.mkFlow(OP_BRA
, joinBB
, CC_ALWAYS
, NULL
);
1639 bld
.setPosition(joinBB
, false);
1640 bld
.mkFlow(OP_JOIN
, NULL
, CC_ALWAYS
, NULL
)->fixed
= 1;
1644 NVC0LoweringPass::handleATOM(Instruction
*atom
)
1647 Value
*ptr
= atom
->getIndirect(0, 0), *ind
= atom
->getIndirect(0, 1), *base
;
1649 switch (atom
->src(0).getFile()) {
1650 case FILE_MEMORY_LOCAL
:
1653 case FILE_MEMORY_SHARED
:
1654 // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1655 // operations on shared memory. For Maxwell, ATOMS is enough.
1656 if (targ
->getChipset() < NVISA_GK104_CHIPSET
)
1657 handleSharedATOM(atom
);
1658 else if (targ
->getChipset() < NVISA_GM107_CHIPSET
)
1659 handleSharedATOMNVE4(atom
);
1661 case FILE_MEMORY_GLOBAL
:
1664 assert(atom
->src(0).getFile() == FILE_MEMORY_BUFFER
);
1665 base
= loadBufInfo64(ind
, atom
->getSrc(0)->reg
.fileIndex
* 16);
1666 assert(base
->reg
.size
== 8);
1668 base
= bld
.mkOp2v(OP_ADD
, TYPE_U64
, base
, base
, ptr
);
1669 assert(base
->reg
.size
== 8);
1670 atom
->setIndirect(0, 0, base
);
1671 atom
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
1673 // Harden against out-of-bounds accesses
1674 Value
*offset
= bld
.loadImm(NULL
, atom
->getSrc(0)->reg
.data
.offset
+ typeSizeof(atom
->sType
));
1675 Value
*length
= loadBufLength32(ind
, atom
->getSrc(0)->reg
.fileIndex
* 16);
1676 Value
*pred
= new_LValue(func
, FILE_PREDICATE
);
1678 bld
.mkOp2(OP_ADD
, TYPE_U32
, offset
, offset
, ptr
);
1679 bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, pred
, TYPE_U32
, offset
, length
);
1680 atom
->setPredicate(CC_NOT_P
, pred
);
1681 if (atom
->defExists(0)) {
1682 Value
*zero
, *dst
= atom
->getDef(0);
1683 atom
->setDef(0, bld
.getSSA());
1685 bld
.setPosition(atom
, true);
1686 bld
.mkMov((zero
= bld
.getSSA()), bld
.mkImm(0))
1687 ->setPredicate(CC_P
, pred
);
1688 bld
.mkOp2(OP_UNION
, TYPE_U32
, dst
, atom
->getDef(0), zero
);
1694 bld
.mkOp1v(OP_RDSV
, TYPE_U32
, bld
.getScratch(), bld
.mkSysVal(sv
, 0));
1696 atom
->setSrc(0, cloneShallow(func
, atom
->getSrc(0)));
1697 atom
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
1699 base
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, base
, base
, ptr
);
1700 atom
->setIndirect(0, 1, NULL
);
1701 atom
->setIndirect(0, 0, base
);
1707 NVC0LoweringPass::handleCasExch(Instruction
*cas
, bool needCctl
)
1709 if (targ
->getChipset() < NVISA_GM107_CHIPSET
) {
1710 if (cas
->src(0).getFile() == FILE_MEMORY_SHARED
) {
1711 // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1716 if (cas
->subOp
!= NV50_IR_SUBOP_ATOM_CAS
&&
1717 cas
->subOp
!= NV50_IR_SUBOP_ATOM_EXCH
)
1719 bld
.setPosition(cas
, true);
1722 Instruction
*cctl
= bld
.mkOp1(OP_CCTL
, TYPE_NONE
, NULL
, cas
->getSrc(0));
1723 cctl
->setIndirect(0, 0, cas
->getIndirect(0, 0));
1725 cctl
->subOp
= NV50_IR_SUBOP_CCTL_IV
;
1726 if (cas
->isPredicated())
1727 cctl
->setPredicate(cas
->cc
, cas
->getPredicate());
1730 if (cas
->subOp
== NV50_IR_SUBOP_ATOM_CAS
&&
1731 targ
->getChipset() < NVISA_GV100_CHIPSET
) {
1732 // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1733 // should be set to the high part of the double reg or bad things will
1734 // happen elsewhere in the universe.
1735 // Also, it sometimes returns the new value instead of the old one
1736 // under mysterious circumstances.
1737 Value
*dreg
= bld
.getSSA(8);
1738 bld
.setPosition(cas
, false);
1739 bld
.mkOp2(OP_MERGE
, TYPE_U64
, dreg
, cas
->getSrc(1), cas
->getSrc(2));
1740 cas
->setSrc(1, dreg
);
1741 cas
->setSrc(2, dreg
);
1748 NVC0LoweringPass::loadResInfo32(Value
*ptr
, uint32_t off
, uint16_t base
)
1750 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1754 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
1758 NVC0LoweringPass::loadResInfo64(Value
*ptr
, uint32_t off
, uint16_t base
)
1760 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1764 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getScratch(), ptr
, bld
.mkImm(4));
1767 mkLoadv(TYPE_U64
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U64
, off
), ptr
);
1771 NVC0LoweringPass::loadResLength32(Value
*ptr
, uint32_t off
, uint16_t base
)
1773 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1777 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getScratch(), ptr
, bld
.mkImm(4));
1780 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U64
, off
+ 8), ptr
);
1784 NVC0LoweringPass::loadBufInfo64(Value
*ptr
, uint32_t off
)
1786 return loadResInfo64(ptr
, off
, prog
->driver
->io
.bufInfoBase
);
1790 NVC0LoweringPass::loadBufLength32(Value
*ptr
, uint32_t off
)
1792 return loadResLength32(ptr
, off
, prog
->driver
->io
.bufInfoBase
);
1796 NVC0LoweringPass::loadUboInfo64(Value
*ptr
, uint32_t off
)
1798 return loadResInfo64(ptr
, off
, prog
->driver
->io
.uboInfoBase
);
1802 NVC0LoweringPass::loadUboLength32(Value
*ptr
, uint32_t off
)
1804 return loadResLength32(ptr
, off
, prog
->driver
->io
.uboInfoBase
);
1808 NVC0LoweringPass::loadMsInfo32(Value
*ptr
, uint32_t off
)
1810 uint8_t b
= prog
->driver
->io
.msInfoCBSlot
;
1811 off
+= prog
->driver
->io
.msInfoBase
;
1813 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
1817 NVC0LoweringPass::loadSuInfo32(Value
*ptr
, int slot
, uint32_t off
, bool bindless
)
1819 uint32_t base
= slot
* NVC0_SU_INFO__STRIDE
;
1821 // We don't upload surface info for bindless for GM107+
1822 assert(!bindless
|| targ
->getChipset() < NVISA_GM107_CHIPSET
);
1825 ptr
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(slot
));
1827 ptr
= bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(511));
1829 ptr
= bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(7));
1830 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(6));
1835 return loadResInfo32(ptr
, off
, bindless
? prog
->driver
->io
.bindlessBase
:
1836 prog
->driver
->io
.suInfoBase
);
1840 NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target
, uint32_t index
, int slot
, Value
*ind
, bool bindless
)
1842 if (!bindless
|| targ
->getChipset() < NVISA_GM107_CHIPSET
)
1843 return loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(index
), bindless
);
1847 Value
*samples
= bld
.getSSA();
1848 // this shouldn't be lowered because it's being inserted before the current instruction
1849 TexInstruction
*tex
= new_TexInstruction(func
, OP_TXQ
);
1850 tex
->tex
.target
= target
;
1851 tex
->tex
.query
= TXQ_TYPE
;
1852 tex
->tex
.mask
= 0x4;
1855 tex
->tex
.rIndirectSrc
= 0;
1856 tex
->setDef(0, samples
);
1857 tex
->setSrc(0, ind
);
1858 tex
->setSrc(1, bld
.loadImm(NULL
, 0));
1861 // doesn't work with sample counts other than 1/2/4/8 but they aren't supported
1864 Value
*tmp
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), samples
, bld
.mkImm(2));
1865 return bld
.mkOp2v(OP_SHR
, TYPE_U32
, bld
.getSSA(), tmp
, bld
.mkImm(2));
1868 Value
*tmp
= bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, bld
.getSSA(), TYPE_U32
, samples
, bld
.mkImm(2))->getDef(0);
1869 return bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), tmp
, bld
.mkImm(1));
1878 static inline uint16_t getSuClampSubOp(const TexInstruction
*su
, int c
)
1880 switch (su
->tex
.target
.getEnum()) {
1881 case TEX_TARGET_BUFFER
: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1882 case TEX_TARGET_RECT
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1883 case TEX_TARGET_1D
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1884 case TEX_TARGET_1D_ARRAY
: return (c
== 1) ?
1885 NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1886 NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1887 case TEX_TARGET_2D
: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1888 case TEX_TARGET_2D_MS
: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1889 case TEX_TARGET_2D_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1890 case TEX_TARGET_2D_MS_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1891 case TEX_TARGET_3D
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1892 case TEX_TARGET_CUBE
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1893 case TEX_TARGET_CUBE_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1901 NVC0LoweringPass::handleSUQ(TexInstruction
*suq
)
1903 int mask
= suq
->tex
.mask
;
1904 int dim
= suq
->tex
.target
.getDim();
1905 int arg
= dim
+ (suq
->tex
.target
.isArray() || suq
->tex
.target
.isCube());
1906 Value
*ind
= suq
->getIndirectR();
1907 int slot
= suq
->tex
.r
;
1910 for (c
= 0, d
= 0; c
< 3; ++c
, mask
>>= 1) {
1911 if (c
>= arg
|| !(mask
& 1))
1916 if (c
== 1 && suq
->tex
.target
== TEX_TARGET_1D_ARRAY
) {
1917 offset
= NVC0_SU_INFO_SIZE(2);
1919 offset
= NVC0_SU_INFO_SIZE(c
);
1921 bld
.mkMov(suq
->getDef(d
++), loadSuInfo32(ind
, slot
, offset
, suq
->tex
.bindless
));
1922 if (c
== 2 && suq
->tex
.target
.isCube())
1923 bld
.mkOp2(OP_DIV
, TYPE_U32
, suq
->getDef(d
- 1), suq
->getDef(d
- 1),
1924 bld
.loadImm(NULL
, 6));
1928 if (suq
->tex
.target
.isMS()) {
1929 Value
*ms_x
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(0), suq
->tex
.bindless
);
1930 Value
*ms_y
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(1), suq
->tex
.bindless
);
1931 Value
*ms
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(), ms_x
, ms_y
);
1932 bld
.mkOp2(OP_SHL
, TYPE_U32
, suq
->getDef(d
++), bld
.loadImm(NULL
, 1), ms
);
1934 bld
.mkMov(suq
->getDef(d
++), bld
.loadImm(NULL
, 1));
1943 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction
*tex
)
1945 const int arg
= tex
->tex
.target
.getArgCount();
1946 int slot
= tex
->tex
.r
;
1948 if (tex
->tex
.target
== TEX_TARGET_2D_MS
)
1949 tex
->tex
.target
= TEX_TARGET_2D
;
1951 if (tex
->tex
.target
== TEX_TARGET_2D_MS_ARRAY
)
1952 tex
->tex
.target
= TEX_TARGET_2D_ARRAY
;
1956 Value
*x
= tex
->getSrc(0);
1957 Value
*y
= tex
->getSrc(1);
1958 Value
*s
= tex
->getSrc(arg
- 1);
1960 Value
*tx
= bld
.getSSA(), *ty
= bld
.getSSA(), *ts
= bld
.getSSA();
1961 Value
*ind
= tex
->getIndirectR();
1963 Value
*ms_x
= loadMsAdjInfo32(tex
->tex
.target
, 0, slot
, ind
, tex
->tex
.bindless
);
1964 Value
*ms_y
= loadMsAdjInfo32(tex
->tex
.target
, 1, slot
, ind
, tex
->tex
.bindless
);
1966 bld
.mkOp2(OP_SHL
, TYPE_U32
, tx
, x
, ms_x
);
1967 bld
.mkOp2(OP_SHL
, TYPE_U32
, ty
, y
, ms_y
);
1969 s
= bld
.mkOp2v(OP_AND
, TYPE_U32
, ts
, s
, bld
.loadImm(NULL
, 0x7));
1970 s
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, ts
, ts
, bld
.mkImm(3));
1972 Value
*dx
= loadMsInfo32(ts
, 0x0);
1973 Value
*dy
= loadMsInfo32(ts
, 0x4);
1975 bld
.mkOp2(OP_ADD
, TYPE_U32
, tx
, tx
, dx
);
1976 bld
.mkOp2(OP_ADD
, TYPE_U32
, ty
, ty
, dy
);
1980 tex
->moveSources(arg
, -1);
1983 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1984 // They're computed from the coordinates using the surface info in c[] space.
1986 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction
*su
)
1989 const bool atom
= su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
;
1991 su
->op
== OP_SULDB
|| su
->op
== OP_SUSTB
|| su
->op
== OP_SUREDB
;
1992 const int slot
= su
->tex
.r
;
1993 const int dim
= su
->tex
.target
.getDim();
1994 const bool array
= su
->tex
.target
.isArray() || su
->tex
.target
.isCube();
1995 const int arg
= dim
+ array
;
1997 Value
*zero
= bld
.mkImm(0);
2001 Value
*bf
, *eau
, *off
;
2003 Value
*ind
= su
->getIndirectR();
2006 off
= bld
.getScratch(4);
2007 bf
= bld
.getScratch(4);
2008 addr
= bld
.getSSA(8);
2009 pred
= bld
.getScratch(1, FILE_PREDICATE
);
2011 bld
.setPosition(su
, false);
2013 adjustCoordinatesMS(su
);
2015 // calculate clamped coordinates
2016 for (c
= 0; c
< arg
; ++c
) {
2019 if (c
== 1 && su
->tex
.target
== TEX_TARGET_1D_ARRAY
) {
2020 // The array index is stored in the Z component for 1D arrays.
2024 src
[c
] = bld
.getScratch();
2026 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_RAW_X
, su
->tex
.bindless
);
2028 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_DIM(dimc
), su
->tex
.bindless
);
2029 bld
.mkOp3(OP_SUCLAMP
, TYPE_S32
, src
[c
], su
->getSrc(c
), v
, zero
)
2030 ->subOp
= getSuClampSubOp(su
, dimc
);
2035 if (dim
== 2 && !array
) {
2036 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_UNK1C
, su
->tex
.bindless
);
2037 src
[2] = bld
.mkOp2v(OP_SHR
, TYPE_U32
, bld
.getSSA(),
2038 v
, bld
.loadImm(NULL
, 16));
2040 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_DIM(2), su
->tex
.bindless
);
2041 bld
.mkOp3(OP_SUCLAMP
, TYPE_S32
, src
[2], src
[2], v
, zero
)
2042 ->subOp
= NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
2045 // set predicate output
2046 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
2047 src
[0]->getInsn()->setFlagsDef(1, pred
);
2050 p1
= bld
.getSSA(1, FILE_PREDICATE
);
2051 src
[dim
]->getInsn()->setFlagsDef(1, p1
);
2054 // calculate pixel offset
2057 if (su
->tex
.target
!= TEX_TARGET_BUFFER
)
2058 bld
.mkOp2(OP_AND
, TYPE_U32
, off
, src
[0], bld
.loadImm(NULL
, 0xffff));
2063 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_UNK1C
, su
->tex
.bindless
);
2064 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, src
[2], v
, src
[1])
2065 ->subOp
= NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l
2067 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_PITCH
, su
->tex
.bindless
);
2068 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, off
, v
, src
[0])
2070 NV50_IR_SUBOP_MADSP_SD
: NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
2073 // calculate effective address part 1
2074 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
2078 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_FMT
, su
->tex
.bindless
);
2079 bld
.mkOp3(OP_VSHL
, TYPE_U32
, bf
, src
[0], v
, zero
)
2080 ->subOp
= NV50_IR_SUBOP_V1(7,6,8|2);
2092 subOp
= NV50_IR_SUBOP_SUBFM_3D
;
2096 subOp
= NV50_IR_SUBOP_SUBFM_3D
;
2100 insn
= bld
.mkOp3(OP_SUBFM
, TYPE_U32
, bf
, src
[0], y
, z
);
2101 insn
->subOp
= subOp
;
2102 insn
->setFlagsDef(1, pred
);
2106 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
, su
->tex
.bindless
);
2108 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
2111 eau
= bld
.mkOp3v(OP_SUEAU
, TYPE_U32
, bld
.getScratch(4), off
, bf
, v
);
2113 // add array layer offset
2115 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ARRAY
, su
->tex
.bindless
);
2117 bld
.mkOp3(OP_MADSP
, TYPE_U32
, eau
, src
[1], v
, eau
)
2118 ->subOp
= NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
2120 bld
.mkOp3(OP_MADSP
, TYPE_U32
, eau
, v
, src
[2], eau
)
2121 ->subOp
= NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
2122 // combine predicates
2124 bld
.mkOp2(OP_OR
, TYPE_U8
, pred
, pred
, p1
);
2129 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
2133 // bf == g[] address & 0xff
2134 // eau == g[] address >> 8
2135 bld
.mkOp3(OP_PERMT
, TYPE_U32
, bf
, lo
, bld
.loadImm(NULL
, 0x6540), eau
);
2136 bld
.mkOp3(OP_PERMT
, TYPE_U32
, eau
, zero
, bld
.loadImm(NULL
, 0x0007), eau
);
2138 if (su
->op
== OP_SULDP
&& su
->tex
.target
== TEX_TARGET_BUFFER
) {
2139 // Convert from u32 to u8 address format, which is what the library code
2140 // doing SULDP currently uses.
2141 // XXX: can SUEAU do this ?
2142 // XXX: does it matter that we don't mask high bytes in bf ?
2144 bld
.mkOp2(OP_SHR
, TYPE_U32
, off
, bf
, bld
.mkImm(8));
2145 bld
.mkOp2(OP_ADD
, TYPE_U32
, eau
, eau
, off
);
2148 bld
.mkOp2(OP_MERGE
, TYPE_U64
, addr
, bf
, eau
);
2150 if (atom
&& su
->tex
.target
== TEX_TARGET_BUFFER
)
2151 bld
.mkOp2(OP_ADD
, TYPE_U64
, addr
, addr
, off
);
2153 // let's just set it 0 for raw access and hope it works
2155 bld
.mkImm(0) : loadSuInfo32(ind
, slot
, NVC0_SU_INFO_FMT
, su
->tex
.bindless
);
2157 // get rid of old coordinate sources, make space for fmt info and predicate
2158 su
->moveSources(arg
, 3 - arg
);
2159 // set 64 bit address and 32-bit format sources
2160 su
->setSrc(0, addr
);
2162 su
->setSrc(2, pred
);
2163 su
->setIndirectR(NULL
);
2165 // prevent read fault when the image is not actually bound
2166 CmpInstruction
*pred1
=
2167 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
2168 TYPE_U32
, bld
.mkImm(0),
2169 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
, su
->tex
.bindless
));
2171 if (su
->op
!= OP_SUSTP
&& su
->tex
.format
) {
2172 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
2173 int blockwidth
= format
->bits
[0] + format
->bits
[1] +
2174 format
->bits
[2] + format
->bits
[3];
2176 // make sure that the format doesn't mismatch
2177 assert(format
->components
!= 0);
2178 bld
.mkCmp(OP_SET_OR
, CC_NE
, TYPE_U32
, pred1
->getDef(0),
2179 TYPE_U32
, bld
.loadImm(NULL
, blockwidth
/ 8),
2180 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
, su
->tex
.bindless
),
2183 su
->setPredicate(CC_NOT_P
, pred1
->getDef(0));
2185 // TODO: initialize def values to 0 when the surface operation is not
2186 // performed (not needed for stores). Also, fix the "address bounds test"
2187 // subtests from arb_shader_image_load_store-invalid for buffers, because it
2188 // seems like that the predicate is not correctly set by suclamp.
2192 getSrcType(const TexInstruction::ImgFormatDesc
*t
, int c
)
2195 case FLOAT
: return t
->bits
[c
] == 16 ? TYPE_F16
: TYPE_F32
;
2196 case UNORM
: return t
->bits
[c
] == 8 ? TYPE_U8
: TYPE_U16
;
2197 case SNORM
: return t
->bits
[c
] == 8 ? TYPE_S8
: TYPE_S16
;
2199 return (t
->bits
[c
] == 8 ? TYPE_U8
:
2200 (t
->bits
[c
] == 16 ? TYPE_U16
: TYPE_U32
));
2202 return (t
->bits
[c
] == 8 ? TYPE_S8
:
2203 (t
->bits
[c
] == 16 ? TYPE_S16
: TYPE_S32
));
2209 getDestType(const ImgType type
) {
2220 assert(!"Impossible type");
2226 NVC0LoweringPass::convertSurfaceFormat(TexInstruction
*su
, Instruction
**loaded
)
2228 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
2229 int width
= format
->bits
[0] + format
->bits
[1] +
2230 format
->bits
[2] + format
->bits
[3];
2231 Value
*untypedDst
[4] = {};
2232 Value
*typedDst
[4] = {};
2234 // We must convert this to a generic load.
2237 su
->dType
= typeOfSize(width
/ 8);
2238 su
->sType
= TYPE_U8
;
2240 for (int i
= 0; i
< width
/ 32; i
++)
2241 untypedDst
[i
] = bld
.getSSA();
2243 untypedDst
[0] = bld
.getSSA();
2245 if (loaded
&& loaded
[0]) {
2246 for (int i
= 0; i
< 4; i
++) {
2248 typedDst
[i
] = loaded
[i
]->getDef(0);
2251 for (int i
= 0; i
< 4; i
++) {
2252 typedDst
[i
] = su
->getDef(i
);
2256 // Set the untyped dsts as the su's destinations
2257 if (loaded
&& loaded
[0]) {
2258 for (int i
= 0; i
< 4; i
++)
2260 loaded
[i
]->setDef(0, untypedDst
[i
]);
2262 for (int i
= 0; i
< 4; i
++)
2263 su
->setDef(i
, untypedDst
[i
]);
2265 bld
.setPosition(su
, true);
2268 // Unpack each component into the typed dsts
2270 for (int i
= 0; i
< 4; bits
+= format
->bits
[i
], i
++) {
2274 if (loaded
&& loaded
[0])
2275 bld
.setPosition(loaded
[i
], true);
2277 if (i
>= format
->components
) {
2278 if (format
->type
== FLOAT
||
2279 format
->type
== UNORM
||
2280 format
->type
== SNORM
)
2281 bld
.loadImm(typedDst
[i
], i
== 3 ? 1.0f
: 0.0f
);
2283 bld
.loadImm(typedDst
[i
], i
== 3 ? 1 : 0);
2287 // Get just that component's data into the relevant place
2288 if (format
->bits
[i
] == 32)
2289 bld
.mkMov(typedDst
[i
], untypedDst
[i
]);
2290 else if (format
->bits
[i
] == 16)
2291 bld
.mkCvt(OP_CVT
, getDestType(format
->type
), typedDst
[i
],
2292 getSrcType(format
, i
), untypedDst
[i
/ 2])
2293 ->subOp
= (i
& 1) << (format
->type
== FLOAT
? 0 : 1);
2294 else if (format
->bits
[i
] == 8)
2295 bld
.mkCvt(OP_CVT
, getDestType(format
->type
), typedDst
[i
],
2296 getSrcType(format
, i
), untypedDst
[0])->subOp
= i
;
2298 bld
.mkOp2(OP_EXTBF
, TYPE_U32
, typedDst
[i
], untypedDst
[bits
/ 32],
2299 bld
.mkImm((bits
% 32) | (format
->bits
[i
] << 8)));
2300 if (format
->type
== UNORM
|| format
->type
== SNORM
)
2301 bld
.mkCvt(OP_CVT
, TYPE_F32
, typedDst
[i
], getSrcType(format
, i
), typedDst
[i
]);
2304 // Normalize / convert as necessary
2305 if (format
->type
== UNORM
)
2306 bld
.mkOp2(OP_MUL
, TYPE_F32
, typedDst
[i
], typedDst
[i
], bld
.loadImm(NULL
, 1.0f
/ ((1 << format
->bits
[i
]) - 1)));
2307 else if (format
->type
== SNORM
)
2308 bld
.mkOp2(OP_MUL
, TYPE_F32
, typedDst
[i
], typedDst
[i
], bld
.loadImm(NULL
, 1.0f
/ ((1 << (format
->bits
[i
] - 1)) - 1)));
2309 else if (format
->type
== FLOAT
&& format
->bits
[i
] < 16) {
2310 bld
.mkOp2(OP_SHL
, TYPE_U32
, typedDst
[i
], typedDst
[i
], bld
.loadImm(NULL
, 15 - format
->bits
[i
]));
2311 bld
.mkCvt(OP_CVT
, TYPE_F32
, typedDst
[i
], TYPE_F16
, typedDst
[i
]);
2316 std::swap(typedDst
[0], typedDst
[2]);
2321 NVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction
*su
)
2323 if (!su
->getPredicate())
2326 bld
.setPosition(su
, true);
2328 for (unsigned i
= 0; su
->defExists(i
); ++i
) {
2329 ValueDef
&def
= su
->def(i
);
2331 Instruction
*mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
2332 assert(su
->cc
== CC_NOT_P
);
2333 mov
->setPredicate(CC_P
, su
->getPredicate());
2334 Instruction
*uni
= bld
.mkOp2(OP_UNION
, TYPE_U32
, bld
.getSSA(), NULL
, mov
->getDef(0));
2336 def
.replace(uni
->getDef(0), false);
2337 uni
->setSrc(0, def
.get());
2342 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction
*su
)
2344 processSurfaceCoordsNVE4(su
);
2346 if (su
->op
== OP_SULDP
) {
2347 convertSurfaceFormat(su
, NULL
);
2348 insertOOBSurfaceOpResult(su
);
2351 if (su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
) {
2352 assert(su
->getPredicate());
2354 bld
.mkOp2v(OP_OR
, TYPE_U8
, bld
.getScratch(1, FILE_PREDICATE
),
2355 su
->getPredicate(), su
->getSrc(2));
2357 Instruction
*red
= bld
.mkOp(OP_ATOM
, su
->dType
, bld
.getSSA());
2358 red
->subOp
= su
->subOp
;
2359 red
->setSrc(0, bld
.mkSymbol(FILE_MEMORY_GLOBAL
, 0, TYPE_U32
, 0));
2360 red
->setSrc(1, su
->getSrc(3));
2361 if (su
->subOp
== NV50_IR_SUBOP_ATOM_CAS
)
2362 red
->setSrc(2, su
->getSrc(4));
2363 red
->setIndirect(0, 0, su
->getSrc(0));
2365 // make sure to initialize dst value when the atomic operation is not
2367 Instruction
*mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
2369 assert(su
->cc
== CC_NOT_P
);
2370 red
->setPredicate(su
->cc
, pred
);
2371 mov
->setPredicate(CC_P
, pred
);
2373 bld
.mkOp2(OP_UNION
, TYPE_U32
, su
->getDef(0),
2374 red
->getDef(0), mov
->getDef(0));
2376 delete_Instruction(bld
.getProgram(), su
);
2377 handleCasExch(red
, true);
2380 if (su
->op
== OP_SUSTB
|| su
->op
== OP_SUSTP
)
2381 su
->sType
= (su
->tex
.target
== TEX_TARGET_BUFFER
) ? TYPE_U32
: TYPE_U8
;
2385 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction
*su
)
2387 const int slot
= su
->tex
.r
;
2388 const int dim
= su
->tex
.target
.getDim();
2389 const int arg
= dim
+ (su
->tex
.target
.isArray() || su
->tex
.target
.isCube());
2391 Value
*zero
= bld
.mkImm(0);
2394 Value
*ind
= su
->getIndirectR();
2396 bld
.setPosition(su
, false);
2398 adjustCoordinatesMS(su
);
2402 ptr
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), ind
, bld
.mkImm(su
->tex
.r
));
2403 ptr
= bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(7));
2404 su
->setIndirectR(ptr
);
2407 // get surface coordinates
2408 for (c
= 0; c
< arg
; ++c
)
2409 src
[c
] = su
->getSrc(c
);
2413 // calculate pixel offset
2414 if (su
->op
== OP_SULDP
|| su
->op
== OP_SUREDP
) {
2415 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
, su
->tex
.bindless
);
2416 su
->setSrc(0, bld
.mkOp2v(OP_MUL
, TYPE_U32
, bld
.getSSA(), src
[0], v
));
2419 // add array layer offset
2420 if (su
->tex
.target
.isArray() || su
->tex
.target
.isCube()) {
2421 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ARRAY
, su
->tex
.bindless
);
2423 su
->setSrc(2, bld
.mkOp2v(OP_MUL
, TYPE_U32
, bld
.getSSA(), src
[2], v
));
2426 // prevent read fault when the image is not actually bound
2427 CmpInstruction
*pred
=
2428 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
2429 TYPE_U32
, bld
.mkImm(0),
2430 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
, su
->tex
.bindless
));
2431 if (su
->op
!= OP_SUSTP
&& su
->tex
.format
) {
2432 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
2433 int blockwidth
= format
->bits
[0] + format
->bits
[1] +
2434 format
->bits
[2] + format
->bits
[3];
2436 assert(format
->components
!= 0);
2437 // make sure that the format doesn't mismatch when it's not FMT_NONE
2438 bld
.mkCmp(OP_SET_OR
, CC_NE
, TYPE_U32
, pred
->getDef(0),
2439 TYPE_U32
, bld
.loadImm(NULL
, blockwidth
/ 8),
2440 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
, su
->tex
.bindless
),
2443 su
->setPredicate(CC_NOT_P
, pred
->getDef(0));
2447 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction
*su
)
2449 if (su
->tex
.target
== TEX_TARGET_1D_ARRAY
) {
2450 /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2451 * will simplify the lowering pass and the texture constraints. */
2452 su
->moveSources(1, 1);
2453 su
->setSrc(1, bld
.loadImm(NULL
, 0));
2454 su
->tex
.target
= TEX_TARGET_2D_ARRAY
;
2457 processSurfaceCoordsNVC0(su
);
2459 if (su
->op
== OP_SULDP
) {
2460 convertSurfaceFormat(su
, NULL
);
2461 insertOOBSurfaceOpResult(su
);
2464 if (su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
) {
2465 const int dim
= su
->tex
.target
.getDim();
2466 const int arg
= dim
+ (su
->tex
.target
.isArray() || su
->tex
.target
.isCube());
2467 LValue
*addr
= bld
.getSSA(8);
2468 Value
*def
= su
->getDef(0);
2472 // Set the destination to the address
2473 su
->dType
= TYPE_U64
;
2474 su
->setDef(0, addr
);
2475 su
->setDef(1, su
->getPredicate());
2477 bld
.setPosition(su
, true);
2479 // Perform the atomic op
2480 Instruction
*red
= bld
.mkOp(OP_ATOM
, su
->sType
, bld
.getSSA());
2481 red
->subOp
= su
->subOp
;
2482 red
->setSrc(0, bld
.mkSymbol(FILE_MEMORY_GLOBAL
, 0, su
->sType
, 0));
2483 red
->setSrc(1, su
->getSrc(arg
));
2484 if (red
->subOp
== NV50_IR_SUBOP_ATOM_CAS
)
2485 red
->setSrc(2, su
->getSrc(arg
+ 1));
2486 red
->setIndirect(0, 0, addr
);
2488 // make sure to initialize dst value when the atomic operation is not
2490 Instruction
*mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
2492 assert(su
->cc
== CC_NOT_P
);
2493 red
->setPredicate(su
->cc
, su
->getPredicate());
2494 mov
->setPredicate(CC_P
, su
->getPredicate());
2496 bld
.mkOp2(OP_UNION
, TYPE_U32
, def
, red
->getDef(0), mov
->getDef(0));
2498 handleCasExch(red
, false);
2503 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction
*su
, Instruction
*ret
[4])
2505 const int slot
= su
->tex
.r
;
2506 const int dim
= su
->tex
.target
.getDim();
2507 const bool array
= su
->tex
.target
.isArray() || su
->tex
.target
.isCube();
2508 const int arg
= dim
+ array
;
2509 Value
*ind
= su
->getIndirectR();
2511 Instruction
*pred
= NULL
, *pred2d
= NULL
;
2514 bld
.setPosition(su
, false);
2516 adjustCoordinatesMS(su
);
2518 // add texture handle
2524 pos
= (su
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) ? 2 : 1;
2531 if (dim
== 2 && !array
) {
2532 // This might be a 2d slice of a 3d texture, try to load the z
2535 if (!su
->tex
.bindless
)
2536 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_UNK1C
, su
->tex
.bindless
);
2538 v
= bld
.mkOp2v(OP_SHR
, TYPE_U32
, bld
.getSSA(), ind
, bld
.mkImm(11));
2539 Value
*is_3d
= bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), v
, bld
.mkImm(1));
2540 pred2d
= bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
2541 TYPE_U32
, bld
.mkImm(0), is_3d
);
2543 bld
.mkOp2(OP_SHR
, TYPE_U32
, v
, v
, bld
.loadImm(NULL
, 16));
2544 su
->moveSources(dim
, 1);
2546 su
->tex
.target
= nv50_ir::TEX_TARGET_3D
;
2550 if (su
->tex
.bindless
)
2551 handle
= bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ind
, bld
.mkImm(2047));
2553 handle
= loadTexHandle(ind
, slot
+ 32);
2555 su
->setSrc(arg
+ pos
, handle
);
2557 // The address check doesn't make sense here. The format check could make
2558 // sense but it's a bit of a pain.
2559 if (!su
->tex
.bindless
) {
2560 // prevent read fault when the image is not actually bound
2562 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
2563 TYPE_U32
, bld
.mkImm(0),
2564 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
, su
->tex
.bindless
));
2565 if (su
->op
!= OP_SUSTP
&& su
->tex
.format
) {
2566 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
2567 int blockwidth
= format
->bits
[0] + format
->bits
[1] +
2568 format
->bits
[2] + format
->bits
[3];
2570 assert(format
->components
!= 0);
2571 // make sure that the format doesn't mismatch when it's not FMT_NONE
2572 bld
.mkCmp(OP_SET_OR
, CC_NE
, TYPE_U32
, pred
->getDef(0),
2573 TYPE_U32
, bld
.loadImm(NULL
, blockwidth
/ 8),
2574 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
, su
->tex
.bindless
),
2579 // Now we have "pred" which (optionally) contains whether to do the surface
2580 // op at all, and a "pred2d" which indicates that, in case of doing the
2581 // surface op, we have to create a 2d and 3d version, conditioned on pred2d.
2582 TexInstruction
*su2d
= NULL
;
2584 su2d
= cloneForward(func
, su
)->asTex();
2585 for (unsigned i
= 0; su
->defExists(i
); ++i
)
2586 su2d
->setDef(i
, bld
.getSSA());
2587 su2d
->moveSources(dim
+ 1, -1);
2588 su2d
->tex
.target
= nv50_ir::TEX_TARGET_2D
;
2590 if (pred2d
&& pred
) {
2591 Instruction
*pred3d
= bld
.mkOp2(OP_AND
, TYPE_U8
,
2592 bld
.getSSA(1, FILE_PREDICATE
),
2593 pred
->getDef(0), pred2d
->getDef(0));
2594 pred3d
->src(0).mod
= Modifier(NV50_IR_MOD_NOT
);
2595 pred3d
->src(1).mod
= Modifier(NV50_IR_MOD_NOT
);
2596 su
->setPredicate(CC_P
, pred3d
->getDef(0));
2597 pred2d
= bld
.mkOp2(OP_AND
, TYPE_U8
, bld
.getSSA(1, FILE_PREDICATE
),
2598 pred
->getDef(0), pred2d
->getDef(0));
2599 pred2d
->src(0).mod
= Modifier(NV50_IR_MOD_NOT
);
2601 su
->setPredicate(CC_NOT_P
, pred
->getDef(0));
2602 } else if (pred2d
) {
2603 su
->setPredicate(CC_NOT_P
, pred2d
->getDef(0));
2606 su2d
->setPredicate(CC_P
, pred2d
->getDef(0));
2609 // Create a UNION so that RA assigns the same registers
2610 bld
.setPosition(su
, true);
2611 for (unsigned i
= 0; su
->defExists(i
); ++i
) {
2614 ValueDef
&def
= su
->def(i
);
2615 ValueDef
&def2
= su2d
->def(i
);
2616 Instruction
*mov
= NULL
;
2619 mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
2620 mov
->setPredicate(CC_P
, pred
->getDef(0));
2623 Instruction
*uni
= ret
[i
] = bld
.mkOp2(OP_UNION
, TYPE_U32
,
2626 def
.replace(uni
->getDef(0), false);
2627 uni
->setSrc(0, def
.get());
2629 uni
->setSrc(2, mov
->getDef(0));
2632 // Create a UNION so that RA assigns the same registers
2633 bld
.setPosition(su
, true);
2634 for (unsigned i
= 0; su
->defExists(i
); ++i
) {
2637 ValueDef
&def
= su
->def(i
);
2639 Instruction
*mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
2640 mov
->setPredicate(CC_P
, pred
->getDef(0));
2642 Instruction
*uni
= ret
[i
] = bld
.mkOp2(OP_UNION
, TYPE_U32
,
2644 NULL
, mov
->getDef(0));
2645 def
.replace(uni
->getDef(0), false);
2646 uni
->setSrc(0, def
.get());
2654 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction
*su
)
2656 // processSurfaceCoords also takes care of fixing up the outputs and
2657 // union'ing them with 0 as necessary. Additionally it may create a second
2658 // surface which needs some of the similar fixups.
2660 Instruction
*loaded
[4] = {};
2661 TexInstruction
*su2
= processSurfaceCoordsGM107(su
, loaded
);
2663 if (su
->op
== OP_SULDP
) {
2664 convertSurfaceFormat(su
, loaded
);
2667 if (su
->op
== OP_SUREDP
) {
2671 // If we fixed up the type of the regular surface load instruction, we also
2672 // have to fix up the copy.
2675 su2
->dType
= su
->dType
;
2676 su2
->sType
= su
->sType
;
2681 NVC0LoweringPass::handleWRSV(Instruction
*i
)
2687 // must replace, $sreg are not writeable
2688 addr
= targ
->getSVAddress(FILE_SHADER_OUTPUT
, i
->getSrc(0)->asSym());
2691 sym
= bld
.mkSymbol(FILE_SHADER_OUTPUT
, 0, i
->sType
, addr
);
2693 st
= bld
.mkStore(OP_EXPORT
, i
->dType
, sym
, i
->getIndirect(0, 0),
2695 st
->perPatch
= i
->perPatch
;
2697 bld
.getBB()->remove(i
);
2702 NVC0LoweringPass::handleLDST(Instruction
*i
)
2704 if (i
->src(0).getFile() == FILE_SHADER_INPUT
) {
2705 if (prog
->getType() == Program::TYPE_COMPUTE
) {
2706 i
->getSrc(0)->reg
.file
= FILE_MEMORY_CONST
;
2707 i
->getSrc(0)->reg
.fileIndex
= 0;
2709 if (prog
->getType() == Program::TYPE_GEOMETRY
&&
2710 i
->src(0).isIndirect(0)) {
2711 // XXX: this assumes vec4 units
2712 Value
*ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
2713 i
->getIndirect(0, 0), bld
.mkImm(4));
2714 i
->setIndirect(0, 0, ptr
);
2718 assert(prog
->getType() != Program::TYPE_FRAGMENT
); // INTERP
2720 } else if (i
->src(0).getFile() == FILE_MEMORY_CONST
) {
2721 int8_t fileIndex
= i
->getSrc(0)->reg
.fileIndex
- 1;
2722 Value
*ind
= i
->getIndirect(0, 1);
2724 if (targ
->getChipset() >= NVISA_GK104_CHIPSET
&&
2725 prog
->getType() == Program::TYPE_COMPUTE
&&
2726 (fileIndex
>= 6 || ind
)) {
2727 // The launch descriptor only allows to set up 8 CBs, but OpenGL
2728 // requires at least 12 UBOs. To bypass this limitation, for constant
2729 // buffers 7+, we store the addrs into the driver constbuf and we
2730 // directly load from the global memory.
2732 // Clamp the UBO index when an indirect access is used to avoid
2733 // loading information from the wrong place in the driver cb.
2734 // TODO - synchronize the max with the driver.
2735 ind
= bld
.mkOp2v(OP_MIN
, TYPE_U32
, bld
.getSSA(),
2736 bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(),
2737 ind
, bld
.loadImm(NULL
, fileIndex
)),
2738 bld
.loadImm(NULL
, 13));
2742 Value
*offset
= bld
.loadImm(NULL
, i
->getSrc(0)->reg
.data
.offset
+ typeSizeof(i
->sType
));
2743 Value
*ptr
= loadUboInfo64(ind
, fileIndex
* 16);
2744 Value
*length
= loadUboLength32(ind
, fileIndex
* 16);
2745 Value
*pred
= new_LValue(func
, FILE_PREDICATE
);
2746 if (i
->src(0).isIndirect(0)) {
2747 bld
.mkOp2(OP_ADD
, TYPE_U64
, ptr
, ptr
, i
->getIndirect(0, 0));
2748 bld
.mkOp2(OP_ADD
, TYPE_U32
, offset
, offset
, i
->getIndirect(0, 0));
2750 i
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
2751 i
->setIndirect(0, 1, NULL
);
2752 i
->setIndirect(0, 0, ptr
);
2753 bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, pred
, TYPE_U32
, offset
, length
);
2754 i
->setPredicate(CC_NOT_P
, pred
);
2755 Value
*zero
, *dst
= i
->getDef(0);
2756 i
->setDef(0, bld
.getSSA());
2758 bld
.setPosition(i
, true);
2759 bld
.mkMov((zero
= bld
.getSSA()), bld
.mkImm(0))
2760 ->setPredicate(CC_P
, pred
);
2761 bld
.mkOp2(OP_UNION
, TYPE_U32
, dst
, i
->getDef(0), zero
);
2762 } else if (i
->src(0).isIndirect(1)) {
2764 if (i
->src(0).isIndirect(0))
2765 ptr
= bld
.mkOp3v(OP_INSBF
, TYPE_U32
, bld
.getSSA(),
2766 i
->getIndirect(0, 1), bld
.mkImm(0x1010),
2767 i
->getIndirect(0, 0));
2769 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
2770 i
->getIndirect(0, 1), bld
.mkImm(16));
2771 i
->setIndirect(0, 1, NULL
);
2772 i
->setIndirect(0, 0, ptr
);
2773 i
->subOp
= NV50_IR_SUBOP_LDC_IS
;
2775 } else if (i
->src(0).getFile() == FILE_SHADER_OUTPUT
) {
2776 assert(prog
->getType() == Program::TYPE_TESSELLATION_CONTROL
);
2778 } else if (i
->src(0).getFile() == FILE_MEMORY_BUFFER
) {
2779 Value
*ind
= i
->getIndirect(0, 1);
2780 Value
*ptr
= loadBufInfo64(ind
, i
->getSrc(0)->reg
.fileIndex
* 16);
2781 // XXX come up with a way not to do this for EVERY little access but
2782 // rather to batch these up somehow. Unfortunately we've lost the
2783 // information about the field width by the time we get here.
2784 Value
*offset
= bld
.loadImm(NULL
, i
->getSrc(0)->reg
.data
.offset
+ typeSizeof(i
->sType
));
2785 Value
*length
= loadBufLength32(ind
, i
->getSrc(0)->reg
.fileIndex
* 16);
2786 Value
*pred
= new_LValue(func
, FILE_PREDICATE
);
2787 if (i
->src(0).isIndirect(0)) {
2788 bld
.mkOp2(OP_ADD
, TYPE_U64
, ptr
, ptr
, i
->getIndirect(0, 0));
2789 bld
.mkOp2(OP_ADD
, TYPE_U32
, offset
, offset
, i
->getIndirect(0, 0));
2791 i
->setIndirect(0, 1, NULL
);
2792 i
->setIndirect(0, 0, ptr
);
2793 i
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
2794 bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, pred
, TYPE_U32
, offset
, length
);
2795 i
->setPredicate(CC_NOT_P
, pred
);
2796 if (i
->defExists(0)) {
2797 Value
*zero
, *dst
= i
->getDef(0);
2798 i
->setDef(0, bld
.getSSA());
2800 bld
.setPosition(i
, true);
2801 bld
.mkMov((zero
= bld
.getSSA()), bld
.mkImm(0))
2802 ->setPredicate(CC_P
, pred
);
2803 bld
.mkOp2(OP_UNION
, TYPE_U32
, dst
, i
->getDef(0), zero
);
2809 NVC0LoweringPass::readTessCoord(LValue
*dst
, int c
)
2811 Value
*laneid
= bld
.getSSA();
2814 bld
.mkOp1(OP_RDSV
, TYPE_U32
, laneid
, bld
.mkSysVal(SV_LANEID
, 0));
2825 if (prog
->driver_out
->prop
.tp
.domain
!= PIPE_PRIM_TRIANGLES
) {
2826 bld
.mkMov(dst
, bld
.loadImm(NULL
, 0));
2833 bld
.mkFetch(x
, TYPE_F32
, FILE_SHADER_OUTPUT
, 0x2f0, NULL
, laneid
);
2835 bld
.mkFetch(y
, TYPE_F32
, FILE_SHADER_OUTPUT
, 0x2f4, NULL
, laneid
);
2838 bld
.mkOp2(OP_ADD
, TYPE_F32
, dst
, x
, y
);
2839 bld
.mkOp2(OP_SUB
, TYPE_F32
, dst
, bld
.loadImm(NULL
, 1.0f
), dst
);
2844 NVC0LoweringPass::handleRDSV(Instruction
*i
)
2846 Symbol
*sym
= i
->getSrc(0)->asSym();
2847 const SVSemantic sv
= sym
->reg
.data
.sv
.sv
;
2850 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_INPUT
, sym
);
2852 if (addr
>= 0x400) {
2854 if (sym
->reg
.data
.sv
.index
== 3) {
2855 // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2857 i
->setSrc(0, bld
.mkImm((sv
== SV_NTID
|| sv
== SV_NCTAID
) ? 1 : 0));
2860 // Help CSE combine TID fetches
2861 Value
*tid
= bld
.mkOp1v(OP_RDSV
, TYPE_U32
, bld
.getScratch(),
2862 bld
.mkSysVal(SV_COMBINED_TID
, 0));
2865 switch (sym
->reg
.data
.sv
.index
) {
2866 case 0: i
->setSrc(1, bld
.mkImm(0x1000)); break;
2867 case 1: i
->setSrc(1, bld
.mkImm(0x0a10)); break;
2868 case 2: i
->setSrc(1, bld
.mkImm(0x061a)); break;
2871 if (sv
== SV_VERTEX_COUNT
) {
2872 bld
.setPosition(i
, true);
2873 bld
.mkOp2(OP_EXTBF
, TYPE_U32
, i
->getDef(0), i
->getDef(0), bld
.mkImm(0x808));
2880 assert(prog
->getType() == Program::TYPE_FRAGMENT
);
2881 if (i
->srcExists(1)) {
2882 // Pass offset through to the interpolation logic
2883 ld
= bld
.mkInterp(NV50_IR_INTERP_LINEAR
| NV50_IR_INTERP_OFFSET
,
2884 i
->getDef(0), addr
, NULL
);
2885 ld
->setSrc(1, i
->getSrc(1));
2887 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, i
->getDef(0), addr
, NULL
);
2892 Value
*face
= i
->getDef(0);
2893 bld
.mkInterp(NV50_IR_INTERP_FLAT
, face
, addr
, NULL
);
2894 if (i
->dType
== TYPE_F32
) {
2895 bld
.mkOp2(OP_OR
, TYPE_U32
, face
, face
, bld
.mkImm(0x00000001));
2896 bld
.mkOp1(OP_NEG
, TYPE_S32
, face
, face
);
2897 bld
.mkCvt(OP_CVT
, TYPE_F32
, face
, TYPE_S32
, face
);
2902 assert(prog
->getType() == Program::TYPE_TESSELLATION_EVAL
);
2903 readTessCoord(i
->getDef(0)->asLValue(), i
->getSrc(0)->reg
.data
.sv
.index
);
2908 assert(targ
->getChipset() >= NVISA_GK104_CHIPSET
); // mov $sreg otherwise
2909 if (sym
->reg
.data
.sv
.index
== 3) {
2911 i
->setSrc(0, bld
.mkImm(sv
== SV_GRIDID
? 0 : 1));
2916 addr
+= prog
->driver
->prop
.cp
.gridInfoBase
;
2917 bld
.mkLoad(TYPE_U32
, i
->getDef(0),
2918 bld
.mkSymbol(FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
2919 TYPE_U32
, addr
), NULL
);
2921 case SV_SAMPLE_INDEX
:
2922 // TODO: Properly pass source as an address in the PIX address space
2923 // (which can be of the form [r0+offset]). But this is currently
2925 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
2926 ld
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
2928 case SV_SAMPLE_POS
: {
2929 Value
*sampleID
= bld
.getScratch();
2930 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, sampleID
, bld
.mkImm(0));
2931 ld
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
2932 Value
*offset
= calculateSampleOffset(sampleID
);
2934 assert(prog
->driver_out
->prop
.fp
.readsSampleLocations
);
2936 if (targ
->getChipset() >= NVISA_GM200_CHIPSET
) {
2937 bld
.mkLoad(TYPE_F32
,
2940 FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
2941 TYPE_U32
, prog
->driver
->io
.sampleInfoBase
),
2943 bld
.mkOp2(OP_EXTBF
, TYPE_U32
, i
->getDef(0), i
->getDef(0),
2944 bld
.mkImm(0x040c + sym
->reg
.data
.sv
.index
* 16));
2945 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(0), TYPE_U32
, i
->getDef(0));
2946 bld
.mkOp2(OP_MUL
, TYPE_F32
, i
->getDef(0), i
->getDef(0), bld
.mkImm(1.0f
/ 16.0f
));
2948 bld
.mkLoad(TYPE_F32
,
2951 FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
2952 TYPE_U32
, prog
->driver
->io
.sampleInfoBase
+
2953 4 * sym
->reg
.data
.sv
.index
),
2958 case SV_SAMPLE_MASK
: {
2959 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
2960 ld
->subOp
= NV50_IR_SUBOP_PIXLD_COVMASK
;
2961 Instruction
*sampleid
=
2962 bld
.mkOp1(OP_PIXLD
, TYPE_U32
, bld
.getSSA(), bld
.mkImm(0));
2963 sampleid
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
2965 bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ld
->getDef(0),
2966 bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
2967 bld
.loadImm(NULL
, 1), sampleid
->getDef(0)));
2968 if (prog
->persampleInvocation
) {
2969 bld
.mkMov(i
->getDef(0), masked
);
2971 bld
.mkOp3(OP_SELP
, TYPE_U32
, i
->getDef(0), ld
->getDef(0), masked
,
2978 case SV_BASEINSTANCE
:
2980 ld
= bld
.mkLoad(TYPE_U32
, i
->getDef(0),
2981 bld
.mkSymbol(FILE_MEMORY_CONST
,
2982 prog
->driver
->io
.auxCBSlot
,
2984 prog
->driver
->io
.drawInfoBase
+
2985 4 * (sv
- SV_BASEVERTEX
)),
2989 if (prog
->getType() == Program::TYPE_TESSELLATION_EVAL
&& !i
->perPatch
)
2990 vtx
= bld
.mkOp1v(OP_PFETCH
, TYPE_U32
, bld
.getSSA(), bld
.mkImm(0));
2991 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
2992 bld
.mkInterp(NV50_IR_INTERP_FLAT
, i
->getDef(0), addr
, NULL
);
2994 ld
= bld
.mkFetch(i
->getDef(0), i
->dType
,
2995 FILE_SHADER_INPUT
, addr
, i
->getIndirect(0, 0), vtx
);
2996 ld
->perPatch
= i
->perPatch
;
3000 bld
.getBB()->remove(i
);
3005 NVC0LoweringPass::handleDIV(Instruction
*i
)
3007 if (!isFloatType(i
->dType
))
3009 bld
.setPosition(i
, false);
3010 Instruction
*rcp
= bld
.mkOp1(OP_RCP
, i
->dType
, bld
.getSSA(typeSizeof(i
->dType
)), i
->getSrc(1));
3012 i
->setSrc(1, rcp
->getDef(0));
3017 NVC0LoweringPass::handleMOD(Instruction
*i
)
3019 if (!isFloatType(i
->dType
))
3021 LValue
*value
= bld
.getScratch(typeSizeof(i
->dType
));
3022 bld
.mkOp1(OP_RCP
, i
->dType
, value
, i
->getSrc(1));
3023 bld
.mkOp2(OP_MUL
, i
->dType
, value
, i
->getSrc(0), value
);
3024 bld
.mkOp1(OP_TRUNC
, i
->dType
, value
, value
);
3025 bld
.mkOp2(OP_MUL
, i
->dType
, value
, i
->getSrc(1), value
);
3027 i
->setSrc(1, value
);
3032 NVC0LoweringPass::handleSQRT(Instruction
*i
)
3034 if (targ
->isOpSupported(OP_SQRT
, i
->dType
))
3037 if (i
->dType
== TYPE_F64
) {
3038 Value
*pred
= bld
.getSSA(1, FILE_PREDICATE
);
3039 Value
*zero
= bld
.loadImm(NULL
, 0.0);
3040 Value
*dst
= bld
.getSSA(8);
3041 bld
.mkOp1(OP_RSQ
, i
->dType
, dst
, i
->getSrc(0));
3042 bld
.mkCmp(OP_SET
, CC_LE
, i
->dType
, pred
, i
->dType
, i
->getSrc(0), zero
);
3043 bld
.mkOp3(OP_SELP
, TYPE_U64
, dst
, zero
, dst
, pred
);
3046 // TODO: Handle this properly with a library function
3048 bld
.setPosition(i
, true);
3050 bld
.mkOp1(OP_RCP
, i
->dType
, i
->getDef(0), i
->getDef(0));
3057 NVC0LoweringPass::handlePOW(Instruction
*i
)
3059 LValue
*val
= bld
.getScratch();
3061 bld
.mkOp1(OP_LG2
, TYPE_F32
, val
, i
->getSrc(0));
3062 bld
.mkOp2(OP_MUL
, TYPE_F32
, val
, i
->getSrc(1), val
)->dnz
= 1;
3063 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, val
, val
);
3073 NVC0LoweringPass::handleEXPORT(Instruction
*i
)
3075 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
3076 int id
= i
->getSrc(0)->reg
.data
.offset
/ 4;
3078 if (i
->src(0).isIndirect(0)) // TODO, ugly
3081 i
->subOp
= NV50_IR_SUBOP_MOV_FINAL
;
3082 i
->src(0).set(i
->src(1));
3084 i
->setDef(0, new_LValue(func
, FILE_GPR
));
3085 i
->getDef(0)->reg
.data
.id
= id
;
3087 prog
->maxGPR
= MAX2(prog
->maxGPR
, id
);
3089 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
3090 i
->setIndirect(0, 1, gpEmitAddress
);
3096 NVC0LoweringPass::handleOUT(Instruction
*i
)
3098 Instruction
*prev
= i
->prev
;
3099 ImmediateValue stream
, prevStream
;
3101 // Only merge if the stream ids match. Also, note that the previous
3102 // instruction would have already been lowered, so we take arg1 from it.
3103 if (i
->op
== OP_RESTART
&& prev
&& prev
->op
== OP_EMIT
&&
3104 i
->src(0).getImmediate(stream
) &&
3105 prev
->src(1).getImmediate(prevStream
) &&
3106 stream
.reg
.data
.u32
== prevStream
.reg
.data
.u32
) {
3107 i
->prev
->subOp
= NV50_IR_SUBOP_EMIT_RESTART
;
3108 delete_Instruction(prog
, i
);
3110 assert(gpEmitAddress
);
3111 i
->setDef(0, gpEmitAddress
);
3112 i
->setSrc(1, i
->getSrc(0));
3113 i
->setSrc(0, gpEmitAddress
);
3119 NVC0LoweringPass::calculateSampleOffset(Value
*sampleID
)
3121 Value
*offset
= bld
.getScratch();
3122 if (targ
->getChipset() >= NVISA_GM200_CHIPSET
) {
3123 // Sample location offsets (in bytes) are calculated like so:
3124 // offset = (SV_POSITION.y % 4 * 2) + (SV_POSITION.x % 2)
3125 // offset = offset * 32 + sampleID % 8 * 4;
3126 // which is equivalent to:
3127 // offset = (SV_POSITION.y & 0x3) << 6 + (SV_POSITION.x & 0x1) << 5;
3128 // offset += sampleID << 2
3130 // The second operand (src1) of the INSBF instructions are like so:
3131 // 0xssll where ss is the size and ll is the offset.
3132 // so: dest = src2 | (src0 & (1 << ss - 1)) << ll
3134 // Add sample ID (offset = (sampleID & 0x7) << 2)
3135 bld
.mkOp3(OP_INSBF
, TYPE_U32
, offset
, sampleID
, bld
.mkImm(0x0302), bld
.mkImm(0x0));
3137 Symbol
*xSym
= bld
.mkSysVal(SV_POSITION
, 0);
3138 Symbol
*ySym
= bld
.mkSysVal(SV_POSITION
, 1);
3139 Value
*coord
= bld
.getScratch();
3141 // Add X coordinate (offset |= (SV_POSITION.x & 0x1) << 5)
3142 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, coord
,
3143 targ
->getSVAddress(FILE_SHADER_INPUT
, xSym
), NULL
);
3144 bld
.mkCvt(OP_CVT
, TYPE_U32
, coord
, TYPE_F32
, coord
)
3146 bld
.mkOp3(OP_INSBF
, TYPE_U32
, offset
, coord
, bld
.mkImm(0x0105), offset
);
3148 // Add Y coordinate (offset |= (SV_POSITION.y & 0x3) << 6)
3149 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, coord
,
3150 targ
->getSVAddress(FILE_SHADER_INPUT
, ySym
), NULL
);
3151 bld
.mkCvt(OP_CVT
, TYPE_U32
, coord
, TYPE_F32
, coord
)
3153 bld
.mkOp3(OP_INSBF
, TYPE_U32
, offset
, coord
, bld
.mkImm(0x0206), offset
);
3155 bld
.mkOp2(OP_SHL
, TYPE_U32
, offset
, sampleID
, bld
.mkImm(3));
3160 // Handle programmable sample locations for GM20x+
3162 NVC0LoweringPass::handlePIXLD(Instruction
*i
)
3164 if (i
->subOp
!= NV50_IR_SUBOP_PIXLD_OFFSET
)
3166 if (targ
->getChipset() < NVISA_GM200_CHIPSET
)
3169 assert(prog
->driver_out
->prop
.fp
.readsSampleLocations
);
3171 bld
.mkLoad(TYPE_F32
,
3174 FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
3175 TYPE_U32
, prog
->driver
->io
.sampleInfoBase
),
3176 calculateSampleOffset(i
->getSrc(0)));
3178 bld
.getBB()->remove(i
);
3181 // Generate a binary predicate if an instruction is predicated by
3182 // e.g. an f32 value.
3184 NVC0LoweringPass::checkPredicate(Instruction
*insn
)
3186 Value
*pred
= insn
->getPredicate();
3189 if (!pred
|| pred
->reg
.file
== FILE_PREDICATE
)
3191 pdst
= new_LValue(func
, FILE_PREDICATE
);
3193 // CAUTION: don't use pdst->getInsn, the definition might not be unique,
3194 // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
3196 bld
.mkCmp(OP_SET
, CC_NEU
, insn
->dType
, pdst
, insn
->dType
, bld
.mkImm(0), pred
);
3198 insn
->setPredicate(insn
->cc
, pdst
);
3202 // - add quadop dance for texturing
3203 // - put FP outputs in GPRs
3204 // - convert instruction sequences
3207 NVC0LoweringPass::visit(Instruction
*i
)
3210 bld
.setPosition(i
, false);
3212 if (i
->cc
!= CC_ALWAYS
)
3221 return handleTEX(i
->asTex());
3223 return handleTXD(i
->asTex());
3225 return handleTXLQ(i
->asTex());
3227 return handleTXQ(i
->asTex());
3229 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, i
->getDef(0), i
->getSrc(0));
3230 i
->setSrc(0, i
->getDef(0));
3233 return handlePOW(i
);
3235 return handleDIV(i
);
3237 return handleMOD(i
);
3239 return handleSQRT(i
);
3241 ret
= handleEXPORT(i
);
3245 return handleOUT(i
);
3247 return handleRDSV(i
);
3249 return handleWRSV(i
);
3256 const bool cctl
= i
->src(0).getFile() == FILE_MEMORY_BUFFER
;
3258 handleCasExch(i
, cctl
);
3267 if (targ
->getChipset() >= NVISA_GM107_CHIPSET
)
3268 handleSurfaceOpGM107(i
->asTex());
3269 else if (targ
->getChipset() >= NVISA_GK104_CHIPSET
)
3270 handleSurfaceOpNVE4(i
->asTex());
3272 handleSurfaceOpNVC0(i
->asTex());
3275 handleSUQ(i
->asTex());
3287 /* Kepler+ has a special opcode to compute a new base address to be used
3288 * for indirect loads.
3290 * Maxwell+ has an additional similar requirement for indirect
3291 * interpolation ops in frag shaders.
3293 bool doAfetch
= false;
3294 if (targ
->getChipset() >= NVISA_GK104_CHIPSET
&&
3296 (i
->op
== OP_VFETCH
|| i
->op
== OP_EXPORT
) &&
3297 i
->src(0).isIndirect(0)) {
3300 if (targ
->getChipset() >= NVISA_GM107_CHIPSET
&&
3301 (i
->op
== OP_LINTERP
|| i
->op
== OP_PINTERP
) &&
3302 i
->src(0).isIndirect(0)) {
3307 Value
*addr
= cloneShallow(func
, i
->getSrc(0));
3308 Instruction
*afetch
= bld
.mkOp1(OP_AFETCH
, TYPE_U32
, bld
.getSSA(),
3310 afetch
->setIndirect(0, 0, i
->getIndirect(0, 0));
3311 addr
->reg
.data
.offset
= 0;
3313 i
->setIndirect(0, 0, afetch
->getDef(0));
3320 TargetNVC0::runLegalizePass(Program
*prog
, CGStage stage
) const
3322 if (stage
== CG_STAGE_PRE_SSA
) {
3323 NVC0LoweringPass
pass(prog
);
3324 return pass
.run(prog
, false, true);
3326 if (stage
== CG_STAGE_POST_RA
) {
3327 NVC0LegalizePostRA
pass(prog
);
3328 return pass
.run(prog
, false, true);
3330 if (stage
== CG_STAGE_SSA
) {
3331 NVC0LegalizeSSA pass
;
3332 return pass
.run(prog
, false, true);
3337 } // namespace nv50_ir