2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
26 #include "codegen/nv50_ir_target_nvc0.h"
27 #include "codegen/nv50_ir_lowering_nvc0.h"
39 #define QUADOP(q, r, s, t) \
40 ((QOP_##q << 6) | (QOP_##r << 4) | \
41 (QOP_##s << 2) | (QOP_##t << 0))
44 NVC0LegalizeSSA::handleDIV(Instruction
*i
)
46 FlowInstruction
*call
;
50 bld
.setPosition(i
, false);
51 def
[0] = bld
.mkMovToReg(0, i
->getSrc(0))->getDef(0);
52 def
[1] = bld
.mkMovToReg(1, i
->getSrc(1))->getDef(0);
54 case TYPE_U32
: builtin
= NVC0_BUILTIN_DIV_U32
; break;
55 case TYPE_S32
: builtin
= NVC0_BUILTIN_DIV_S32
; break;
59 call
= bld
.mkFlow(OP_CALL
, NULL
, CC_ALWAYS
, NULL
);
60 bld
.mkMov(i
->getDef(0), def
[(i
->op
== OP_DIV
) ? 0 : 1]);
61 bld
.mkClobber(FILE_GPR
, (i
->op
== OP_DIV
) ? 0xe : 0xd, 2);
62 bld
.mkClobber(FILE_PREDICATE
, (i
->dType
== TYPE_S32
) ? 0xf : 0x3, 0);
65 call
->absolute
= call
->builtin
= 1;
66 call
->target
.builtin
= builtin
;
67 delete_Instruction(prog
, i
);
71 NVC0LegalizeSSA::handleRCPRSQ(Instruction
*i
)
73 assert(i
->dType
== TYPE_F64
);
74 // There are instructions that will compute the high 32 bits of the 64-bit
75 // float. We will just stick 0 in the bottom 32 bits.
77 bld
.setPosition(i
, false);
79 // 1. Take the source and it up.
80 Value
*src
[2], *dst
[2], *def
= i
->getDef(0);
81 bld
.mkSplit(src
, 4, i
->getSrc(0));
83 // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
84 dst
[0] = bld
.loadImm(NULL
, 0);
85 dst
[1] = bld
.getSSA();
87 // 3. The new version of the instruction takes the high 32 bits of the
88 // source and outputs the high 32 bits of the destination.
92 i
->subOp
= NV50_IR_SUBOP_RCPRSQ_64H
;
94 // 4. Recombine the two dst pieces back into the original destination.
95 bld
.setPosition(i
, true);
96 bld
.mkOp2(OP_MERGE
, TYPE_U64
, def
, dst
[0], dst
[1]);
100 NVC0LegalizeSSA::handleFTZ(Instruction
*i
)
102 // Only want to flush float inputs
103 assert(i
->sType
== TYPE_F32
);
105 // If we're already flushing denorms (and NaN's) to zero, no need for this.
109 // Only certain classes of operations can flush
110 OpClass cls
= prog
->getTarget()->getOpClass(i
->op
);
111 if (cls
!= OPCLASS_ARITH
&& cls
!= OPCLASS_COMPARE
&&
112 cls
!= OPCLASS_CONVERT
)
119 NVC0LegalizeSSA::handleTEXLOD(TexInstruction
*i
)
121 if (i
->tex
.levelZero
)
126 // The LOD argument comes right after the coordinates (before depth bias,
128 int arg
= i
->tex
.target
.getArgCount();
130 // SM30+ stores the indirect handle as a separate arg, which comes before
132 if (prog
->getTarget()->getChipset() >= NVISA_GK104_CHIPSET
&&
133 i
->tex
.rIndirectSrc
>= 0)
135 // SM20 stores indirect handle combined with array coordinate
136 if (prog
->getTarget()->getChipset() < NVISA_GK104_CHIPSET
&&
137 !i
->tex
.target
.isArray() &&
138 i
->tex
.rIndirectSrc
>= 0)
141 if (!i
->src(arg
).getImmediate(lod
) || !lod
.isInteger(0))
146 i
->tex
.levelZero
= true;
147 i
->moveSources(arg
+ 1, -1);
151 NVC0LegalizeSSA::handleShift(Instruction
*lo
)
153 Value
*shift
= lo
->getSrc(1);
154 Value
*dst64
= lo
->getDef(0);
155 Value
*src
[2], *dst
[2];
156 operation op
= lo
->op
;
158 bld
.setPosition(lo
, false);
160 bld
.mkSplit(src
, 4, lo
->getSrc(0));
162 // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to
163 // be completely emulated. For SM35+, we can use the more directed SHF
165 if (prog
->getTarget()->getChipset() < NVISA_GK20A_CHIPSET
) {
166 // The strategy here is to handle shifts >= 32 and less than 32 as
170 // If the shift is <= 32, then
171 // (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)
172 // If the shift is > 32, then
173 // (HI,LO) << x = (LO << (x - 32), 0)
176 // If the shift is <= 32, then
177 // (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)
178 // If the shift is > 32, then
179 // (HI,LO) >> x = (0, HI >> (x - 32))
181 // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we
182 // can use to our advantage. Also note the structural similarities
183 // between the right/left cases. The main difference is swapping hi/lo
184 // on input and output.
186 Value
*x32_minus_shift
, *pred
, *hi1
, *hi2
;
187 DataType type
= isSignedIntType(lo
->dType
) ? TYPE_S32
: TYPE_U32
;
188 operation antiop
= op
== OP_SHR
? OP_SHL
: OP_SHR
;
190 std::swap(src
[0], src
[1]);
191 bld
.mkOp2(OP_ADD
, TYPE_U32
, (x32_minus_shift
= bld
.getSSA()), shift
, bld
.mkImm(0x20))
192 ->src(0).mod
= Modifier(NV50_IR_MOD_NEG
);
193 bld
.mkCmp(OP_SET
, CC_LE
, TYPE_U8
, (pred
= bld
.getSSA(1, FILE_PREDICATE
)),
194 TYPE_U32
, shift
, bld
.mkImm(32));
195 // Compute HI (shift <= 32)
196 bld
.mkOp2(OP_OR
, TYPE_U32
, (hi1
= bld
.getSSA()),
197 bld
.mkOp2v(op
, TYPE_U32
, bld
.getSSA(), src
[1], shift
),
198 bld
.mkOp2v(antiop
, TYPE_U32
, bld
.getSSA(), src
[0], x32_minus_shift
))
199 ->setPredicate(CC_P
, pred
);
200 // Compute LO (all shift values)
201 bld
.mkOp2(op
, type
, (dst
[0] = bld
.getSSA()), src
[0], shift
);
202 // Compute HI (shift > 32)
203 bld
.mkOp2(op
, type
, (hi2
= bld
.getSSA()), src
[1],
204 bld
.mkOp1v(OP_NEG
, TYPE_S32
, bld
.getSSA(), x32_minus_shift
))
205 ->setPredicate(CC_NOT_P
, pred
);
206 bld
.mkOp2(OP_UNION
, TYPE_U32
, (dst
[1] = bld
.getSSA()), hi1
, hi2
);
208 std::swap(dst
[0], dst
[1]);
209 bld
.mkOp2(OP_MERGE
, TYPE_U64
, dst64
, dst
[0], dst
[1]);
210 delete_Instruction(prog
, lo
);
214 Instruction
*hi
= new_Instruction(func
, op
, TYPE_U32
);
215 lo
->bb
->insertAfter(lo
, hi
);
217 hi
->sType
= lo
->sType
;
218 lo
->dType
= TYPE_U32
;
220 hi
->setDef(0, (dst
[1] = bld
.getSSA()));
221 if (lo
->op
== OP_SHR
)
222 hi
->subOp
|= NV50_IR_SUBOP_SHIFT_HIGH
;
223 lo
->setDef(0, (dst
[0] = bld
.getSSA()));
225 bld
.setPosition(hi
, true);
227 if (lo
->op
== OP_SHL
)
230 hi
->setSrc(0, new_ImmediateValue(prog
, 0u));
231 hi
->setSrc(1, shift
);
232 hi
->setSrc(2, lo
->op
== OP_SHL
? src
[0] : src
[1]);
234 lo
->setSrc(0, src
[0]);
235 lo
->setSrc(1, shift
);
236 lo
->setSrc(2, src
[1]);
238 bld
.mkOp2(OP_MERGE
, TYPE_U64
, dst64
, dst
[0], dst
[1]);
242 NVC0LegalizeSSA::handleSET(CmpInstruction
*cmp
)
244 DataType hTy
= cmp
->sType
== TYPE_S64
? TYPE_S32
: TYPE_U32
;
246 Value
*src0
[2], *src1
[2];
247 bld
.setPosition(cmp
, false);
249 bld
.mkSplit(src0
, 4, cmp
->getSrc(0));
250 bld
.mkSplit(src1
, 4, cmp
->getSrc(1));
251 bld
.mkOp2(OP_SUB
, hTy
, NULL
, src0
[0], src1
[0])
252 ->setFlagsDef(1, (carry
= bld
.getSSA(1, FILE_FLAGS
)));
253 cmp
->setFlagsSrc(cmp
->srcCount(), carry
);
254 cmp
->setSrc(0, src0
[1]);
255 cmp
->setSrc(1, src1
[1]);
260 NVC0LegalizeSSA::visit(Function
*fn
)
262 bld
.setProgram(fn
->getProgram());
267 NVC0LegalizeSSA::visit(BasicBlock
*bb
)
270 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
273 if (i
->sType
== TYPE_F32
&& prog
->getType() != Program::TYPE_COMPUTE
)
279 if (i
->sType
!= TYPE_F32
)
284 if (i
->dType
== TYPE_F64
)
289 handleTEXLOD(i
->asTex());
293 if (typeSizeof(i
->sType
) == 8)
300 if (typeSizeof(i
->sType
) == 8 && i
->sType
!= TYPE_F64
)
301 handleSET(i
->asCmp());
310 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program
*prog
)
314 needTexBar(prog
->getTarget()->getChipset() >= 0xe0 &&
315 prog
->getTarget()->getChipset() < 0x110)
320 NVC0LegalizePostRA::insnDominatedBy(const Instruction
*later
,
321 const Instruction
*early
) const
323 if (early
->bb
== later
->bb
)
324 return early
->serial
< later
->serial
;
325 return later
->bb
->dominatedBy(early
->bb
);
329 NVC0LegalizePostRA::addTexUse(std::list
<TexUse
> &uses
,
330 Instruction
*usei
, const Instruction
*texi
)
333 bool dominated
= insnDominatedBy(usei
, texi
);
334 // Uses before the tex have to all be included. Just because an earlier
335 // instruction dominates another instruction doesn't mean that there's no
336 // way to get from the tex to the later instruction. For example you could
337 // have nested loops, with the tex in the inner loop, and uses before it in
338 // both loops - even though the outer loop's instruction would dominate the
339 // inner's, we still want a texbar before the inner loop's instruction.
341 // However we can still use the eliding logic between uses dominated by the
342 // tex instruction, as that is unambiguously correct.
344 for (std::list
<TexUse
>::iterator it
= uses
.begin(); it
!= uses
.end();) {
346 if (insnDominatedBy(usei
, it
->insn
)) {
350 if (insnDominatedBy(it
->insn
, usei
)) {
359 uses
.push_back(TexUse(usei
, texi
, dominated
));
362 // While it might be tempting to use the an algorithm that just looks at tex
363 // uses, not all texture results are guaranteed to be used on all paths. In
364 // the case where along some control flow path a texture result is never used,
365 // we might reuse that register for something else, creating a
366 // write-after-write hazard. So we have to manually look through all
367 // instructions looking for ones that reference the registers in question.
369 NVC0LegalizePostRA::findFirstUses(
370 Instruction
*texi
, std::list
<TexUse
> &uses
)
372 int minGPR
= texi
->def(0).rep()->reg
.data
.id
;
373 int maxGPR
= minGPR
+ texi
->def(0).rep()->reg
.size
/ 4 - 1;
375 unordered_set
<const BasicBlock
*> visited
;
376 findFirstUsesBB(minGPR
, maxGPR
, texi
->next
, texi
, uses
, visited
);
380 NVC0LegalizePostRA::findFirstUsesBB(
381 int minGPR
, int maxGPR
, Instruction
*start
,
382 const Instruction
*texi
, std::list
<TexUse
> &uses
,
383 unordered_set
<const BasicBlock
*> &visited
)
385 const BasicBlock
*bb
= start
->bb
;
387 // We don't process the whole bb the first time around. This is correct,
388 // however we might be in a loop and hit this BB again, and need to process
389 // the full thing. So only mark a bb as visited if we processed it from the
391 if (start
== bb
->getEntry()) {
392 if (visited
.find(bb
) != visited
.end())
397 for (Instruction
*insn
= start
; insn
!= bb
->getExit(); insn
= insn
->next
) {
401 for (int d
= 0; insn
->defExists(d
); ++d
) {
402 const Value
*def
= insn
->def(d
).rep();
403 if (insn
->def(d
).getFile() != FILE_GPR
||
404 def
->reg
.data
.id
+ def
->reg
.size
/ 4 - 1 < minGPR
||
405 def
->reg
.data
.id
> maxGPR
)
407 addTexUse(uses
, insn
, texi
);
411 for (int s
= 0; insn
->srcExists(s
); ++s
) {
412 const Value
*src
= insn
->src(s
).rep();
413 if (insn
->src(s
).getFile() != FILE_GPR
||
414 src
->reg
.data
.id
+ src
->reg
.size
/ 4 - 1 < minGPR
||
415 src
->reg
.data
.id
> maxGPR
)
417 addTexUse(uses
, insn
, texi
);
422 for (Graph::EdgeIterator ei
= bb
->cfg
.outgoing(); !ei
.end(); ei
.next()) {
423 findFirstUsesBB(minGPR
, maxGPR
, BasicBlock::get(ei
.getNode())->getEntry(),
424 texi
, uses
, visited
);
429 // This pass is a bit long and ugly and can probably be optimized.
431 // 1. obtain a list of TEXes and their outputs' first use(s)
432 // 2. calculate the barrier level of each first use (minimal number of TEXes,
433 // over all paths, between the TEX and the use in question)
434 // 3. for each barrier, if all paths from the source TEX to that barrier
435 // contain a barrier of lesser level, it can be culled
437 NVC0LegalizePostRA::insertTextureBarriers(Function
*fn
)
439 std::list
<TexUse
> *uses
;
440 std::vector
<Instruction
*> texes
;
441 std::vector
<int> bbFirstTex
;
442 std::vector
<int> bbFirstUse
;
443 std::vector
<int> texCounts
;
444 std::vector
<TexUse
> useVec
;
447 fn
->orderInstructions(insns
);
449 texCounts
.resize(fn
->allBBlocks
.getSize(), 0);
450 bbFirstTex
.resize(fn
->allBBlocks
.getSize(), insns
.getSize());
451 bbFirstUse
.resize(fn
->allBBlocks
.getSize(), insns
.getSize());
453 // tag BB CFG nodes by their id for later
454 for (ArrayList::Iterator i
= fn
->allBBlocks
.iterator(); !i
.end(); i
.next()) {
455 BasicBlock
*bb
= reinterpret_cast<BasicBlock
*>(i
.get());
457 bb
->cfg
.tag
= bb
->getId();
460 // gather the first uses for each TEX
461 for (int i
= 0; i
< insns
.getSize(); ++i
) {
462 Instruction
*tex
= reinterpret_cast<Instruction
*>(insns
.get(i
));
463 if (isTextureOp(tex
->op
)) {
464 texes
.push_back(tex
);
465 if (!texCounts
.at(tex
->bb
->getId()))
466 bbFirstTex
[tex
->bb
->getId()] = texes
.size() - 1;
467 texCounts
[tex
->bb
->getId()]++;
473 uses
= new std::list
<TexUse
>[texes
.size()];
476 for (size_t i
= 0; i
< texes
.size(); ++i
) {
477 findFirstUses(texes
[i
], uses
[i
]);
480 // determine the barrier level at each use
481 for (size_t i
= 0; i
< texes
.size(); ++i
) {
482 for (std::list
<TexUse
>::iterator u
= uses
[i
].begin(); u
!= uses
[i
].end();
484 BasicBlock
*tb
= texes
[i
]->bb
;
485 BasicBlock
*ub
= u
->insn
->bb
;
488 for (size_t j
= i
+ 1; j
< texes
.size() &&
489 texes
[j
]->bb
== tb
&& texes
[j
]->serial
< u
->insn
->serial
;
493 u
->level
= fn
->cfg
.findLightestPathWeight(&tb
->cfg
,
494 &ub
->cfg
, texCounts
);
496 WARN("Failed to find path TEX -> TEXBAR\n");
500 // this counted all TEXes in the origin block, correct that
501 u
->level
-= i
- bbFirstTex
.at(tb
->getId()) + 1 /* this TEX */;
502 // and did not count the TEXes in the destination block, add those
503 for (size_t j
= bbFirstTex
.at(ub
->getId()); j
< texes
.size() &&
504 texes
[j
]->bb
== ub
&& texes
[j
]->serial
< u
->insn
->serial
;
508 assert(u
->level
>= 0);
509 useVec
.push_back(*u
);
514 // insert the barriers
515 for (size_t i
= 0; i
< useVec
.size(); ++i
) {
516 Instruction
*prev
= useVec
[i
].insn
->prev
;
517 if (useVec
[i
].level
< 0)
519 if (prev
&& prev
->op
== OP_TEXBAR
) {
520 if (prev
->subOp
> useVec
[i
].level
)
521 prev
->subOp
= useVec
[i
].level
;
522 prev
->setSrc(prev
->srcCount(), useVec
[i
].tex
->getDef(0));
524 Instruction
*bar
= new_Instruction(func
, OP_TEXBAR
, TYPE_NONE
);
526 bar
->subOp
= useVec
[i
].level
;
527 // make use explicit to ease latency calculation
528 bar
->setSrc(bar
->srcCount(), useVec
[i
].tex
->getDef(0));
529 useVec
[i
].insn
->bb
->insertBefore(useVec
[i
].insn
, bar
);
533 if (fn
->getProgram()->optLevel
< 3)
536 std::vector
<Limits
> limitT
, limitB
, limitS
; // entry, exit, single
538 limitT
.resize(fn
->allBBlocks
.getSize(), Limits(0, 0));
539 limitB
.resize(fn
->allBBlocks
.getSize(), Limits(0, 0));
540 limitS
.resize(fn
->allBBlocks
.getSize());
542 // cull unneeded barriers (should do that earlier, but for simplicity)
543 IteratorRef bi
= fn
->cfg
.iteratorCFG();
544 // first calculate min/max outstanding TEXes for each BB
545 for (bi
->reset(); !bi
->end(); bi
->next()) {
546 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
547 BasicBlock
*bb
= BasicBlock::get(n
);
549 int max
= std::numeric_limits
<int>::max();
550 for (Instruction
*i
= bb
->getFirst(); i
; i
= i
->next
) {
551 if (isTextureOp(i
->op
)) {
553 if (max
< std::numeric_limits
<int>::max())
556 if (i
->op
== OP_TEXBAR
) {
557 min
= MIN2(min
, i
->subOp
);
558 max
= MIN2(max
, i
->subOp
);
561 // limits when looking at an isolated block
562 limitS
[bb
->getId()].min
= min
;
563 limitS
[bb
->getId()].max
= max
;
565 // propagate the min/max values
566 for (unsigned int l
= 0; l
<= fn
->loopNestingBound
; ++l
) {
567 for (bi
->reset(); !bi
->end(); bi
->next()) {
568 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
569 BasicBlock
*bb
= BasicBlock::get(n
);
570 const int bbId
= bb
->getId();
571 for (Graph::EdgeIterator ei
= n
->incident(); !ei
.end(); ei
.next()) {
572 BasicBlock
*in
= BasicBlock::get(ei
.getNode());
573 const int inId
= in
->getId();
574 limitT
[bbId
].min
= MAX2(limitT
[bbId
].min
, limitB
[inId
].min
);
575 limitT
[bbId
].max
= MAX2(limitT
[bbId
].max
, limitB
[inId
].max
);
577 // I just hope this is correct ...
578 if (limitS
[bbId
].max
== std::numeric_limits
<int>::max()) {
580 limitB
[bbId
].min
= limitT
[bbId
].min
+ limitS
[bbId
].min
;
581 limitB
[bbId
].max
= limitT
[bbId
].max
+ limitS
[bbId
].min
;
583 // block contained a barrier
584 limitB
[bbId
].min
= MIN2(limitS
[bbId
].max
,
585 limitT
[bbId
].min
+ limitS
[bbId
].min
);
586 limitB
[bbId
].max
= MIN2(limitS
[bbId
].max
,
587 limitT
[bbId
].max
+ limitS
[bbId
].min
);
591 // finally delete unnecessary barriers
592 for (bi
->reset(); !bi
->end(); bi
->next()) {
593 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
594 BasicBlock
*bb
= BasicBlock::get(n
);
595 Instruction
*prev
= NULL
;
597 int max
= limitT
[bb
->getId()].max
;
598 for (Instruction
*i
= bb
->getFirst(); i
; i
= next
) {
600 if (i
->op
== OP_TEXBAR
) {
601 if (i
->subOp
>= max
) {
602 delete_Instruction(prog
, i
);
606 if (prev
&& prev
->op
== OP_TEXBAR
&& prev
->subOp
>= max
) {
607 delete_Instruction(prog
, prev
);
612 if (isTextureOp(i
->op
)) {
615 if (i
&& !i
->isNop())
623 NVC0LegalizePostRA::visit(Function
*fn
)
626 insertTextureBarriers(fn
);
628 rZero
= new_LValue(fn
, FILE_GPR
);
629 pOne
= new_LValue(fn
, FILE_PREDICATE
);
630 carry
= new_LValue(fn
, FILE_FLAGS
);
632 rZero
->reg
.data
.id
= (prog
->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET
) ? 255 : 63;
633 carry
->reg
.data
.id
= 0;
634 pOne
->reg
.data
.id
= 7;
640 NVC0LegalizePostRA::replaceZero(Instruction
*i
)
642 for (int s
= 0; i
->srcExists(s
); ++s
) {
643 if (s
== 2 && i
->op
== OP_SUCLAMP
)
645 ImmediateValue
*imm
= i
->getSrc(s
)->asImm();
647 if (i
->op
== OP_SELP
&& s
== 2) {
649 if (imm
->reg
.data
.u64
== 0)
650 i
->src(s
).mod
= i
->src(s
).mod
^ Modifier(NV50_IR_MOD_NOT
);
651 } else if (imm
->reg
.data
.u64
== 0) {
658 // replace CONT with BRA for single unconditional continue
660 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock
*bb
)
662 if (bb
->cfg
.incidentCount() != 2 || bb
->getEntry()->op
!= OP_PRECONT
)
664 Graph::EdgeIterator ei
= bb
->cfg
.incident();
665 if (ei
.getType() != Graph::Edge::BACK
)
667 if (ei
.getType() != Graph::Edge::BACK
)
669 BasicBlock
*contBB
= BasicBlock::get(ei
.getNode());
671 if (!contBB
->getExit() || contBB
->getExit()->op
!= OP_CONT
||
672 contBB
->getExit()->getPredicate())
674 contBB
->getExit()->op
= OP_BRA
;
675 bb
->remove(bb
->getEntry()); // delete PRECONT
678 assert(ei
.end() || ei
.getType() != Graph::Edge::BACK
);
682 // replace branches to join blocks with join ops
684 NVC0LegalizePostRA::propagateJoin(BasicBlock
*bb
)
686 if (bb
->getEntry()->op
!= OP_JOIN
|| bb
->getEntry()->asFlow()->limit
)
688 for (Graph::EdgeIterator ei
= bb
->cfg
.incident(); !ei
.end(); ei
.next()) {
689 BasicBlock
*in
= BasicBlock::get(ei
.getNode());
690 Instruction
*exit
= in
->getExit();
692 in
->insertTail(new FlowInstruction(func
, OP_JOIN
, bb
));
693 // there should always be a terminator instruction
694 WARN("inserted missing terminator in BB:%i\n", in
->getId());
696 if (exit
->op
== OP_BRA
) {
698 exit
->asFlow()->limit
= 1; // must-not-propagate marker
701 bb
->remove(bb
->getEntry());
705 NVC0LegalizePostRA::visit(BasicBlock
*bb
)
707 Instruction
*i
, *next
;
709 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
710 for (i
= bb
->getFirst(); i
; i
= next
) {
712 if (i
->op
== OP_EMIT
|| i
->op
== OP_RESTART
) {
713 if (!i
->getDef(0)->refCount())
715 if (i
->src(0).getFile() == FILE_IMMEDIATE
)
716 i
->setSrc(0, rZero
); // initial value must be 0
722 if (i
->op
== OP_BAR
&& i
->subOp
== NV50_IR_SUBOP_BAR_SYNC
&&
723 prog
->getType() != Program::TYPE_COMPUTE
) {
724 // It seems like barriers are never required for tessellation since
725 // the warp size is 32, and there are always at most 32 tcs threads.
728 if (i
->op
== OP_LOAD
&& i
->subOp
== NV50_IR_SUBOP_LDC_IS
) {
729 int offset
= i
->src(0).get()->reg
.data
.offset
;
730 if (abs(offset
) > 0x10000)
731 i
->src(0).get()->reg
.fileIndex
+= offset
>> 16;
732 i
->src(0).get()->reg
.data
.offset
= (int)(short)offset
;
734 // TODO: Move this to before register allocation for operations that
735 // need the $c register !
736 if (typeSizeof(i
->sType
) == 8 || typeSizeof(i
->dType
) == 8) {
738 hi
= BuildUtil::split64BitOpPostRA(func
, i
, rZero
, carry
);
743 if (i
->op
!= OP_MOV
&& i
->op
!= OP_PFETCH
)
750 if (!tryReplaceContWithBra(bb
))
756 NVC0LoweringPass::NVC0LoweringPass(Program
*prog
) : targ(prog
->getTarget())
758 bld
.setProgram(prog
);
762 NVC0LoweringPass::visit(Function
*fn
)
764 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
765 assert(!strncmp(fn
->getName(), "MAIN", 4));
766 // TODO: when we generate actual functions pass this value along somehow
767 bld
.setPosition(BasicBlock::get(fn
->cfg
.getRoot()), false);
768 gpEmitAddress
= bld
.loadImm(NULL
, 0)->asLValue();
770 bld
.setPosition(BasicBlock::get(fn
->cfgExit
)->getExit(), false);
771 bld
.mkMovToReg(0, gpEmitAddress
);
778 NVC0LoweringPass::visit(BasicBlock
*bb
)
784 NVC0LoweringPass::loadTexHandle(Value
*ptr
, unsigned int slot
)
786 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
787 uint32_t off
= prog
->driver
->io
.texBindBase
+ slot
* 4;
790 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(2));
793 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
796 // move array source to first slot, convert to u16, add indirections
798 NVC0LoweringPass::handleTEX(TexInstruction
*i
)
800 const int dim
= i
->tex
.target
.getDim() + i
->tex
.target
.isCube();
801 const int arg
= i
->tex
.target
.getArgCount();
802 const int lyr
= arg
- (i
->tex
.target
.isMS() ? 2 : 1);
803 const int chipset
= prog
->getTarget()->getChipset();
805 /* Only normalize in the non-explicit derivatives case. For explicit
806 * derivatives, this is handled in handleManualTXD.
808 if (i
->tex
.target
.isCube() && i
->dPdx
[0].get() == NULL
) {
811 for (c
= 0; c
< 3; ++c
)
812 src
[c
] = bld
.mkOp1v(OP_ABS
, TYPE_F32
, bld
.getSSA(), i
->getSrc(c
));
813 val
= bld
.getScratch();
814 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[0], src
[1]);
815 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[2], val
);
816 bld
.mkOp1(OP_RCP
, TYPE_F32
, val
, val
);
817 for (c
= 0; c
< 3; ++c
) {
818 i
->setSrc(c
, bld
.mkOp2v(OP_MUL
, TYPE_F32
, bld
.getSSA(),
823 // Arguments to the TEX instruction are a little insane. Even though the
824 // encoding is identical between SM20 and SM30, the arguments mean
825 // different things between Fermi and Kepler+. A lot of arguments are
826 // optional based on flags passed to the instruction. This summarizes the
836 // - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
837 // - other: 4 bits each, single reg
841 // array (+ offsets for txd in upper 16 bits)
846 // offsets (same as fermi, except txd which takes it with array)
863 if (chipset
>= NVISA_GK104_CHIPSET
) {
864 if (i
->tex
.rIndirectSrc
>= 0 || i
->tex
.sIndirectSrc
>= 0) {
865 // XXX this ignores tsc, and assumes a 1:1 mapping
866 assert(i
->tex
.rIndirectSrc
>= 0);
867 Value
*hnd
= loadTexHandle(i
->getIndirectR(), i
->tex
.r
);
870 i
->setIndirectR(hnd
);
871 i
->setIndirectS(NULL
);
872 } else if (i
->tex
.r
== i
->tex
.s
|| i
->op
== OP_TXF
) {
873 if (i
->tex
.r
== 0xffff)
874 i
->tex
.r
= prog
->driver
->io
.fbtexBindBase
/ 4;
876 i
->tex
.r
+= prog
->driver
->io
.texBindBase
/ 4;
877 i
->tex
.s
= 0; // only a single cX[] value possible here
879 Value
*hnd
= bld
.getScratch();
880 Value
*rHnd
= loadTexHandle(NULL
, i
->tex
.r
);
881 Value
*sHnd
= loadTexHandle(NULL
, i
->tex
.s
);
883 bld
.mkOp3(OP_INSBF
, TYPE_U32
, hnd
, rHnd
, bld
.mkImm(0x1400), sHnd
);
885 i
->tex
.r
= 0; // not used for indirect tex
887 i
->setIndirectR(hnd
);
889 if (i
->tex
.target
.isArray()) {
890 LValue
*layer
= new_LValue(func
, FILE_GPR
);
891 Value
*src
= i
->getSrc(lyr
);
892 const int sat
= (i
->op
== OP_TXF
) ? 1 : 0;
893 DataType sTy
= (i
->op
== OP_TXF
) ? TYPE_U32
: TYPE_F32
;
894 bld
.mkCvt(OP_CVT
, TYPE_U16
, layer
, sTy
, src
)->saturate
= sat
;
895 if (i
->op
!= OP_TXD
|| chipset
< NVISA_GM107_CHIPSET
) {
896 for (int s
= dim
; s
>= 1; --s
)
897 i
->setSrc(s
, i
->getSrc(s
- 1));
900 i
->setSrc(dim
, layer
);
903 // Move the indirect reference to the first place
904 if (i
->tex
.rIndirectSrc
>= 0 && (
905 i
->op
== OP_TXD
|| chipset
< NVISA_GM107_CHIPSET
)) {
906 Value
*hnd
= i
->getIndirectR();
908 i
->setIndirectR(NULL
);
909 i
->moveSources(0, 1);
911 i
->tex
.rIndirectSrc
= 0;
912 i
->tex
.sIndirectSrc
= -1;
914 // Move the indirect reference to right after the coords
915 else if (i
->tex
.rIndirectSrc
>= 0 && chipset
>= NVISA_GM107_CHIPSET
) {
916 Value
*hnd
= i
->getIndirectR();
918 i
->setIndirectR(NULL
);
919 i
->moveSources(arg
, 1);
921 i
->tex
.rIndirectSrc
= 0;
922 i
->tex
.sIndirectSrc
= -1;
925 // (nvc0) generate and move the tsc/tic/array source to the front
926 if (i
->tex
.target
.isArray() || i
->tex
.rIndirectSrc
>= 0 || i
->tex
.sIndirectSrc
>= 0) {
927 LValue
*src
= new_LValue(func
, FILE_GPR
); // 0xttxsaaaa
929 Value
*ticRel
= i
->getIndirectR();
930 Value
*tscRel
= i
->getIndirectS();
932 if (i
->tex
.r
== 0xffff) {
938 i
->setSrc(i
->tex
.rIndirectSrc
, NULL
);
940 ticRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
941 ticRel
, bld
.mkImm(i
->tex
.r
));
944 i
->setSrc(i
->tex
.sIndirectSrc
, NULL
);
946 tscRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
947 tscRel
, bld
.mkImm(i
->tex
.s
));
950 Value
*arrayIndex
= i
->tex
.target
.isArray() ? i
->getSrc(lyr
) : NULL
;
952 for (int s
= dim
; s
>= 1; --s
)
953 i
->setSrc(s
, i
->getSrc(s
- 1));
954 i
->setSrc(0, arrayIndex
);
956 i
->moveSources(0, 1);
960 int sat
= (i
->op
== OP_TXF
) ? 1 : 0;
961 DataType sTy
= (i
->op
== OP_TXF
) ? TYPE_U32
: TYPE_F32
;
962 bld
.mkCvt(OP_CVT
, TYPE_U16
, src
, sTy
, arrayIndex
)->saturate
= sat
;
968 bld
.mkOp3(OP_INSBF
, TYPE_U32
, src
, ticRel
, bld
.mkImm(0x0917), src
);
970 bld
.mkOp3(OP_INSBF
, TYPE_U32
, src
, tscRel
, bld
.mkImm(0x0710), src
);
975 // For nvc0, the sample id has to be in the second operand, as the offset
976 // does. Right now we don't know how to pass both in, and this case can't
977 // happen with OpenGL. On nve0, the sample id is part of the texture
978 // coordinate argument.
979 assert(chipset
>= NVISA_GK104_CHIPSET
||
980 !i
->tex
.useOffsets
|| !i
->tex
.target
.isMS());
982 // offset is between lod and dc
983 if (i
->tex
.useOffsets
) {
985 int s
= i
->srcCount(0xff, true);
986 if (i
->op
!= OP_TXD
|| chipset
< NVISA_GK104_CHIPSET
) {
987 if (i
->tex
.target
.isShadow())
989 if (i
->srcExists(s
)) // move potential predicate out of the way
990 i
->moveSources(s
, 1);
991 if (i
->tex
.useOffsets
== 4 && i
->srcExists(s
+ 1))
992 i
->moveSources(s
+ 1, 1);
994 if (i
->op
== OP_TXG
) {
995 // Either there is 1 offset, which goes into the 2 low bytes of the
996 // first source, or there are 4 offsets, which go into 2 sources (8
997 // values, 1 byte each).
998 Value
*offs
[2] = {NULL
, NULL
};
999 for (n
= 0; n
< i
->tex
.useOffsets
; n
++) {
1000 for (c
= 0; c
< 2; ++c
) {
1001 if ((n
% 2) == 0 && c
== 0)
1002 bld
.mkMov(offs
[n
/ 2] = bld
.getScratch(), i
->offset
[n
][c
].get());
1004 bld
.mkOp3(OP_INSBF
, TYPE_U32
,
1006 i
->offset
[n
][c
].get(),
1007 bld
.mkImm(0x800 | ((n
* 16 + c
* 8) % 32)),
1011 i
->setSrc(s
, offs
[0]);
1013 i
->setSrc(s
+ 1, offs
[1]);
1016 assert(i
->tex
.useOffsets
== 1);
1017 for (c
= 0; c
< 3; ++c
) {
1019 if (!i
->offset
[0][c
].getImmediate(val
))
1020 assert(!"non-immediate offset passed to non-TXG");
1021 imm
|= (val
.reg
.data
.u32
& 0xf) << (c
* 4);
1023 if (i
->op
== OP_TXD
&& chipset
>= NVISA_GK104_CHIPSET
) {
1024 // The offset goes into the upper 16 bits of the array index. So
1025 // create it if it's not already there, and INSBF it if it already
1027 s
= (i
->tex
.rIndirectSrc
>= 0) ? 1 : 0;
1028 if (chipset
>= NVISA_GM107_CHIPSET
)
1030 if (i
->tex
.target
.isArray()) {
1031 bld
.mkOp3(OP_INSBF
, TYPE_U32
, i
->getSrc(s
),
1032 bld
.loadImm(NULL
, imm
), bld
.mkImm(0xc10),
1035 i
->moveSources(s
, 1);
1036 i
->setSrc(s
, bld
.loadImm(NULL
, imm
<< 16));
1039 i
->setSrc(s
, bld
.loadImm(NULL
, imm
));
1044 if (chipset
>= NVISA_GK104_CHIPSET
) {
1046 // If TEX requires more than 4 sources, the 2nd register tuple must be
1047 // aligned to 4, even if it consists of just a single 4-byte register.
1049 // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
1051 int s
= i
->srcCount(0xff, true);
1052 if (s
> 4 && s
< 7) {
1053 if (i
->srcExists(s
)) // move potential predicate out of the way
1054 i
->moveSources(s
, 7 - s
);
1056 i
->setSrc(s
++, bld
.loadImm(NULL
, 0));
1064 NVC0LoweringPass::handleManualTXD(TexInstruction
*i
)
1066 static const uint8_t qOps
[4][2] =
1068 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l0
1069 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l1
1070 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l2
1071 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l3
1076 Value
*zero
= bld
.loadImm(bld
.getSSA(), 0);
1078 const int dim
= i
->tex
.target
.getDim() + i
->tex
.target
.isCube();
1080 // This function is invoked after handleTEX lowering, so we have to expect
1081 // the arguments in the order that the hw wants them. For Fermi, array and
1082 // indirect are both in the leading arg, while for Kepler, array and
1083 // indirect are separate (and both precede the coordinates). Maxwell is
1084 // handled in a separate function.
1086 if (targ
->getChipset() < NVISA_GK104_CHIPSET
)
1087 array
= i
->tex
.target
.isArray() || i
->tex
.rIndirectSrc
>= 0;
1089 array
= i
->tex
.target
.isArray() + (i
->tex
.rIndirectSrc
>= 0);
1091 i
->op
= OP_TEX
; // no need to clone dPdx/dPdy later
1093 for (c
= 0; c
< dim
; ++c
)
1094 crd
[c
] = bld
.getScratch();
1096 bld
.mkOp(OP_QUADON
, TYPE_NONE
, NULL
);
1097 for (l
= 0; l
< 4; ++l
) {
1098 Value
*src
[3], *val
;
1099 // mov coordinates from lane l to all lanes
1100 for (c
= 0; c
< dim
; ++c
)
1101 bld
.mkQuadop(0x00, crd
[c
], l
, i
->getSrc(c
+ array
), zero
);
1102 // add dPdx from lane l to lanes dx
1103 for (c
= 0; c
< dim
; ++c
)
1104 bld
.mkQuadop(qOps
[l
][0], crd
[c
], l
, i
->dPdx
[c
].get(), crd
[c
]);
1105 // add dPdy from lane l to lanes dy
1106 for (c
= 0; c
< dim
; ++c
)
1107 bld
.mkQuadop(qOps
[l
][1], crd
[c
], l
, i
->dPdy
[c
].get(), crd
[c
]);
1108 // normalize cube coordinates
1109 if (i
->tex
.target
.isCube()) {
1110 for (c
= 0; c
< 3; ++c
)
1111 src
[c
] = bld
.mkOp1v(OP_ABS
, TYPE_F32
, bld
.getSSA(), crd
[c
]);
1112 val
= bld
.getScratch();
1113 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[0], src
[1]);
1114 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[2], val
);
1115 bld
.mkOp1(OP_RCP
, TYPE_F32
, val
, val
);
1116 for (c
= 0; c
< 3; ++c
)
1117 src
[c
] = bld
.mkOp2v(OP_MUL
, TYPE_F32
, bld
.getSSA(), crd
[c
], val
);
1119 for (c
= 0; c
< dim
; ++c
)
1123 bld
.insert(tex
= cloneForward(func
, i
));
1124 for (c
= 0; c
< dim
; ++c
)
1125 tex
->setSrc(c
+ array
, src
[c
]);
1127 for (c
= 0; i
->defExists(c
); ++c
) {
1129 def
[c
][l
] = bld
.getSSA();
1130 mov
= bld
.mkMov(def
[c
][l
], tex
->getDef(c
));
1132 mov
->lanes
= 1 << l
;
1135 bld
.mkOp(OP_QUADPOP
, TYPE_NONE
, NULL
);
1137 for (c
= 0; i
->defExists(c
); ++c
) {
1138 Instruction
*u
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(c
));
1139 for (l
= 0; l
< 4; ++l
)
1140 u
->setSrc(l
, def
[c
][l
]);
1148 NVC0LoweringPass::handleTXD(TexInstruction
*txd
)
1150 int dim
= txd
->tex
.target
.getDim() + txd
->tex
.target
.isCube();
1151 unsigned arg
= txd
->tex
.target
.getArgCount();
1152 unsigned expected_args
= arg
;
1153 const int chipset
= prog
->getTarget()->getChipset();
1155 if (chipset
>= NVISA_GK104_CHIPSET
) {
1156 if (!txd
->tex
.target
.isArray() && txd
->tex
.useOffsets
)
1158 if (txd
->tex
.rIndirectSrc
>= 0 || txd
->tex
.sIndirectSrc
>= 0)
1161 if (txd
->tex
.useOffsets
)
1163 if (!txd
->tex
.target
.isArray() && (
1164 txd
->tex
.rIndirectSrc
>= 0 || txd
->tex
.sIndirectSrc
>= 0))
1168 if (expected_args
> 4 ||
1170 txd
->tex
.target
.isShadow())
1174 while (txd
->srcExists(arg
))
1177 txd
->tex
.derivAll
= true;
1178 if (txd
->op
== OP_TEX
)
1179 return handleManualTXD(txd
);
1181 assert(arg
== expected_args
);
1182 for (int c
= 0; c
< dim
; ++c
) {
1183 txd
->setSrc(arg
+ c
* 2 + 0, txd
->dPdx
[c
]);
1184 txd
->setSrc(arg
+ c
* 2 + 1, txd
->dPdy
[c
]);
1185 txd
->dPdx
[c
].set(NULL
);
1186 txd
->dPdy
[c
].set(NULL
);
1189 // In this case we have fewer than 4 "real" arguments, which means that
1190 // handleTEX didn't apply any padding. However we have to make sure that
1191 // the second "group" of arguments still gets padded up to 4.
1192 if (chipset
>= NVISA_GK104_CHIPSET
) {
1193 int s
= arg
+ 2 * dim
;
1194 if (s
>= 4 && s
< 7) {
1195 if (txd
->srcExists(s
)) // move potential predicate out of the way
1196 txd
->moveSources(s
, 7 - s
);
1198 txd
->setSrc(s
++, bld
.loadImm(NULL
, 0));
1206 NVC0LoweringPass::handleTXQ(TexInstruction
*txq
)
1208 const int chipset
= prog
->getTarget()->getChipset();
1209 if (chipset
>= NVISA_GK104_CHIPSET
&& txq
->tex
.rIndirectSrc
< 0)
1210 txq
->tex
.r
+= prog
->driver
->io
.texBindBase
/ 4;
1212 if (txq
->tex
.rIndirectSrc
< 0)
1215 Value
*ticRel
= txq
->getIndirectR();
1217 txq
->setIndirectS(NULL
);
1218 txq
->tex
.sIndirectSrc
= -1;
1222 if (chipset
< NVISA_GK104_CHIPSET
) {
1223 LValue
*src
= new_LValue(func
, FILE_GPR
); // 0xttxsaaaa
1225 txq
->setSrc(txq
->tex
.rIndirectSrc
, NULL
);
1227 ticRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
1228 ticRel
, bld
.mkImm(txq
->tex
.r
));
1230 bld
.mkOp2(OP_SHL
, TYPE_U32
, src
, ticRel
, bld
.mkImm(0x17));
1232 txq
->moveSources(0, 1);
1233 txq
->setSrc(0, src
);
1235 Value
*hnd
= loadTexHandle(txq
->getIndirectR(), txq
->tex
.r
);
1239 txq
->setIndirectR(NULL
);
1240 txq
->moveSources(0, 1);
1241 txq
->setSrc(0, hnd
);
1242 txq
->tex
.rIndirectSrc
= 0;
1249 NVC0LoweringPass::handleTXLQ(TexInstruction
*i
)
1251 /* The outputs are inverted compared to what the TGSI instruction
1252 * expects. Take that into account in the mask.
1254 assert((i
->tex
.mask
& ~3) == 0);
1255 if (i
->tex
.mask
== 1)
1257 else if (i
->tex
.mask
== 2)
1260 bld
.setPosition(i
, true);
1262 /* The returned values are not quite what we want:
1263 * (a) convert from s16/u16 to f32
1264 * (b) multiply by 1/256
1266 for (int def
= 0; def
< 2; ++def
) {
1267 if (!i
->defExists(def
))
1269 enum DataType type
= TYPE_S16
;
1270 if (i
->tex
.mask
== 2 || def
> 0)
1272 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(def
), type
, i
->getDef(def
));
1273 bld
.mkOp2(OP_MUL
, TYPE_F32
, i
->getDef(def
),
1274 i
->getDef(def
), bld
.loadImm(NULL
, 1.0f
/ 256));
1276 if (i
->tex
.mask
== 3) {
1277 LValue
*t
= new_LValue(func
, FILE_GPR
);
1278 bld
.mkMov(t
, i
->getDef(0));
1279 bld
.mkMov(i
->getDef(0), i
->getDef(1));
1280 bld
.mkMov(i
->getDef(1), t
);
1286 NVC0LoweringPass::handleBUFQ(Instruction
*bufq
)
1289 bufq
->setSrc(0, loadBufLength32(bufq
->getIndirect(0, 1),
1290 bufq
->getSrc(0)->reg
.fileIndex
* 16));
1291 bufq
->setIndirect(0, 0, NULL
);
1292 bufq
->setIndirect(0, 1, NULL
);
1297 NVC0LoweringPass::handleSharedATOMNVE4(Instruction
*atom
)
1299 assert(atom
->src(0).getFile() == FILE_MEMORY_SHARED
);
1301 BasicBlock
*currBB
= atom
->bb
;
1302 BasicBlock
*tryLockBB
= atom
->bb
->splitBefore(atom
, false);
1303 BasicBlock
*joinBB
= atom
->bb
->splitAfter(atom
);
1304 BasicBlock
*setAndUnlockBB
= new BasicBlock(func
);
1305 BasicBlock
*failLockBB
= new BasicBlock(func
);
1307 bld
.setPosition(currBB
, true);
1308 assert(!currBB
->joinAt
);
1309 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
1311 CmpInstruction
*pred
=
1312 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
1313 TYPE_U32
, bld
.mkImm(0), bld
.mkImm(1));
1315 bld
.mkFlow(OP_BRA
, tryLockBB
, CC_ALWAYS
, NULL
);
1316 currBB
->cfg
.attach(&tryLockBB
->cfg
, Graph::Edge::TREE
);
1318 bld
.setPosition(tryLockBB
, true);
1321 bld
.mkLoad(TYPE_U32
, atom
->getDef(0), atom
->getSrc(0)->asSym(),
1322 atom
->getIndirect(0, 0));
1323 ld
->setDef(1, bld
.getSSA(1, FILE_PREDICATE
));
1324 ld
->subOp
= NV50_IR_SUBOP_LOAD_LOCKED
;
1326 bld
.mkFlow(OP_BRA
, setAndUnlockBB
, CC_P
, ld
->getDef(1));
1327 bld
.mkFlow(OP_BRA
, failLockBB
, CC_ALWAYS
, NULL
);
1328 tryLockBB
->cfg
.attach(&failLockBB
->cfg
, Graph::Edge::CROSS
);
1329 tryLockBB
->cfg
.attach(&setAndUnlockBB
->cfg
, Graph::Edge::TREE
);
1331 tryLockBB
->cfg
.detach(&joinBB
->cfg
);
1334 bld
.setPosition(setAndUnlockBB
, true);
1336 if (atom
->subOp
== NV50_IR_SUBOP_ATOM_EXCH
) {
1337 // Read the old value, and write the new one.
1338 stVal
= atom
->getSrc(1);
1339 } else if (atom
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
1340 CmpInstruction
*set
=
1341 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(),
1342 TYPE_U32
, ld
->getDef(0), atom
->getSrc(1));
1344 bld
.mkCmp(OP_SLCT
, CC_NE
, TYPE_U32
, (stVal
= bld
.getSSA()),
1345 TYPE_U32
, atom
->getSrc(2), ld
->getDef(0), set
->getDef(0));
1349 switch (atom
->subOp
) {
1350 case NV50_IR_SUBOP_ATOM_ADD
:
1353 case NV50_IR_SUBOP_ATOM_AND
:
1356 case NV50_IR_SUBOP_ATOM_OR
:
1359 case NV50_IR_SUBOP_ATOM_XOR
:
1362 case NV50_IR_SUBOP_ATOM_MIN
:
1365 case NV50_IR_SUBOP_ATOM_MAX
:
1373 stVal
= bld
.mkOp2v(op
, atom
->dType
, bld
.getSSA(), ld
->getDef(0),
1378 bld
.mkStore(OP_STORE
, TYPE_U32
, atom
->getSrc(0)->asSym(),
1379 atom
->getIndirect(0, 0), stVal
);
1380 st
->setDef(0, pred
->getDef(0));
1381 st
->subOp
= NV50_IR_SUBOP_STORE_UNLOCKED
;
1383 bld
.mkFlow(OP_BRA
, failLockBB
, CC_ALWAYS
, NULL
);
1384 setAndUnlockBB
->cfg
.attach(&failLockBB
->cfg
, Graph::Edge::TREE
);
1386 // Lock until the store has not been performed.
1387 bld
.setPosition(failLockBB
, true);
1388 bld
.mkFlow(OP_BRA
, tryLockBB
, CC_NOT_P
, pred
->getDef(0));
1389 bld
.mkFlow(OP_BRA
, joinBB
, CC_ALWAYS
, NULL
);
1390 failLockBB
->cfg
.attach(&tryLockBB
->cfg
, Graph::Edge::BACK
);
1391 failLockBB
->cfg
.attach(&joinBB
->cfg
, Graph::Edge::TREE
);
1393 bld
.setPosition(joinBB
, false);
1394 bld
.mkFlow(OP_JOIN
, NULL
, CC_ALWAYS
, NULL
)->fixed
= 1;
1398 NVC0LoweringPass::handleSharedATOM(Instruction
*atom
)
1400 assert(atom
->src(0).getFile() == FILE_MEMORY_SHARED
);
1402 BasicBlock
*currBB
= atom
->bb
;
1403 BasicBlock
*tryLockAndSetBB
= atom
->bb
->splitBefore(atom
, false);
1404 BasicBlock
*joinBB
= atom
->bb
->splitAfter(atom
);
1406 bld
.setPosition(currBB
, true);
1407 assert(!currBB
->joinAt
);
1408 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
1410 bld
.mkFlow(OP_BRA
, tryLockAndSetBB
, CC_ALWAYS
, NULL
);
1411 currBB
->cfg
.attach(&tryLockAndSetBB
->cfg
, Graph::Edge::TREE
);
1413 bld
.setPosition(tryLockAndSetBB
, true);
1416 bld
.mkLoad(TYPE_U32
, atom
->getDef(0), atom
->getSrc(0)->asSym(),
1417 atom
->getIndirect(0, 0));
1418 ld
->setDef(1, bld
.getSSA(1, FILE_PREDICATE
));
1419 ld
->subOp
= NV50_IR_SUBOP_LOAD_LOCKED
;
1422 if (atom
->subOp
== NV50_IR_SUBOP_ATOM_EXCH
) {
1423 // Read the old value, and write the new one.
1424 stVal
= atom
->getSrc(1);
1425 } else if (atom
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
1426 CmpInstruction
*set
=
1427 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
1428 TYPE_U32
, ld
->getDef(0), atom
->getSrc(1));
1429 set
->setPredicate(CC_P
, ld
->getDef(1));
1432 bld
.mkOp3(OP_SELP
, TYPE_U32
, bld
.getSSA(), ld
->getDef(0),
1433 atom
->getSrc(2), set
->getDef(0));
1434 selp
->src(2).mod
= Modifier(NV50_IR_MOD_NOT
);
1435 selp
->setPredicate(CC_P
, ld
->getDef(1));
1437 stVal
= selp
->getDef(0);
1441 switch (atom
->subOp
) {
1442 case NV50_IR_SUBOP_ATOM_ADD
:
1445 case NV50_IR_SUBOP_ATOM_AND
:
1448 case NV50_IR_SUBOP_ATOM_OR
:
1451 case NV50_IR_SUBOP_ATOM_XOR
:
1454 case NV50_IR_SUBOP_ATOM_MIN
:
1457 case NV50_IR_SUBOP_ATOM_MAX
:
1466 bld
.mkOp2(op
, atom
->dType
, bld
.getSSA(), ld
->getDef(0),
1468 i
->setPredicate(CC_P
, ld
->getDef(1));
1470 stVal
= i
->getDef(0);
1474 bld
.mkStore(OP_STORE
, TYPE_U32
, atom
->getSrc(0)->asSym(),
1475 atom
->getIndirect(0, 0), stVal
);
1476 st
->setPredicate(CC_P
, ld
->getDef(1));
1477 st
->subOp
= NV50_IR_SUBOP_STORE_UNLOCKED
;
1479 // Loop until the lock is acquired.
1480 bld
.mkFlow(OP_BRA
, tryLockAndSetBB
, CC_NOT_P
, ld
->getDef(1));
1481 tryLockAndSetBB
->cfg
.attach(&tryLockAndSetBB
->cfg
, Graph::Edge::BACK
);
1482 tryLockAndSetBB
->cfg
.attach(&joinBB
->cfg
, Graph::Edge::CROSS
);
1483 bld
.mkFlow(OP_BRA
, joinBB
, CC_ALWAYS
, NULL
);
1487 bld
.setPosition(joinBB
, false);
1488 bld
.mkFlow(OP_JOIN
, NULL
, CC_ALWAYS
, NULL
)->fixed
= 1;
1492 NVC0LoweringPass::handleATOM(Instruction
*atom
)
1495 Value
*ptr
= atom
->getIndirect(0, 0), *ind
= atom
->getIndirect(0, 1), *base
;
1497 switch (atom
->src(0).getFile()) {
1498 case FILE_MEMORY_LOCAL
:
1501 case FILE_MEMORY_SHARED
:
1502 // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1503 // operations on shared memory. For Maxwell, ATOMS is enough.
1504 if (targ
->getChipset() < NVISA_GK104_CHIPSET
)
1505 handleSharedATOM(atom
);
1506 else if (targ
->getChipset() < NVISA_GM107_CHIPSET
)
1507 handleSharedATOMNVE4(atom
);
1510 assert(atom
->src(0).getFile() == FILE_MEMORY_BUFFER
);
1511 base
= loadBufInfo64(ind
, atom
->getSrc(0)->reg
.fileIndex
* 16);
1512 assert(base
->reg
.size
== 8);
1514 base
= bld
.mkOp2v(OP_ADD
, TYPE_U64
, base
, base
, ptr
);
1515 assert(base
->reg
.size
== 8);
1516 atom
->setIndirect(0, 0, base
);
1517 atom
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
1519 // Harden against out-of-bounds accesses
1520 Value
*offset
= bld
.loadImm(NULL
, atom
->getSrc(0)->reg
.data
.offset
+ typeSizeof(atom
->sType
));
1521 Value
*length
= loadBufLength32(ind
, atom
->getSrc(0)->reg
.fileIndex
* 16);
1522 Value
*pred
= new_LValue(func
, FILE_PREDICATE
);
1524 bld
.mkOp2(OP_ADD
, TYPE_U32
, offset
, offset
, ptr
);
1525 bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, pred
, TYPE_U32
, offset
, length
);
1526 atom
->setPredicate(CC_NOT_P
, pred
);
1527 if (atom
->defExists(0)) {
1528 Value
*zero
, *dst
= atom
->getDef(0);
1529 atom
->setDef(0, bld
.getSSA());
1531 bld
.setPosition(atom
, true);
1532 bld
.mkMov((zero
= bld
.getSSA()), bld
.mkImm(0))
1533 ->setPredicate(CC_P
, pred
);
1534 bld
.mkOp2(OP_UNION
, TYPE_U32
, dst
, atom
->getDef(0), zero
);
1540 bld
.mkOp1v(OP_RDSV
, TYPE_U32
, bld
.getScratch(), bld
.mkSysVal(sv
, 0));
1542 atom
->setSrc(0, cloneShallow(func
, atom
->getSrc(0)));
1543 atom
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
1545 base
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, base
, base
, ptr
);
1546 atom
->setIndirect(0, 1, NULL
);
1547 atom
->setIndirect(0, 0, base
);
1553 NVC0LoweringPass::handleCasExch(Instruction
*cas
, bool needCctl
)
1555 if (targ
->getChipset() < NVISA_GM107_CHIPSET
) {
1556 if (cas
->src(0).getFile() == FILE_MEMORY_SHARED
) {
1557 // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1562 if (cas
->subOp
!= NV50_IR_SUBOP_ATOM_CAS
&&
1563 cas
->subOp
!= NV50_IR_SUBOP_ATOM_EXCH
)
1565 bld
.setPosition(cas
, true);
1568 Instruction
*cctl
= bld
.mkOp1(OP_CCTL
, TYPE_NONE
, NULL
, cas
->getSrc(0));
1569 cctl
->setIndirect(0, 0, cas
->getIndirect(0, 0));
1571 cctl
->subOp
= NV50_IR_SUBOP_CCTL_IV
;
1572 if (cas
->isPredicated())
1573 cctl
->setPredicate(cas
->cc
, cas
->getPredicate());
1576 if (cas
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
1577 // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1578 // should be set to the high part of the double reg or bad things will
1579 // happen elsewhere in the universe.
1580 // Also, it sometimes returns the new value instead of the old one
1581 // under mysterious circumstances.
1582 Value
*dreg
= bld
.getSSA(8);
1583 bld
.setPosition(cas
, false);
1584 bld
.mkOp2(OP_MERGE
, TYPE_U64
, dreg
, cas
->getSrc(1), cas
->getSrc(2));
1585 cas
->setSrc(1, dreg
);
1586 cas
->setSrc(2, dreg
);
1593 NVC0LoweringPass::loadResInfo32(Value
*ptr
, uint32_t off
, uint16_t base
)
1595 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1599 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
1603 NVC0LoweringPass::loadResInfo64(Value
*ptr
, uint32_t off
, uint16_t base
)
1605 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1609 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getScratch(), ptr
, bld
.mkImm(4));
1612 mkLoadv(TYPE_U64
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U64
, off
), ptr
);
1616 NVC0LoweringPass::loadResLength32(Value
*ptr
, uint32_t off
, uint16_t base
)
1618 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1622 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getScratch(), ptr
, bld
.mkImm(4));
1625 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U64
, off
+ 8), ptr
);
1629 NVC0LoweringPass::loadBufInfo64(Value
*ptr
, uint32_t off
)
1631 return loadResInfo64(ptr
, off
, prog
->driver
->io
.bufInfoBase
);
1635 NVC0LoweringPass::loadBufLength32(Value
*ptr
, uint32_t off
)
1637 return loadResLength32(ptr
, off
, prog
->driver
->io
.bufInfoBase
);
1641 NVC0LoweringPass::loadUboInfo64(Value
*ptr
, uint32_t off
)
1643 return loadResInfo64(ptr
, off
, prog
->driver
->io
.uboInfoBase
);
1647 NVC0LoweringPass::loadUboLength32(Value
*ptr
, uint32_t off
)
1649 return loadResLength32(ptr
, off
, prog
->driver
->io
.uboInfoBase
);
1653 NVC0LoweringPass::loadMsInfo32(Value
*ptr
, uint32_t off
)
1655 uint8_t b
= prog
->driver
->io
.msInfoCBSlot
;
1656 off
+= prog
->driver
->io
.msInfoBase
;
1658 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
1661 /* On nvc0, surface info is obtained via the surface binding points passed
1662 * to the SULD/SUST instructions.
1663 * On nve4, surface info is stored in c[] and is used by various special
1664 * instructions, e.g. for clamping coordinates or generating an address.
1665 * They couldn't just have added an equivalent to TIC now, couldn't they ?
1667 #define NVC0_SU_INFO_ADDR 0x00
1668 #define NVC0_SU_INFO_FMT 0x04
1669 #define NVC0_SU_INFO_DIM_X 0x08
1670 #define NVC0_SU_INFO_PITCH 0x0c
1671 #define NVC0_SU_INFO_DIM_Y 0x10
1672 #define NVC0_SU_INFO_ARRAY 0x14
1673 #define NVC0_SU_INFO_DIM_Z 0x18
1674 #define NVC0_SU_INFO_UNK1C 0x1c
1675 #define NVC0_SU_INFO_WIDTH 0x20
1676 #define NVC0_SU_INFO_HEIGHT 0x24
1677 #define NVC0_SU_INFO_DEPTH 0x28
1678 #define NVC0_SU_INFO_TARGET 0x2c
1679 #define NVC0_SU_INFO_BSIZE 0x30
1680 #define NVC0_SU_INFO_RAW_X 0x34
1681 #define NVC0_SU_INFO_MS_X 0x38
1682 #define NVC0_SU_INFO_MS_Y 0x3c
1684 #define NVC0_SU_INFO__STRIDE 0x40
1686 #define NVC0_SU_INFO_DIM(i) (0x08 + (i) * 8)
1687 #define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4)
1688 #define NVC0_SU_INFO_MS(i) (0x38 + (i) * 4)
1691 NVC0LoweringPass::loadSuInfo32(Value
*ptr
, int slot
, uint32_t off
)
1693 uint32_t base
= slot
* NVC0_SU_INFO__STRIDE
;
1696 ptr
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(slot
));
1697 ptr
= bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(7));
1698 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(6));
1703 return loadResInfo32(ptr
, off
, prog
->driver
->io
.suInfoBase
);
1706 static inline uint16_t getSuClampSubOp(const TexInstruction
*su
, int c
)
1708 switch (su
->tex
.target
.getEnum()) {
1709 case TEX_TARGET_BUFFER
: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1710 case TEX_TARGET_RECT
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1711 case TEX_TARGET_1D
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1712 case TEX_TARGET_1D_ARRAY
: return (c
== 1) ?
1713 NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1714 NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1715 case TEX_TARGET_2D
: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1716 case TEX_TARGET_2D_MS
: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1717 case TEX_TARGET_2D_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1718 case TEX_TARGET_2D_MS_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1719 case TEX_TARGET_3D
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1720 case TEX_TARGET_CUBE
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1721 case TEX_TARGET_CUBE_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1729 NVC0LoweringPass::handleSUQ(TexInstruction
*suq
)
1731 int mask
= suq
->tex
.mask
;
1732 int dim
= suq
->tex
.target
.getDim();
1733 int arg
= dim
+ (suq
->tex
.target
.isArray() || suq
->tex
.target
.isCube());
1734 Value
*ind
= suq
->getIndirectR();
1735 int slot
= suq
->tex
.r
;
1738 for (c
= 0, d
= 0; c
< 3; ++c
, mask
>>= 1) {
1739 if (c
>= arg
|| !(mask
& 1))
1744 if (c
== 1 && suq
->tex
.target
== TEX_TARGET_1D_ARRAY
) {
1745 offset
= NVC0_SU_INFO_SIZE(2);
1747 offset
= NVC0_SU_INFO_SIZE(c
);
1749 bld
.mkMov(suq
->getDef(d
++), loadSuInfo32(ind
, slot
, offset
));
1750 if (c
== 2 && suq
->tex
.target
.isCube())
1751 bld
.mkOp2(OP_DIV
, TYPE_U32
, suq
->getDef(d
- 1), suq
->getDef(d
- 1),
1752 bld
.loadImm(NULL
, 6));
1756 if (suq
->tex
.target
.isMS()) {
1757 Value
*ms_x
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(0));
1758 Value
*ms_y
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(1));
1759 Value
*ms
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(), ms_x
, ms_y
);
1760 bld
.mkOp2(OP_SHL
, TYPE_U32
, suq
->getDef(d
++), bld
.loadImm(NULL
, 1), ms
);
1762 bld
.mkMov(suq
->getDef(d
++), bld
.loadImm(NULL
, 1));
1771 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction
*tex
)
1773 const int arg
= tex
->tex
.target
.getArgCount();
1774 int slot
= tex
->tex
.r
;
1776 if (tex
->tex
.target
== TEX_TARGET_2D_MS
)
1777 tex
->tex
.target
= TEX_TARGET_2D
;
1779 if (tex
->tex
.target
== TEX_TARGET_2D_MS_ARRAY
)
1780 tex
->tex
.target
= TEX_TARGET_2D_ARRAY
;
1784 Value
*x
= tex
->getSrc(0);
1785 Value
*y
= tex
->getSrc(1);
1786 Value
*s
= tex
->getSrc(arg
- 1);
1788 Value
*tx
= bld
.getSSA(), *ty
= bld
.getSSA(), *ts
= bld
.getSSA();
1789 Value
*ind
= tex
->getIndirectR();
1791 Value
*ms_x
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(0));
1792 Value
*ms_y
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(1));
1794 bld
.mkOp2(OP_SHL
, TYPE_U32
, tx
, x
, ms_x
);
1795 bld
.mkOp2(OP_SHL
, TYPE_U32
, ty
, y
, ms_y
);
1797 s
= bld
.mkOp2v(OP_AND
, TYPE_U32
, ts
, s
, bld
.loadImm(NULL
, 0x7));
1798 s
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, ts
, ts
, bld
.mkImm(3));
1800 Value
*dx
= loadMsInfo32(ts
, 0x0);
1801 Value
*dy
= loadMsInfo32(ts
, 0x4);
1803 bld
.mkOp2(OP_ADD
, TYPE_U32
, tx
, tx
, dx
);
1804 bld
.mkOp2(OP_ADD
, TYPE_U32
, ty
, ty
, dy
);
1808 tex
->moveSources(arg
, -1);
1811 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1812 // They're computed from the coordinates using the surface info in c[] space.
1814 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction
*su
)
1817 const bool atom
= su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
;
1819 su
->op
== OP_SULDB
|| su
->op
== OP_SUSTB
|| su
->op
== OP_SUREDB
;
1820 const int slot
= su
->tex
.r
;
1821 const int dim
= su
->tex
.target
.getDim();
1822 const int arg
= dim
+ (su
->tex
.target
.isArray() || su
->tex
.target
.isCube());
1824 Value
*zero
= bld
.mkImm(0);
1828 Value
*bf
, *eau
, *off
;
1830 Value
*ind
= su
->getIndirectR();
1832 off
= bld
.getScratch(4);
1833 bf
= bld
.getScratch(4);
1834 addr
= bld
.getSSA(8);
1835 pred
= bld
.getScratch(1, FILE_PREDICATE
);
1837 bld
.setPosition(su
, false);
1839 adjustCoordinatesMS(su
);
1841 // calculate clamped coordinates
1842 for (c
= 0; c
< arg
; ++c
) {
1845 if (c
== 1 && su
->tex
.target
== TEX_TARGET_1D_ARRAY
) {
1846 // The array index is stored in the Z component for 1D arrays.
1850 src
[c
] = bld
.getScratch();
1852 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_RAW_X
);
1854 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_DIM(dimc
));
1855 bld
.mkOp3(OP_SUCLAMP
, TYPE_S32
, src
[c
], su
->getSrc(c
), v
, zero
)
1856 ->subOp
= getSuClampSubOp(su
, dimc
);
1861 // set predicate output
1862 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1863 src
[0]->getInsn()->setFlagsDef(1, pred
);
1865 if (su
->tex
.target
.isArray() || su
->tex
.target
.isCube()) {
1866 p1
= bld
.getSSA(1, FILE_PREDICATE
);
1867 src
[dim
]->getInsn()->setFlagsDef(1, p1
);
1870 // calculate pixel offset
1872 if (su
->tex
.target
!= TEX_TARGET_BUFFER
)
1873 bld
.mkOp2(OP_AND
, TYPE_U32
, off
, src
[0], bld
.loadImm(NULL
, 0xffff));
1876 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_UNK1C
);
1877 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, src
[2], v
, src
[1])
1878 ->subOp
= NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1880 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_PITCH
);
1881 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, off
, v
, src
[0])
1882 ->subOp
= NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1885 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_PITCH
);
1886 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, src
[1], v
, src
[0])
1887 ->subOp
= (su
->tex
.target
.isArray() || su
->tex
.target
.isCube()) ?
1888 NV50_IR_SUBOP_MADSP_SD
: NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1891 // calculate effective address part 1
1892 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1896 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_FMT
);
1897 bld
.mkOp3(OP_VSHL
, TYPE_U32
, bf
, src
[0], v
, zero
)
1898 ->subOp
= NV50_IR_SUBOP_V1(7,6,8|2);
1912 if (!su
->tex
.target
.isArray() && !su
->tex
.target
.isCube()) {
1913 z
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_UNK1C
);
1914 subOp
= NV50_IR_SUBOP_SUBFM_3D
;
1918 subOp
= NV50_IR_SUBOP_SUBFM_3D
;
1922 insn
= bld
.mkOp3(OP_SUBFM
, TYPE_U32
, bf
, src
[0], y
, z
);
1923 insn
->subOp
= subOp
;
1924 insn
->setFlagsDef(1, pred
);
1928 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
);
1930 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1933 eau
= bld
.mkOp3v(OP_SUEAU
, TYPE_U32
, bld
.getScratch(4), off
, bf
, v
);
1935 // add array layer offset
1936 if (su
->tex
.target
.isArray() || su
->tex
.target
.isCube()) {
1937 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ARRAY
);
1939 bld
.mkOp3(OP_MADSP
, TYPE_U32
, eau
, src
[1], v
, eau
)
1940 ->subOp
= NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1942 bld
.mkOp3(OP_MADSP
, TYPE_U32
, eau
, v
, src
[2], eau
)
1943 ->subOp
= NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
1944 // combine predicates
1946 bld
.mkOp2(OP_OR
, TYPE_U8
, pred
, pred
, p1
);
1951 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1955 // bf == g[] address & 0xff
1956 // eau == g[] address >> 8
1957 bld
.mkOp3(OP_PERMT
, TYPE_U32
, bf
, lo
, bld
.loadImm(NULL
, 0x6540), eau
);
1958 bld
.mkOp3(OP_PERMT
, TYPE_U32
, eau
, zero
, bld
.loadImm(NULL
, 0x0007), eau
);
1960 if (su
->op
== OP_SULDP
&& su
->tex
.target
== TEX_TARGET_BUFFER
) {
1961 // Convert from u32 to u8 address format, which is what the library code
1962 // doing SULDP currently uses.
1963 // XXX: can SUEAU do this ?
1964 // XXX: does it matter that we don't mask high bytes in bf ?
1966 bld
.mkOp2(OP_SHR
, TYPE_U32
, off
, bf
, bld
.mkImm(8));
1967 bld
.mkOp2(OP_ADD
, TYPE_U32
, eau
, eau
, off
);
1970 bld
.mkOp2(OP_MERGE
, TYPE_U64
, addr
, bf
, eau
);
1972 if (atom
&& su
->tex
.target
== TEX_TARGET_BUFFER
)
1973 bld
.mkOp2(OP_ADD
, TYPE_U64
, addr
, addr
, off
);
1975 // let's just set it 0 for raw access and hope it works
1977 bld
.mkImm(0) : loadSuInfo32(ind
, slot
, NVC0_SU_INFO_FMT
);
1979 // get rid of old coordinate sources, make space for fmt info and predicate
1980 su
->moveSources(arg
, 3 - arg
);
1981 // set 64 bit address and 32-bit format sources
1982 su
->setSrc(0, addr
);
1984 su
->setSrc(2, pred
);
1986 // prevent read fault when the image is not actually bound
1987 CmpInstruction
*pred1
=
1988 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
1989 TYPE_U32
, bld
.mkImm(0),
1990 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
));
1992 if (su
->op
!= OP_SUSTP
&& su
->tex
.format
) {
1993 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
1994 int blockwidth
= format
->bits
[0] + format
->bits
[1] +
1995 format
->bits
[2] + format
->bits
[3];
1997 // make sure that the format doesn't mismatch
1998 assert(format
->components
!= 0);
1999 bld
.mkCmp(OP_SET_OR
, CC_NE
, TYPE_U32
, pred1
->getDef(0),
2000 TYPE_U32
, bld
.loadImm(NULL
, blockwidth
/ 8),
2001 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
),
2004 su
->setPredicate(CC_NOT_P
, pred1
->getDef(0));
2006 // TODO: initialize def values to 0 when the surface operation is not
2007 // performed (not needed for stores). Also, fix the "address bounds test"
2008 // subtests from arb_shader_image_load_store-invalid for buffers, because it
2009 // seems like that the predicate is not correctly set by suclamp.
2013 getSrcType(const TexInstruction::ImgFormatDesc
*t
, int c
)
2016 case FLOAT
: return t
->bits
[c
] == 16 ? TYPE_F16
: TYPE_F32
;
2017 case UNORM
: return t
->bits
[c
] == 8 ? TYPE_U8
: TYPE_U16
;
2018 case SNORM
: return t
->bits
[c
] == 8 ? TYPE_S8
: TYPE_S16
;
2020 return (t
->bits
[c
] == 8 ? TYPE_U8
:
2021 (t
->bits
[c
] == 16 ? TYPE_U16
: TYPE_U32
));
2023 return (t
->bits
[c
] == 8 ? TYPE_S8
:
2024 (t
->bits
[c
] == 16 ? TYPE_S16
: TYPE_S32
));
2030 getDestType(const ImgType type
) {
2041 assert(!"Impossible type");
2047 NVC0LoweringPass::convertSurfaceFormat(TexInstruction
*su
)
2049 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
2050 int width
= format
->bits
[0] + format
->bits
[1] +
2051 format
->bits
[2] + format
->bits
[3];
2052 Value
*untypedDst
[4] = {};
2053 Value
*typedDst
[4] = {};
2055 // We must convert this to a generic load.
2058 su
->dType
= typeOfSize(width
/ 8);
2059 su
->sType
= TYPE_U8
;
2061 for (int i
= 0; i
< width
/ 32; i
++)
2062 untypedDst
[i
] = bld
.getSSA();
2064 untypedDst
[0] = bld
.getSSA();
2066 for (int i
= 0; i
< 4; i
++) {
2067 typedDst
[i
] = su
->getDef(i
);
2070 // Set the untyped dsts as the su's destinations
2071 for (int i
= 0; i
< 4; i
++)
2072 su
->setDef(i
, untypedDst
[i
]);
2074 bld
.setPosition(su
, true);
2076 // Unpack each component into the typed dsts
2078 for (int i
= 0; i
< 4; bits
+= format
->bits
[i
], i
++) {
2081 if (i
>= format
->components
) {
2082 if (format
->type
== FLOAT
||
2083 format
->type
== UNORM
||
2084 format
->type
== SNORM
)
2085 bld
.loadImm(typedDst
[i
], i
== 3 ? 1.0f
: 0.0f
);
2087 bld
.loadImm(typedDst
[i
], i
== 3 ? 1 : 0);
2091 // Get just that component's data into the relevant place
2092 if (format
->bits
[i
] == 32)
2093 bld
.mkMov(typedDst
[i
], untypedDst
[i
]);
2094 else if (format
->bits
[i
] == 16)
2095 bld
.mkCvt(OP_CVT
, getDestType(format
->type
), typedDst
[i
],
2096 getSrcType(format
, i
), untypedDst
[i
/ 2])
2097 ->subOp
= (i
& 1) << (format
->type
== FLOAT
? 0 : 1);
2098 else if (format
->bits
[i
] == 8)
2099 bld
.mkCvt(OP_CVT
, getDestType(format
->type
), typedDst
[i
],
2100 getSrcType(format
, i
), untypedDst
[0])->subOp
= i
;
2102 bld
.mkOp2(OP_EXTBF
, TYPE_U32
, typedDst
[i
], untypedDst
[bits
/ 32],
2103 bld
.mkImm((bits
% 32) | (format
->bits
[i
] << 8)));
2104 if (format
->type
== UNORM
|| format
->type
== SNORM
)
2105 bld
.mkCvt(OP_CVT
, TYPE_F32
, typedDst
[i
], getSrcType(format
, i
), typedDst
[i
]);
2108 // Normalize / convert as necessary
2109 if (format
->type
== UNORM
)
2110 bld
.mkOp2(OP_MUL
, TYPE_F32
, typedDst
[i
], typedDst
[i
], bld
.loadImm(NULL
, 1.0f
/ ((1 << format
->bits
[i
]) - 1)));
2111 else if (format
->type
== SNORM
)
2112 bld
.mkOp2(OP_MUL
, TYPE_F32
, typedDst
[i
], typedDst
[i
], bld
.loadImm(NULL
, 1.0f
/ ((1 << (format
->bits
[i
] - 1)) - 1)));
2113 else if (format
->type
== FLOAT
&& format
->bits
[i
] < 16) {
2114 bld
.mkOp2(OP_SHL
, TYPE_U32
, typedDst
[i
], typedDst
[i
], bld
.loadImm(NULL
, 15 - format
->bits
[i
]));
2115 bld
.mkCvt(OP_CVT
, TYPE_F32
, typedDst
[i
], TYPE_F16
, typedDst
[i
]);
2120 std::swap(typedDst
[0], typedDst
[2]);
2125 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction
*su
)
2127 processSurfaceCoordsNVE4(su
);
2129 if (su
->op
== OP_SULDP
)
2130 convertSurfaceFormat(su
);
2132 if (su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
) {
2133 assert(su
->getPredicate());
2135 bld
.mkOp2v(OP_OR
, TYPE_U8
, bld
.getScratch(1, FILE_PREDICATE
),
2136 su
->getPredicate(), su
->getSrc(2));
2138 Instruction
*red
= bld
.mkOp(OP_ATOM
, su
->dType
, bld
.getSSA());
2139 red
->subOp
= su
->subOp
;
2140 red
->setSrc(0, bld
.mkSymbol(FILE_MEMORY_GLOBAL
, 0, TYPE_U32
, 0));
2141 red
->setSrc(1, su
->getSrc(3));
2142 if (su
->subOp
== NV50_IR_SUBOP_ATOM_CAS
)
2143 red
->setSrc(2, su
->getSrc(4));
2144 red
->setIndirect(0, 0, su
->getSrc(0));
2146 // make sure to initialize dst value when the atomic operation is not
2148 Instruction
*mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
2150 assert(su
->cc
== CC_NOT_P
);
2151 red
->setPredicate(su
->cc
, pred
);
2152 mov
->setPredicate(CC_P
, pred
);
2154 bld
.mkOp2(OP_UNION
, TYPE_U32
, su
->getDef(0),
2155 red
->getDef(0), mov
->getDef(0));
2157 delete_Instruction(bld
.getProgram(), su
);
2158 handleCasExch(red
, true);
2161 if (su
->op
== OP_SUSTB
|| su
->op
== OP_SUSTP
)
2162 su
->sType
= (su
->tex
.target
== TEX_TARGET_BUFFER
) ? TYPE_U32
: TYPE_U8
;
2166 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction
*su
)
2168 const int slot
= su
->tex
.r
;
2169 const int dim
= su
->tex
.target
.getDim();
2170 const int arg
= dim
+ (su
->tex
.target
.isArray() || su
->tex
.target
.isCube());
2172 Value
*zero
= bld
.mkImm(0);
2175 Value
*ind
= su
->getIndirectR();
2177 bld
.setPosition(su
, false);
2179 adjustCoordinatesMS(su
);
2183 ptr
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), ind
, bld
.mkImm(su
->tex
.r
));
2184 ptr
= bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(7));
2185 su
->setIndirectR(ptr
);
2188 // get surface coordinates
2189 for (c
= 0; c
< arg
; ++c
)
2190 src
[c
] = su
->getSrc(c
);
2194 // calculate pixel offset
2195 if (su
->op
== OP_SULDP
|| su
->op
== OP_SUREDP
) {
2196 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
);
2197 su
->setSrc(0, bld
.mkOp2v(OP_MUL
, TYPE_U32
, bld
.getSSA(), src
[0], v
));
2200 // add array layer offset
2201 if (su
->tex
.target
.isArray() || su
->tex
.target
.isCube()) {
2202 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ARRAY
);
2204 su
->setSrc(2, bld
.mkOp2v(OP_MUL
, TYPE_U32
, bld
.getSSA(), src
[2], v
));
2207 // prevent read fault when the image is not actually bound
2208 CmpInstruction
*pred
=
2209 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
2210 TYPE_U32
, bld
.mkImm(0),
2211 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
));
2212 if (su
->op
!= OP_SUSTP
&& su
->tex
.format
) {
2213 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
2214 int blockwidth
= format
->bits
[0] + format
->bits
[1] +
2215 format
->bits
[2] + format
->bits
[3];
2217 assert(format
->components
!= 0);
2218 // make sure that the format doesn't mismatch when it's not FMT_NONE
2219 bld
.mkCmp(OP_SET_OR
, CC_NE
, TYPE_U32
, pred
->getDef(0),
2220 TYPE_U32
, bld
.loadImm(NULL
, blockwidth
/ 8),
2221 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
),
2224 su
->setPredicate(CC_NOT_P
, pred
->getDef(0));
2228 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction
*su
)
2230 if (su
->tex
.target
== TEX_TARGET_1D_ARRAY
) {
2231 /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2232 * will simplify the lowering pass and the texture constraints. */
2233 su
->moveSources(1, 1);
2234 su
->setSrc(1, bld
.loadImm(NULL
, 0));
2235 su
->tex
.target
= TEX_TARGET_2D_ARRAY
;
2238 processSurfaceCoordsNVC0(su
);
2240 if (su
->op
== OP_SULDP
)
2241 convertSurfaceFormat(su
);
2243 if (su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
) {
2244 const int dim
= su
->tex
.target
.getDim();
2245 const int arg
= dim
+ (su
->tex
.target
.isArray() || su
->tex
.target
.isCube());
2246 LValue
*addr
= bld
.getSSA(8);
2247 Value
*def
= su
->getDef(0);
2251 // Set the destination to the address
2252 su
->dType
= TYPE_U64
;
2253 su
->setDef(0, addr
);
2254 su
->setDef(1, su
->getPredicate());
2256 bld
.setPosition(su
, true);
2258 // Perform the atomic op
2259 Instruction
*red
= bld
.mkOp(OP_ATOM
, su
->sType
, bld
.getSSA());
2260 red
->subOp
= su
->subOp
;
2261 red
->setSrc(0, bld
.mkSymbol(FILE_MEMORY_GLOBAL
, 0, su
->sType
, 0));
2262 red
->setSrc(1, su
->getSrc(arg
));
2263 if (red
->subOp
== NV50_IR_SUBOP_ATOM_CAS
)
2264 red
->setSrc(2, su
->getSrc(arg
+ 1));
2265 red
->setIndirect(0, 0, addr
);
2267 // make sure to initialize dst value when the atomic operation is not
2269 Instruction
*mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
2271 assert(su
->cc
== CC_NOT_P
);
2272 red
->setPredicate(su
->cc
, su
->getPredicate());
2273 mov
->setPredicate(CC_P
, su
->getPredicate());
2275 bld
.mkOp2(OP_UNION
, TYPE_U32
, def
, red
->getDef(0), mov
->getDef(0));
2277 handleCasExch(red
, false);
2282 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction
*su
)
2284 const int slot
= su
->tex
.r
;
2285 const int dim
= su
->tex
.target
.getDim();
2286 const int arg
= dim
+ (su
->tex
.target
.isArray() || su
->tex
.target
.isCube());
2287 Value
*ind
= su
->getIndirectR();
2290 bld
.setPosition(su
, false);
2292 // add texture handle
2298 pos
= (su
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) ? 2 : 1;
2304 su
->setSrc(arg
+ pos
, loadTexHandle(ind
, slot
+ 32));
2306 // prevent read fault when the image is not actually bound
2307 CmpInstruction
*pred
=
2308 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
2309 TYPE_U32
, bld
.mkImm(0),
2310 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
));
2311 if (su
->op
!= OP_SUSTP
&& su
->tex
.format
) {
2312 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
2313 int blockwidth
= format
->bits
[0] + format
->bits
[1] +
2314 format
->bits
[2] + format
->bits
[3];
2316 assert(format
->components
!= 0);
2317 // make sure that the format doesn't mismatch when it's not FMT_NONE
2318 bld
.mkCmp(OP_SET_OR
, CC_NE
, TYPE_U32
, pred
->getDef(0),
2319 TYPE_U32
, bld
.loadImm(NULL
, blockwidth
/ 8),
2320 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
),
2323 su
->setPredicate(CC_NOT_P
, pred
->getDef(0));
2327 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction
*su
)
2329 processSurfaceCoordsGM107(su
);
2331 if (su
->op
== OP_SULDP
)
2332 convertSurfaceFormat(su
);
2334 if (su
->op
== OP_SUREDP
) {
2335 Value
*def
= su
->getDef(0);
2338 su
->setDef(0, bld
.getSSA());
2340 bld
.setPosition(su
, true);
2342 // make sure to initialize dst value when the atomic operation is not
2344 Instruction
*mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
2346 assert(su
->cc
== CC_NOT_P
);
2347 mov
->setPredicate(CC_P
, su
->getPredicate());
2349 bld
.mkOp2(OP_UNION
, TYPE_U32
, def
, su
->getDef(0), mov
->getDef(0));
2354 NVC0LoweringPass::handleWRSV(Instruction
*i
)
2360 // must replace, $sreg are not writeable
2361 addr
= targ
->getSVAddress(FILE_SHADER_OUTPUT
, i
->getSrc(0)->asSym());
2364 sym
= bld
.mkSymbol(FILE_SHADER_OUTPUT
, 0, i
->sType
, addr
);
2366 st
= bld
.mkStore(OP_EXPORT
, i
->dType
, sym
, i
->getIndirect(0, 0),
2368 st
->perPatch
= i
->perPatch
;
2370 bld
.getBB()->remove(i
);
2375 NVC0LoweringPass::handleLDST(Instruction
*i
)
2377 if (i
->src(0).getFile() == FILE_SHADER_INPUT
) {
2378 if (prog
->getType() == Program::TYPE_COMPUTE
) {
2379 i
->getSrc(0)->reg
.file
= FILE_MEMORY_CONST
;
2380 i
->getSrc(0)->reg
.fileIndex
= 0;
2382 if (prog
->getType() == Program::TYPE_GEOMETRY
&&
2383 i
->src(0).isIndirect(0)) {
2384 // XXX: this assumes vec4 units
2385 Value
*ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
2386 i
->getIndirect(0, 0), bld
.mkImm(4));
2387 i
->setIndirect(0, 0, ptr
);
2391 assert(prog
->getType() != Program::TYPE_FRAGMENT
); // INTERP
2393 } else if (i
->src(0).getFile() == FILE_MEMORY_CONST
) {
2394 if (targ
->getChipset() >= NVISA_GK104_CHIPSET
&&
2395 prog
->getType() == Program::TYPE_COMPUTE
) {
2396 // The launch descriptor only allows to set up 8 CBs, but OpenGL
2397 // requires at least 12 UBOs. To bypass this limitation, we store the
2398 // addrs into the driver constbuf and we directly load from the global
2400 int8_t fileIndex
= i
->getSrc(0)->reg
.fileIndex
- 1;
2401 Value
*ind
= i
->getIndirect(0, 1);
2404 // Clamp the UBO index when an indirect access is used to avoid
2405 // loading information from the wrong place in the driver cb.
2406 ind
= bld
.mkOp2v(OP_MIN
, TYPE_U32
, ind
,
2407 bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(),
2408 ind
, bld
.loadImm(NULL
, fileIndex
)),
2409 bld
.loadImm(NULL
, 12));
2412 if (i
->src(0).isIndirect(1)) {
2413 Value
*offset
= bld
.loadImm(NULL
, i
->getSrc(0)->reg
.data
.offset
+ typeSizeof(i
->sType
));
2414 Value
*ptr
= loadUboInfo64(ind
, fileIndex
* 16);
2415 Value
*length
= loadUboLength32(ind
, fileIndex
* 16);
2416 Value
*pred
= new_LValue(func
, FILE_PREDICATE
);
2417 if (i
->src(0).isIndirect(0)) {
2418 bld
.mkOp2(OP_ADD
, TYPE_U64
, ptr
, ptr
, i
->getIndirect(0, 0));
2419 bld
.mkOp2(OP_ADD
, TYPE_U32
, offset
, offset
, i
->getIndirect(0, 0));
2421 i
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
2422 i
->setIndirect(0, 1, NULL
);
2423 i
->setIndirect(0, 0, ptr
);
2424 bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, pred
, TYPE_U32
, offset
, length
);
2425 i
->setPredicate(CC_NOT_P
, pred
);
2426 if (i
->defExists(0)) {
2427 bld
.mkMov(i
->getDef(0), bld
.mkImm(0));
2429 } else if (fileIndex
>= 0) {
2430 Value
*ptr
= loadUboInfo64(ind
, fileIndex
* 16);
2431 if (i
->src(0).isIndirect(0)) {
2432 bld
.mkOp2(OP_ADD
, TYPE_U64
, ptr
, ptr
, i
->getIndirect(0, 0));
2434 i
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
2435 i
->setIndirect(0, 1, NULL
);
2436 i
->setIndirect(0, 0, ptr
);
2438 } else if (i
->src(0).isIndirect(1)) {
2440 if (i
->src(0).isIndirect(0))
2441 ptr
= bld
.mkOp3v(OP_INSBF
, TYPE_U32
, bld
.getSSA(),
2442 i
->getIndirect(0, 1), bld
.mkImm(0x1010),
2443 i
->getIndirect(0, 0));
2445 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
2446 i
->getIndirect(0, 1), bld
.mkImm(16));
2447 i
->setIndirect(0, 1, NULL
);
2448 i
->setIndirect(0, 0, ptr
);
2449 i
->subOp
= NV50_IR_SUBOP_LDC_IS
;
2451 } else if (i
->src(0).getFile() == FILE_SHADER_OUTPUT
) {
2452 assert(prog
->getType() == Program::TYPE_TESSELLATION_CONTROL
);
2454 } else if (i
->src(0).getFile() == FILE_MEMORY_BUFFER
) {
2455 Value
*ind
= i
->getIndirect(0, 1);
2456 Value
*ptr
= loadBufInfo64(ind
, i
->getSrc(0)->reg
.fileIndex
* 16);
2457 // XXX come up with a way not to do this for EVERY little access but
2458 // rather to batch these up somehow. Unfortunately we've lost the
2459 // information about the field width by the time we get here.
2460 Value
*offset
= bld
.loadImm(NULL
, i
->getSrc(0)->reg
.data
.offset
+ typeSizeof(i
->sType
));
2461 Value
*length
= loadBufLength32(ind
, i
->getSrc(0)->reg
.fileIndex
* 16);
2462 Value
*pred
= new_LValue(func
, FILE_PREDICATE
);
2463 if (i
->src(0).isIndirect(0)) {
2464 bld
.mkOp2(OP_ADD
, TYPE_U64
, ptr
, ptr
, i
->getIndirect(0, 0));
2465 bld
.mkOp2(OP_ADD
, TYPE_U32
, offset
, offset
, i
->getIndirect(0, 0));
2467 i
->setIndirect(0, 1, NULL
);
2468 i
->setIndirect(0, 0, ptr
);
2469 i
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
2470 bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, pred
, TYPE_U32
, offset
, length
);
2471 i
->setPredicate(CC_NOT_P
, pred
);
2472 if (i
->defExists(0)) {
2473 Value
*zero
, *dst
= i
->getDef(0);
2474 i
->setDef(0, bld
.getSSA());
2476 bld
.setPosition(i
, true);
2477 bld
.mkMov((zero
= bld
.getSSA()), bld
.mkImm(0))
2478 ->setPredicate(CC_P
, pred
);
2479 bld
.mkOp2(OP_UNION
, TYPE_U32
, dst
, i
->getDef(0), zero
);
2485 NVC0LoweringPass::readTessCoord(LValue
*dst
, int c
)
2487 Value
*laneid
= bld
.getSSA();
2490 bld
.mkOp1(OP_RDSV
, TYPE_U32
, laneid
, bld
.mkSysVal(SV_LANEID
, 0));
2501 if (prog
->driver
->prop
.tp
.domain
!= PIPE_PRIM_TRIANGLES
) {
2502 bld
.mkMov(dst
, bld
.loadImm(NULL
, 0));
2509 bld
.mkFetch(x
, TYPE_F32
, FILE_SHADER_OUTPUT
, 0x2f0, NULL
, laneid
);
2511 bld
.mkFetch(y
, TYPE_F32
, FILE_SHADER_OUTPUT
, 0x2f4, NULL
, laneid
);
2514 bld
.mkOp2(OP_ADD
, TYPE_F32
, dst
, x
, y
);
2515 bld
.mkOp2(OP_SUB
, TYPE_F32
, dst
, bld
.loadImm(NULL
, 1.0f
), dst
);
2520 NVC0LoweringPass::handleRDSV(Instruction
*i
)
2522 Symbol
*sym
= i
->getSrc(0)->asSym();
2523 const SVSemantic sv
= sym
->reg
.data
.sv
.sv
;
2526 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_INPUT
, sym
);
2528 if (addr
>= 0x400) {
2530 if (sym
->reg
.data
.sv
.index
== 3) {
2531 // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2533 i
->setSrc(0, bld
.mkImm((sv
== SV_NTID
|| sv
== SV_NCTAID
) ? 1 : 0));
2535 if (sv
== SV_VERTEX_COUNT
) {
2536 bld
.setPosition(i
, true);
2537 bld
.mkOp2(OP_EXTBF
, TYPE_U32
, i
->getDef(0), i
->getDef(0), bld
.mkImm(0x808));
2544 assert(prog
->getType() == Program::TYPE_FRAGMENT
);
2545 if (i
->srcExists(1)) {
2546 // Pass offset through to the interpolation logic
2547 ld
= bld
.mkInterp(NV50_IR_INTERP_LINEAR
| NV50_IR_INTERP_OFFSET
,
2548 i
->getDef(0), addr
, NULL
);
2549 ld
->setSrc(1, i
->getSrc(1));
2551 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, i
->getDef(0), addr
, NULL
);
2556 Value
*face
= i
->getDef(0);
2557 bld
.mkInterp(NV50_IR_INTERP_FLAT
, face
, addr
, NULL
);
2558 if (i
->dType
== TYPE_F32
) {
2559 bld
.mkOp2(OP_OR
, TYPE_U32
, face
, face
, bld
.mkImm(0x00000001));
2560 bld
.mkOp1(OP_NEG
, TYPE_S32
, face
, face
);
2561 bld
.mkCvt(OP_CVT
, TYPE_F32
, face
, TYPE_S32
, face
);
2566 assert(prog
->getType() == Program::TYPE_TESSELLATION_EVAL
);
2567 readTessCoord(i
->getDef(0)->asLValue(), i
->getSrc(0)->reg
.data
.sv
.index
);
2572 assert(targ
->getChipset() >= NVISA_GK104_CHIPSET
); // mov $sreg otherwise
2573 if (sym
->reg
.data
.sv
.index
== 3) {
2575 i
->setSrc(0, bld
.mkImm(sv
== SV_GRIDID
? 0 : 1));
2580 addr
+= prog
->driver
->prop
.cp
.gridInfoBase
;
2581 bld
.mkLoad(TYPE_U32
, i
->getDef(0),
2582 bld
.mkSymbol(FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
2583 TYPE_U32
, addr
), NULL
);
2585 case SV_SAMPLE_INDEX
:
2586 // TODO: Properly pass source as an address in the PIX address space
2587 // (which can be of the form [r0+offset]). But this is currently
2589 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
2590 ld
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
2592 case SV_SAMPLE_POS
: {
2593 Value
*off
= new_LValue(func
, FILE_GPR
);
2594 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
2595 ld
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
2596 bld
.mkOp2(OP_SHL
, TYPE_U32
, off
, i
->getDef(0), bld
.mkImm(3));
2597 bld
.mkLoad(TYPE_F32
,
2600 FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
2601 TYPE_U32
, prog
->driver
->io
.sampleInfoBase
+
2602 4 * sym
->reg
.data
.sv
.index
),
2606 case SV_SAMPLE_MASK
: {
2607 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
2608 ld
->subOp
= NV50_IR_SUBOP_PIXLD_COVMASK
;
2609 Instruction
*sampleid
=
2610 bld
.mkOp1(OP_PIXLD
, TYPE_U32
, bld
.getSSA(), bld
.mkImm(0));
2611 sampleid
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
2613 bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ld
->getDef(0),
2614 bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
2615 bld
.loadImm(NULL
, 1), sampleid
->getDef(0)));
2616 if (prog
->driver
->prop
.fp
.persampleInvocation
) {
2617 bld
.mkMov(i
->getDef(0), masked
);
2619 bld
.mkOp3(OP_SELP
, TYPE_U32
, i
->getDef(0), ld
->getDef(0), masked
,
2626 case SV_BASEINSTANCE
:
2628 ld
= bld
.mkLoad(TYPE_U32
, i
->getDef(0),
2629 bld
.mkSymbol(FILE_MEMORY_CONST
,
2630 prog
->driver
->io
.auxCBSlot
,
2632 prog
->driver
->io
.drawInfoBase
+
2633 4 * (sv
- SV_BASEVERTEX
)),
2637 if (prog
->getType() == Program::TYPE_TESSELLATION_EVAL
&& !i
->perPatch
)
2638 vtx
= bld
.mkOp1v(OP_PFETCH
, TYPE_U32
, bld
.getSSA(), bld
.mkImm(0));
2639 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
2640 bld
.mkInterp(NV50_IR_INTERP_FLAT
, i
->getDef(0), addr
, NULL
);
2642 ld
= bld
.mkFetch(i
->getDef(0), i
->dType
,
2643 FILE_SHADER_INPUT
, addr
, i
->getIndirect(0, 0), vtx
);
2644 ld
->perPatch
= i
->perPatch
;
2648 bld
.getBB()->remove(i
);
2653 NVC0LoweringPass::handleDIV(Instruction
*i
)
2655 if (!isFloatType(i
->dType
))
2657 bld
.setPosition(i
, false);
2658 Instruction
*rcp
= bld
.mkOp1(OP_RCP
, i
->dType
, bld
.getSSA(typeSizeof(i
->dType
)), i
->getSrc(1));
2660 i
->setSrc(1, rcp
->getDef(0));
2665 NVC0LoweringPass::handleMOD(Instruction
*i
)
2667 if (!isFloatType(i
->dType
))
2669 LValue
*value
= bld
.getScratch(typeSizeof(i
->dType
));
2670 bld
.mkOp1(OP_RCP
, i
->dType
, value
, i
->getSrc(1));
2671 bld
.mkOp2(OP_MUL
, i
->dType
, value
, i
->getSrc(0), value
);
2672 bld
.mkOp1(OP_TRUNC
, i
->dType
, value
, value
);
2673 bld
.mkOp2(OP_MUL
, i
->dType
, value
, i
->getSrc(1), value
);
2675 i
->setSrc(1, value
);
2680 NVC0LoweringPass::handleSQRT(Instruction
*i
)
2682 if (i
->dType
== TYPE_F64
) {
2683 Value
*pred
= bld
.getSSA(1, FILE_PREDICATE
);
2684 Value
*zero
= bld
.loadImm(NULL
, 0.0);
2685 Value
*dst
= bld
.getSSA(8);
2686 bld
.mkOp1(OP_RSQ
, i
->dType
, dst
, i
->getSrc(0));
2687 bld
.mkCmp(OP_SET
, CC_LE
, i
->dType
, pred
, i
->dType
, i
->getSrc(0), zero
);
2688 bld
.mkOp3(OP_SELP
, TYPE_U64
, dst
, zero
, dst
, pred
);
2691 // TODO: Handle this properly with a library function
2693 bld
.setPosition(i
, true);
2695 bld
.mkOp1(OP_RCP
, i
->dType
, i
->getDef(0), i
->getDef(0));
2702 NVC0LoweringPass::handlePOW(Instruction
*i
)
2704 LValue
*val
= bld
.getScratch();
2706 bld
.mkOp1(OP_LG2
, TYPE_F32
, val
, i
->getSrc(0));
2707 bld
.mkOp2(OP_MUL
, TYPE_F32
, val
, i
->getSrc(1), val
)->dnz
= 1;
2708 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, val
, val
);
2718 NVC0LoweringPass::handleEXPORT(Instruction
*i
)
2720 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
2721 int id
= i
->getSrc(0)->reg
.data
.offset
/ 4;
2723 if (i
->src(0).isIndirect(0)) // TODO, ugly
2726 i
->subOp
= NV50_IR_SUBOP_MOV_FINAL
;
2727 i
->src(0).set(i
->src(1));
2729 i
->setDef(0, new_LValue(func
, FILE_GPR
));
2730 i
->getDef(0)->reg
.data
.id
= id
;
2732 prog
->maxGPR
= MAX2(prog
->maxGPR
, id
);
2734 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
2735 i
->setIndirect(0, 1, gpEmitAddress
);
2741 NVC0LoweringPass::handleOUT(Instruction
*i
)
2743 Instruction
*prev
= i
->prev
;
2744 ImmediateValue stream
, prevStream
;
2746 // Only merge if the stream ids match. Also, note that the previous
2747 // instruction would have already been lowered, so we take arg1 from it.
2748 if (i
->op
== OP_RESTART
&& prev
&& prev
->op
== OP_EMIT
&&
2749 i
->src(0).getImmediate(stream
) &&
2750 prev
->src(1).getImmediate(prevStream
) &&
2751 stream
.reg
.data
.u32
== prevStream
.reg
.data
.u32
) {
2752 i
->prev
->subOp
= NV50_IR_SUBOP_EMIT_RESTART
;
2753 delete_Instruction(prog
, i
);
2755 assert(gpEmitAddress
);
2756 i
->setDef(0, gpEmitAddress
);
2757 i
->setSrc(1, i
->getSrc(0));
2758 i
->setSrc(0, gpEmitAddress
);
2763 // Generate a binary predicate if an instruction is predicated by
2764 // e.g. an f32 value.
2766 NVC0LoweringPass::checkPredicate(Instruction
*insn
)
2768 Value
*pred
= insn
->getPredicate();
2771 if (!pred
|| pred
->reg
.file
== FILE_PREDICATE
)
2773 pdst
= new_LValue(func
, FILE_PREDICATE
);
2775 // CAUTION: don't use pdst->getInsn, the definition might not be unique,
2776 // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
2778 bld
.mkCmp(OP_SET
, CC_NEU
, insn
->dType
, pdst
, insn
->dType
, bld
.mkImm(0), pred
);
2780 insn
->setPredicate(insn
->cc
, pdst
);
2784 // - add quadop dance for texturing
2785 // - put FP outputs in GPRs
2786 // - convert instruction sequences
2789 NVC0LoweringPass::visit(Instruction
*i
)
2792 bld
.setPosition(i
, false);
2794 if (i
->cc
!= CC_ALWAYS
)
2803 return handleTEX(i
->asTex());
2805 return handleTXD(i
->asTex());
2807 return handleTXLQ(i
->asTex());
2809 return handleTXQ(i
->asTex());
2811 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, i
->getDef(0), i
->getSrc(0));
2812 i
->setSrc(0, i
->getDef(0));
2815 return handlePOW(i
);
2817 return handleDIV(i
);
2819 return handleMOD(i
);
2821 return handleSQRT(i
);
2823 ret
= handleEXPORT(i
);
2827 return handleOUT(i
);
2829 return handleRDSV(i
);
2831 return handleWRSV(i
);
2838 const bool cctl
= i
->src(0).getFile() == FILE_MEMORY_BUFFER
;
2840 handleCasExch(i
, cctl
);
2849 if (targ
->getChipset() >= NVISA_GM107_CHIPSET
)
2850 handleSurfaceOpGM107(i
->asTex());
2851 else if (targ
->getChipset() >= NVISA_GK104_CHIPSET
)
2852 handleSurfaceOpNVE4(i
->asTex());
2854 handleSurfaceOpNVC0(i
->asTex());
2857 handleSUQ(i
->asTex());
2866 /* Kepler+ has a special opcode to compute a new base address to be used
2867 * for indirect loads.
2869 * Maxwell+ has an additional similar requirement for indirect
2870 * interpolation ops in frag shaders.
2872 bool doAfetch
= false;
2873 if (targ
->getChipset() >= NVISA_GK104_CHIPSET
&&
2875 (i
->op
== OP_VFETCH
|| i
->op
== OP_EXPORT
) &&
2876 i
->src(0).isIndirect(0)) {
2879 if (targ
->getChipset() >= NVISA_GM107_CHIPSET
&&
2880 (i
->op
== OP_LINTERP
|| i
->op
== OP_PINTERP
) &&
2881 i
->src(0).isIndirect(0)) {
2886 Value
*addr
= cloneShallow(func
, i
->getSrc(0));
2887 Instruction
*afetch
= bld
.mkOp1(OP_AFETCH
, TYPE_U32
, bld
.getSSA(),
2889 afetch
->setIndirect(0, 0, i
->getIndirect(0, 0));
2890 addr
->reg
.data
.offset
= 0;
2892 i
->setIndirect(0, 0, afetch
->getDef(0));
2899 TargetNVC0::runLegalizePass(Program
*prog
, CGStage stage
) const
2901 if (stage
== CG_STAGE_PRE_SSA
) {
2902 NVC0LoweringPass
pass(prog
);
2903 return pass
.run(prog
, false, true);
2905 if (stage
== CG_STAGE_POST_RA
) {
2906 NVC0LegalizePostRA
pass(prog
);
2907 return pass
.run(prog
, false, true);
2909 if (stage
== CG_STAGE_SSA
) {
2910 NVC0LegalizeSSA pass
;
2911 return pass
.run(prog
, false, true);
2916 } // namespace nv50_ir