2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
26 #include "codegen/nv50_ir_target_nvc0.h"
27 #include "codegen/nv50_ir_lowering_nvc0.h"
39 #define QUADOP(q, r, s, t) \
40 ((QOP_##q << 6) | (QOP_##r << 4) | \
41 (QOP_##s << 2) | (QOP_##t << 0))
44 NVC0LegalizeSSA::handleDIV(Instruction
*i
)
46 FlowInstruction
*call
;
50 bld
.setPosition(i
, false);
51 def
[0] = bld
.mkMovToReg(0, i
->getSrc(0))->getDef(0);
52 def
[1] = bld
.mkMovToReg(1, i
->getSrc(1))->getDef(0);
54 case TYPE_U32
: builtin
= NVC0_BUILTIN_DIV_U32
; break;
55 case TYPE_S32
: builtin
= NVC0_BUILTIN_DIV_S32
; break;
59 call
= bld
.mkFlow(OP_CALL
, NULL
, CC_ALWAYS
, NULL
);
60 bld
.mkMov(i
->getDef(0), def
[(i
->op
== OP_DIV
) ? 0 : 1]);
61 bld
.mkClobber(FILE_GPR
, (i
->op
== OP_DIV
) ? 0xe : 0xd, 2);
62 bld
.mkClobber(FILE_PREDICATE
, (i
->dType
== TYPE_S32
) ? 0xf : 0x3, 0);
65 call
->absolute
= call
->builtin
= 1;
66 call
->target
.builtin
= builtin
;
67 delete_Instruction(prog
, i
);
71 NVC0LegalizeSSA::handleRCPRSQ(Instruction
*i
)
73 assert(i
->dType
== TYPE_F64
);
74 // There are instructions that will compute the high 32 bits of the 64-bit
75 // float. We will just stick 0 in the bottom 32 bits.
77 bld
.setPosition(i
, false);
79 // 1. Take the source and it up.
80 Value
*src
[2], *dst
[2], *def
= i
->getDef(0);
81 bld
.mkSplit(src
, 4, i
->getSrc(0));
83 // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
84 dst
[0] = bld
.loadImm(NULL
, 0);
85 dst
[1] = bld
.getSSA();
87 // 3. The new version of the instruction takes the high 32 bits of the
88 // source and outputs the high 32 bits of the destination.
92 i
->subOp
= NV50_IR_SUBOP_RCPRSQ_64H
;
94 // 4. Recombine the two dst pieces back into the original destination.
95 bld
.setPosition(i
, true);
96 bld
.mkOp2(OP_MERGE
, TYPE_U64
, def
, dst
[0], dst
[1]);
100 NVC0LegalizeSSA::handleFTZ(Instruction
*i
)
102 // Only want to flush float inputs
103 assert(i
->sType
== TYPE_F32
);
105 // If we're already flushing denorms (and NaN's) to zero, no need for this.
109 // Only certain classes of operations can flush
110 OpClass cls
= prog
->getTarget()->getOpClass(i
->op
);
111 if (cls
!= OPCLASS_ARITH
&& cls
!= OPCLASS_COMPARE
&&
112 cls
!= OPCLASS_CONVERT
)
119 NVC0LegalizeSSA::visit(Function
*fn
)
121 bld
.setProgram(fn
->getProgram());
126 NVC0LegalizeSSA::visit(BasicBlock
*bb
)
129 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
131 if (i
->sType
== TYPE_F32
) {
132 if (prog
->getType() != Program::TYPE_COMPUTE
)
143 if (i
->dType
== TYPE_F64
)
153 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program
*prog
)
157 needTexBar(prog
->getTarget()->getChipset() >= 0xe0)
162 NVC0LegalizePostRA::insnDominatedBy(const Instruction
*later
,
163 const Instruction
*early
) const
165 if (early
->bb
== later
->bb
)
166 return early
->serial
< later
->serial
;
167 return later
->bb
->dominatedBy(early
->bb
);
171 NVC0LegalizePostRA::addTexUse(std::list
<TexUse
> &uses
,
172 Instruction
*usei
, const Instruction
*texi
)
175 bool dominated
= insnDominatedBy(usei
, texi
);
176 // Uses before the tex have to all be included. Just because an earlier
177 // instruction dominates another instruction doesn't mean that there's no
178 // way to get from the tex to the later instruction. For example you could
179 // have nested loops, with the tex in the inner loop, and uses before it in
180 // both loops - even though the outer loop's instruction would dominate the
181 // inner's, we still want a texbar before the inner loop's instruction.
183 // However we can still use the eliding logic between uses dominated by the
184 // tex instruction, as that is unambiguously correct.
186 for (std::list
<TexUse
>::iterator it
= uses
.begin(); it
!= uses
.end();) {
188 if (insnDominatedBy(usei
, it
->insn
)) {
192 if (insnDominatedBy(it
->insn
, usei
)) {
201 uses
.push_back(TexUse(usei
, texi
, dominated
));
204 // While it might be tempting to use the an algorithm that just looks at tex
205 // uses, not all texture results are guaranteed to be used on all paths. In
206 // the case where along some control flow path a texture result is never used,
207 // we might reuse that register for something else, creating a
208 // write-after-write hazard. So we have to manually look through all
209 // instructions looking for ones that reference the registers in question.
211 NVC0LegalizePostRA::findFirstUses(
212 Instruction
*texi
, std::list
<TexUse
> &uses
)
214 int minGPR
= texi
->def(0).rep()->reg
.data
.id
;
215 int maxGPR
= minGPR
+ texi
->def(0).rep()->reg
.size
/ 4 - 1;
217 unordered_set
<const BasicBlock
*> visited
;
218 findFirstUsesBB(minGPR
, maxGPR
, texi
->next
, texi
, uses
, visited
);
222 NVC0LegalizePostRA::findFirstUsesBB(
223 int minGPR
, int maxGPR
, Instruction
*start
,
224 const Instruction
*texi
, std::list
<TexUse
> &uses
,
225 unordered_set
<const BasicBlock
*> &visited
)
227 const BasicBlock
*bb
= start
->bb
;
229 // We don't process the whole bb the first time around. This is correct,
230 // however we might be in a loop and hit this BB again, and need to process
231 // the full thing. So only mark a bb as visited if we processed it from the
233 if (start
== bb
->getEntry()) {
234 if (visited
.find(bb
) != visited
.end())
239 for (Instruction
*insn
= start
; insn
!= bb
->getExit(); insn
= insn
->next
) {
243 for (int d
= 0; insn
->defExists(d
); ++d
) {
244 const Value
*def
= insn
->def(d
).rep();
245 if (insn
->def(d
).getFile() != FILE_GPR
||
246 def
->reg
.data
.id
+ def
->reg
.size
/ 4 - 1 < minGPR
||
247 def
->reg
.data
.id
> maxGPR
)
249 addTexUse(uses
, insn
, texi
);
253 for (int s
= 0; insn
->srcExists(s
); ++s
) {
254 const Value
*src
= insn
->src(s
).rep();
255 if (insn
->src(s
).getFile() != FILE_GPR
||
256 src
->reg
.data
.id
+ src
->reg
.size
/ 4 - 1 < minGPR
||
257 src
->reg
.data
.id
> maxGPR
)
259 addTexUse(uses
, insn
, texi
);
264 for (Graph::EdgeIterator ei
= bb
->cfg
.outgoing(); !ei
.end(); ei
.next()) {
265 findFirstUsesBB(minGPR
, maxGPR
, BasicBlock::get(ei
.getNode())->getEntry(),
266 texi
, uses
, visited
);
271 // This pass is a bit long and ugly and can probably be optimized.
273 // 1. obtain a list of TEXes and their outputs' first use(s)
274 // 2. calculate the barrier level of each first use (minimal number of TEXes,
275 // over all paths, between the TEX and the use in question)
276 // 3. for each barrier, if all paths from the source TEX to that barrier
277 // contain a barrier of lesser level, it can be culled
279 NVC0LegalizePostRA::insertTextureBarriers(Function
*fn
)
281 std::list
<TexUse
> *uses
;
282 std::vector
<Instruction
*> texes
;
283 std::vector
<int> bbFirstTex
;
284 std::vector
<int> bbFirstUse
;
285 std::vector
<int> texCounts
;
286 std::vector
<TexUse
> useVec
;
289 fn
->orderInstructions(insns
);
291 texCounts
.resize(fn
->allBBlocks
.getSize(), 0);
292 bbFirstTex
.resize(fn
->allBBlocks
.getSize(), insns
.getSize());
293 bbFirstUse
.resize(fn
->allBBlocks
.getSize(), insns
.getSize());
295 // tag BB CFG nodes by their id for later
296 for (ArrayList::Iterator i
= fn
->allBBlocks
.iterator(); !i
.end(); i
.next()) {
297 BasicBlock
*bb
= reinterpret_cast<BasicBlock
*>(i
.get());
299 bb
->cfg
.tag
= bb
->getId();
302 // gather the first uses for each TEX
303 for (int i
= 0; i
< insns
.getSize(); ++i
) {
304 Instruction
*tex
= reinterpret_cast<Instruction
*>(insns
.get(i
));
305 if (isTextureOp(tex
->op
)) {
306 texes
.push_back(tex
);
307 if (!texCounts
.at(tex
->bb
->getId()))
308 bbFirstTex
[tex
->bb
->getId()] = texes
.size() - 1;
309 texCounts
[tex
->bb
->getId()]++;
315 uses
= new std::list
<TexUse
>[texes
.size()];
318 for (size_t i
= 0; i
< texes
.size(); ++i
) {
319 findFirstUses(texes
[i
], uses
[i
]);
322 // determine the barrier level at each use
323 for (size_t i
= 0; i
< texes
.size(); ++i
) {
324 for (std::list
<TexUse
>::iterator u
= uses
[i
].begin(); u
!= uses
[i
].end();
326 BasicBlock
*tb
= texes
[i
]->bb
;
327 BasicBlock
*ub
= u
->insn
->bb
;
330 for (size_t j
= i
+ 1; j
< texes
.size() &&
331 texes
[j
]->bb
== tb
&& texes
[j
]->serial
< u
->insn
->serial
;
335 u
->level
= fn
->cfg
.findLightestPathWeight(&tb
->cfg
,
336 &ub
->cfg
, texCounts
);
338 WARN("Failed to find path TEX -> TEXBAR\n");
342 // this counted all TEXes in the origin block, correct that
343 u
->level
-= i
- bbFirstTex
.at(tb
->getId()) + 1 /* this TEX */;
344 // and did not count the TEXes in the destination block, add those
345 for (size_t j
= bbFirstTex
.at(ub
->getId()); j
< texes
.size() &&
346 texes
[j
]->bb
== ub
&& texes
[j
]->serial
< u
->insn
->serial
;
350 assert(u
->level
>= 0);
351 useVec
.push_back(*u
);
356 // insert the barriers
357 for (size_t i
= 0; i
< useVec
.size(); ++i
) {
358 Instruction
*prev
= useVec
[i
].insn
->prev
;
359 if (useVec
[i
].level
< 0)
361 if (prev
&& prev
->op
== OP_TEXBAR
) {
362 if (prev
->subOp
> useVec
[i
].level
)
363 prev
->subOp
= useVec
[i
].level
;
364 prev
->setSrc(prev
->srcCount(), useVec
[i
].tex
->getDef(0));
366 Instruction
*bar
= new_Instruction(func
, OP_TEXBAR
, TYPE_NONE
);
368 bar
->subOp
= useVec
[i
].level
;
369 // make use explicit to ease latency calculation
370 bar
->setSrc(bar
->srcCount(), useVec
[i
].tex
->getDef(0));
371 useVec
[i
].insn
->bb
->insertBefore(useVec
[i
].insn
, bar
);
375 if (fn
->getProgram()->optLevel
< 3)
378 std::vector
<Limits
> limitT
, limitB
, limitS
; // entry, exit, single
380 limitT
.resize(fn
->allBBlocks
.getSize(), Limits(0, 0));
381 limitB
.resize(fn
->allBBlocks
.getSize(), Limits(0, 0));
382 limitS
.resize(fn
->allBBlocks
.getSize());
384 // cull unneeded barriers (should do that earlier, but for simplicity)
385 IteratorRef bi
= fn
->cfg
.iteratorCFG();
386 // first calculate min/max outstanding TEXes for each BB
387 for (bi
->reset(); !bi
->end(); bi
->next()) {
388 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
389 BasicBlock
*bb
= BasicBlock::get(n
);
391 int max
= std::numeric_limits
<int>::max();
392 for (Instruction
*i
= bb
->getFirst(); i
; i
= i
->next
) {
393 if (isTextureOp(i
->op
)) {
395 if (max
< std::numeric_limits
<int>::max())
398 if (i
->op
== OP_TEXBAR
) {
399 min
= MIN2(min
, i
->subOp
);
400 max
= MIN2(max
, i
->subOp
);
403 // limits when looking at an isolated block
404 limitS
[bb
->getId()].min
= min
;
405 limitS
[bb
->getId()].max
= max
;
407 // propagate the min/max values
408 for (unsigned int l
= 0; l
<= fn
->loopNestingBound
; ++l
) {
409 for (bi
->reset(); !bi
->end(); bi
->next()) {
410 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
411 BasicBlock
*bb
= BasicBlock::get(n
);
412 const int bbId
= bb
->getId();
413 for (Graph::EdgeIterator ei
= n
->incident(); !ei
.end(); ei
.next()) {
414 BasicBlock
*in
= BasicBlock::get(ei
.getNode());
415 const int inId
= in
->getId();
416 limitT
[bbId
].min
= MAX2(limitT
[bbId
].min
, limitB
[inId
].min
);
417 limitT
[bbId
].max
= MAX2(limitT
[bbId
].max
, limitB
[inId
].max
);
419 // I just hope this is correct ...
420 if (limitS
[bbId
].max
== std::numeric_limits
<int>::max()) {
422 limitB
[bbId
].min
= limitT
[bbId
].min
+ limitS
[bbId
].min
;
423 limitB
[bbId
].max
= limitT
[bbId
].max
+ limitS
[bbId
].min
;
425 // block contained a barrier
426 limitB
[bbId
].min
= MIN2(limitS
[bbId
].max
,
427 limitT
[bbId
].min
+ limitS
[bbId
].min
);
428 limitB
[bbId
].max
= MIN2(limitS
[bbId
].max
,
429 limitT
[bbId
].max
+ limitS
[bbId
].min
);
433 // finally delete unnecessary barriers
434 for (bi
->reset(); !bi
->end(); bi
->next()) {
435 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
436 BasicBlock
*bb
= BasicBlock::get(n
);
437 Instruction
*prev
= NULL
;
439 int max
= limitT
[bb
->getId()].max
;
440 for (Instruction
*i
= bb
->getFirst(); i
; i
= next
) {
442 if (i
->op
== OP_TEXBAR
) {
443 if (i
->subOp
>= max
) {
444 delete_Instruction(prog
, i
);
448 if (prev
&& prev
->op
== OP_TEXBAR
&& prev
->subOp
>= max
) {
449 delete_Instruction(prog
, prev
);
454 if (isTextureOp(i
->op
)) {
457 if (i
&& !i
->isNop())
465 NVC0LegalizePostRA::visit(Function
*fn
)
468 insertTextureBarriers(fn
);
470 rZero
= new_LValue(fn
, FILE_GPR
);
471 pOne
= new_LValue(fn
, FILE_PREDICATE
);
472 carry
= new_LValue(fn
, FILE_FLAGS
);
474 rZero
->reg
.data
.id
= (prog
->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET
) ? 255 : 63;
475 carry
->reg
.data
.id
= 0;
476 pOne
->reg
.data
.id
= 7;
482 NVC0LegalizePostRA::replaceZero(Instruction
*i
)
484 for (int s
= 0; i
->srcExists(s
); ++s
) {
485 if (s
== 2 && i
->op
== OP_SUCLAMP
)
487 ImmediateValue
*imm
= i
->getSrc(s
)->asImm();
489 if (i
->op
== OP_SELP
&& s
== 2) {
491 if (imm
->reg
.data
.u64
== 0)
492 i
->src(s
).mod
= i
->src(s
).mod
^ Modifier(NV50_IR_MOD_NOT
);
493 } else if (imm
->reg
.data
.u64
== 0) {
500 // replace CONT with BRA for single unconditional continue
502 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock
*bb
)
504 if (bb
->cfg
.incidentCount() != 2 || bb
->getEntry()->op
!= OP_PRECONT
)
506 Graph::EdgeIterator ei
= bb
->cfg
.incident();
507 if (ei
.getType() != Graph::Edge::BACK
)
509 if (ei
.getType() != Graph::Edge::BACK
)
511 BasicBlock
*contBB
= BasicBlock::get(ei
.getNode());
513 if (!contBB
->getExit() || contBB
->getExit()->op
!= OP_CONT
||
514 contBB
->getExit()->getPredicate())
516 contBB
->getExit()->op
= OP_BRA
;
517 bb
->remove(bb
->getEntry()); // delete PRECONT
520 assert(ei
.end() || ei
.getType() != Graph::Edge::BACK
);
524 // replace branches to join blocks with join ops
526 NVC0LegalizePostRA::propagateJoin(BasicBlock
*bb
)
528 if (bb
->getEntry()->op
!= OP_JOIN
|| bb
->getEntry()->asFlow()->limit
)
530 for (Graph::EdgeIterator ei
= bb
->cfg
.incident(); !ei
.end(); ei
.next()) {
531 BasicBlock
*in
= BasicBlock::get(ei
.getNode());
532 Instruction
*exit
= in
->getExit();
534 in
->insertTail(new FlowInstruction(func
, OP_JOIN
, bb
));
535 // there should always be a terminator instruction
536 WARN("inserted missing terminator in BB:%i\n", in
->getId());
538 if (exit
->op
== OP_BRA
) {
540 exit
->asFlow()->limit
= 1; // must-not-propagate marker
543 bb
->remove(bb
->getEntry());
547 NVC0LegalizePostRA::visit(BasicBlock
*bb
)
549 Instruction
*i
, *next
;
551 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
552 for (i
= bb
->getFirst(); i
; i
= next
) {
554 if (i
->op
== OP_EMIT
|| i
->op
== OP_RESTART
) {
555 if (!i
->getDef(0)->refCount())
557 if (i
->src(0).getFile() == FILE_IMMEDIATE
)
558 i
->setSrc(0, rZero
); // initial value must be 0
564 if (i
->op
== OP_BAR
&& i
->subOp
== NV50_IR_SUBOP_BAR_SYNC
&&
565 prog
->getType() != Program::TYPE_COMPUTE
) {
566 // It seems like barriers are never required for tessellation since
567 // the warp size is 32, and there are always at most 32 tcs threads.
570 if (i
->op
== OP_LOAD
&& i
->subOp
== NV50_IR_SUBOP_LDC_IS
) {
571 int offset
= i
->src(0).get()->reg
.data
.offset
;
572 if (abs(offset
) > 0x10000)
573 i
->src(0).get()->reg
.fileIndex
+= offset
>> 16;
574 i
->src(0).get()->reg
.data
.offset
= (int)(short)offset
;
576 // TODO: Move this to before register allocation for operations that
577 // need the $c register !
578 if (typeSizeof(i
->dType
) == 8) {
580 hi
= BuildUtil::split64BitOpPostRA(func
, i
, rZero
, carry
);
585 if (i
->op
!= OP_MOV
&& i
->op
!= OP_PFETCH
)
592 if (!tryReplaceContWithBra(bb
))
598 NVC0LoweringPass::NVC0LoweringPass(Program
*prog
) : targ(prog
->getTarget())
600 bld
.setProgram(prog
);
604 NVC0LoweringPass::visit(Function
*fn
)
606 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
607 assert(!strncmp(fn
->getName(), "MAIN", 4));
608 // TODO: when we generate actual functions pass this value along somehow
609 bld
.setPosition(BasicBlock::get(fn
->cfg
.getRoot()), false);
610 gpEmitAddress
= bld
.loadImm(NULL
, 0)->asLValue();
612 bld
.setPosition(BasicBlock::get(fn
->cfgExit
)->getExit(), false);
613 bld
.mkMovToReg(0, gpEmitAddress
);
620 NVC0LoweringPass::visit(BasicBlock
*bb
)
626 NVC0LoweringPass::loadTexHandle(Value
*ptr
, unsigned int slot
)
628 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
629 uint32_t off
= prog
->driver
->io
.texBindBase
+ slot
* 4;
632 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(2));
635 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
638 // move array source to first slot, convert to u16, add indirections
640 NVC0LoweringPass::handleTEX(TexInstruction
*i
)
642 const int dim
= i
->tex
.target
.getDim() + i
->tex
.target
.isCube();
643 const int arg
= i
->tex
.target
.getArgCount();
644 const int lyr
= arg
- (i
->tex
.target
.isMS() ? 2 : 1);
645 const int chipset
= prog
->getTarget()->getChipset();
647 /* Only normalize in the non-explicit derivatives case. For explicit
648 * derivatives, this is handled in handleManualTXD.
650 if (i
->tex
.target
.isCube() && i
->dPdx
[0].get() == NULL
) {
653 for (c
= 0; c
< 3; ++c
)
654 src
[c
] = bld
.mkOp1v(OP_ABS
, TYPE_F32
, bld
.getSSA(), i
->getSrc(c
));
655 val
= bld
.getScratch();
656 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[0], src
[1]);
657 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[2], val
);
658 bld
.mkOp1(OP_RCP
, TYPE_F32
, val
, val
);
659 for (c
= 0; c
< 3; ++c
) {
660 i
->setSrc(c
, bld
.mkOp2v(OP_MUL
, TYPE_F32
, bld
.getSSA(),
665 // Arguments to the TEX instruction are a little insane. Even though the
666 // encoding is identical between SM20 and SM30, the arguments mean
667 // different things between Fermi and Kepler+. A lot of arguments are
668 // optional based on flags passed to the instruction. This summarizes the
678 // - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
679 // - other: 4 bits each, single reg
683 // array (+ offsets for txd in upper 16 bits)
688 // offsets (same as fermi, except txd which takes it with array)
705 if (chipset
>= NVISA_GK104_CHIPSET
) {
706 if (i
->tex
.rIndirectSrc
>= 0 || i
->tex
.sIndirectSrc
>= 0) {
707 // XXX this ignores tsc, and assumes a 1:1 mapping
708 assert(i
->tex
.rIndirectSrc
>= 0);
709 Value
*hnd
= loadTexHandle(i
->getIndirectR(), i
->tex
.r
);
712 i
->setIndirectR(hnd
);
713 i
->setIndirectS(NULL
);
714 } else if (i
->tex
.r
== i
->tex
.s
|| i
->op
== OP_TXF
) {
715 i
->tex
.r
+= prog
->driver
->io
.texBindBase
/ 4;
716 i
->tex
.s
= 0; // only a single cX[] value possible here
718 Value
*hnd
= bld
.getScratch();
719 Value
*rHnd
= loadTexHandle(NULL
, i
->tex
.r
);
720 Value
*sHnd
= loadTexHandle(NULL
, i
->tex
.s
);
722 bld
.mkOp3(OP_INSBF
, TYPE_U32
, hnd
, rHnd
, bld
.mkImm(0x1400), sHnd
);
724 i
->tex
.r
= 0; // not used for indirect tex
726 i
->setIndirectR(hnd
);
728 if (i
->tex
.target
.isArray()) {
729 LValue
*layer
= new_LValue(func
, FILE_GPR
);
730 Value
*src
= i
->getSrc(lyr
);
731 const int sat
= (i
->op
== OP_TXF
) ? 1 : 0;
732 DataType sTy
= (i
->op
== OP_TXF
) ? TYPE_U32
: TYPE_F32
;
733 bld
.mkCvt(OP_CVT
, TYPE_U16
, layer
, sTy
, src
)->saturate
= sat
;
734 if (i
->op
!= OP_TXD
|| chipset
< NVISA_GM107_CHIPSET
) {
735 for (int s
= dim
; s
>= 1; --s
)
736 i
->setSrc(s
, i
->getSrc(s
- 1));
739 i
->setSrc(dim
, layer
);
742 // Move the indirect reference to the first place
743 if (i
->tex
.rIndirectSrc
>= 0 && (
744 i
->op
== OP_TXD
|| chipset
< NVISA_GM107_CHIPSET
)) {
745 Value
*hnd
= i
->getIndirectR();
747 i
->setIndirectR(NULL
);
748 i
->moveSources(0, 1);
750 i
->tex
.rIndirectSrc
= 0;
751 i
->tex
.sIndirectSrc
= -1;
753 // Move the indirect reference to right after the coords
754 else if (i
->tex
.rIndirectSrc
>= 0 && chipset
>= NVISA_GM107_CHIPSET
) {
755 Value
*hnd
= i
->getIndirectR();
757 i
->setIndirectR(NULL
);
758 i
->moveSources(arg
, 1);
760 i
->tex
.rIndirectSrc
= 0;
761 i
->tex
.sIndirectSrc
= -1;
764 // (nvc0) generate and move the tsc/tic/array source to the front
765 if (i
->tex
.target
.isArray() || i
->tex
.rIndirectSrc
>= 0 || i
->tex
.sIndirectSrc
>= 0) {
766 LValue
*src
= new_LValue(func
, FILE_GPR
); // 0xttxsaaaa
768 Value
*ticRel
= i
->getIndirectR();
769 Value
*tscRel
= i
->getIndirectS();
772 i
->setSrc(i
->tex
.rIndirectSrc
, NULL
);
774 ticRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
775 ticRel
, bld
.mkImm(i
->tex
.r
));
778 i
->setSrc(i
->tex
.sIndirectSrc
, NULL
);
780 tscRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
781 tscRel
, bld
.mkImm(i
->tex
.s
));
784 Value
*arrayIndex
= i
->tex
.target
.isArray() ? i
->getSrc(lyr
) : NULL
;
786 for (int s
= dim
; s
>= 1; --s
)
787 i
->setSrc(s
, i
->getSrc(s
- 1));
788 i
->setSrc(0, arrayIndex
);
790 i
->moveSources(0, 1);
794 int sat
= (i
->op
== OP_TXF
) ? 1 : 0;
795 DataType sTy
= (i
->op
== OP_TXF
) ? TYPE_U32
: TYPE_F32
;
796 bld
.mkCvt(OP_CVT
, TYPE_U16
, src
, sTy
, arrayIndex
)->saturate
= sat
;
802 bld
.mkOp3(OP_INSBF
, TYPE_U32
, src
, ticRel
, bld
.mkImm(0x0917), src
);
804 bld
.mkOp3(OP_INSBF
, TYPE_U32
, src
, tscRel
, bld
.mkImm(0x0710), src
);
809 // For nvc0, the sample id has to be in the second operand, as the offset
810 // does. Right now we don't know how to pass both in, and this case can't
811 // happen with OpenGL. On nve0, the sample id is part of the texture
812 // coordinate argument.
813 assert(chipset
>= NVISA_GK104_CHIPSET
||
814 !i
->tex
.useOffsets
|| !i
->tex
.target
.isMS());
816 // offset is between lod and dc
817 if (i
->tex
.useOffsets
) {
819 int s
= i
->srcCount(0xff, true);
820 if (i
->op
!= OP_TXD
|| chipset
< NVISA_GK104_CHIPSET
) {
821 if (i
->tex
.target
.isShadow())
823 if (i
->srcExists(s
)) // move potential predicate out of the way
824 i
->moveSources(s
, 1);
825 if (i
->tex
.useOffsets
== 4 && i
->srcExists(s
+ 1))
826 i
->moveSources(s
+ 1, 1);
828 if (i
->op
== OP_TXG
) {
829 // Either there is 1 offset, which goes into the 2 low bytes of the
830 // first source, or there are 4 offsets, which go into 2 sources (8
831 // values, 1 byte each).
832 Value
*offs
[2] = {NULL
, NULL
};
833 for (n
= 0; n
< i
->tex
.useOffsets
; n
++) {
834 for (c
= 0; c
< 2; ++c
) {
835 if ((n
% 2) == 0 && c
== 0)
836 bld
.mkMov(offs
[n
/ 2] = bld
.getScratch(), i
->offset
[n
][c
].get());
838 bld
.mkOp3(OP_INSBF
, TYPE_U32
,
840 i
->offset
[n
][c
].get(),
841 bld
.mkImm(0x800 | ((n
* 16 + c
* 8) % 32)),
845 i
->setSrc(s
, offs
[0]);
847 i
->setSrc(s
+ 1, offs
[1]);
850 assert(i
->tex
.useOffsets
== 1);
851 for (c
= 0; c
< 3; ++c
) {
853 if (!i
->offset
[0][c
].getImmediate(val
))
854 assert(!"non-immediate offset passed to non-TXG");
855 imm
|= (val
.reg
.data
.u32
& 0xf) << (c
* 4);
857 if (i
->op
== OP_TXD
&& chipset
>= NVISA_GK104_CHIPSET
) {
858 // The offset goes into the upper 16 bits of the array index. So
859 // create it if it's not already there, and INSBF it if it already
861 s
= (i
->tex
.rIndirectSrc
>= 0) ? 1 : 0;
862 if (chipset
>= NVISA_GM107_CHIPSET
)
864 if (i
->tex
.target
.isArray()) {
865 bld
.mkOp3(OP_INSBF
, TYPE_U32
, i
->getSrc(s
),
866 bld
.loadImm(NULL
, imm
), bld
.mkImm(0xc10),
869 i
->moveSources(s
, 1);
870 i
->setSrc(s
, bld
.loadImm(NULL
, imm
<< 16));
873 i
->setSrc(s
, bld
.loadImm(NULL
, imm
));
878 if (chipset
>= NVISA_GK104_CHIPSET
) {
880 // If TEX requires more than 4 sources, the 2nd register tuple must be
881 // aligned to 4, even if it consists of just a single 4-byte register.
883 // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
885 int s
= i
->srcCount(0xff, true);
886 if (s
> 4 && s
< 7) {
887 if (i
->srcExists(s
)) // move potential predicate out of the way
888 i
->moveSources(s
, 7 - s
);
890 i
->setSrc(s
++, bld
.loadImm(NULL
, 0));
898 NVC0LoweringPass::handleManualTXD(TexInstruction
*i
)
900 static const uint8_t qOps
[4][2] =
902 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l0
903 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l1
904 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l2
905 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l3
910 Value
*zero
= bld
.loadImm(bld
.getSSA(), 0);
912 const int dim
= i
->tex
.target
.getDim() + i
->tex
.target
.isCube();
914 // This function is invoked after handleTEX lowering, so we have to expect
915 // the arguments in the order that the hw wants them. For Fermi, array and
916 // indirect are both in the leading arg, while for Kepler, array and
917 // indirect are separate (and both precede the coordinates). Maxwell is
918 // handled in a separate function.
920 if (targ
->getChipset() < NVISA_GK104_CHIPSET
)
921 array
= i
->tex
.target
.isArray() || i
->tex
.rIndirectSrc
>= 0;
923 array
= i
->tex
.target
.isArray() + (i
->tex
.rIndirectSrc
>= 0);
925 i
->op
= OP_TEX
; // no need to clone dPdx/dPdy later
927 for (c
= 0; c
< dim
; ++c
)
928 crd
[c
] = bld
.getScratch();
930 bld
.mkOp(OP_QUADON
, TYPE_NONE
, NULL
);
931 for (l
= 0; l
< 4; ++l
) {
933 // mov coordinates from lane l to all lanes
934 for (c
= 0; c
< dim
; ++c
)
935 bld
.mkQuadop(0x00, crd
[c
], l
, i
->getSrc(c
+ array
), zero
);
936 // add dPdx from lane l to lanes dx
937 for (c
= 0; c
< dim
; ++c
)
938 bld
.mkQuadop(qOps
[l
][0], crd
[c
], l
, i
->dPdx
[c
].get(), crd
[c
]);
939 // add dPdy from lane l to lanes dy
940 for (c
= 0; c
< dim
; ++c
)
941 bld
.mkQuadop(qOps
[l
][1], crd
[c
], l
, i
->dPdy
[c
].get(), crd
[c
]);
942 // normalize cube coordinates
943 if (i
->tex
.target
.isCube()) {
944 for (c
= 0; c
< 3; ++c
)
945 src
[c
] = bld
.mkOp1v(OP_ABS
, TYPE_F32
, bld
.getSSA(), crd
[c
]);
946 val
= bld
.getScratch();
947 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[0], src
[1]);
948 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[2], val
);
949 bld
.mkOp1(OP_RCP
, TYPE_F32
, val
, val
);
950 for (c
= 0; c
< 3; ++c
)
951 src
[c
] = bld
.mkOp2v(OP_MUL
, TYPE_F32
, bld
.getSSA(), crd
[c
], val
);
953 for (c
= 0; c
< dim
; ++c
)
957 bld
.insert(tex
= cloneForward(func
, i
));
958 for (c
= 0; c
< dim
; ++c
)
959 tex
->setSrc(c
+ array
, src
[c
]);
961 for (c
= 0; i
->defExists(c
); ++c
) {
963 def
[c
][l
] = bld
.getSSA();
964 mov
= bld
.mkMov(def
[c
][l
], tex
->getDef(c
));
969 bld
.mkOp(OP_QUADPOP
, TYPE_NONE
, NULL
);
971 for (c
= 0; i
->defExists(c
); ++c
) {
972 Instruction
*u
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(c
));
973 for (l
= 0; l
< 4; ++l
)
974 u
->setSrc(l
, def
[c
][l
]);
982 NVC0LoweringPass::handleTXD(TexInstruction
*txd
)
984 int dim
= txd
->tex
.target
.getDim() + txd
->tex
.target
.isCube();
985 unsigned arg
= txd
->tex
.target
.getArgCount();
986 unsigned expected_args
= arg
;
987 const int chipset
= prog
->getTarget()->getChipset();
989 if (chipset
>= NVISA_GK104_CHIPSET
) {
990 if (!txd
->tex
.target
.isArray() && txd
->tex
.useOffsets
)
992 if (txd
->tex
.rIndirectSrc
>= 0 || txd
->tex
.sIndirectSrc
>= 0)
995 if (txd
->tex
.useOffsets
)
997 if (!txd
->tex
.target
.isArray() && (
998 txd
->tex
.rIndirectSrc
>= 0 || txd
->tex
.sIndirectSrc
>= 0))
1002 if (expected_args
> 4 ||
1004 txd
->tex
.target
.isShadow())
1008 while (txd
->srcExists(arg
))
1011 txd
->tex
.derivAll
= true;
1012 if (txd
->op
== OP_TEX
)
1013 return handleManualTXD(txd
);
1015 assert(arg
== expected_args
);
1016 for (int c
= 0; c
< dim
; ++c
) {
1017 txd
->setSrc(arg
+ c
* 2 + 0, txd
->dPdx
[c
]);
1018 txd
->setSrc(arg
+ c
* 2 + 1, txd
->dPdy
[c
]);
1019 txd
->dPdx
[c
].set(NULL
);
1020 txd
->dPdy
[c
].set(NULL
);
1023 // In this case we have fewer than 4 "real" arguments, which means that
1024 // handleTEX didn't apply any padding. However we have to make sure that
1025 // the second "group" of arguments still gets padded up to 4.
1026 if (chipset
>= NVISA_GK104_CHIPSET
) {
1027 int s
= arg
+ 2 * dim
;
1028 if (s
>= 4 && s
< 7) {
1029 if (txd
->srcExists(s
)) // move potential predicate out of the way
1030 txd
->moveSources(s
, 7 - s
);
1032 txd
->setSrc(s
++, bld
.loadImm(NULL
, 0));
1040 NVC0LoweringPass::handleTXQ(TexInstruction
*txq
)
1042 const int chipset
= prog
->getTarget()->getChipset();
1043 if (chipset
>= NVISA_GK104_CHIPSET
&& txq
->tex
.rIndirectSrc
< 0)
1044 txq
->tex
.r
+= prog
->driver
->io
.texBindBase
/ 4;
1046 if (txq
->tex
.rIndirectSrc
< 0)
1049 Value
*ticRel
= txq
->getIndirectR();
1051 txq
->setIndirectS(NULL
);
1052 txq
->tex
.sIndirectSrc
= -1;
1056 if (chipset
< NVISA_GK104_CHIPSET
) {
1057 LValue
*src
= new_LValue(func
, FILE_GPR
); // 0xttxsaaaa
1059 txq
->setSrc(txq
->tex
.rIndirectSrc
, NULL
);
1061 ticRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
1062 ticRel
, bld
.mkImm(txq
->tex
.r
));
1064 bld
.mkOp2(OP_SHL
, TYPE_U32
, src
, ticRel
, bld
.mkImm(0x17));
1066 txq
->moveSources(0, 1);
1067 txq
->setSrc(0, src
);
1069 Value
*hnd
= loadTexHandle(txq
->getIndirectR(), txq
->tex
.r
);
1073 txq
->setIndirectR(NULL
);
1074 txq
->moveSources(0, 1);
1075 txq
->setSrc(0, hnd
);
1076 txq
->tex
.rIndirectSrc
= 0;
1083 NVC0LoweringPass::handleTXLQ(TexInstruction
*i
)
1085 /* The outputs are inverted compared to what the TGSI instruction
1086 * expects. Take that into account in the mask.
1088 assert((i
->tex
.mask
& ~3) == 0);
1089 if (i
->tex
.mask
== 1)
1091 else if (i
->tex
.mask
== 2)
1094 bld
.setPosition(i
, true);
1096 /* The returned values are not quite what we want:
1097 * (a) convert from s16/u16 to f32
1098 * (b) multiply by 1/256
1100 for (int def
= 0; def
< 2; ++def
) {
1101 if (!i
->defExists(def
))
1103 enum DataType type
= TYPE_S16
;
1104 if (i
->tex
.mask
== 2 || def
> 0)
1106 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(def
), type
, i
->getDef(def
));
1107 bld
.mkOp2(OP_MUL
, TYPE_F32
, i
->getDef(def
),
1108 i
->getDef(def
), bld
.loadImm(NULL
, 1.0f
/ 256));
1110 if (i
->tex
.mask
== 3) {
1111 LValue
*t
= new_LValue(func
, FILE_GPR
);
1112 bld
.mkMov(t
, i
->getDef(0));
1113 bld
.mkMov(i
->getDef(0), i
->getDef(1));
1114 bld
.mkMov(i
->getDef(1), t
);
1120 NVC0LoweringPass::handleBUFQ(Instruction
*bufq
)
1123 bufq
->setSrc(0, loadBufLength32(bufq
->getIndirect(0, 1),
1124 bufq
->getSrc(0)->reg
.fileIndex
* 16));
1125 bufq
->setIndirect(0, 0, NULL
);
1126 bufq
->setIndirect(0, 1, NULL
);
1131 NVC0LoweringPass::handleSharedATOMNVE4(Instruction
*atom
)
1133 assert(atom
->src(0).getFile() == FILE_MEMORY_SHARED
);
1135 BasicBlock
*currBB
= atom
->bb
;
1136 BasicBlock
*tryLockBB
= atom
->bb
->splitBefore(atom
, false);
1137 BasicBlock
*joinBB
= atom
->bb
->splitAfter(atom
);
1138 BasicBlock
*setAndUnlockBB
= new BasicBlock(func
);
1139 BasicBlock
*failLockBB
= new BasicBlock(func
);
1141 bld
.setPosition(currBB
, true);
1142 assert(!currBB
->joinAt
);
1143 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
1145 CmpInstruction
*pred
=
1146 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
1147 TYPE_U32
, bld
.mkImm(0), bld
.mkImm(1));
1149 bld
.mkFlow(OP_BRA
, tryLockBB
, CC_ALWAYS
, NULL
);
1150 currBB
->cfg
.attach(&tryLockBB
->cfg
, Graph::Edge::TREE
);
1152 bld
.setPosition(tryLockBB
, true);
1155 bld
.mkLoad(TYPE_U32
, atom
->getDef(0), atom
->getSrc(0)->asSym(),
1156 atom
->getIndirect(0, 0));
1157 ld
->setDef(1, bld
.getSSA(1, FILE_PREDICATE
));
1158 ld
->subOp
= NV50_IR_SUBOP_LOAD_LOCKED
;
1160 bld
.mkFlow(OP_BRA
, setAndUnlockBB
, CC_P
, ld
->getDef(1));
1161 bld
.mkFlow(OP_BRA
, failLockBB
, CC_ALWAYS
, NULL
);
1162 tryLockBB
->cfg
.attach(&failLockBB
->cfg
, Graph::Edge::CROSS
);
1163 tryLockBB
->cfg
.attach(&setAndUnlockBB
->cfg
, Graph::Edge::TREE
);
1165 tryLockBB
->cfg
.detach(&joinBB
->cfg
);
1168 bld
.setPosition(setAndUnlockBB
, true);
1170 if (atom
->subOp
== NV50_IR_SUBOP_ATOM_EXCH
) {
1171 // Read the old value, and write the new one.
1172 stVal
= atom
->getSrc(1);
1173 } else if (atom
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
1174 CmpInstruction
*set
=
1175 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(),
1176 TYPE_U32
, ld
->getDef(0), atom
->getSrc(1));
1178 bld
.mkCmp(OP_SLCT
, CC_NE
, TYPE_U32
, (stVal
= bld
.getSSA()),
1179 TYPE_U32
, atom
->getSrc(2), ld
->getDef(0), set
->getDef(0));
1183 switch (atom
->subOp
) {
1184 case NV50_IR_SUBOP_ATOM_ADD
:
1187 case NV50_IR_SUBOP_ATOM_AND
:
1190 case NV50_IR_SUBOP_ATOM_OR
:
1193 case NV50_IR_SUBOP_ATOM_XOR
:
1196 case NV50_IR_SUBOP_ATOM_MIN
:
1199 case NV50_IR_SUBOP_ATOM_MAX
:
1207 stVal
= bld
.mkOp2v(op
, atom
->dType
, bld
.getSSA(), ld
->getDef(0),
1212 bld
.mkStore(OP_STORE
, TYPE_U32
, atom
->getSrc(0)->asSym(),
1213 atom
->getIndirect(0, 0), stVal
);
1214 st
->setDef(0, pred
->getDef(0));
1215 st
->subOp
= NV50_IR_SUBOP_STORE_UNLOCKED
;
1217 bld
.mkFlow(OP_BRA
, failLockBB
, CC_ALWAYS
, NULL
);
1218 setAndUnlockBB
->cfg
.attach(&failLockBB
->cfg
, Graph::Edge::TREE
);
1220 // Lock until the store has not been performed.
1221 bld
.setPosition(failLockBB
, true);
1222 bld
.mkFlow(OP_BRA
, tryLockBB
, CC_NOT_P
, pred
->getDef(0));
1223 bld
.mkFlow(OP_BRA
, joinBB
, CC_ALWAYS
, NULL
);
1224 failLockBB
->cfg
.attach(&tryLockBB
->cfg
, Graph::Edge::BACK
);
1225 failLockBB
->cfg
.attach(&joinBB
->cfg
, Graph::Edge::TREE
);
1227 bld
.setPosition(joinBB
, false);
1228 bld
.mkFlow(OP_JOIN
, NULL
, CC_ALWAYS
, NULL
)->fixed
= 1;
1232 NVC0LoweringPass::handleSharedATOM(Instruction
*atom
)
1234 assert(atom
->src(0).getFile() == FILE_MEMORY_SHARED
);
1236 BasicBlock
*currBB
= atom
->bb
;
1237 BasicBlock
*tryLockAndSetBB
= atom
->bb
->splitBefore(atom
, false);
1238 BasicBlock
*joinBB
= atom
->bb
->splitAfter(atom
);
1240 bld
.setPosition(currBB
, true);
1241 assert(!currBB
->joinAt
);
1242 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
1244 bld
.mkFlow(OP_BRA
, tryLockAndSetBB
, CC_ALWAYS
, NULL
);
1245 currBB
->cfg
.attach(&tryLockAndSetBB
->cfg
, Graph::Edge::TREE
);
1247 bld
.setPosition(tryLockAndSetBB
, true);
1250 bld
.mkLoad(TYPE_U32
, atom
->getDef(0), atom
->getSrc(0)->asSym(),
1251 atom
->getIndirect(0, 0));
1252 ld
->setDef(1, bld
.getSSA(1, FILE_PREDICATE
));
1253 ld
->subOp
= NV50_IR_SUBOP_LOAD_LOCKED
;
1256 if (atom
->subOp
== NV50_IR_SUBOP_ATOM_EXCH
) {
1257 // Read the old value, and write the new one.
1258 stVal
= atom
->getSrc(1);
1259 } else if (atom
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
1260 CmpInstruction
*set
=
1261 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
1262 TYPE_U32
, ld
->getDef(0), atom
->getSrc(1));
1263 set
->setPredicate(CC_P
, ld
->getDef(1));
1266 bld
.mkOp3(OP_SELP
, TYPE_U32
, bld
.getSSA(), ld
->getDef(0),
1267 atom
->getSrc(2), set
->getDef(0));
1268 selp
->src(2).mod
= Modifier(NV50_IR_MOD_NOT
);
1269 selp
->setPredicate(CC_P
, ld
->getDef(1));
1271 stVal
= selp
->getDef(0);
1275 switch (atom
->subOp
) {
1276 case NV50_IR_SUBOP_ATOM_ADD
:
1279 case NV50_IR_SUBOP_ATOM_AND
:
1282 case NV50_IR_SUBOP_ATOM_OR
:
1285 case NV50_IR_SUBOP_ATOM_XOR
:
1288 case NV50_IR_SUBOP_ATOM_MIN
:
1291 case NV50_IR_SUBOP_ATOM_MAX
:
1300 bld
.mkOp2(op
, atom
->dType
, bld
.getSSA(), ld
->getDef(0),
1302 i
->setPredicate(CC_P
, ld
->getDef(1));
1304 stVal
= i
->getDef(0);
1308 bld
.mkStore(OP_STORE
, TYPE_U32
, atom
->getSrc(0)->asSym(),
1309 atom
->getIndirect(0, 0), stVal
);
1310 st
->setPredicate(CC_P
, ld
->getDef(1));
1311 st
->subOp
= NV50_IR_SUBOP_STORE_UNLOCKED
;
1313 // Loop until the lock is acquired.
1314 bld
.mkFlow(OP_BRA
, tryLockAndSetBB
, CC_NOT_P
, ld
->getDef(1));
1315 tryLockAndSetBB
->cfg
.attach(&tryLockAndSetBB
->cfg
, Graph::Edge::BACK
);
1316 tryLockAndSetBB
->cfg
.attach(&joinBB
->cfg
, Graph::Edge::CROSS
);
1317 bld
.mkFlow(OP_BRA
, joinBB
, CC_ALWAYS
, NULL
);
1321 bld
.setPosition(joinBB
, false);
1322 bld
.mkFlow(OP_JOIN
, NULL
, CC_ALWAYS
, NULL
)->fixed
= 1;
1326 NVC0LoweringPass::handleATOM(Instruction
*atom
)
1329 Value
*ptr
= atom
->getIndirect(0, 0), *ind
= atom
->getIndirect(0, 1), *base
;
1331 switch (atom
->src(0).getFile()) {
1332 case FILE_MEMORY_LOCAL
:
1335 case FILE_MEMORY_SHARED
:
1336 // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1337 // operations on shared memory. For Maxwell, ATOMS is enough.
1338 if (targ
->getChipset() < NVISA_GK104_CHIPSET
)
1339 handleSharedATOM(atom
);
1340 else if (targ
->getChipset() < NVISA_GM107_CHIPSET
)
1341 handleSharedATOMNVE4(atom
);
1344 assert(atom
->src(0).getFile() == FILE_MEMORY_BUFFER
);
1345 base
= loadBufInfo64(ind
, atom
->getSrc(0)->reg
.fileIndex
* 16);
1346 assert(base
->reg
.size
== 8);
1348 base
= bld
.mkOp2v(OP_ADD
, TYPE_U64
, base
, base
, ptr
);
1349 assert(base
->reg
.size
== 8);
1350 atom
->setIndirect(0, 0, base
);
1351 atom
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
1353 // Harden against out-of-bounds accesses
1354 Value
*offset
= bld
.loadImm(NULL
, atom
->getSrc(0)->reg
.data
.offset
+ typeSizeof(atom
->sType
));
1355 Value
*length
= loadBufLength32(ind
, atom
->getSrc(0)->reg
.fileIndex
* 16);
1356 Value
*pred
= new_LValue(func
, FILE_PREDICATE
);
1358 bld
.mkOp2(OP_ADD
, TYPE_U32
, offset
, offset
, ptr
);
1359 bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, pred
, TYPE_U32
, offset
, length
);
1360 atom
->setPredicate(CC_NOT_P
, pred
);
1361 if (atom
->defExists(0)) {
1362 Value
*zero
, *dst
= atom
->getDef(0);
1363 atom
->setDef(0, bld
.getSSA());
1365 bld
.setPosition(atom
, true);
1366 bld
.mkMov((zero
= bld
.getSSA()), bld
.mkImm(0))
1367 ->setPredicate(CC_P
, pred
);
1368 bld
.mkOp2(OP_UNION
, TYPE_U32
, dst
, atom
->getDef(0), zero
);
1374 bld
.mkOp1v(OP_RDSV
, TYPE_U32
, bld
.getScratch(), bld
.mkSysVal(sv
, 0));
1376 atom
->setSrc(0, cloneShallow(func
, atom
->getSrc(0)));
1377 atom
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
1379 base
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, base
, base
, ptr
);
1380 atom
->setIndirect(0, 1, NULL
);
1381 atom
->setIndirect(0, 0, base
);
1387 NVC0LoweringPass::handleCasExch(Instruction
*cas
, bool needCctl
)
1389 if (targ
->getChipset() < NVISA_GM107_CHIPSET
) {
1390 if (cas
->src(0).getFile() == FILE_MEMORY_SHARED
) {
1391 // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1396 if (cas
->subOp
!= NV50_IR_SUBOP_ATOM_CAS
&&
1397 cas
->subOp
!= NV50_IR_SUBOP_ATOM_EXCH
)
1399 bld
.setPosition(cas
, true);
1402 Instruction
*cctl
= bld
.mkOp1(OP_CCTL
, TYPE_NONE
, NULL
, cas
->getSrc(0));
1403 cctl
->setIndirect(0, 0, cas
->getIndirect(0, 0));
1405 cctl
->subOp
= NV50_IR_SUBOP_CCTL_IV
;
1406 if (cas
->isPredicated())
1407 cctl
->setPredicate(cas
->cc
, cas
->getPredicate());
1410 if (cas
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
1411 // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1412 // should be set to the high part of the double reg or bad things will
1413 // happen elsewhere in the universe.
1414 // Also, it sometimes returns the new value instead of the old one
1415 // under mysterious circumstances.
1416 Value
*dreg
= bld
.getSSA(8);
1417 bld
.setPosition(cas
, false);
1418 bld
.mkOp2(OP_MERGE
, TYPE_U64
, dreg
, cas
->getSrc(1), cas
->getSrc(2));
1419 cas
->setSrc(1, dreg
);
1420 cas
->setSrc(2, dreg
);
1427 NVC0LoweringPass::loadResInfo32(Value
*ptr
, uint32_t off
, uint16_t base
)
1429 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1433 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
1437 NVC0LoweringPass::loadResInfo64(Value
*ptr
, uint32_t off
, uint16_t base
)
1439 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1443 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getScratch(), ptr
, bld
.mkImm(4));
1446 mkLoadv(TYPE_U64
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U64
, off
), ptr
);
1450 NVC0LoweringPass::loadResLength32(Value
*ptr
, uint32_t off
, uint16_t base
)
1452 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1456 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getScratch(), ptr
, bld
.mkImm(4));
1459 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U64
, off
+ 8), ptr
);
1463 NVC0LoweringPass::loadBufInfo64(Value
*ptr
, uint32_t off
)
1465 return loadResInfo64(ptr
, off
, prog
->driver
->io
.bufInfoBase
);
1469 NVC0LoweringPass::loadBufLength32(Value
*ptr
, uint32_t off
)
1471 return loadResLength32(ptr
, off
, prog
->driver
->io
.bufInfoBase
);
1475 NVC0LoweringPass::loadUboInfo64(Value
*ptr
, uint32_t off
)
1477 return loadResInfo64(ptr
, off
, prog
->driver
->io
.uboInfoBase
);
1481 NVC0LoweringPass::loadUboLength32(Value
*ptr
, uint32_t off
)
1483 return loadResLength32(ptr
, off
, prog
->driver
->io
.uboInfoBase
);
1487 NVC0LoweringPass::loadMsInfo32(Value
*ptr
, uint32_t off
)
1489 uint8_t b
= prog
->driver
->io
.msInfoCBSlot
;
1490 off
+= prog
->driver
->io
.msInfoBase
;
1492 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
1495 /* On nvc0, surface info is obtained via the surface binding points passed
1496 * to the SULD/SUST instructions.
1497 * On nve4, surface info is stored in c[] and is used by various special
1498 * instructions, e.g. for clamping coordinates or generating an address.
1499 * They couldn't just have added an equivalent to TIC now, couldn't they ?
1501 #define NVC0_SU_INFO_ADDR 0x00
1502 #define NVC0_SU_INFO_FMT 0x04
1503 #define NVC0_SU_INFO_DIM_X 0x08
1504 #define NVC0_SU_INFO_PITCH 0x0c
1505 #define NVC0_SU_INFO_DIM_Y 0x10
1506 #define NVC0_SU_INFO_ARRAY 0x14
1507 #define NVC0_SU_INFO_DIM_Z 0x18
1508 #define NVC0_SU_INFO_UNK1C 0x1c
1509 #define NVC0_SU_INFO_WIDTH 0x20
1510 #define NVC0_SU_INFO_HEIGHT 0x24
1511 #define NVC0_SU_INFO_DEPTH 0x28
1512 #define NVC0_SU_INFO_TARGET 0x2c
1513 #define NVC0_SU_INFO_BSIZE 0x30
1514 #define NVC0_SU_INFO_RAW_X 0x34
1515 #define NVC0_SU_INFO_MS_X 0x38
1516 #define NVC0_SU_INFO_MS_Y 0x3c
1518 #define NVC0_SU_INFO__STRIDE 0x40
1520 #define NVC0_SU_INFO_DIM(i) (0x08 + (i) * 8)
1521 #define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4)
1522 #define NVC0_SU_INFO_MS(i) (0x38 + (i) * 4)
1525 NVC0LoweringPass::loadSuInfo32(Value
*ptr
, int slot
, uint32_t off
)
1527 uint32_t base
= slot
* NVC0_SU_INFO__STRIDE
;
1530 ptr
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(slot
));
1531 ptr
= bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(7));
1532 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(6));
1537 return loadResInfo32(ptr
, off
, prog
->driver
->io
.suInfoBase
);
1540 static inline uint16_t getSuClampSubOp(const TexInstruction
*su
, int c
)
1542 switch (su
->tex
.target
.getEnum()) {
1543 case TEX_TARGET_BUFFER
: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1544 case TEX_TARGET_RECT
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1545 case TEX_TARGET_1D
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1546 case TEX_TARGET_1D_ARRAY
: return (c
== 1) ?
1547 NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1548 NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1549 case TEX_TARGET_2D
: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1550 case TEX_TARGET_2D_MS
: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1551 case TEX_TARGET_2D_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1552 case TEX_TARGET_2D_MS_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1553 case TEX_TARGET_3D
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1554 case TEX_TARGET_CUBE
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1555 case TEX_TARGET_CUBE_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1563 NVC0LoweringPass::handleSUQ(TexInstruction
*suq
)
1565 int mask
= suq
->tex
.mask
;
1566 int dim
= suq
->tex
.target
.getDim();
1567 int arg
= dim
+ (suq
->tex
.target
.isArray() || suq
->tex
.target
.isCube());
1568 Value
*ind
= suq
->getIndirectR();
1569 int slot
= suq
->tex
.r
;
1572 for (c
= 0, d
= 0; c
< 3; ++c
, mask
>>= 1) {
1573 if (c
>= arg
|| !(mask
& 1))
1578 if (c
== 1 && suq
->tex
.target
== TEX_TARGET_1D_ARRAY
) {
1579 offset
= NVC0_SU_INFO_SIZE(2);
1581 offset
= NVC0_SU_INFO_SIZE(c
);
1583 bld
.mkMov(suq
->getDef(d
++), loadSuInfo32(ind
, slot
, offset
));
1584 if (c
== 2 && suq
->tex
.target
.isCube())
1585 bld
.mkOp2(OP_DIV
, TYPE_U32
, suq
->getDef(d
- 1), suq
->getDef(d
- 1),
1586 bld
.loadImm(NULL
, 6));
1590 if (suq
->tex
.target
.isMS()) {
1591 Value
*ms_x
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(0));
1592 Value
*ms_y
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(1));
1593 Value
*ms
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(), ms_x
, ms_y
);
1594 bld
.mkOp2(OP_SHL
, TYPE_U32
, suq
->getDef(d
++), bld
.loadImm(NULL
, 1), ms
);
1596 bld
.mkMov(suq
->getDef(d
++), bld
.loadImm(NULL
, 1));
1605 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction
*tex
)
1607 const int arg
= tex
->tex
.target
.getArgCount();
1608 int slot
= tex
->tex
.r
;
1610 if (tex
->tex
.target
== TEX_TARGET_2D_MS
)
1611 tex
->tex
.target
= TEX_TARGET_2D
;
1613 if (tex
->tex
.target
== TEX_TARGET_2D_MS_ARRAY
)
1614 tex
->tex
.target
= TEX_TARGET_2D_ARRAY
;
1618 Value
*x
= tex
->getSrc(0);
1619 Value
*y
= tex
->getSrc(1);
1620 Value
*s
= tex
->getSrc(arg
- 1);
1622 Value
*tx
= bld
.getSSA(), *ty
= bld
.getSSA(), *ts
= bld
.getSSA();
1623 Value
*ind
= tex
->getIndirectR();
1625 Value
*ms_x
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(0));
1626 Value
*ms_y
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_MS(1));
1628 bld
.mkOp2(OP_SHL
, TYPE_U32
, tx
, x
, ms_x
);
1629 bld
.mkOp2(OP_SHL
, TYPE_U32
, ty
, y
, ms_y
);
1631 s
= bld
.mkOp2v(OP_AND
, TYPE_U32
, ts
, s
, bld
.loadImm(NULL
, 0x7));
1632 s
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, ts
, ts
, bld
.mkImm(3));
1634 Value
*dx
= loadMsInfo32(ts
, 0x0);
1635 Value
*dy
= loadMsInfo32(ts
, 0x4);
1637 bld
.mkOp2(OP_ADD
, TYPE_U32
, tx
, tx
, dx
);
1638 bld
.mkOp2(OP_ADD
, TYPE_U32
, ty
, ty
, dy
);
1642 tex
->moveSources(arg
, -1);
1645 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1646 // They're computed from the coordinates using the surface info in c[] space.
1648 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction
*su
)
1651 const bool atom
= su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
;
1653 su
->op
== OP_SULDB
|| su
->op
== OP_SUSTB
|| su
->op
== OP_SUREDB
;
1654 const int slot
= su
->tex
.r
;
1655 const int dim
= su
->tex
.target
.getDim();
1656 const int arg
= dim
+ (su
->tex
.target
.isArray() || su
->tex
.target
.isCube());
1658 Value
*zero
= bld
.mkImm(0);
1662 Value
*bf
, *eau
, *off
;
1664 Value
*ind
= su
->getIndirectR();
1666 off
= bld
.getScratch(4);
1667 bf
= bld
.getScratch(4);
1668 addr
= bld
.getSSA(8);
1669 pred
= bld
.getScratch(1, FILE_PREDICATE
);
1671 bld
.setPosition(su
, false);
1673 adjustCoordinatesMS(su
);
1675 // calculate clamped coordinates
1676 for (c
= 0; c
< arg
; ++c
) {
1679 if (c
== 1 && su
->tex
.target
== TEX_TARGET_1D_ARRAY
) {
1680 // The array index is stored in the Z component for 1D arrays.
1684 src
[c
] = bld
.getScratch();
1686 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_RAW_X
);
1688 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_DIM(dimc
));
1689 bld
.mkOp3(OP_SUCLAMP
, TYPE_S32
, src
[c
], su
->getSrc(c
), v
, zero
)
1690 ->subOp
= getSuClampSubOp(su
, dimc
);
1695 // set predicate output
1696 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1697 src
[0]->getInsn()->setFlagsDef(1, pred
);
1699 if (su
->tex
.target
.isArray() || su
->tex
.target
.isCube()) {
1700 p1
= bld
.getSSA(1, FILE_PREDICATE
);
1701 src
[dim
]->getInsn()->setFlagsDef(1, p1
);
1704 // calculate pixel offset
1706 if (su
->tex
.target
!= TEX_TARGET_BUFFER
)
1707 bld
.mkOp2(OP_AND
, TYPE_U32
, off
, src
[0], bld
.loadImm(NULL
, 0xffff));
1710 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_UNK1C
);
1711 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, src
[2], v
, src
[1])
1712 ->subOp
= NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1714 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_PITCH
);
1715 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, off
, v
, src
[0])
1716 ->subOp
= NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1719 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_PITCH
);
1720 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, src
[1], v
, src
[0])
1721 ->subOp
= (su
->tex
.target
.isArray() || su
->tex
.target
.isCube()) ?
1722 NV50_IR_SUBOP_MADSP_SD
: NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1725 // calculate effective address part 1
1726 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1730 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_FMT
);
1731 bld
.mkOp3(OP_VSHL
, TYPE_U32
, bf
, src
[0], v
, zero
)
1732 ->subOp
= NV50_IR_SUBOP_V1(7,6,8|2);
1746 if (!su
->tex
.target
.isArray() && !su
->tex
.target
.isCube()) {
1747 z
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_UNK1C
);
1748 subOp
= NV50_IR_SUBOP_SUBFM_3D
;
1752 subOp
= NV50_IR_SUBOP_SUBFM_3D
;
1756 insn
= bld
.mkOp3(OP_SUBFM
, TYPE_U32
, bf
, src
[0], y
, z
);
1757 insn
->subOp
= subOp
;
1758 insn
->setFlagsDef(1, pred
);
1762 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
);
1764 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1767 eau
= bld
.mkOp3v(OP_SUEAU
, TYPE_U32
, bld
.getScratch(4), off
, bf
, v
);
1769 // add array layer offset
1770 if (su
->tex
.target
.isArray() || su
->tex
.target
.isCube()) {
1771 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ARRAY
);
1773 bld
.mkOp3(OP_MADSP
, TYPE_U32
, eau
, src
[1], v
, eau
)
1774 ->subOp
= NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1776 bld
.mkOp3(OP_MADSP
, TYPE_U32
, eau
, v
, src
[2], eau
)
1777 ->subOp
= NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
1778 // combine predicates
1780 bld
.mkOp2(OP_OR
, TYPE_U8
, pred
, pred
, p1
);
1785 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1789 // bf == g[] address & 0xff
1790 // eau == g[] address >> 8
1791 bld
.mkOp3(OP_PERMT
, TYPE_U32
, bf
, lo
, bld
.loadImm(NULL
, 0x6540), eau
);
1792 bld
.mkOp3(OP_PERMT
, TYPE_U32
, eau
, zero
, bld
.loadImm(NULL
, 0x0007), eau
);
1794 if (su
->op
== OP_SULDP
&& su
->tex
.target
== TEX_TARGET_BUFFER
) {
1795 // Convert from u32 to u8 address format, which is what the library code
1796 // doing SULDP currently uses.
1797 // XXX: can SUEAU do this ?
1798 // XXX: does it matter that we don't mask high bytes in bf ?
1800 bld
.mkOp2(OP_SHR
, TYPE_U32
, off
, bf
, bld
.mkImm(8));
1801 bld
.mkOp2(OP_ADD
, TYPE_U32
, eau
, eau
, off
);
1804 bld
.mkOp2(OP_MERGE
, TYPE_U64
, addr
, bf
, eau
);
1806 if (atom
&& su
->tex
.target
== TEX_TARGET_BUFFER
)
1807 bld
.mkOp2(OP_ADD
, TYPE_U64
, addr
, addr
, off
);
1809 // let's just set it 0 for raw access and hope it works
1811 bld
.mkImm(0) : loadSuInfo32(ind
, slot
, NVC0_SU_INFO_FMT
);
1813 // get rid of old coordinate sources, make space for fmt info and predicate
1814 su
->moveSources(arg
, 3 - arg
);
1815 // set 64 bit address and 32-bit format sources
1816 su
->setSrc(0, addr
);
1818 su
->setSrc(2, pred
);
1820 // prevent read fault when the image is not actually bound
1821 CmpInstruction
*pred1
=
1822 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
1823 TYPE_U32
, bld
.mkImm(0),
1824 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
));
1826 if (su
->op
!= OP_SUSTP
&& su
->tex
.format
) {
1827 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
1828 int blockwidth
= format
->bits
[0] + format
->bits
[1] +
1829 format
->bits
[2] + format
->bits
[3];
1831 // make sure that the format doesn't mismatch
1832 assert(format
->components
!= 0);
1833 bld
.mkCmp(OP_SET_OR
, CC_NE
, TYPE_U32
, pred1
->getDef(0),
1834 TYPE_U32
, bld
.loadImm(NULL
, blockwidth
/ 8),
1835 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
),
1838 su
->setPredicate(CC_NOT_P
, pred1
->getDef(0));
1840 // TODO: initialize def values to 0 when the surface operation is not
1841 // performed (not needed for stores). Also, fix the "address bounds test"
1842 // subtests from arb_shader_image_load_store-invalid for buffers, because it
1843 // seems like that the predicate is not correctly set by suclamp.
1847 getSrcType(const TexInstruction::ImgFormatDesc
*t
, int c
)
1850 case FLOAT
: return t
->bits
[c
] == 16 ? TYPE_F16
: TYPE_F32
;
1851 case UNORM
: return t
->bits
[c
] == 8 ? TYPE_U8
: TYPE_U16
;
1852 case SNORM
: return t
->bits
[c
] == 8 ? TYPE_S8
: TYPE_S16
;
1854 return (t
->bits
[c
] == 8 ? TYPE_U8
:
1855 (t
->bits
[c
] == 16 ? TYPE_U16
: TYPE_U32
));
1857 return (t
->bits
[c
] == 8 ? TYPE_S8
:
1858 (t
->bits
[c
] == 16 ? TYPE_S16
: TYPE_S32
));
1864 getDestType(const ImgType type
) {
1875 assert(!"Impossible type");
1881 NVC0LoweringPass::convertSurfaceFormat(TexInstruction
*su
)
1883 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
1884 int width
= format
->bits
[0] + format
->bits
[1] +
1885 format
->bits
[2] + format
->bits
[3];
1886 Value
*untypedDst
[4] = {};
1887 Value
*typedDst
[4] = {};
1889 // We must convert this to a generic load.
1892 su
->dType
= typeOfSize(width
/ 8);
1893 su
->sType
= TYPE_U8
;
1895 for (int i
= 0; i
< width
/ 32; i
++)
1896 untypedDst
[i
] = bld
.getSSA();
1898 untypedDst
[0] = bld
.getSSA();
1900 for (int i
= 0; i
< 4; i
++) {
1901 typedDst
[i
] = su
->getDef(i
);
1904 // Set the untyped dsts as the su's destinations
1905 for (int i
= 0; i
< 4; i
++)
1906 su
->setDef(i
, untypedDst
[i
]);
1908 bld
.setPosition(su
, true);
1910 // Unpack each component into the typed dsts
1912 for (int i
= 0; i
< 4; bits
+= format
->bits
[i
], i
++) {
1915 if (i
>= format
->components
) {
1916 if (format
->type
== FLOAT
||
1917 format
->type
== UNORM
||
1918 format
->type
== SNORM
)
1919 bld
.loadImm(typedDst
[i
], i
== 3 ? 1.0f
: 0.0f
);
1921 bld
.loadImm(typedDst
[i
], i
== 3 ? 1 : 0);
1925 // Get just that component's data into the relevant place
1926 if (format
->bits
[i
] == 32)
1927 bld
.mkMov(typedDst
[i
], untypedDst
[i
]);
1928 else if (format
->bits
[i
] == 16)
1929 bld
.mkCvt(OP_CVT
, getDestType(format
->type
), typedDst
[i
],
1930 getSrcType(format
, i
), untypedDst
[i
/ 2])
1931 ->subOp
= (i
& 1) << (format
->type
== FLOAT
? 0 : 1);
1932 else if (format
->bits
[i
] == 8)
1933 bld
.mkCvt(OP_CVT
, getDestType(format
->type
), typedDst
[i
],
1934 getSrcType(format
, i
), untypedDst
[0])->subOp
= i
;
1936 bld
.mkOp2(OP_EXTBF
, TYPE_U32
, typedDst
[i
], untypedDst
[bits
/ 32],
1937 bld
.mkImm((bits
% 32) | (format
->bits
[i
] << 8)));
1938 if (format
->type
== UNORM
|| format
->type
== SNORM
)
1939 bld
.mkCvt(OP_CVT
, TYPE_F32
, typedDst
[i
], getSrcType(format
, i
), typedDst
[i
]);
1942 // Normalize / convert as necessary
1943 if (format
->type
== UNORM
)
1944 bld
.mkOp2(OP_MUL
, TYPE_F32
, typedDst
[i
], typedDst
[i
], bld
.loadImm(NULL
, 1.0f
/ ((1 << format
->bits
[i
]) - 1)));
1945 else if (format
->type
== SNORM
)
1946 bld
.mkOp2(OP_MUL
, TYPE_F32
, typedDst
[i
], typedDst
[i
], bld
.loadImm(NULL
, 1.0f
/ ((1 << (format
->bits
[i
] - 1)) - 1)));
1947 else if (format
->type
== FLOAT
&& format
->bits
[i
] < 16) {
1948 bld
.mkOp2(OP_SHL
, TYPE_U32
, typedDst
[i
], typedDst
[i
], bld
.loadImm(NULL
, 15 - format
->bits
[i
]));
1949 bld
.mkCvt(OP_CVT
, TYPE_F32
, typedDst
[i
], TYPE_F16
, typedDst
[i
]);
1954 std::swap(typedDst
[0], typedDst
[2]);
1959 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction
*su
)
1961 processSurfaceCoordsNVE4(su
);
1963 if (su
->op
== OP_SULDP
)
1964 convertSurfaceFormat(su
);
1966 if (su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
) {
1967 Value
*pred
= su
->getSrc(2);
1968 CondCode cc
= CC_NOT_P
;
1969 if (su
->getPredicate()) {
1970 pred
= bld
.getScratch(1, FILE_PREDICATE
);
1972 if (cc
== CC_NOT_P
) {
1973 bld
.mkOp2(OP_OR
, TYPE_U8
, pred
, su
->getPredicate(), su
->getSrc(2));
1975 bld
.mkOp2(OP_AND
, TYPE_U8
, pred
, su
->getPredicate(), su
->getSrc(2));
1976 pred
->getInsn()->src(1).mod
= Modifier(NV50_IR_MOD_NOT
);
1979 Instruction
*red
= bld
.mkOp(OP_ATOM
, su
->dType
, bld
.getSSA());
1980 red
->subOp
= su
->subOp
;
1981 red
->setSrc(0, bld
.mkSymbol(FILE_MEMORY_GLOBAL
, 0, TYPE_U32
, 0));
1982 red
->setSrc(1, su
->getSrc(3));
1983 if (su
->subOp
== NV50_IR_SUBOP_ATOM_CAS
)
1984 red
->setSrc(2, su
->getSrc(4));
1985 red
->setIndirect(0, 0, su
->getSrc(0));
1987 // make sure to initialize dst value when the atomic operation is not
1989 Instruction
*mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
1991 assert(cc
== CC_NOT_P
);
1992 red
->setPredicate(cc
, pred
);
1993 mov
->setPredicate(CC_P
, pred
);
1995 bld
.mkOp2(OP_UNION
, TYPE_U32
, su
->getDef(0),
1996 red
->getDef(0), mov
->getDef(0));
1998 delete_Instruction(bld
.getProgram(), su
);
1999 handleCasExch(red
, true);
2002 if (su
->op
== OP_SUSTB
|| su
->op
== OP_SUSTP
)
2003 su
->sType
= (su
->tex
.target
== TEX_TARGET_BUFFER
) ? TYPE_U32
: TYPE_U8
;
2007 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction
*su
)
2009 const int slot
= su
->tex
.r
;
2010 const int dim
= su
->tex
.target
.getDim();
2011 const int arg
= dim
+ (su
->tex
.target
.isArray() || su
->tex
.target
.isCube());
2013 Value
*zero
= bld
.mkImm(0);
2016 Value
*ind
= su
->getIndirectR();
2018 bld
.setPosition(su
, false);
2020 adjustCoordinatesMS(su
);
2024 ptr
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), ind
, bld
.mkImm(su
->tex
.r
));
2025 ptr
= bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ptr
, bld
.mkImm(7));
2026 su
->setIndirectR(ptr
);
2029 // get surface coordinates
2030 for (c
= 0; c
< arg
; ++c
)
2031 src
[c
] = su
->getSrc(c
);
2035 // calculate pixel offset
2036 if (su
->op
== OP_SULDP
|| su
->op
== OP_SUREDP
) {
2037 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
);
2038 su
->setSrc(0, bld
.mkOp2v(OP_MUL
, TYPE_U32
, bld
.getSSA(), src
[0], v
));
2041 // add array layer offset
2042 if (su
->tex
.target
.isArray() || su
->tex
.target
.isCube()) {
2043 v
= loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ARRAY
);
2045 su
->setSrc(2, bld
.mkOp2v(OP_MUL
, TYPE_U32
, bld
.getSSA(), src
[2], v
));
2048 // prevent read fault when the image is not actually bound
2049 CmpInstruction
*pred
=
2050 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
2051 TYPE_U32
, bld
.mkImm(0),
2052 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
));
2053 if (su
->op
!= OP_SUSTP
&& su
->tex
.format
) {
2054 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
2055 int blockwidth
= format
->bits
[0] + format
->bits
[1] +
2056 format
->bits
[2] + format
->bits
[3];
2058 assert(format
->components
!= 0);
2059 // make sure that the format doesn't mismatch when it's not FMT_NONE
2060 bld
.mkCmp(OP_SET_OR
, CC_NE
, TYPE_U32
, pred
->getDef(0),
2061 TYPE_U32
, bld
.loadImm(NULL
, blockwidth
/ 8),
2062 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
),
2065 su
->setPredicate(CC_NOT_P
, pred
->getDef(0));
2069 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction
*su
)
2071 if (su
->tex
.target
== TEX_TARGET_1D_ARRAY
) {
2072 /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2073 * will simplify the lowering pass and the texture constraints. */
2074 su
->moveSources(1, 1);
2075 su
->setSrc(1, bld
.loadImm(NULL
, 0));
2076 su
->tex
.target
= TEX_TARGET_2D_ARRAY
;
2079 processSurfaceCoordsNVC0(su
);
2081 if (su
->op
== OP_SULDP
)
2082 convertSurfaceFormat(su
);
2084 if (su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
) {
2085 const int dim
= su
->tex
.target
.getDim();
2086 const int arg
= dim
+ (su
->tex
.target
.isArray() || su
->tex
.target
.isCube());
2087 LValue
*addr
= bld
.getSSA(8);
2088 Value
*def
= su
->getDef(0);
2092 // Set the destination to the address
2093 su
->dType
= TYPE_U64
;
2094 su
->setDef(0, addr
);
2095 su
->setDef(1, su
->getPredicate());
2097 bld
.setPosition(su
, true);
2099 // Perform the atomic op
2100 Instruction
*red
= bld
.mkOp(OP_ATOM
, su
->sType
, bld
.getSSA());
2101 red
->subOp
= su
->subOp
;
2102 red
->setSrc(0, bld
.mkSymbol(FILE_MEMORY_GLOBAL
, 0, su
->sType
, 0));
2103 red
->setSrc(1, su
->getSrc(arg
));
2104 if (red
->subOp
== NV50_IR_SUBOP_ATOM_CAS
)
2105 red
->setSrc(2, su
->getSrc(arg
+ 1));
2106 red
->setIndirect(0, 0, addr
);
2108 // make sure to initialize dst value when the atomic operation is not
2110 Instruction
*mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
2112 assert(su
->cc
== CC_NOT_P
);
2113 red
->setPredicate(su
->cc
, su
->getPredicate());
2114 mov
->setPredicate(CC_P
, su
->getPredicate());
2116 bld
.mkOp2(OP_UNION
, TYPE_U32
, def
, red
->getDef(0), mov
->getDef(0));
2118 handleCasExch(red
, false);
2123 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction
*su
)
2125 const int slot
= su
->tex
.r
;
2126 const int dim
= su
->tex
.target
.getDim();
2127 const int arg
= dim
+ (su
->tex
.target
.isArray() || su
->tex
.target
.isCube());
2128 Value
*ind
= su
->getIndirectR();
2131 bld
.setPosition(su
, false);
2133 // add texture handle
2139 pos
= (su
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) ? 2 : 1;
2145 su
->setSrc(arg
+ pos
, loadTexHandle(ind
, slot
+ 32));
2147 // prevent read fault when the image is not actually bound
2148 CmpInstruction
*pred
=
2149 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
2150 TYPE_U32
, bld
.mkImm(0),
2151 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_ADDR
));
2152 if (su
->op
!= OP_SUSTP
&& su
->tex
.format
) {
2153 const TexInstruction::ImgFormatDesc
*format
= su
->tex
.format
;
2154 int blockwidth
= format
->bits
[0] + format
->bits
[1] +
2155 format
->bits
[2] + format
->bits
[3];
2157 assert(format
->components
!= 0);
2158 // make sure that the format doesn't mismatch when it's not FMT_NONE
2159 bld
.mkCmp(OP_SET_OR
, CC_NE
, TYPE_U32
, pred
->getDef(0),
2160 TYPE_U32
, bld
.loadImm(NULL
, blockwidth
/ 8),
2161 loadSuInfo32(ind
, slot
, NVC0_SU_INFO_BSIZE
),
2164 su
->setPredicate(CC_NOT_P
, pred
->getDef(0));
2168 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction
*su
)
2170 processSurfaceCoordsGM107(su
);
2172 if (su
->op
== OP_SULDP
)
2173 convertSurfaceFormat(su
);
2175 if (su
->op
== OP_SUREDP
) {
2176 Value
*def
= su
->getDef(0);
2179 su
->setDef(0, bld
.getSSA());
2181 bld
.setPosition(su
, true);
2183 // make sure to initialize dst value when the atomic operation is not
2185 Instruction
*mov
= bld
.mkMov(bld
.getSSA(), bld
.loadImm(NULL
, 0));
2187 assert(su
->cc
== CC_NOT_P
);
2188 mov
->setPredicate(CC_P
, su
->getPredicate());
2190 bld
.mkOp2(OP_UNION
, TYPE_U32
, def
, su
->getDef(0), mov
->getDef(0));
2195 NVC0LoweringPass::handleWRSV(Instruction
*i
)
2201 // must replace, $sreg are not writeable
2202 addr
= targ
->getSVAddress(FILE_SHADER_OUTPUT
, i
->getSrc(0)->asSym());
2205 sym
= bld
.mkSymbol(FILE_SHADER_OUTPUT
, 0, i
->sType
, addr
);
2207 st
= bld
.mkStore(OP_EXPORT
, i
->dType
, sym
, i
->getIndirect(0, 0),
2209 st
->perPatch
= i
->perPatch
;
2211 bld
.getBB()->remove(i
);
2216 NVC0LoweringPass::handleLDST(Instruction
*i
)
2218 if (i
->src(0).getFile() == FILE_SHADER_INPUT
) {
2219 if (prog
->getType() == Program::TYPE_COMPUTE
) {
2220 i
->getSrc(0)->reg
.file
= FILE_MEMORY_CONST
;
2221 i
->getSrc(0)->reg
.fileIndex
= 0;
2223 if (prog
->getType() == Program::TYPE_GEOMETRY
&&
2224 i
->src(0).isIndirect(0)) {
2225 // XXX: this assumes vec4 units
2226 Value
*ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
2227 i
->getIndirect(0, 0), bld
.mkImm(4));
2228 i
->setIndirect(0, 0, ptr
);
2232 assert(prog
->getType() != Program::TYPE_FRAGMENT
); // INTERP
2234 } else if (i
->src(0).getFile() == FILE_MEMORY_CONST
) {
2235 if (targ
->getChipset() >= NVISA_GK104_CHIPSET
&&
2236 prog
->getType() == Program::TYPE_COMPUTE
) {
2237 // The launch descriptor only allows to set up 8 CBs, but OpenGL
2238 // requires at least 12 UBOs. To bypass this limitation, we store the
2239 // addrs into the driver constbuf and we directly load from the global
2241 int8_t fileIndex
= i
->getSrc(0)->reg
.fileIndex
- 1;
2242 Value
*ind
= i
->getIndirect(0, 1);
2245 // Clamp the UBO index when an indirect access is used to avoid
2246 // loading information from the wrong place in the driver cb.
2247 ind
= bld
.mkOp2v(OP_MIN
, TYPE_U32
, ind
,
2248 bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(),
2249 ind
, bld
.loadImm(NULL
, fileIndex
)),
2250 bld
.loadImm(NULL
, 12));
2253 if (i
->src(0).isIndirect(1)) {
2254 Value
*offset
= bld
.loadImm(NULL
, i
->getSrc(0)->reg
.data
.offset
+ typeSizeof(i
->sType
));
2255 Value
*ptr
= loadUboInfo64(ind
, fileIndex
* 16);
2256 Value
*length
= loadUboLength32(ind
, fileIndex
* 16);
2257 Value
*pred
= new_LValue(func
, FILE_PREDICATE
);
2258 if (i
->src(0).isIndirect(0)) {
2259 bld
.mkOp2(OP_ADD
, TYPE_U64
, ptr
, ptr
, i
->getIndirect(0, 0));
2260 bld
.mkOp2(OP_ADD
, TYPE_U32
, offset
, offset
, i
->getIndirect(0, 0));
2262 i
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
2263 i
->setIndirect(0, 1, NULL
);
2264 i
->setIndirect(0, 0, ptr
);
2265 bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, pred
, TYPE_U32
, offset
, length
);
2266 i
->setPredicate(CC_NOT_P
, pred
);
2267 if (i
->defExists(0)) {
2268 bld
.mkMov(i
->getDef(0), bld
.mkImm(0));
2270 } else if (fileIndex
>= 0) {
2271 Value
*ptr
= loadUboInfo64(ind
, fileIndex
* 16);
2272 if (i
->src(0).isIndirect(0)) {
2273 bld
.mkOp2(OP_ADD
, TYPE_U64
, ptr
, ptr
, i
->getIndirect(0, 0));
2275 i
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
2276 i
->setIndirect(0, 1, NULL
);
2277 i
->setIndirect(0, 0, ptr
);
2279 } else if (i
->src(0).isIndirect(1)) {
2281 if (i
->src(0).isIndirect(0))
2282 ptr
= bld
.mkOp3v(OP_INSBF
, TYPE_U32
, bld
.getSSA(),
2283 i
->getIndirect(0, 1), bld
.mkImm(0x1010),
2284 i
->getIndirect(0, 0));
2286 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
2287 i
->getIndirect(0, 1), bld
.mkImm(16));
2288 i
->setIndirect(0, 1, NULL
);
2289 i
->setIndirect(0, 0, ptr
);
2290 i
->subOp
= NV50_IR_SUBOP_LDC_IS
;
2292 } else if (i
->src(0).getFile() == FILE_SHADER_OUTPUT
) {
2293 assert(prog
->getType() == Program::TYPE_TESSELLATION_CONTROL
);
2295 } else if (i
->src(0).getFile() == FILE_MEMORY_BUFFER
) {
2296 Value
*ind
= i
->getIndirect(0, 1);
2297 Value
*ptr
= loadBufInfo64(ind
, i
->getSrc(0)->reg
.fileIndex
* 16);
2298 // XXX come up with a way not to do this for EVERY little access but
2299 // rather to batch these up somehow. Unfortunately we've lost the
2300 // information about the field width by the time we get here.
2301 Value
*offset
= bld
.loadImm(NULL
, i
->getSrc(0)->reg
.data
.offset
+ typeSizeof(i
->sType
));
2302 Value
*length
= loadBufLength32(ind
, i
->getSrc(0)->reg
.fileIndex
* 16);
2303 Value
*pred
= new_LValue(func
, FILE_PREDICATE
);
2304 if (i
->src(0).isIndirect(0)) {
2305 bld
.mkOp2(OP_ADD
, TYPE_U64
, ptr
, ptr
, i
->getIndirect(0, 0));
2306 bld
.mkOp2(OP_ADD
, TYPE_U32
, offset
, offset
, i
->getIndirect(0, 0));
2308 i
->setIndirect(0, 1, NULL
);
2309 i
->setIndirect(0, 0, ptr
);
2310 i
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
2311 bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, pred
, TYPE_U32
, offset
, length
);
2312 i
->setPredicate(CC_NOT_P
, pred
);
2313 if (i
->defExists(0)) {
2314 Value
*zero
, *dst
= i
->getDef(0);
2315 i
->setDef(0, bld
.getSSA());
2317 bld
.setPosition(i
, true);
2318 bld
.mkMov((zero
= bld
.getSSA()), bld
.mkImm(0))
2319 ->setPredicate(CC_P
, pred
);
2320 bld
.mkOp2(OP_UNION
, TYPE_U32
, dst
, i
->getDef(0), zero
);
2326 NVC0LoweringPass::readTessCoord(LValue
*dst
, int c
)
2328 Value
*laneid
= bld
.getSSA();
2331 bld
.mkOp1(OP_RDSV
, TYPE_U32
, laneid
, bld
.mkSysVal(SV_LANEID
, 0));
2342 if (prog
->driver
->prop
.tp
.domain
!= PIPE_PRIM_TRIANGLES
) {
2343 bld
.mkMov(dst
, bld
.loadImm(NULL
, 0));
2350 bld
.mkFetch(x
, TYPE_F32
, FILE_SHADER_OUTPUT
, 0x2f0, NULL
, laneid
);
2352 bld
.mkFetch(y
, TYPE_F32
, FILE_SHADER_OUTPUT
, 0x2f4, NULL
, laneid
);
2355 bld
.mkOp2(OP_ADD
, TYPE_F32
, dst
, x
, y
);
2356 bld
.mkOp2(OP_SUB
, TYPE_F32
, dst
, bld
.loadImm(NULL
, 1.0f
), dst
);
2361 NVC0LoweringPass::handleRDSV(Instruction
*i
)
2363 Symbol
*sym
= i
->getSrc(0)->asSym();
2364 const SVSemantic sv
= sym
->reg
.data
.sv
.sv
;
2367 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_INPUT
, sym
);
2369 if (addr
>= 0x400) {
2371 if (sym
->reg
.data
.sv
.index
== 3) {
2372 // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2374 i
->setSrc(0, bld
.mkImm((sv
== SV_NTID
|| sv
== SV_NCTAID
) ? 1 : 0));
2376 if (sv
== SV_VERTEX_COUNT
) {
2377 bld
.setPosition(i
, true);
2378 bld
.mkOp2(OP_EXTBF
, TYPE_U32
, i
->getDef(0), i
->getDef(0), bld
.mkImm(0x808));
2385 assert(prog
->getType() == Program::TYPE_FRAGMENT
);
2386 if (i
->srcExists(1)) {
2387 // Pass offset through to the interpolation logic
2388 ld
= bld
.mkInterp(NV50_IR_INTERP_LINEAR
| NV50_IR_INTERP_OFFSET
,
2389 i
->getDef(0), addr
, NULL
);
2390 ld
->setSrc(1, i
->getSrc(1));
2392 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, i
->getDef(0), addr
, NULL
);
2397 Value
*face
= i
->getDef(0);
2398 bld
.mkInterp(NV50_IR_INTERP_FLAT
, face
, addr
, NULL
);
2399 if (i
->dType
== TYPE_F32
) {
2400 bld
.mkOp2(OP_OR
, TYPE_U32
, face
, face
, bld
.mkImm(0x00000001));
2401 bld
.mkOp1(OP_NEG
, TYPE_S32
, face
, face
);
2402 bld
.mkCvt(OP_CVT
, TYPE_F32
, face
, TYPE_S32
, face
);
2407 assert(prog
->getType() == Program::TYPE_TESSELLATION_EVAL
);
2408 readTessCoord(i
->getDef(0)->asLValue(), i
->getSrc(0)->reg
.data
.sv
.index
);
2413 assert(targ
->getChipset() >= NVISA_GK104_CHIPSET
); // mov $sreg otherwise
2414 if (sym
->reg
.data
.sv
.index
== 3) {
2416 i
->setSrc(0, bld
.mkImm(sv
== SV_GRIDID
? 0 : 1));
2421 addr
+= prog
->driver
->prop
.cp
.gridInfoBase
;
2422 bld
.mkLoad(TYPE_U32
, i
->getDef(0),
2423 bld
.mkSymbol(FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
2424 TYPE_U32
, addr
), NULL
);
2426 case SV_SAMPLE_INDEX
:
2427 // TODO: Properly pass source as an address in the PIX address space
2428 // (which can be of the form [r0+offset]). But this is currently
2430 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
2431 ld
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
2433 case SV_SAMPLE_POS
: {
2434 Value
*off
= new_LValue(func
, FILE_GPR
);
2435 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
2436 ld
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
2437 bld
.mkOp2(OP_SHL
, TYPE_U32
, off
, i
->getDef(0), bld
.mkImm(3));
2438 bld
.mkLoad(TYPE_F32
,
2441 FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
2442 TYPE_U32
, prog
->driver
->io
.sampleInfoBase
+
2443 4 * sym
->reg
.data
.sv
.index
),
2447 case SV_SAMPLE_MASK
: {
2448 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
2449 ld
->subOp
= NV50_IR_SUBOP_PIXLD_COVMASK
;
2450 Instruction
*sampleid
=
2451 bld
.mkOp1(OP_PIXLD
, TYPE_U32
, bld
.getSSA(), bld
.mkImm(0));
2452 sampleid
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
2454 bld
.mkOp2v(OP_AND
, TYPE_U32
, bld
.getSSA(), ld
->getDef(0),
2455 bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
2456 bld
.loadImm(NULL
, 1), sampleid
->getDef(0)));
2457 if (prog
->driver
->prop
.fp
.persampleInvocation
) {
2458 bld
.mkMov(i
->getDef(0), masked
);
2460 bld
.mkOp3(OP_SELP
, TYPE_U32
, i
->getDef(0), ld
->getDef(0), masked
,
2467 case SV_BASEINSTANCE
:
2469 ld
= bld
.mkLoad(TYPE_U32
, i
->getDef(0),
2470 bld
.mkSymbol(FILE_MEMORY_CONST
,
2471 prog
->driver
->io
.auxCBSlot
,
2473 prog
->driver
->io
.drawInfoBase
+
2474 4 * (sv
- SV_BASEVERTEX
)),
2478 if (prog
->getType() == Program::TYPE_TESSELLATION_EVAL
&& !i
->perPatch
)
2479 vtx
= bld
.mkOp1v(OP_PFETCH
, TYPE_U32
, bld
.getSSA(), bld
.mkImm(0));
2480 ld
= bld
.mkFetch(i
->getDef(0), i
->dType
,
2481 FILE_SHADER_INPUT
, addr
, i
->getIndirect(0, 0), vtx
);
2482 ld
->perPatch
= i
->perPatch
;
2485 bld
.getBB()->remove(i
);
2490 NVC0LoweringPass::handleDIV(Instruction
*i
)
2492 if (!isFloatType(i
->dType
))
2494 bld
.setPosition(i
, false);
2495 Instruction
*rcp
= bld
.mkOp1(OP_RCP
, i
->dType
, bld
.getSSA(typeSizeof(i
->dType
)), i
->getSrc(1));
2497 i
->setSrc(1, rcp
->getDef(0));
2502 NVC0LoweringPass::handleMOD(Instruction
*i
)
2504 if (!isFloatType(i
->dType
))
2506 LValue
*value
= bld
.getScratch(typeSizeof(i
->dType
));
2507 bld
.mkOp1(OP_RCP
, i
->dType
, value
, i
->getSrc(1));
2508 bld
.mkOp2(OP_MUL
, i
->dType
, value
, i
->getSrc(0), value
);
2509 bld
.mkOp1(OP_TRUNC
, i
->dType
, value
, value
);
2510 bld
.mkOp2(OP_MUL
, i
->dType
, value
, i
->getSrc(1), value
);
2512 i
->setSrc(1, value
);
2517 NVC0LoweringPass::handleSQRT(Instruction
*i
)
2519 if (i
->dType
== TYPE_F64
) {
2520 Value
*pred
= bld
.getSSA(1, FILE_PREDICATE
);
2521 Value
*zero
= bld
.loadImm(NULL
, 0.0);
2522 Value
*dst
= bld
.getSSA(8);
2523 bld
.mkOp1(OP_RSQ
, i
->dType
, dst
, i
->getSrc(0));
2524 bld
.mkCmp(OP_SET
, CC_LE
, i
->dType
, pred
, i
->dType
, i
->getSrc(0), zero
);
2525 bld
.mkOp3(OP_SELP
, TYPE_U64
, dst
, zero
, dst
, pred
);
2528 // TODO: Handle this properly with a library function
2530 bld
.setPosition(i
, true);
2532 bld
.mkOp1(OP_RCP
, i
->dType
, i
->getDef(0), i
->getDef(0));
2539 NVC0LoweringPass::handlePOW(Instruction
*i
)
2541 LValue
*val
= bld
.getScratch();
2543 bld
.mkOp1(OP_LG2
, TYPE_F32
, val
, i
->getSrc(0));
2544 bld
.mkOp2(OP_MUL
, TYPE_F32
, val
, i
->getSrc(1), val
)->dnz
= 1;
2545 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, val
, val
);
2555 NVC0LoweringPass::handleEXPORT(Instruction
*i
)
2557 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
2558 int id
= i
->getSrc(0)->reg
.data
.offset
/ 4;
2560 if (i
->src(0).isIndirect(0)) // TODO, ugly
2563 i
->subOp
= NV50_IR_SUBOP_MOV_FINAL
;
2564 i
->src(0).set(i
->src(1));
2566 i
->setDef(0, new_LValue(func
, FILE_GPR
));
2567 i
->getDef(0)->reg
.data
.id
= id
;
2569 prog
->maxGPR
= MAX2(prog
->maxGPR
, id
);
2571 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
2572 i
->setIndirect(0, 1, gpEmitAddress
);
2578 NVC0LoweringPass::handleOUT(Instruction
*i
)
2580 Instruction
*prev
= i
->prev
;
2581 ImmediateValue stream
, prevStream
;
2583 // Only merge if the stream ids match. Also, note that the previous
2584 // instruction would have already been lowered, so we take arg1 from it.
2585 if (i
->op
== OP_RESTART
&& prev
&& prev
->op
== OP_EMIT
&&
2586 i
->src(0).getImmediate(stream
) &&
2587 prev
->src(1).getImmediate(prevStream
) &&
2588 stream
.reg
.data
.u32
== prevStream
.reg
.data
.u32
) {
2589 i
->prev
->subOp
= NV50_IR_SUBOP_EMIT_RESTART
;
2590 delete_Instruction(prog
, i
);
2592 assert(gpEmitAddress
);
2593 i
->setDef(0, gpEmitAddress
);
2594 i
->setSrc(1, i
->getSrc(0));
2595 i
->setSrc(0, gpEmitAddress
);
2600 // Generate a binary predicate if an instruction is predicated by
2601 // e.g. an f32 value.
2603 NVC0LoweringPass::checkPredicate(Instruction
*insn
)
2605 Value
*pred
= insn
->getPredicate();
2608 if (!pred
|| pred
->reg
.file
== FILE_PREDICATE
)
2610 pdst
= new_LValue(func
, FILE_PREDICATE
);
2612 // CAUTION: don't use pdst->getInsn, the definition might not be unique,
2613 // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
2615 bld
.mkCmp(OP_SET
, CC_NEU
, insn
->dType
, pdst
, insn
->dType
, bld
.mkImm(0), pred
);
2617 insn
->setPredicate(insn
->cc
, pdst
);
2621 // - add quadop dance for texturing
2622 // - put FP outputs in GPRs
2623 // - convert instruction sequences
2626 NVC0LoweringPass::visit(Instruction
*i
)
2629 bld
.setPosition(i
, false);
2631 if (i
->cc
!= CC_ALWAYS
)
2640 return handleTEX(i
->asTex());
2642 return handleTXD(i
->asTex());
2644 return handleTXLQ(i
->asTex());
2646 return handleTXQ(i
->asTex());
2648 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, i
->getDef(0), i
->getSrc(0));
2649 i
->setSrc(0, i
->getDef(0));
2652 return handlePOW(i
);
2654 return handleDIV(i
);
2656 return handleMOD(i
);
2658 return handleSQRT(i
);
2660 ret
= handleEXPORT(i
);
2664 return handleOUT(i
);
2666 return handleRDSV(i
);
2668 return handleWRSV(i
);
2675 const bool cctl
= i
->src(0).getFile() == FILE_MEMORY_BUFFER
;
2677 handleCasExch(i
, cctl
);
2686 if (targ
->getChipset() >= NVISA_GM107_CHIPSET
)
2687 handleSurfaceOpGM107(i
->asTex());
2688 else if (targ
->getChipset() >= NVISA_GK104_CHIPSET
)
2689 handleSurfaceOpNVE4(i
->asTex());
2691 handleSurfaceOpNVC0(i
->asTex());
2694 handleSUQ(i
->asTex());
2703 /* Kepler+ has a special opcode to compute a new base address to be used
2704 * for indirect loads.
2706 * Maxwell+ has an additional similar requirement for indirect
2707 * interpolation ops in frag shaders.
2709 bool doAfetch
= false;
2710 if (targ
->getChipset() >= NVISA_GK104_CHIPSET
&&
2712 (i
->op
== OP_VFETCH
|| i
->op
== OP_EXPORT
) &&
2713 i
->src(0).isIndirect(0)) {
2716 if (targ
->getChipset() >= NVISA_GM107_CHIPSET
&&
2717 (i
->op
== OP_LINTERP
|| i
->op
== OP_PINTERP
) &&
2718 i
->src(0).isIndirect(0)) {
2723 Value
*addr
= cloneShallow(func
, i
->getSrc(0));
2724 Instruction
*afetch
= bld
.mkOp1(OP_AFETCH
, TYPE_U32
, bld
.getSSA(),
2726 afetch
->setIndirect(0, 0, i
->getIndirect(0, 0));
2727 addr
->reg
.data
.offset
= 0;
2729 i
->setIndirect(0, 0, afetch
->getDef(0));
2736 TargetNVC0::runLegalizePass(Program
*prog
, CGStage stage
) const
2738 if (stage
== CG_STAGE_PRE_SSA
) {
2739 NVC0LoweringPass
pass(prog
);
2740 return pass
.run(prog
, false, true);
2742 if (stage
== CG_STAGE_POST_RA
) {
2743 NVC0LegalizePostRA
pass(prog
);
2744 return pass
.run(prog
, false, true);
2746 if (stage
== CG_STAGE_SSA
) {
2747 NVC0LegalizeSSA pass
;
2748 return pass
.run(prog
, false, true);
2753 } // namespace nv50_ir