2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
26 #include "codegen/nv50_ir_target_nvc0.h"
27 #include "codegen/nv50_ir_lowering_nvc0.h"
39 #define QUADOP(q, r, s, t) \
40 ((QOP_##q << 6) | (QOP_##r << 4) | \
41 (QOP_##s << 2) | (QOP_##t << 0))
44 NVC0LegalizeSSA::handleDIV(Instruction
*i
)
46 FlowInstruction
*call
;
50 bld
.setPosition(i
, false);
51 def
[0] = bld
.mkMovToReg(0, i
->getSrc(0))->getDef(0);
52 def
[1] = bld
.mkMovToReg(1, i
->getSrc(1))->getDef(0);
54 case TYPE_U32
: builtin
= NVC0_BUILTIN_DIV_U32
; break;
55 case TYPE_S32
: builtin
= NVC0_BUILTIN_DIV_S32
; break;
59 call
= bld
.mkFlow(OP_CALL
, NULL
, CC_ALWAYS
, NULL
);
60 bld
.mkMov(i
->getDef(0), def
[(i
->op
== OP_DIV
) ? 0 : 1]);
61 bld
.mkClobber(FILE_GPR
, (i
->op
== OP_DIV
) ? 0xe : 0xd, 2);
62 bld
.mkClobber(FILE_PREDICATE
, (i
->dType
== TYPE_S32
) ? 0xf : 0x3, 0);
65 call
->absolute
= call
->builtin
= 1;
66 call
->target
.builtin
= builtin
;
67 delete_Instruction(prog
, i
);
71 NVC0LegalizeSSA::handleRCPRSQ(Instruction
*i
)
73 assert(i
->dType
== TYPE_F64
);
74 // There are instructions that will compute the high 32 bits of the 64-bit
75 // float. We will just stick 0 in the bottom 32 bits.
77 bld
.setPosition(i
, false);
79 // 1. Take the source and it up.
80 Value
*src
[2], *dst
[2], *def
= i
->getDef(0);
81 bld
.mkSplit(src
, 4, i
->getSrc(0));
83 // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
84 dst
[0] = bld
.loadImm(NULL
, 0);
85 dst
[1] = bld
.getSSA();
87 // 3. The new version of the instruction takes the high 32 bits of the
88 // source and outputs the high 32 bits of the destination.
92 i
->subOp
= NV50_IR_SUBOP_RCPRSQ_64H
;
94 // 4. Recombine the two dst pieces back into the original destination.
95 bld
.setPosition(i
, true);
96 bld
.mkOp2(OP_MERGE
, TYPE_U64
, def
, dst
[0], dst
[1]);
100 NVC0LegalizeSSA::handleFTZ(Instruction
*i
)
102 // Only want to flush float inputs
103 assert(i
->sType
== TYPE_F32
);
105 // If we're already flushing denorms (and NaN's) to zero, no need for this.
109 // Only certain classes of operations can flush
110 OpClass cls
= prog
->getTarget()->getOpClass(i
->op
);
111 if (cls
!= OPCLASS_ARITH
&& cls
!= OPCLASS_COMPARE
&&
112 cls
!= OPCLASS_CONVERT
)
119 NVC0LegalizeSSA::visit(Function
*fn
)
121 bld
.setProgram(fn
->getProgram());
126 NVC0LegalizeSSA::visit(BasicBlock
*bb
)
129 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
131 if (i
->sType
== TYPE_F32
) {
132 if (prog
->getType() != Program::TYPE_COMPUTE
)
143 if (i
->dType
== TYPE_F64
)
153 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program
*prog
)
156 needTexBar(prog
->getTarget()->getChipset() >= 0xe0)
161 NVC0LegalizePostRA::insnDominatedBy(const Instruction
*later
,
162 const Instruction
*early
) const
164 if (early
->bb
== later
->bb
)
165 return early
->serial
< later
->serial
;
166 return later
->bb
->dominatedBy(early
->bb
);
170 NVC0LegalizePostRA::addTexUse(std::list
<TexUse
> &uses
,
171 Instruction
*usei
, const Instruction
*texi
)
174 for (std::list
<TexUse
>::iterator it
= uses
.begin();
176 if (insnDominatedBy(usei
, it
->insn
)) {
180 if (insnDominatedBy(it
->insn
, usei
))
186 uses
.push_back(TexUse(usei
, texi
));
189 // While it might be tempting to use the an algorithm that just looks at tex
190 // uses, not all texture results are guaranteed to be used on all paths. In
191 // the case where along some control flow path a texture result is never used,
192 // we might reuse that register for something else, creating a
193 // write-after-write hazard. So we have to manually look through all
194 // instructions looking for ones that reference the registers in question.
196 NVC0LegalizePostRA::findFirstUses(
197 Instruction
*texi
, std::list
<TexUse
> &uses
)
199 int minGPR
= texi
->def(0).rep()->reg
.data
.id
;
200 int maxGPR
= minGPR
+ texi
->def(0).rep()->reg
.size
/ 4 - 1;
202 unordered_set
<const BasicBlock
*> visited
;
203 findFirstUsesBB(minGPR
, maxGPR
, texi
->next
, texi
, uses
, visited
);
207 NVC0LegalizePostRA::findFirstUsesBB(
208 int minGPR
, int maxGPR
, Instruction
*start
,
209 const Instruction
*texi
, std::list
<TexUse
> &uses
,
210 unordered_set
<const BasicBlock
*> &visited
)
212 const BasicBlock
*bb
= start
->bb
;
214 // We don't process the whole bb the first time around. This is correct,
215 // however we might be in a loop and hit this BB again, and need to process
216 // the full thing. So only mark a bb as visited if we processed it from the
218 if (start
== bb
->getEntry()) {
219 if (visited
.find(bb
) != visited
.end())
224 for (Instruction
*insn
= start
; insn
!= bb
->getExit(); insn
= insn
->next
) {
228 for (int d
= 0; insn
->defExists(d
); ++d
) {
229 if (insn
->def(d
).getFile() != FILE_GPR
||
230 insn
->def(d
).rep()->reg
.data
.id
< minGPR
||
231 insn
->def(d
).rep()->reg
.data
.id
> maxGPR
)
233 addTexUse(uses
, insn
, texi
);
237 for (int s
= 0; insn
->srcExists(s
); ++s
) {
238 if (insn
->src(s
).getFile() != FILE_GPR
||
239 insn
->src(s
).rep()->reg
.data
.id
< minGPR
||
240 insn
->src(s
).rep()->reg
.data
.id
> maxGPR
)
242 addTexUse(uses
, insn
, texi
);
247 for (Graph::EdgeIterator ei
= bb
->cfg
.outgoing(); !ei
.end(); ei
.next()) {
248 findFirstUsesBB(minGPR
, maxGPR
, BasicBlock::get(ei
.getNode())->getEntry(),
249 texi
, uses
, visited
);
254 // This pass is a bit long and ugly and can probably be optimized.
256 // 1. obtain a list of TEXes and their outputs' first use(s)
257 // 2. calculate the barrier level of each first use (minimal number of TEXes,
258 // over all paths, between the TEX and the use in question)
259 // 3. for each barrier, if all paths from the source TEX to that barrier
260 // contain a barrier of lesser level, it can be culled
262 NVC0LegalizePostRA::insertTextureBarriers(Function
*fn
)
264 std::list
<TexUse
> *uses
;
265 std::vector
<Instruction
*> texes
;
266 std::vector
<int> bbFirstTex
;
267 std::vector
<int> bbFirstUse
;
268 std::vector
<int> texCounts
;
269 std::vector
<TexUse
> useVec
;
272 fn
->orderInstructions(insns
);
274 texCounts
.resize(fn
->allBBlocks
.getSize(), 0);
275 bbFirstTex
.resize(fn
->allBBlocks
.getSize(), insns
.getSize());
276 bbFirstUse
.resize(fn
->allBBlocks
.getSize(), insns
.getSize());
278 // tag BB CFG nodes by their id for later
279 for (ArrayList::Iterator i
= fn
->allBBlocks
.iterator(); !i
.end(); i
.next()) {
280 BasicBlock
*bb
= reinterpret_cast<BasicBlock
*>(i
.get());
282 bb
->cfg
.tag
= bb
->getId();
285 // gather the first uses for each TEX
286 for (int i
= 0; i
< insns
.getSize(); ++i
) {
287 Instruction
*tex
= reinterpret_cast<Instruction
*>(insns
.get(i
));
288 if (isTextureOp(tex
->op
)) {
289 texes
.push_back(tex
);
290 if (!texCounts
.at(tex
->bb
->getId()))
291 bbFirstTex
[tex
->bb
->getId()] = texes
.size() - 1;
292 texCounts
[tex
->bb
->getId()]++;
298 uses
= new std::list
<TexUse
>[texes
.size()];
301 for (size_t i
= 0; i
< texes
.size(); ++i
) {
302 findFirstUses(texes
[i
], uses
[i
]);
305 // determine the barrier level at each use
306 for (size_t i
= 0; i
< texes
.size(); ++i
) {
307 for (std::list
<TexUse
>::iterator u
= uses
[i
].begin(); u
!= uses
[i
].end();
309 BasicBlock
*tb
= texes
[i
]->bb
;
310 BasicBlock
*ub
= u
->insn
->bb
;
313 for (size_t j
= i
+ 1; j
< texes
.size() &&
314 texes
[j
]->bb
== tb
&& texes
[j
]->serial
< u
->insn
->serial
;
318 u
->level
= fn
->cfg
.findLightestPathWeight(&tb
->cfg
,
319 &ub
->cfg
, texCounts
);
321 WARN("Failed to find path TEX -> TEXBAR\n");
325 // this counted all TEXes in the origin block, correct that
326 u
->level
-= i
- bbFirstTex
.at(tb
->getId()) + 1 /* this TEX */;
327 // and did not count the TEXes in the destination block, add those
328 for (size_t j
= bbFirstTex
.at(ub
->getId()); j
< texes
.size() &&
329 texes
[j
]->bb
== ub
&& texes
[j
]->serial
< u
->insn
->serial
;
333 assert(u
->level
>= 0);
334 useVec
.push_back(*u
);
339 // insert the barriers
340 for (size_t i
= 0; i
< useVec
.size(); ++i
) {
341 Instruction
*prev
= useVec
[i
].insn
->prev
;
342 if (useVec
[i
].level
< 0)
344 if (prev
&& prev
->op
== OP_TEXBAR
) {
345 if (prev
->subOp
> useVec
[i
].level
)
346 prev
->subOp
= useVec
[i
].level
;
347 prev
->setSrc(prev
->srcCount(), useVec
[i
].tex
->getDef(0));
349 Instruction
*bar
= new_Instruction(func
, OP_TEXBAR
, TYPE_NONE
);
351 bar
->subOp
= useVec
[i
].level
;
352 // make use explicit to ease latency calculation
353 bar
->setSrc(bar
->srcCount(), useVec
[i
].tex
->getDef(0));
354 useVec
[i
].insn
->bb
->insertBefore(useVec
[i
].insn
, bar
);
358 if (fn
->getProgram()->optLevel
< 3)
361 std::vector
<Limits
> limitT
, limitB
, limitS
; // entry, exit, single
363 limitT
.resize(fn
->allBBlocks
.getSize(), Limits(0, 0));
364 limitB
.resize(fn
->allBBlocks
.getSize(), Limits(0, 0));
365 limitS
.resize(fn
->allBBlocks
.getSize());
367 // cull unneeded barriers (should do that earlier, but for simplicity)
368 IteratorRef bi
= fn
->cfg
.iteratorCFG();
369 // first calculate min/max outstanding TEXes for each BB
370 for (bi
->reset(); !bi
->end(); bi
->next()) {
371 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
372 BasicBlock
*bb
= BasicBlock::get(n
);
374 int max
= std::numeric_limits
<int>::max();
375 for (Instruction
*i
= bb
->getFirst(); i
; i
= i
->next
) {
376 if (isTextureOp(i
->op
)) {
378 if (max
< std::numeric_limits
<int>::max())
381 if (i
->op
== OP_TEXBAR
) {
382 min
= MIN2(min
, i
->subOp
);
383 max
= MIN2(max
, i
->subOp
);
386 // limits when looking at an isolated block
387 limitS
[bb
->getId()].min
= min
;
388 limitS
[bb
->getId()].max
= max
;
390 // propagate the min/max values
391 for (unsigned int l
= 0; l
<= fn
->loopNestingBound
; ++l
) {
392 for (bi
->reset(); !bi
->end(); bi
->next()) {
393 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
394 BasicBlock
*bb
= BasicBlock::get(n
);
395 const int bbId
= bb
->getId();
396 for (Graph::EdgeIterator ei
= n
->incident(); !ei
.end(); ei
.next()) {
397 BasicBlock
*in
= BasicBlock::get(ei
.getNode());
398 const int inId
= in
->getId();
399 limitT
[bbId
].min
= MAX2(limitT
[bbId
].min
, limitB
[inId
].min
);
400 limitT
[bbId
].max
= MAX2(limitT
[bbId
].max
, limitB
[inId
].max
);
402 // I just hope this is correct ...
403 if (limitS
[bbId
].max
== std::numeric_limits
<int>::max()) {
405 limitB
[bbId
].min
= limitT
[bbId
].min
+ limitS
[bbId
].min
;
406 limitB
[bbId
].max
= limitT
[bbId
].max
+ limitS
[bbId
].min
;
408 // block contained a barrier
409 limitB
[bbId
].min
= MIN2(limitS
[bbId
].max
,
410 limitT
[bbId
].min
+ limitS
[bbId
].min
);
411 limitB
[bbId
].max
= MIN2(limitS
[bbId
].max
,
412 limitT
[bbId
].max
+ limitS
[bbId
].min
);
416 // finally delete unnecessary barriers
417 for (bi
->reset(); !bi
->end(); bi
->next()) {
418 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
419 BasicBlock
*bb
= BasicBlock::get(n
);
420 Instruction
*prev
= NULL
;
422 int max
= limitT
[bb
->getId()].max
;
423 for (Instruction
*i
= bb
->getFirst(); i
; i
= next
) {
425 if (i
->op
== OP_TEXBAR
) {
426 if (i
->subOp
>= max
) {
427 delete_Instruction(prog
, i
);
431 if (prev
&& prev
->op
== OP_TEXBAR
&& prev
->subOp
>= max
) {
432 delete_Instruction(prog
, prev
);
437 if (isTextureOp(i
->op
)) {
440 if (i
&& !i
->isNop())
448 NVC0LegalizePostRA::visit(Function
*fn
)
451 insertTextureBarriers(fn
);
453 rZero
= new_LValue(fn
, FILE_GPR
);
454 carry
= new_LValue(fn
, FILE_FLAGS
);
456 rZero
->reg
.data
.id
= prog
->getTarget()->getFileSize(FILE_GPR
);
457 carry
->reg
.data
.id
= 0;
463 NVC0LegalizePostRA::replaceZero(Instruction
*i
)
465 for (int s
= 0; i
->srcExists(s
); ++s
) {
466 if (s
== 2 && i
->op
== OP_SUCLAMP
)
468 ImmediateValue
*imm
= i
->getSrc(s
)->asImm();
469 if (imm
&& imm
->reg
.data
.u64
== 0)
474 // replace CONT with BRA for single unconditional continue
476 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock
*bb
)
478 if (bb
->cfg
.incidentCount() != 2 || bb
->getEntry()->op
!= OP_PRECONT
)
480 Graph::EdgeIterator ei
= bb
->cfg
.incident();
481 if (ei
.getType() != Graph::Edge::BACK
)
483 if (ei
.getType() != Graph::Edge::BACK
)
485 BasicBlock
*contBB
= BasicBlock::get(ei
.getNode());
487 if (!contBB
->getExit() || contBB
->getExit()->op
!= OP_CONT
||
488 contBB
->getExit()->getPredicate())
490 contBB
->getExit()->op
= OP_BRA
;
491 bb
->remove(bb
->getEntry()); // delete PRECONT
494 assert(ei
.end() || ei
.getType() != Graph::Edge::BACK
);
498 // replace branches to join blocks with join ops
500 NVC0LegalizePostRA::propagateJoin(BasicBlock
*bb
)
502 if (bb
->getEntry()->op
!= OP_JOIN
|| bb
->getEntry()->asFlow()->limit
)
504 for (Graph::EdgeIterator ei
= bb
->cfg
.incident(); !ei
.end(); ei
.next()) {
505 BasicBlock
*in
= BasicBlock::get(ei
.getNode());
506 Instruction
*exit
= in
->getExit();
508 in
->insertTail(new FlowInstruction(func
, OP_JOIN
, bb
));
509 // there should always be a terminator instruction
510 WARN("inserted missing terminator in BB:%i\n", in
->getId());
512 if (exit
->op
== OP_BRA
) {
514 exit
->asFlow()->limit
= 1; // must-not-propagate marker
517 bb
->remove(bb
->getEntry());
521 NVC0LegalizePostRA::visit(BasicBlock
*bb
)
523 Instruction
*i
, *next
;
525 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
526 for (i
= bb
->getFirst(); i
; i
= next
) {
528 if (i
->op
== OP_EMIT
|| i
->op
== OP_RESTART
) {
529 if (!i
->getDef(0)->refCount())
531 if (i
->src(0).getFile() == FILE_IMMEDIATE
)
532 i
->setSrc(0, rZero
); // initial value must be 0
538 if (i
->op
== OP_BAR
&& i
->subOp
== NV50_IR_SUBOP_BAR_SYNC
&&
539 prog
->getType() != Program::TYPE_COMPUTE
) {
540 // It seems like barriers are never required for tessellation since
541 // the warp size is 32, and there are always at most 32 tcs threads.
544 if (i
->op
== OP_LOAD
&& i
->subOp
== NV50_IR_SUBOP_LDC_IS
) {
545 int offset
= i
->src(0).get()->reg
.data
.offset
;
546 if (abs(offset
) > 0x10000)
547 i
->src(0).get()->reg
.fileIndex
+= offset
>> 16;
548 i
->src(0).get()->reg
.data
.offset
= (int)(short)offset
;
550 // TODO: Move this to before register allocation for operations that
551 // need the $c register !
552 if (typeSizeof(i
->dType
) == 8) {
554 hi
= BuildUtil::split64BitOpPostRA(func
, i
, rZero
, carry
);
559 if (i
->op
!= OP_MOV
&& i
->op
!= OP_PFETCH
)
566 if (!tryReplaceContWithBra(bb
))
572 NVC0LoweringPass::NVC0LoweringPass(Program
*prog
) : targ(prog
->getTarget())
574 bld
.setProgram(prog
);
579 NVC0LoweringPass::visit(Function
*fn
)
581 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
582 assert(!strncmp(fn
->getName(), "MAIN", 4));
583 // TODO: when we generate actual functions pass this value along somehow
584 bld
.setPosition(BasicBlock::get(fn
->cfg
.getRoot()), false);
585 gpEmitAddress
= bld
.loadImm(NULL
, 0)->asLValue();
587 bld
.setPosition(BasicBlock::get(fn
->cfgExit
)->getExit(), false);
588 bld
.mkMovToReg(0, gpEmitAddress
);
595 NVC0LoweringPass::visit(BasicBlock
*bb
)
601 NVC0LoweringPass::loadTexHandle(Value
*ptr
, unsigned int slot
)
603 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
604 uint32_t off
= prog
->driver
->io
.texBindBase
+ slot
* 4;
606 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
609 // move array source to first slot, convert to u16, add indirections
611 NVC0LoweringPass::handleTEX(TexInstruction
*i
)
613 const int dim
= i
->tex
.target
.getDim() + i
->tex
.target
.isCube();
614 const int arg
= i
->tex
.target
.getArgCount();
615 const int lyr
= arg
- (i
->tex
.target
.isMS() ? 2 : 1);
616 const int chipset
= prog
->getTarget()->getChipset();
618 /* Only normalize in the non-explicit derivatives case. For explicit
619 * derivatives, this is handled in handleManualTXD.
621 if (i
->tex
.target
.isCube() && i
->dPdx
[0].get() == NULL
) {
624 for (c
= 0; c
< 3; ++c
)
625 src
[c
] = bld
.mkOp1v(OP_ABS
, TYPE_F32
, bld
.getSSA(), i
->getSrc(c
));
626 val
= bld
.getScratch();
627 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[0], src
[1]);
628 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[2], val
);
629 bld
.mkOp1(OP_RCP
, TYPE_F32
, val
, val
);
630 for (c
= 0; c
< 3; ++c
) {
631 i
->setSrc(c
, bld
.mkOp2v(OP_MUL
, TYPE_F32
, bld
.getSSA(),
636 // Arguments to the TEX instruction are a little insane. Even though the
637 // encoding is identical between SM20 and SM30, the arguments mean
638 // different things between Fermi and Kepler+. A lot of arguments are
639 // optional based on flags passed to the instruction. This summarizes the
649 // - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
650 // - other: 4 bits each, single reg
654 // array (+ offsets for txd in upper 16 bits)
659 // offsets (same as fermi, except txd which takes it with array)
676 if (chipset
>= NVISA_GK104_CHIPSET
) {
677 if (i
->tex
.rIndirectSrc
>= 0 || i
->tex
.sIndirectSrc
>= 0) {
678 // XXX this ignores tsc, and assumes a 1:1 mapping
679 assert(i
->tex
.rIndirectSrc
>= 0);
680 Value
*hnd
= loadTexHandle(
681 bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
682 i
->getIndirectR(), bld
.mkImm(2)),
686 i
->setIndirectR(hnd
);
687 i
->setIndirectS(NULL
);
688 } else if (i
->tex
.r
== i
->tex
.s
|| i
->op
== OP_TXF
) {
689 i
->tex
.r
+= prog
->driver
->io
.texBindBase
/ 4;
690 i
->tex
.s
= 0; // only a single cX[] value possible here
692 Value
*hnd
= bld
.getScratch();
693 Value
*rHnd
= loadTexHandle(NULL
, i
->tex
.r
);
694 Value
*sHnd
= loadTexHandle(NULL
, i
->tex
.s
);
696 bld
.mkOp3(OP_INSBF
, TYPE_U32
, hnd
, rHnd
, bld
.mkImm(0x1400), sHnd
);
698 i
->tex
.r
= 0; // not used for indirect tex
700 i
->setIndirectR(hnd
);
702 if (i
->tex
.target
.isArray()) {
703 LValue
*layer
= new_LValue(func
, FILE_GPR
);
704 Value
*src
= i
->getSrc(lyr
);
705 const int sat
= (i
->op
== OP_TXF
) ? 1 : 0;
706 DataType sTy
= (i
->op
== OP_TXF
) ? TYPE_U32
: TYPE_F32
;
707 bld
.mkCvt(OP_CVT
, TYPE_U16
, layer
, sTy
, src
)->saturate
= sat
;
708 if (i
->op
!= OP_TXD
|| chipset
< NVISA_GM107_CHIPSET
) {
709 for (int s
= dim
; s
>= 1; --s
)
710 i
->setSrc(s
, i
->getSrc(s
- 1));
713 i
->setSrc(dim
, layer
);
716 // Move the indirect reference to the first place
717 if (i
->tex
.rIndirectSrc
>= 0 && (
718 i
->op
== OP_TXD
|| chipset
< NVISA_GM107_CHIPSET
)) {
719 Value
*hnd
= i
->getIndirectR();
721 i
->setIndirectR(NULL
);
722 i
->moveSources(0, 1);
724 i
->tex
.rIndirectSrc
= 0;
725 i
->tex
.sIndirectSrc
= -1;
728 // (nvc0) generate and move the tsc/tic/array source to the front
729 if (i
->tex
.target
.isArray() || i
->tex
.rIndirectSrc
>= 0 || i
->tex
.sIndirectSrc
>= 0) {
730 LValue
*src
= new_LValue(func
, FILE_GPR
); // 0xttxsaaaa
732 Value
*ticRel
= i
->getIndirectR();
733 Value
*tscRel
= i
->getIndirectS();
736 i
->setSrc(i
->tex
.rIndirectSrc
, NULL
);
738 ticRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
739 ticRel
, bld
.mkImm(i
->tex
.r
));
742 i
->setSrc(i
->tex
.sIndirectSrc
, NULL
);
744 tscRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
745 tscRel
, bld
.mkImm(i
->tex
.s
));
748 Value
*arrayIndex
= i
->tex
.target
.isArray() ? i
->getSrc(lyr
) : NULL
;
750 for (int s
= dim
; s
>= 1; --s
)
751 i
->setSrc(s
, i
->getSrc(s
- 1));
752 i
->setSrc(0, arrayIndex
);
754 i
->moveSources(0, 1);
758 int sat
= (i
->op
== OP_TXF
) ? 1 : 0;
759 DataType sTy
= (i
->op
== OP_TXF
) ? TYPE_U32
: TYPE_F32
;
760 bld
.mkCvt(OP_CVT
, TYPE_U16
, src
, sTy
, arrayIndex
)->saturate
= sat
;
766 bld
.mkOp3(OP_INSBF
, TYPE_U32
, src
, ticRel
, bld
.mkImm(0x0917), src
);
768 bld
.mkOp3(OP_INSBF
, TYPE_U32
, src
, tscRel
, bld
.mkImm(0x0710), src
);
773 // For nvc0, the sample id has to be in the second operand, as the offset
774 // does. Right now we don't know how to pass both in, and this case can't
775 // happen with OpenGL. On nve0, the sample id is part of the texture
776 // coordinate argument.
777 assert(chipset
>= NVISA_GK104_CHIPSET
||
778 !i
->tex
.useOffsets
|| !i
->tex
.target
.isMS());
780 // offset is between lod and dc
781 if (i
->tex
.useOffsets
) {
783 int s
= i
->srcCount(0xff, true);
784 if (i
->op
!= OP_TXD
|| chipset
< NVISA_GK104_CHIPSET
) {
785 if (i
->tex
.target
.isShadow())
787 if (i
->srcExists(s
)) // move potential predicate out of the way
788 i
->moveSources(s
, 1);
789 if (i
->tex
.useOffsets
== 4 && i
->srcExists(s
+ 1))
790 i
->moveSources(s
+ 1, 1);
792 if (i
->op
== OP_TXG
) {
793 // Either there is 1 offset, which goes into the 2 low bytes of the
794 // first source, or there are 4 offsets, which go into 2 sources (8
795 // values, 1 byte each).
796 Value
*offs
[2] = {NULL
, NULL
};
797 for (n
= 0; n
< i
->tex
.useOffsets
; n
++) {
798 for (c
= 0; c
< 2; ++c
) {
799 if ((n
% 2) == 0 && c
== 0)
800 offs
[n
/ 2] = i
->offset
[n
][c
].get();
802 bld
.mkOp3(OP_INSBF
, TYPE_U32
,
804 i
->offset
[n
][c
].get(),
805 bld
.mkImm(0x800 | ((n
* 16 + c
* 8) % 32)),
809 i
->setSrc(s
, offs
[0]);
811 i
->setSrc(s
+ 1, offs
[1]);
814 assert(i
->tex
.useOffsets
== 1);
815 for (c
= 0; c
< 3; ++c
) {
817 if (!i
->offset
[0][c
].getImmediate(val
))
818 assert(!"non-immediate offset passed to non-TXG");
819 imm
|= (val
.reg
.data
.u32
& 0xf) << (c
* 4);
821 if (i
->op
== OP_TXD
&& chipset
>= NVISA_GK104_CHIPSET
) {
822 // The offset goes into the upper 16 bits of the array index. So
823 // create it if it's not already there, and INSBF it if it already
825 s
= (i
->tex
.rIndirectSrc
>= 0) ? 1 : 0;
826 if (chipset
>= NVISA_GM107_CHIPSET
)
828 if (i
->tex
.target
.isArray()) {
829 bld
.mkOp3(OP_INSBF
, TYPE_U32
, i
->getSrc(s
),
830 bld
.loadImm(NULL
, imm
), bld
.mkImm(0xc10),
833 i
->moveSources(s
, 1);
834 i
->setSrc(s
, bld
.loadImm(NULL
, imm
<< 16));
837 i
->setSrc(s
, bld
.loadImm(NULL
, imm
));
842 if (chipset
>= NVISA_GK104_CHIPSET
) {
844 // If TEX requires more than 4 sources, the 2nd register tuple must be
845 // aligned to 4, even if it consists of just a single 4-byte register.
847 // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
849 int s
= i
->srcCount(0xff, true);
850 if (s
> 4 && s
< 7) {
851 if (i
->srcExists(s
)) // move potential predicate out of the way
852 i
->moveSources(s
, 7 - s
);
854 i
->setSrc(s
++, bld
.loadImm(NULL
, 0));
862 NVC0LoweringPass::handleManualTXD(TexInstruction
*i
)
864 static const uint8_t qOps
[4][2] =
866 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l0
867 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l1
868 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l2
869 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l3
874 Value
*zero
= bld
.loadImm(bld
.getSSA(), 0);
876 const int dim
= i
->tex
.target
.getDim() + i
->tex
.target
.isCube();
877 const int array
= i
->tex
.target
.isArray();
879 i
->op
= OP_TEX
; // no need to clone dPdx/dPdy later
881 for (c
= 0; c
< dim
; ++c
)
882 crd
[c
] = bld
.getScratch();
884 bld
.mkOp(OP_QUADON
, TYPE_NONE
, NULL
);
885 for (l
= 0; l
< 4; ++l
) {
887 // mov coordinates from lane l to all lanes
888 for (c
= 0; c
< dim
; ++c
)
889 bld
.mkQuadop(0x00, crd
[c
], l
, i
->getSrc(c
+ array
), zero
);
890 // add dPdx from lane l to lanes dx
891 for (c
= 0; c
< dim
; ++c
)
892 bld
.mkQuadop(qOps
[l
][0], crd
[c
], l
, i
->dPdx
[c
].get(), crd
[c
]);
893 // add dPdy from lane l to lanes dy
894 for (c
= 0; c
< dim
; ++c
)
895 bld
.mkQuadop(qOps
[l
][1], crd
[c
], l
, i
->dPdy
[c
].get(), crd
[c
]);
896 // normalize cube coordinates
897 if (i
->tex
.target
.isCube()) {
898 for (c
= 0; c
< 3; ++c
)
899 src
[c
] = bld
.mkOp1v(OP_ABS
, TYPE_F32
, bld
.getSSA(), crd
[c
]);
900 val
= bld
.getScratch();
901 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[0], src
[1]);
902 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[2], val
);
903 bld
.mkOp1(OP_RCP
, TYPE_F32
, val
, val
);
904 for (c
= 0; c
< 3; ++c
)
905 src
[c
] = bld
.mkOp2v(OP_MUL
, TYPE_F32
, bld
.getSSA(), crd
[c
], val
);
907 for (c
= 0; c
< dim
; ++c
)
911 bld
.insert(tex
= cloneForward(func
, i
));
912 for (c
= 0; c
< dim
; ++c
)
913 tex
->setSrc(c
+ array
, src
[c
]);
915 for (c
= 0; i
->defExists(c
); ++c
) {
917 def
[c
][l
] = bld
.getSSA();
918 mov
= bld
.mkMov(def
[c
][l
], tex
->getDef(c
));
923 bld
.mkOp(OP_QUADPOP
, TYPE_NONE
, NULL
);
925 for (c
= 0; i
->defExists(c
); ++c
) {
926 Instruction
*u
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(c
));
927 for (l
= 0; l
< 4; ++l
)
928 u
->setSrc(l
, def
[c
][l
]);
936 NVC0LoweringPass::handleTXD(TexInstruction
*txd
)
938 int dim
= txd
->tex
.target
.getDim() + txd
->tex
.target
.isCube();
939 unsigned arg
= txd
->tex
.target
.getArgCount();
940 unsigned expected_args
= arg
;
941 const int chipset
= prog
->getTarget()->getChipset();
943 if (chipset
>= NVISA_GK104_CHIPSET
) {
944 if (!txd
->tex
.target
.isArray() && txd
->tex
.useOffsets
)
946 if (txd
->tex
.rIndirectSrc
>= 0 || txd
->tex
.sIndirectSrc
>= 0)
949 if (txd
->tex
.useOffsets
)
951 if (!txd
->tex
.target
.isArray() && (
952 txd
->tex
.rIndirectSrc
>= 0 || txd
->tex
.sIndirectSrc
>= 0))
956 if (expected_args
> 4 ||
958 txd
->tex
.target
.isShadow())
962 while (txd
->srcExists(arg
))
965 txd
->tex
.derivAll
= true;
966 if (txd
->op
== OP_TEX
)
967 return handleManualTXD(txd
);
969 assert(arg
== expected_args
);
970 for (int c
= 0; c
< dim
; ++c
) {
971 txd
->setSrc(arg
+ c
* 2 + 0, txd
->dPdx
[c
]);
972 txd
->setSrc(arg
+ c
* 2 + 1, txd
->dPdy
[c
]);
973 txd
->dPdx
[c
].set(NULL
);
974 txd
->dPdy
[c
].set(NULL
);
980 NVC0LoweringPass::handleTXQ(TexInstruction
*txq
)
982 const int chipset
= prog
->getTarget()->getChipset();
983 if (chipset
>= NVISA_GK104_CHIPSET
&& txq
->tex
.rIndirectSrc
< 0)
984 txq
->tex
.r
+= prog
->driver
->io
.texBindBase
/ 4;
986 if (txq
->tex
.rIndirectSrc
< 0)
989 Value
*ticRel
= txq
->getIndirectR();
991 txq
->setIndirectS(NULL
);
992 txq
->tex
.sIndirectSrc
= -1;
996 if (chipset
< NVISA_GK104_CHIPSET
) {
997 LValue
*src
= new_LValue(func
, FILE_GPR
); // 0xttxsaaaa
999 txq
->setSrc(txq
->tex
.rIndirectSrc
, NULL
);
1001 ticRel
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getScratch(),
1002 ticRel
, bld
.mkImm(txq
->tex
.r
));
1004 bld
.mkOp2(OP_SHL
, TYPE_U32
, src
, ticRel
, bld
.mkImm(0x17));
1006 txq
->moveSources(0, 1);
1007 txq
->setSrc(0, src
);
1009 Value
*hnd
= loadTexHandle(
1010 bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
1011 txq
->getIndirectR(), bld
.mkImm(2)),
1016 txq
->setIndirectR(NULL
);
1017 txq
->moveSources(0, 1);
1018 txq
->setSrc(0, hnd
);
1019 txq
->tex
.rIndirectSrc
= 0;
1026 NVC0LoweringPass::handleTXLQ(TexInstruction
*i
)
1028 /* The outputs are inverted compared to what the TGSI instruction
1029 * expects. Take that into account in the mask.
1031 assert((i
->tex
.mask
& ~3) == 0);
1032 if (i
->tex
.mask
== 1)
1034 else if (i
->tex
.mask
== 2)
1037 bld
.setPosition(i
, true);
1039 /* The returned values are not quite what we want:
1040 * (a) convert from s16/u16 to f32
1041 * (b) multiply by 1/256
1043 for (int def
= 0; def
< 2; ++def
) {
1044 if (!i
->defExists(def
))
1046 enum DataType type
= TYPE_S16
;
1047 if (i
->tex
.mask
== 2 || def
> 0)
1049 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(def
), type
, i
->getDef(def
));
1050 bld
.mkOp2(OP_MUL
, TYPE_F32
, i
->getDef(def
),
1051 i
->getDef(def
), bld
.loadImm(NULL
, 1.0f
/ 256));
1053 if (i
->tex
.mask
== 3) {
1054 LValue
*t
= new_LValue(func
, FILE_GPR
);
1055 bld
.mkMov(t
, i
->getDef(0));
1056 bld
.mkMov(i
->getDef(0), i
->getDef(1));
1057 bld
.mkMov(i
->getDef(1), t
);
1063 NVC0LoweringPass::handleSUQ(Instruction
*suq
)
1066 suq
->setSrc(0, loadResLength32(suq
->getIndirect(0, 1),
1067 suq
->getSrc(0)->reg
.fileIndex
* 16));
1068 suq
->setIndirect(0, 0, NULL
);
1069 suq
->setIndirect(0, 1, NULL
);
1074 NVC0LoweringPass::handleSharedATOM(Instruction
*atom
)
1076 assert(atom
->src(0).getFile() == FILE_MEMORY_SHARED
);
1078 BasicBlock
*currBB
= atom
->bb
;
1079 BasicBlock
*tryLockAndSetBB
= atom
->bb
->splitBefore(atom
, false);
1080 BasicBlock
*joinBB
= atom
->bb
->splitAfter(atom
);
1082 bld
.setPosition(currBB
, true);
1083 assert(!currBB
->joinAt
);
1084 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
1086 bld
.mkFlow(OP_BRA
, tryLockAndSetBB
, CC_ALWAYS
, NULL
);
1087 currBB
->cfg
.attach(&tryLockAndSetBB
->cfg
, Graph::Edge::TREE
);
1089 bld
.setPosition(tryLockAndSetBB
, true);
1092 bld
.mkLoad(TYPE_U32
, atom
->getDef(0),
1093 bld
.mkSymbol(FILE_MEMORY_SHARED
, 0, TYPE_U32
, 0), NULL
);
1094 ld
->setDef(1, bld
.getSSA(1, FILE_PREDICATE
));
1095 ld
->subOp
= NV50_IR_SUBOP_LOAD_LOCKED
;
1098 if (atom
->subOp
== NV50_IR_SUBOP_ATOM_EXCH
) {
1099 // Read the old value, and write the new one.
1100 stVal
= atom
->getSrc(1);
1101 } else if (atom
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
1102 CmpInstruction
*set
=
1103 bld
.mkCmp(OP_SET
, CC_EQ
, TYPE_U32
, bld
.getSSA(1, FILE_PREDICATE
),
1104 TYPE_U32
, ld
->getDef(0), atom
->getSrc(1));
1105 set
->setPredicate(CC_P
, ld
->getDef(1));
1108 bld
.mkOp3(OP_SELP
, TYPE_U32
, bld
.getSSA(), ld
->getDef(0),
1109 atom
->getSrc(2), set
->getDef(0));
1110 selp
->src(2).mod
= Modifier(NV50_IR_MOD_NOT
);
1111 selp
->setPredicate(CC_P
, ld
->getDef(1));
1113 stVal
= selp
->getDef(0);
1117 switch (atom
->subOp
) {
1118 case NV50_IR_SUBOP_ATOM_ADD
:
1121 case NV50_IR_SUBOP_ATOM_AND
:
1124 case NV50_IR_SUBOP_ATOM_OR
:
1127 case NV50_IR_SUBOP_ATOM_XOR
:
1130 case NV50_IR_SUBOP_ATOM_MIN
:
1133 case NV50_IR_SUBOP_ATOM_MAX
:
1142 bld
.mkOp2(op
, atom
->dType
, bld
.getSSA(), ld
->getDef(0),
1144 i
->setPredicate(CC_P
, ld
->getDef(1));
1146 stVal
= i
->getDef(0);
1150 bld
.mkStore(OP_STORE
, TYPE_U32
,
1151 bld
.mkSymbol(FILE_MEMORY_SHARED
, 0, TYPE_U32
, 0),
1153 st
->setPredicate(CC_P
, ld
->getDef(1));
1154 st
->subOp
= NV50_IR_SUBOP_STORE_UNLOCKED
;
1156 // Loop until the lock is acquired.
1157 bld
.mkFlow(OP_BRA
, tryLockAndSetBB
, CC_NOT_P
, ld
->getDef(1));
1158 tryLockAndSetBB
->cfg
.attach(&tryLockAndSetBB
->cfg
, Graph::Edge::BACK
);
1159 tryLockAndSetBB
->cfg
.attach(&joinBB
->cfg
, Graph::Edge::CROSS
);
1160 bld
.mkFlow(OP_BRA
, joinBB
, CC_ALWAYS
, NULL
);
1164 bld
.setPosition(joinBB
, false);
1165 bld
.mkFlow(OP_JOIN
, NULL
, CC_ALWAYS
, NULL
)->fixed
= 1;
1169 NVC0LoweringPass::handleATOM(Instruction
*atom
)
1172 Value
*ptr
= atom
->getIndirect(0, 0), *ind
= atom
->getIndirect(0, 1), *base
;
1174 switch (atom
->src(0).getFile()) {
1175 case FILE_MEMORY_LOCAL
:
1178 case FILE_MEMORY_SHARED
:
1179 handleSharedATOM(atom
);
1182 assert(atom
->src(0).getFile() == FILE_MEMORY_GLOBAL
);
1183 base
= loadResInfo64(ind
, atom
->getSrc(0)->reg
.fileIndex
* 16);
1184 assert(base
->reg
.size
== 8);
1186 base
= bld
.mkOp2v(OP_ADD
, TYPE_U64
, base
, base
, ptr
);
1187 assert(base
->reg
.size
== 8);
1188 atom
->setIndirect(0, 0, base
);
1192 bld
.mkOp1v(OP_RDSV
, TYPE_U32
, bld
.getScratch(), bld
.mkSysVal(sv
, 0));
1194 atom
->setSrc(0, cloneShallow(func
, atom
->getSrc(0)));
1195 atom
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
1197 base
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, base
, base
, ptr
);
1198 atom
->setIndirect(0, 1, NULL
);
1199 atom
->setIndirect(0, 0, base
);
1205 NVC0LoweringPass::handleCasExch(Instruction
*cas
, bool needCctl
)
1207 if (cas
->src(0).getFile() == FILE_MEMORY_SHARED
) {
1208 // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1212 if (cas
->subOp
!= NV50_IR_SUBOP_ATOM_CAS
&&
1213 cas
->subOp
!= NV50_IR_SUBOP_ATOM_EXCH
)
1215 bld
.setPosition(cas
, true);
1218 Instruction
*cctl
= bld
.mkOp1(OP_CCTL
, TYPE_NONE
, NULL
, cas
->getSrc(0));
1219 cctl
->setIndirect(0, 0, cas
->getIndirect(0, 0));
1221 cctl
->subOp
= NV50_IR_SUBOP_CCTL_IV
;
1222 if (cas
->isPredicated())
1223 cctl
->setPredicate(cas
->cc
, cas
->getPredicate());
1226 if (cas
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
1227 // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1228 // should be set to the high part of the double reg or bad things will
1229 // happen elsewhere in the universe.
1230 // Also, it sometimes returns the new value instead of the old one
1231 // under mysterious circumstances.
1232 Value
*dreg
= bld
.getSSA(8);
1233 bld
.setPosition(cas
, false);
1234 bld
.mkOp2(OP_MERGE
, TYPE_U64
, dreg
, cas
->getSrc(1), cas
->getSrc(2));
1235 cas
->setSrc(1, dreg
);
1236 cas
->setSrc(2, dreg
);
1243 NVC0LoweringPass::loadResInfo32(Value
*ptr
, uint32_t off
)
1245 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1246 off
+= prog
->driver
->io
.suInfoBase
;
1248 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
1252 NVC0LoweringPass::loadResInfo64(Value
*ptr
, uint32_t off
)
1254 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1255 off
+= prog
->driver
->io
.suInfoBase
;
1258 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getScratch(), ptr
, bld
.mkImm(4));
1261 mkLoadv(TYPE_U64
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U64
, off
), ptr
);
1265 NVC0LoweringPass::loadResLength32(Value
*ptr
, uint32_t off
)
1267 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
1268 off
+= prog
->driver
->io
.suInfoBase
;
1271 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getScratch(), ptr
, bld
.mkImm(4));
1274 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U64
, off
+ 8), ptr
);
1278 NVC0LoweringPass::loadMsInfo32(Value
*ptr
, uint32_t off
)
1280 uint8_t b
= prog
->driver
->io
.msInfoCBSlot
;
1281 off
+= prog
->driver
->io
.msInfoBase
;
1283 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
1286 /* On nvc0, surface info is obtained via the surface binding points passed
1287 * to the SULD/SUST instructions.
1288 * On nve4, surface info is stored in c[] and is used by various special
1289 * instructions, e.g. for clamping coordiantes or generating an address.
1290 * They couldn't just have added an equivalent to TIC now, couldn't they ?
1292 #define NVE4_SU_INFO_ADDR 0x00
1293 #define NVE4_SU_INFO_FMT 0x04
1294 #define NVE4_SU_INFO_DIM_X 0x08
1295 #define NVE4_SU_INFO_PITCH 0x0c
1296 #define NVE4_SU_INFO_DIM_Y 0x10
1297 #define NVE4_SU_INFO_ARRAY 0x14
1298 #define NVE4_SU_INFO_DIM_Z 0x18
1299 #define NVE4_SU_INFO_UNK1C 0x1c
1300 #define NVE4_SU_INFO_WIDTH 0x20
1301 #define NVE4_SU_INFO_HEIGHT 0x24
1302 #define NVE4_SU_INFO_DEPTH 0x28
1303 #define NVE4_SU_INFO_TARGET 0x2c
1304 #define NVE4_SU_INFO_CALL 0x30
1305 #define NVE4_SU_INFO_RAW_X 0x34
1306 #define NVE4_SU_INFO_MS_X 0x38
1307 #define NVE4_SU_INFO_MS_Y 0x3c
1309 #define NVE4_SU_INFO__STRIDE 0x40
1311 #define NVE4_SU_INFO_DIM(i) (0x08 + (i) * 8)
1312 #define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
1313 #define NVE4_SU_INFO_MS(i) (0x38 + (i) * 4)
1315 static inline uint16_t getSuClampSubOp(const TexInstruction
*su
, int c
)
1317 switch (su
->tex
.target
.getEnum()) {
1318 case TEX_TARGET_BUFFER
: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1319 case TEX_TARGET_RECT
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1320 case TEX_TARGET_1D
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1321 case TEX_TARGET_1D_ARRAY
: return (c
== 1) ?
1322 NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1323 NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1324 case TEX_TARGET_2D
: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1325 case TEX_TARGET_2D_MS
: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1326 case TEX_TARGET_2D_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1327 case TEX_TARGET_2D_MS_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1328 case TEX_TARGET_3D
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1329 case TEX_TARGET_CUBE
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1330 case TEX_TARGET_CUBE_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1338 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction
*tex
)
1340 const uint16_t base
= tex
->tex
.r
* NVE4_SU_INFO__STRIDE
;
1341 const int arg
= tex
->tex
.target
.getArgCount();
1343 if (tex
->tex
.target
== TEX_TARGET_2D_MS
)
1344 tex
->tex
.target
= TEX_TARGET_2D
;
1346 if (tex
->tex
.target
== TEX_TARGET_2D_MS_ARRAY
)
1347 tex
->tex
.target
= TEX_TARGET_2D_ARRAY
;
1351 Value
*x
= tex
->getSrc(0);
1352 Value
*y
= tex
->getSrc(1);
1353 Value
*s
= tex
->getSrc(arg
- 1);
1355 Value
*tx
= bld
.getSSA(), *ty
= bld
.getSSA(), *ts
= bld
.getSSA();
1357 Value
*ms_x
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_MS(0));
1358 Value
*ms_y
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_MS(1));
1360 bld
.mkOp2(OP_SHL
, TYPE_U32
, tx
, x
, ms_x
);
1361 bld
.mkOp2(OP_SHL
, TYPE_U32
, ty
, y
, ms_y
);
1363 s
= bld
.mkOp2v(OP_AND
, TYPE_U32
, ts
, s
, bld
.loadImm(NULL
, 0x7));
1364 s
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, ts
, ts
, bld
.mkImm(3));
1366 Value
*dx
= loadMsInfo32(ts
, 0x0);
1367 Value
*dy
= loadMsInfo32(ts
, 0x4);
1369 bld
.mkOp2(OP_ADD
, TYPE_U32
, tx
, tx
, dx
);
1370 bld
.mkOp2(OP_ADD
, TYPE_U32
, ty
, ty
, dy
);
1374 tex
->moveSources(arg
, -1);
1377 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1378 // They're computed from the coordinates using the surface info in c[] space.
1380 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction
*su
)
1383 const bool atom
= su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
;
1385 su
->op
== OP_SULDB
|| su
->op
== OP_SUSTB
|| su
->op
== OP_SUREDB
;
1386 const int idx
= su
->tex
.r
;
1387 const int dim
= su
->tex
.target
.getDim();
1388 const int arg
= dim
+ (su
->tex
.target
.isArray() ? 1 : 0);
1389 const uint16_t base
= idx
* NVE4_SU_INFO__STRIDE
;
1391 Value
*zero
= bld
.mkImm(0);
1395 Value
*bf
, *eau
, *off
;
1398 off
= bld
.getScratch(4);
1399 bf
= bld
.getScratch(4);
1400 addr
= bld
.getSSA(8);
1401 pred
= bld
.getScratch(1, FILE_PREDICATE
);
1403 bld
.setPosition(su
, false);
1405 adjustCoordinatesMS(su
);
1407 // calculate clamped coordinates
1408 for (c
= 0; c
< arg
; ++c
) {
1409 src
[c
] = bld
.getScratch();
1411 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_RAW_X
);
1413 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_DIM(c
));
1414 bld
.mkOp3(OP_SUCLAMP
, TYPE_S32
, src
[c
], su
->getSrc(c
), v
, zero
)
1415 ->subOp
= getSuClampSubOp(su
, c
);
1420 // set predicate output
1421 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1422 src
[0]->getInsn()->setFlagsDef(1, pred
);
1424 if (su
->tex
.target
.isArray()) {
1425 p1
= bld
.getSSA(1, FILE_PREDICATE
);
1426 src
[dim
]->getInsn()->setFlagsDef(1, p1
);
1429 // calculate pixel offset
1431 if (su
->tex
.target
!= TEX_TARGET_BUFFER
)
1432 bld
.mkOp2(OP_AND
, TYPE_U32
, off
, src
[0], bld
.loadImm(NULL
, 0xffff));
1435 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_UNK1C
);
1436 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, src
[2], v
, src
[1])
1437 ->subOp
= NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1439 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_PITCH
);
1440 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, off
, v
, src
[0])
1441 ->subOp
= NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1444 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_PITCH
);
1445 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, src
[1], v
, src
[0])
1446 ->subOp
= su
->tex
.target
.isArray() ?
1447 NV50_IR_SUBOP_MADSP_SD
: NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1450 // calculate effective address part 1
1451 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1455 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_FMT
);
1456 bld
.mkOp3(OP_VSHL
, TYPE_U32
, bf
, src
[0], v
, zero
)
1457 ->subOp
= NV50_IR_SUBOP_V1(7,6,8|2);
1471 if (!su
->tex
.target
.isArray()) {
1472 z
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_UNK1C
);
1473 subOp
= NV50_IR_SUBOP_SUBFM_3D
;
1477 subOp
= NV50_IR_SUBOP_SUBFM_3D
;
1481 insn
= bld
.mkOp3(OP_SUBFM
, TYPE_U32
, bf
, src
[0], y
, z
);
1482 insn
->subOp
= subOp
;
1483 insn
->setFlagsDef(1, pred
);
1487 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_ADDR
);
1489 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1492 eau
= bld
.mkOp3v(OP_SUEAU
, TYPE_U32
, bld
.getScratch(4), off
, bf
, v
);
1494 // add array layer offset
1495 if (su
->tex
.target
.isArray()) {
1496 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_ARRAY
);
1498 bld
.mkOp3(OP_MADSP
, TYPE_U32
, eau
, src
[1], v
, eau
)
1499 ->subOp
= NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1501 bld
.mkOp3(OP_MADSP
, TYPE_U32
, eau
, v
, src
[2], eau
)
1502 ->subOp
= NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
1503 // combine predicates
1505 bld
.mkOp2(OP_OR
, TYPE_U8
, pred
, pred
, p1
);
1510 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1514 // bf == g[] address & 0xff
1515 // eau == g[] address >> 8
1516 bld
.mkOp3(OP_PERMT
, TYPE_U32
, bf
, lo
, bld
.loadImm(NULL
, 0x6540), eau
);
1517 bld
.mkOp3(OP_PERMT
, TYPE_U32
, eau
, zero
, bld
.loadImm(NULL
, 0x0007), eau
);
1519 if (su
->op
== OP_SULDP
&& su
->tex
.target
== TEX_TARGET_BUFFER
) {
1520 // Convert from u32 to u8 address format, which is what the library code
1521 // doing SULDP currently uses.
1522 // XXX: can SUEAU do this ?
1523 // XXX: does it matter that we don't mask high bytes in bf ?
1525 bld
.mkOp2(OP_SHR
, TYPE_U32
, off
, bf
, bld
.mkImm(8));
1526 bld
.mkOp2(OP_ADD
, TYPE_U32
, eau
, eau
, off
);
1529 bld
.mkOp2(OP_MERGE
, TYPE_U64
, addr
, bf
, eau
);
1531 if (atom
&& su
->tex
.target
== TEX_TARGET_BUFFER
)
1532 bld
.mkOp2(OP_ADD
, TYPE_U64
, addr
, addr
, off
);
1534 // let's just set it 0 for raw access and hope it works
1536 bld
.mkImm(0) : loadResInfo32(NULL
, base
+ NVE4_SU_INFO_FMT
);
1538 // get rid of old coordinate sources, make space for fmt info and predicate
1539 su
->moveSources(arg
, 3 - arg
);
1540 // set 64 bit address and 32-bit format sources
1541 su
->setSrc(0, addr
);
1543 su
->setSrc(2, pred
);
1547 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction
*su
)
1549 processSurfaceCoordsNVE4(su
);
1551 // Who do we hate more ? The person who decided that nvc0's SULD doesn't
1552 // have to support conversion or the person who decided that, in OpenCL,
1553 // you don't have to specify the format here like you do in OpenGL ?
1555 if (su
->op
== OP_SULDP
) {
1556 // We don't patch shaders. Ever.
1557 // You get an indirect call to our library blob here.
1558 // But at least it's uniform.
1559 FlowInstruction
*call
;
1562 uint16_t base
= su
->tex
.r
* NVE4_SU_INFO__STRIDE
+ NVE4_SU_INFO_CALL
;
1564 for (int i
= 0; i
< 4; ++i
)
1565 (r
[i
] = bld
.getScratch(4, FILE_GPR
))->reg
.data
.id
= i
;
1566 for (int i
= 0; i
< 3; ++i
)
1567 (p
[i
] = bld
.getScratch(1, FILE_PREDICATE
))->reg
.data
.id
= i
;
1568 (r
[4] = bld
.getScratch(8, FILE_GPR
))->reg
.data
.id
= 4;
1570 bld
.mkMov(p
[1], bld
.mkImm((su
->cache
== CACHE_CA
) ? 1 : 0), TYPE_U8
);
1571 bld
.mkMov(p
[2], bld
.mkImm((su
->cache
== CACHE_CG
) ? 1 : 0), TYPE_U8
);
1572 bld
.mkMov(p
[0], su
->getSrc(2), TYPE_U8
);
1573 bld
.mkMov(r
[4], su
->getSrc(0), TYPE_U64
);
1574 bld
.mkMov(r
[2], su
->getSrc(1), TYPE_U32
);
1576 call
= bld
.mkFlow(OP_CALL
, NULL
, su
->cc
, su
->getPredicate());
1580 call
->setSrc(0, bld
.mkSymbol(FILE_MEMORY_CONST
,
1581 prog
->driver
->io
.auxCBSlot
, TYPE_U32
,
1582 prog
->driver
->io
.suInfoBase
+ base
));
1583 call
->setSrc(1, r
[2]);
1584 call
->setSrc(2, r
[4]);
1585 for (int i
= 0; i
< 3; ++i
)
1586 call
->setSrc(3 + i
, p
[i
]);
1587 for (int i
= 0; i
< 4; ++i
) {
1588 call
->setDef(i
, r
[i
]);
1589 bld
.mkMov(su
->getDef(i
), r
[i
]);
1591 call
->setDef(4, p
[1]);
1592 delete_Instruction(bld
.getProgram(), su
);
1595 if (su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
) {
1596 // FIXME: for out of bounds access, destination value will be undefined !
1597 Value
*pred
= su
->getSrc(2);
1598 CondCode cc
= CC_NOT_P
;
1599 if (su
->getPredicate()) {
1600 pred
= bld
.getScratch(1, FILE_PREDICATE
);
1602 if (cc
== CC_NOT_P
) {
1603 bld
.mkOp2(OP_OR
, TYPE_U8
, pred
, su
->getPredicate(), su
->getSrc(2));
1605 bld
.mkOp2(OP_AND
, TYPE_U8
, pred
, su
->getPredicate(), su
->getSrc(2));
1606 pred
->getInsn()->src(1).mod
= Modifier(NV50_IR_MOD_NOT
);
1609 Instruction
*red
= bld
.mkOp(OP_ATOM
, su
->dType
, su
->getDef(0));
1610 red
->subOp
= su
->subOp
;
1612 gMemBase
= bld
.mkSymbol(FILE_MEMORY_GLOBAL
, 0, TYPE_U32
, 0);
1613 red
->setSrc(0, gMemBase
);
1614 red
->setSrc(1, su
->getSrc(3));
1615 if (su
->subOp
== NV50_IR_SUBOP_ATOM_CAS
)
1616 red
->setSrc(2, su
->getSrc(4));
1617 red
->setIndirect(0, 0, su
->getSrc(0));
1618 red
->setPredicate(cc
, pred
);
1619 delete_Instruction(bld
.getProgram(), su
);
1620 handleCasExch(red
, true);
1622 su
->sType
= (su
->tex
.target
== TEX_TARGET_BUFFER
) ? TYPE_U32
: TYPE_U8
;
1627 NVC0LoweringPass::handleWRSV(Instruction
*i
)
1633 // must replace, $sreg are not writeable
1634 addr
= targ
->getSVAddress(FILE_SHADER_OUTPUT
, i
->getSrc(0)->asSym());
1637 sym
= bld
.mkSymbol(FILE_SHADER_OUTPUT
, 0, i
->sType
, addr
);
1639 st
= bld
.mkStore(OP_EXPORT
, i
->dType
, sym
, i
->getIndirect(0, 0),
1641 st
->perPatch
= i
->perPatch
;
1643 bld
.getBB()->remove(i
);
1648 NVC0LoweringPass::readTessCoord(LValue
*dst
, int c
)
1650 Value
*laneid
= bld
.getSSA();
1653 bld
.mkOp1(OP_RDSV
, TYPE_U32
, laneid
, bld
.mkSysVal(SV_LANEID
, 0));
1668 bld
.mkFetch(x
, TYPE_F32
, FILE_SHADER_OUTPUT
, 0x2f0, NULL
, laneid
);
1670 bld
.mkFetch(y
, TYPE_F32
, FILE_SHADER_OUTPUT
, 0x2f4, NULL
, laneid
);
1673 bld
.mkOp2(OP_ADD
, TYPE_F32
, dst
, x
, y
);
1674 bld
.mkOp2(OP_SUB
, TYPE_F32
, dst
, bld
.loadImm(NULL
, 1.0f
), dst
);
1679 NVC0LoweringPass::handleRDSV(Instruction
*i
)
1681 Symbol
*sym
= i
->getSrc(0)->asSym();
1682 const SVSemantic sv
= sym
->reg
.data
.sv
.sv
;
1685 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_INPUT
, sym
);
1687 if (addr
>= 0x400) {
1689 if (sym
->reg
.data
.sv
.index
== 3) {
1690 // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
1692 i
->setSrc(0, bld
.mkImm((sv
== SV_NTID
|| sv
== SV_NCTAID
) ? 1 : 0));
1694 if (sv
== SV_VERTEX_COUNT
) {
1695 bld
.setPosition(i
, true);
1696 bld
.mkOp2(OP_EXTBF
, TYPE_U32
, i
->getDef(0), i
->getDef(0), bld
.mkImm(0x808));
1703 assert(prog
->getType() == Program::TYPE_FRAGMENT
);
1704 if (i
->srcExists(1)) {
1705 // Pass offset through to the interpolation logic
1706 ld
= bld
.mkInterp(NV50_IR_INTERP_LINEAR
| NV50_IR_INTERP_OFFSET
,
1707 i
->getDef(0), addr
, NULL
);
1708 ld
->setSrc(1, i
->getSrc(1));
1710 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, i
->getDef(0), addr
, NULL
);
1715 Value
*face
= i
->getDef(0);
1716 bld
.mkInterp(NV50_IR_INTERP_FLAT
, face
, addr
, NULL
);
1717 if (i
->dType
== TYPE_F32
) {
1718 bld
.mkOp2(OP_OR
, TYPE_U32
, face
, face
, bld
.mkImm(0x00000001));
1719 bld
.mkOp1(OP_NEG
, TYPE_S32
, face
, face
);
1720 bld
.mkCvt(OP_CVT
, TYPE_F32
, face
, TYPE_S32
, face
);
1725 assert(prog
->getType() == Program::TYPE_TESSELLATION_EVAL
);
1726 readTessCoord(i
->getDef(0)->asLValue(), i
->getSrc(0)->reg
.data
.sv
.index
);
1731 assert(targ
->getChipset() >= NVISA_GK104_CHIPSET
); // mov $sreg otherwise
1732 if (sym
->reg
.data
.sv
.index
== 3) {
1734 i
->setSrc(0, bld
.mkImm(sv
== SV_GRIDID
? 0 : 1));
1737 addr
+= prog
->driver
->prop
.cp
.gridInfoBase
;
1738 bld
.mkLoad(TYPE_U32
, i
->getDef(0),
1739 bld
.mkSymbol(FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
1740 TYPE_U32
, addr
), NULL
);
1742 case SV_SAMPLE_INDEX
:
1743 // TODO: Properly pass source as an address in the PIX address space
1744 // (which can be of the form [r0+offset]). But this is currently
1746 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
1747 ld
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
1749 case SV_SAMPLE_POS
: {
1750 Value
*off
= new_LValue(func
, FILE_GPR
);
1751 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
1752 ld
->subOp
= NV50_IR_SUBOP_PIXLD_SAMPLEID
;
1753 bld
.mkOp2(OP_SHL
, TYPE_U32
, off
, i
->getDef(0), bld
.mkImm(3));
1754 bld
.mkLoad(TYPE_F32
,
1757 FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
1758 TYPE_U32
, prog
->driver
->io
.sampleInfoBase
+
1759 4 * sym
->reg
.data
.sv
.index
),
1763 case SV_SAMPLE_MASK
:
1764 ld
= bld
.mkOp1(OP_PIXLD
, TYPE_U32
, i
->getDef(0), bld
.mkImm(0));
1765 ld
->subOp
= NV50_IR_SUBOP_PIXLD_COVMASK
;
1768 case SV_BASEINSTANCE
:
1770 ld
= bld
.mkLoad(TYPE_U32
, i
->getDef(0),
1771 bld
.mkSymbol(FILE_MEMORY_CONST
,
1772 prog
->driver
->io
.auxCBSlot
,
1774 prog
->driver
->io
.drawInfoBase
+
1775 4 * (sv
- SV_BASEVERTEX
)),
1779 if (prog
->getType() == Program::TYPE_TESSELLATION_EVAL
&& !i
->perPatch
)
1780 vtx
= bld
.mkOp1v(OP_PFETCH
, TYPE_U32
, bld
.getSSA(), bld
.mkImm(0));
1781 ld
= bld
.mkFetch(i
->getDef(0), i
->dType
,
1782 FILE_SHADER_INPUT
, addr
, i
->getIndirect(0, 0), vtx
);
1783 ld
->perPatch
= i
->perPatch
;
1786 bld
.getBB()->remove(i
);
1791 NVC0LoweringPass::handleDIV(Instruction
*i
)
1793 if (!isFloatType(i
->dType
))
1795 bld
.setPosition(i
, false);
1796 Instruction
*rcp
= bld
.mkOp1(OP_RCP
, i
->dType
, bld
.getSSA(typeSizeof(i
->dType
)), i
->getSrc(1));
1798 i
->setSrc(1, rcp
->getDef(0));
1803 NVC0LoweringPass::handleMOD(Instruction
*i
)
1805 if (!isFloatType(i
->dType
))
1807 LValue
*value
= bld
.getScratch(typeSizeof(i
->dType
));
1808 bld
.mkOp1(OP_RCP
, i
->dType
, value
, i
->getSrc(1));
1809 bld
.mkOp2(OP_MUL
, i
->dType
, value
, i
->getSrc(0), value
);
1810 bld
.mkOp1(OP_TRUNC
, i
->dType
, value
, value
);
1811 bld
.mkOp2(OP_MUL
, i
->dType
, value
, i
->getSrc(1), value
);
1813 i
->setSrc(1, value
);
1818 NVC0LoweringPass::handleSQRT(Instruction
*i
)
1820 if (i
->dType
== TYPE_F64
) {
1821 Value
*pred
= bld
.getSSA(1, FILE_PREDICATE
);
1822 Value
*zero
= bld
.loadImm(NULL
, 0.0);
1823 Value
*dst
= bld
.getSSA(8);
1824 bld
.mkOp1(OP_RSQ
, i
->dType
, dst
, i
->getSrc(0));
1825 bld
.mkCmp(OP_SET
, CC_LE
, i
->dType
, pred
, i
->dType
, i
->getSrc(0), zero
);
1826 bld
.mkOp3(OP_SELP
, TYPE_U64
, dst
, zero
, dst
, pred
);
1829 // TODO: Handle this properly with a library function
1831 bld
.setPosition(i
, true);
1833 bld
.mkOp1(OP_RCP
, i
->dType
, i
->getDef(0), i
->getDef(0));
1840 NVC0LoweringPass::handlePOW(Instruction
*i
)
1842 LValue
*val
= bld
.getScratch();
1844 bld
.mkOp1(OP_LG2
, TYPE_F32
, val
, i
->getSrc(0));
1845 bld
.mkOp2(OP_MUL
, TYPE_F32
, val
, i
->getSrc(1), val
)->dnz
= 1;
1846 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, val
, val
);
1856 NVC0LoweringPass::handleEXPORT(Instruction
*i
)
1858 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
1859 int id
= i
->getSrc(0)->reg
.data
.offset
/ 4;
1861 if (i
->src(0).isIndirect(0)) // TODO, ugly
1864 i
->subOp
= NV50_IR_SUBOP_MOV_FINAL
;
1865 i
->src(0).set(i
->src(1));
1867 i
->setDef(0, new_LValue(func
, FILE_GPR
));
1868 i
->getDef(0)->reg
.data
.id
= id
;
1870 prog
->maxGPR
= MAX2(prog
->maxGPR
, id
);
1872 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
1873 i
->setIndirect(0, 1, gpEmitAddress
);
1879 NVC0LoweringPass::handleOUT(Instruction
*i
)
1881 Instruction
*prev
= i
->prev
;
1882 ImmediateValue stream
, prevStream
;
1884 // Only merge if the stream ids match. Also, note that the previous
1885 // instruction would have already been lowered, so we take arg1 from it.
1886 if (i
->op
== OP_RESTART
&& prev
&& prev
->op
== OP_EMIT
&&
1887 i
->src(0).getImmediate(stream
) &&
1888 prev
->src(1).getImmediate(prevStream
) &&
1889 stream
.reg
.data
.u32
== prevStream
.reg
.data
.u32
) {
1890 i
->prev
->subOp
= NV50_IR_SUBOP_EMIT_RESTART
;
1891 delete_Instruction(prog
, i
);
1893 assert(gpEmitAddress
);
1894 i
->setDef(0, gpEmitAddress
);
1895 i
->setSrc(1, i
->getSrc(0));
1896 i
->setSrc(0, gpEmitAddress
);
1901 // Generate a binary predicate if an instruction is predicated by
1902 // e.g. an f32 value.
1904 NVC0LoweringPass::checkPredicate(Instruction
*insn
)
1906 Value
*pred
= insn
->getPredicate();
1909 if (!pred
|| pred
->reg
.file
== FILE_PREDICATE
)
1911 pdst
= new_LValue(func
, FILE_PREDICATE
);
1913 // CAUTION: don't use pdst->getInsn, the definition might not be unique,
1914 // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
1916 bld
.mkCmp(OP_SET
, CC_NEU
, insn
->dType
, pdst
, insn
->dType
, bld
.mkImm(0), pred
);
1918 insn
->setPredicate(insn
->cc
, pdst
);
1922 // - add quadop dance for texturing
1923 // - put FP outputs in GPRs
1924 // - convert instruction sequences
1927 NVC0LoweringPass::visit(Instruction
*i
)
1930 bld
.setPosition(i
, false);
1932 if (i
->cc
!= CC_ALWAYS
)
1941 return handleTEX(i
->asTex());
1943 return handleTXD(i
->asTex());
1945 return handleTXLQ(i
->asTex());
1947 return handleTXQ(i
->asTex());
1949 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, i
->getDef(0), i
->getSrc(0));
1950 i
->setSrc(0, i
->getDef(0));
1953 return handlePOW(i
);
1955 return handleDIV(i
);
1957 return handleMOD(i
);
1959 return handleSQRT(i
);
1961 ret
= handleEXPORT(i
);
1965 return handleOUT(i
);
1967 return handleRDSV(i
);
1969 return handleWRSV(i
);
1972 if (i
->src(0).getFile() == FILE_SHADER_INPUT
) {
1973 if (prog
->getType() == Program::TYPE_COMPUTE
) {
1974 i
->getSrc(0)->reg
.file
= FILE_MEMORY_CONST
;
1975 i
->getSrc(0)->reg
.fileIndex
= 0;
1977 if (prog
->getType() == Program::TYPE_GEOMETRY
&&
1978 i
->src(0).isIndirect(0)) {
1979 // XXX: this assumes vec4 units
1980 Value
*ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
1981 i
->getIndirect(0, 0), bld
.mkImm(4));
1982 i
->setIndirect(0, 0, ptr
);
1986 assert(prog
->getType() != Program::TYPE_FRAGMENT
); // INTERP
1988 } else if (i
->src(0).getFile() == FILE_MEMORY_CONST
) {
1989 if (i
->src(0).isIndirect(1)) {
1991 if (i
->src(0).isIndirect(0))
1992 ptr
= bld
.mkOp3v(OP_INSBF
, TYPE_U32
, bld
.getSSA(),
1993 i
->getIndirect(0, 1), bld
.mkImm(0x1010),
1994 i
->getIndirect(0, 0));
1996 ptr
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
1997 i
->getIndirect(0, 1), bld
.mkImm(16));
1998 i
->setIndirect(0, 1, NULL
);
1999 i
->setIndirect(0, 0, ptr
);
2000 i
->subOp
= NV50_IR_SUBOP_LDC_IS
;
2002 } else if (i
->src(0).getFile() == FILE_SHADER_OUTPUT
) {
2003 assert(prog
->getType() == Program::TYPE_TESSELLATION_CONTROL
);
2005 } else if (i
->src(0).getFile() == FILE_MEMORY_GLOBAL
) {
2006 Value
*ind
= i
->getIndirect(0, 1);
2007 Value
*ptr
= loadResInfo64(ind
, i
->getSrc(0)->reg
.fileIndex
* 16);
2008 // XXX come up with a way not to do this for EVERY little access but
2009 // rather to batch these up somehow. Unfortunately we've lost the
2010 // information about the field width by the time we get here.
2011 Value
*offset
= bld
.loadImm(NULL
, i
->getSrc(0)->reg
.data
.offset
+ typeSizeof(i
->sType
));
2012 Value
*length
= loadResLength32(ind
, i
->getSrc(0)->reg
.fileIndex
* 16);
2013 Value
*pred
= new_LValue(func
, FILE_PREDICATE
);
2014 if (i
->src(0).isIndirect(0)) {
2015 bld
.mkOp2(OP_ADD
, TYPE_U64
, ptr
, ptr
, i
->getIndirect(0, 0));
2016 bld
.mkOp2(OP_ADD
, TYPE_U32
, offset
, offset
, i
->getIndirect(0, 0));
2018 i
->setIndirect(0, 1, NULL
);
2019 i
->setIndirect(0, 0, ptr
);
2020 bld
.mkCmp(OP_SET
, CC_GT
, TYPE_U32
, pred
, TYPE_U32
, offset
, length
);
2021 i
->setPredicate(CC_NOT_P
, pred
);
2022 if (i
->defExists(0)) {
2023 bld
.mkMov(i
->getDef(0), bld
.mkImm(0));
2029 const bool cctl
= i
->src(0).getFile() == FILE_MEMORY_GLOBAL
;
2031 handleCasExch(i
, cctl
);
2040 if (targ
->getChipset() >= NVISA_GK104_CHIPSET
)
2041 handleSurfaceOpNVE4(i
->asTex());
2050 /* Kepler+ has a special opcode to compute a new base address to be used
2051 * for indirect loads.
2053 if (targ
->getChipset() >= NVISA_GK104_CHIPSET
&& !i
->perPatch
&&
2054 (i
->op
== OP_VFETCH
|| i
->op
== OP_EXPORT
) && i
->src(0).isIndirect(0)) {
2055 Instruction
*afetch
= bld
.mkOp1(OP_AFETCH
, TYPE_U32
, bld
.getSSA(),
2056 cloneShallow(func
, i
->getSrc(0)));
2057 afetch
->setIndirect(0, 0, i
->getIndirect(0, 0));
2058 i
->src(0).get()->reg
.data
.offset
= 0;
2059 i
->setIndirect(0, 0, afetch
->getDef(0));
2066 TargetNVC0::runLegalizePass(Program
*prog
, CGStage stage
) const
2068 if (stage
== CG_STAGE_PRE_SSA
) {
2069 NVC0LoweringPass
pass(prog
);
2070 return pass
.run(prog
, false, true);
2072 if (stage
== CG_STAGE_POST_RA
) {
2073 NVC0LegalizePostRA
pass(prog
);
2074 return pass
.run(prog
, false, true);
2076 if (stage
== CG_STAGE_SSA
) {
2077 NVC0LegalizeSSA pass
;
2078 return pass
.run(prog
, false, true);
2083 } // namespace nv50_ir