2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
26 #include "codegen/nv50_ir_target_nvc0.h"
38 #define QUADOP(q, r, s, t) \
39 ((QOP_##q << 6) | (QOP_##r << 4) | \
40 (QOP_##s << 2) | (QOP_##t << 0))
42 class NVC0LegalizeSSA
: public Pass
45 virtual bool visit(BasicBlock
*);
46 virtual bool visit(Function
*);
48 // we want to insert calls to the builtin library only after optimization
49 void handleDIV(Instruction
*); // integer division, modulus
50 void handleRCPRSQ(Instruction
*); // double precision float recip/rsqrt
57 NVC0LegalizeSSA::handleDIV(Instruction
*i
)
59 FlowInstruction
*call
;
63 bld
.setPosition(i
, false);
64 def
[0] = bld
.mkMovToReg(0, i
->getSrc(0))->getDef(0);
65 def
[1] = bld
.mkMovToReg(1, i
->getSrc(1))->getDef(0);
67 case TYPE_U32
: builtin
= NVC0_BUILTIN_DIV_U32
; break;
68 case TYPE_S32
: builtin
= NVC0_BUILTIN_DIV_S32
; break;
72 call
= bld
.mkFlow(OP_CALL
, NULL
, CC_ALWAYS
, NULL
);
73 bld
.mkMov(i
->getDef(0), def
[(i
->op
== OP_DIV
) ? 0 : 1]);
74 bld
.mkClobber(FILE_GPR
, (i
->op
== OP_DIV
) ? 0xe : 0xd, 2);
75 bld
.mkClobber(FILE_PREDICATE
, (i
->dType
== TYPE_S32
) ? 0xf : 0x3, 0);
78 call
->absolute
= call
->builtin
= 1;
79 call
->target
.builtin
= builtin
;
80 delete_Instruction(prog
, i
);
84 NVC0LegalizeSSA::handleRCPRSQ(Instruction
*i
)
90 NVC0LegalizeSSA::visit(Function
*fn
)
92 bld
.setProgram(fn
->getProgram());
97 NVC0LegalizeSSA::visit(BasicBlock
*bb
)
100 for (Instruction
*i
= bb
->getEntry(); i
; i
= next
) {
102 if (i
->dType
== TYPE_F32
)
111 if (i
->dType
== TYPE_F64
)
121 class NVC0LegalizePostRA
: public Pass
124 NVC0LegalizePostRA(const Program
*);
127 virtual bool visit(Function
*);
128 virtual bool visit(BasicBlock
*);
130 void replaceZero(Instruction
*);
131 bool tryReplaceContWithBra(BasicBlock
*);
132 void propagateJoin(BasicBlock
*);
136 TexUse(Instruction
*use
, const Instruction
*tex
)
137 : insn(use
), tex(tex
), level(-1) { }
139 const Instruction
*tex
; // or split / mov
145 Limits(int min
, int max
) : min(min
), max(max
) { }
148 bool insertTextureBarriers(Function
*);
149 inline bool insnDominatedBy(const Instruction
*, const Instruction
*) const;
150 void findFirstUses(const Instruction
*tex
, const Instruction
*def
,
152 void findOverwritingDefs(const Instruction
*tex
, Instruction
*insn
,
153 const BasicBlock
*term
,
155 void addTexUse(std::list
<TexUse
>&, Instruction
*, const Instruction
*);
156 const Instruction
*recurseDef(const Instruction
*);
161 const bool needTexBar
;
164 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program
*prog
)
167 needTexBar(prog
->getTarget()->getChipset() >= 0xe0)
172 NVC0LegalizePostRA::insnDominatedBy(const Instruction
*later
,
173 const Instruction
*early
) const
175 if (early
->bb
== later
->bb
)
176 return early
->serial
< later
->serial
;
177 return later
->bb
->dominatedBy(early
->bb
);
181 NVC0LegalizePostRA::addTexUse(std::list
<TexUse
> &uses
,
182 Instruction
*usei
, const Instruction
*insn
)
185 for (std::list
<TexUse
>::iterator it
= uses
.begin();
187 if (insnDominatedBy(usei
, it
->insn
)) {
191 if (insnDominatedBy(it
->insn
, usei
))
197 uses
.push_back(TexUse(usei
, insn
));
201 NVC0LegalizePostRA::findOverwritingDefs(const Instruction
*texi
,
203 const BasicBlock
*term
,
204 std::list
<TexUse
> &uses
)
206 while (insn
->op
== OP_MOV
&& insn
->getDef(0)->equals(insn
->getSrc(0)))
207 insn
= insn
->getSrc(0)->getUniqueInsn();
209 if (!insn
|| !insn
->bb
->reachableBy(texi
->bb
, term
))
213 /* Values not connected to the tex's definition through any of these should
214 * not be conflicting.
221 for (int s
= 0; insn
->srcExists(s
); ++s
)
222 findOverwritingDefs(texi
, insn
->getSrc(s
)->getUniqueInsn(), term
,
226 // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
227 addTexUse(uses
, insn
, texi
);
233 NVC0LegalizePostRA::findFirstUses(const Instruction
*texi
,
234 const Instruction
*insn
,
235 std::list
<TexUse
> &uses
)
237 for (int d
= 0; insn
->defExists(d
); ++d
) {
238 Value
*v
= insn
->getDef(d
);
239 for (Value::UseIterator u
= v
->uses
.begin(); u
!= v
->uses
.end(); ++u
) {
240 Instruction
*usei
= (*u
)->getInsn();
242 if (usei
->op
== OP_PHI
|| usei
->op
== OP_UNION
) {
243 // need a barrier before WAW cases
244 for (int s
= 0; usei
->srcExists(s
); ++s
) {
245 Instruction
*defi
= usei
->getSrc(s
)->getUniqueInsn();
246 if (defi
&& &usei
->src(s
) != *u
)
247 findOverwritingDefs(texi
, defi
, usei
->bb
, uses
);
251 if (usei
->op
== OP_SPLIT
||
252 usei
->op
== OP_MERGE
||
253 usei
->op
== OP_PHI
||
254 usei
->op
== OP_UNION
) {
255 // these uses don't manifest in the machine code
256 findFirstUses(texi
, usei
, uses
);
258 if (usei
->op
== OP_MOV
&& usei
->getDef(0)->equals(usei
->getSrc(0)) &&
259 usei
->subOp
!= NV50_IR_SUBOP_MOV_FINAL
) {
260 findFirstUses(texi
, usei
, uses
);
262 addTexUse(uses
, usei
, insn
);
269 // This pass is a bit long and ugly and can probably be optimized.
271 // 1. obtain a list of TEXes and their outputs' first use(s)
272 // 2. calculate the barrier level of each first use (minimal number of TEXes,
273 // over all paths, between the TEX and the use in question)
274 // 3. for each barrier, if all paths from the source TEX to that barrier
275 // contain a barrier of lesser level, it can be culled
277 NVC0LegalizePostRA::insertTextureBarriers(Function
*fn
)
279 std::list
<TexUse
> *uses
;
280 std::vector
<Instruction
*> texes
;
281 std::vector
<int> bbFirstTex
;
282 std::vector
<int> bbFirstUse
;
283 std::vector
<int> texCounts
;
284 std::vector
<TexUse
> useVec
;
287 fn
->orderInstructions(insns
);
289 texCounts
.resize(fn
->allBBlocks
.getSize(), 0);
290 bbFirstTex
.resize(fn
->allBBlocks
.getSize(), insns
.getSize());
291 bbFirstUse
.resize(fn
->allBBlocks
.getSize(), insns
.getSize());
293 // tag BB CFG nodes by their id for later
294 for (ArrayList::Iterator i
= fn
->allBBlocks
.iterator(); !i
.end(); i
.next()) {
295 BasicBlock
*bb
= reinterpret_cast<BasicBlock
*>(i
.get());
297 bb
->cfg
.tag
= bb
->getId();
300 // gather the first uses for each TEX
301 for (int i
= 0; i
< insns
.getSize(); ++i
) {
302 Instruction
*tex
= reinterpret_cast<Instruction
*>(insns
.get(i
));
303 if (isTextureOp(tex
->op
)) {
304 texes
.push_back(tex
);
305 if (!texCounts
.at(tex
->bb
->getId()))
306 bbFirstTex
[tex
->bb
->getId()] = texes
.size() - 1;
307 texCounts
[tex
->bb
->getId()]++;
313 uses
= new std::list
<TexUse
>[texes
.size()];
316 for (size_t i
= 0; i
< texes
.size(); ++i
)
317 findFirstUses(texes
[i
], texes
[i
], uses
[i
]);
319 // determine the barrier level at each use
320 for (size_t i
= 0; i
< texes
.size(); ++i
) {
321 for (std::list
<TexUse
>::iterator u
= uses
[i
].begin(); u
!= uses
[i
].end();
323 BasicBlock
*tb
= texes
[i
]->bb
;
324 BasicBlock
*ub
= u
->insn
->bb
;
327 for (size_t j
= i
+ 1; j
< texes
.size() &&
328 texes
[j
]->bb
== tb
&& texes
[j
]->serial
< u
->insn
->serial
;
332 u
->level
= fn
->cfg
.findLightestPathWeight(&tb
->cfg
,
333 &ub
->cfg
, texCounts
);
335 WARN("Failed to find path TEX -> TEXBAR\n");
339 // this counted all TEXes in the origin block, correct that
340 u
->level
-= i
- bbFirstTex
.at(tb
->getId()) + 1 /* this TEX */;
341 // and did not count the TEXes in the destination block, add those
342 for (size_t j
= bbFirstTex
.at(ub
->getId()); j
< texes
.size() &&
343 texes
[j
]->bb
== ub
&& texes
[j
]->serial
< u
->insn
->serial
;
347 assert(u
->level
>= 0);
348 useVec
.push_back(*u
);
354 // insert the barriers
355 for (size_t i
= 0; i
< useVec
.size(); ++i
) {
356 Instruction
*prev
= useVec
[i
].insn
->prev
;
357 if (useVec
[i
].level
< 0)
359 if (prev
&& prev
->op
== OP_TEXBAR
) {
360 if (prev
->subOp
> useVec
[i
].level
)
361 prev
->subOp
= useVec
[i
].level
;
362 prev
->setSrc(prev
->srcCount(), useVec
[i
].tex
->getDef(0));
364 Instruction
*bar
= new_Instruction(func
, OP_TEXBAR
, TYPE_NONE
);
366 bar
->subOp
= useVec
[i
].level
;
367 // make use explicit to ease latency calculation
368 bar
->setSrc(bar
->srcCount(), useVec
[i
].tex
->getDef(0));
369 useVec
[i
].insn
->bb
->insertBefore(useVec
[i
].insn
, bar
);
373 if (fn
->getProgram()->optLevel
< 3) {
379 std::vector
<Limits
> limitT
, limitB
, limitS
; // entry, exit, single
381 limitT
.resize(fn
->allBBlocks
.getSize(), Limits(0, 0));
382 limitB
.resize(fn
->allBBlocks
.getSize(), Limits(0, 0));
383 limitS
.resize(fn
->allBBlocks
.getSize());
385 // cull unneeded barriers (should do that earlier, but for simplicity)
386 IteratorRef bi
= fn
->cfg
.iteratorCFG();
387 // first calculate min/max outstanding TEXes for each BB
388 for (bi
->reset(); !bi
->end(); bi
->next()) {
389 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
390 BasicBlock
*bb
= BasicBlock::get(n
);
392 int max
= std::numeric_limits
<int>::max();
393 for (Instruction
*i
= bb
->getFirst(); i
; i
= i
->next
) {
394 if (isTextureOp(i
->op
)) {
396 if (max
< std::numeric_limits
<int>::max())
399 if (i
->op
== OP_TEXBAR
) {
400 min
= MIN2(min
, i
->subOp
);
401 max
= MIN2(max
, i
->subOp
);
404 // limits when looking at an isolated block
405 limitS
[bb
->getId()].min
= min
;
406 limitS
[bb
->getId()].max
= max
;
408 // propagate the min/max values
409 for (unsigned int l
= 0; l
<= fn
->loopNestingBound
; ++l
) {
410 for (bi
->reset(); !bi
->end(); bi
->next()) {
411 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
412 BasicBlock
*bb
= BasicBlock::get(n
);
413 const int bbId
= bb
->getId();
414 for (Graph::EdgeIterator ei
= n
->incident(); !ei
.end(); ei
.next()) {
415 BasicBlock
*in
= BasicBlock::get(ei
.getNode());
416 const int inId
= in
->getId();
417 limitT
[bbId
].min
= MAX2(limitT
[bbId
].min
, limitB
[inId
].min
);
418 limitT
[bbId
].max
= MAX2(limitT
[bbId
].max
, limitB
[inId
].max
);
420 // I just hope this is correct ...
421 if (limitS
[bbId
].max
== std::numeric_limits
<int>::max()) {
423 limitB
[bbId
].min
= limitT
[bbId
].min
+ limitS
[bbId
].min
;
424 limitB
[bbId
].max
= limitT
[bbId
].max
+ limitS
[bbId
].min
;
426 // block contained a barrier
427 limitB
[bbId
].min
= MIN2(limitS
[bbId
].max
,
428 limitT
[bbId
].min
+ limitS
[bbId
].min
);
429 limitB
[bbId
].max
= MIN2(limitS
[bbId
].max
,
430 limitT
[bbId
].max
+ limitS
[bbId
].min
);
434 // finally delete unnecessary barriers
435 for (bi
->reset(); !bi
->end(); bi
->next()) {
436 Graph::Node
*n
= reinterpret_cast<Graph::Node
*>(bi
->get());
437 BasicBlock
*bb
= BasicBlock::get(n
);
438 Instruction
*prev
= NULL
;
440 int max
= limitT
[bb
->getId()].max
;
441 for (Instruction
*i
= bb
->getFirst(); i
; i
= next
) {
443 if (i
->op
== OP_TEXBAR
) {
444 if (i
->subOp
>= max
) {
445 delete_Instruction(prog
, i
);
449 if (prev
&& prev
->op
== OP_TEXBAR
&& prev
->subOp
>= max
) {
450 delete_Instruction(prog
, prev
);
455 if (isTextureOp(i
->op
)) {
458 if (i
&& !i
->isNop())
468 NVC0LegalizePostRA::visit(Function
*fn
)
471 insertTextureBarriers(fn
);
473 rZero
= new_LValue(fn
, FILE_GPR
);
474 carry
= new_LValue(fn
, FILE_FLAGS
);
476 rZero
->reg
.data
.id
= prog
->getTarget()->getFileSize(FILE_GPR
);
477 carry
->reg
.data
.id
= 0;
483 NVC0LegalizePostRA::replaceZero(Instruction
*i
)
485 for (int s
= 0; i
->srcExists(s
); ++s
) {
486 if (s
== 2 && i
->op
== OP_SUCLAMP
)
488 ImmediateValue
*imm
= i
->getSrc(s
)->asImm();
489 if (imm
&& imm
->reg
.data
.u64
== 0)
494 // replace CONT with BRA for single unconditional continue
496 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock
*bb
)
498 if (bb
->cfg
.incidentCount() != 2 || bb
->getEntry()->op
!= OP_PRECONT
)
500 Graph::EdgeIterator ei
= bb
->cfg
.incident();
501 if (ei
.getType() != Graph::Edge::BACK
)
503 if (ei
.getType() != Graph::Edge::BACK
)
505 BasicBlock
*contBB
= BasicBlock::get(ei
.getNode());
507 if (!contBB
->getExit() || contBB
->getExit()->op
!= OP_CONT
||
508 contBB
->getExit()->getPredicate())
510 contBB
->getExit()->op
= OP_BRA
;
511 bb
->remove(bb
->getEntry()); // delete PRECONT
514 assert(ei
.end() || ei
.getType() != Graph::Edge::BACK
);
518 // replace branches to join blocks with join ops
520 NVC0LegalizePostRA::propagateJoin(BasicBlock
*bb
)
522 if (bb
->getEntry()->op
!= OP_JOIN
|| bb
->getEntry()->asFlow()->limit
)
524 for (Graph::EdgeIterator ei
= bb
->cfg
.incident(); !ei
.end(); ei
.next()) {
525 BasicBlock
*in
= BasicBlock::get(ei
.getNode());
526 Instruction
*exit
= in
->getExit();
528 in
->insertTail(new FlowInstruction(func
, OP_JOIN
, bb
));
529 // there should always be a terminator instruction
530 WARN("inserted missing terminator in BB:%i\n", in
->getId());
532 if (exit
->op
== OP_BRA
) {
534 exit
->asFlow()->limit
= 1; // must-not-propagate marker
537 bb
->remove(bb
->getEntry());
541 NVC0LegalizePostRA::visit(BasicBlock
*bb
)
543 Instruction
*i
, *next
;
545 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
546 for (i
= bb
->getFirst(); i
; i
= next
) {
548 if (i
->op
== OP_EMIT
|| i
->op
== OP_RESTART
) {
549 if (!i
->getDef(0)->refCount())
551 if (i
->src(0).getFile() == FILE_IMMEDIATE
)
552 i
->setSrc(0, rZero
); // initial value must be 0
557 // TODO: Move this to before register allocation for operations that
558 // need the $c register !
559 if (typeSizeof(i
->dType
) == 8) {
561 hi
= BuildUtil::split64BitOpPostRA(func
, i
, rZero
, carry
);
566 if (i
->op
!= OP_MOV
&& i
->op
!= OP_PFETCH
)
573 if (!tryReplaceContWithBra(bb
))
579 class NVC0LoweringPass
: public Pass
582 NVC0LoweringPass(Program
*);
585 virtual bool visit(Function
*);
586 virtual bool visit(BasicBlock
*);
587 virtual bool visit(Instruction
*);
589 bool handleRDSV(Instruction
*);
590 bool handleWRSV(Instruction
*);
591 bool handleEXPORT(Instruction
*);
592 bool handleOUT(Instruction
*);
593 bool handleDIV(Instruction
*);
594 bool handleMOD(Instruction
*);
595 bool handleSQRT(Instruction
*);
596 bool handlePOW(Instruction
*);
597 bool handleTEX(TexInstruction
*);
598 bool handleTXD(TexInstruction
*);
599 bool handleTXQ(TexInstruction
*);
600 bool handleManualTXD(TexInstruction
*);
601 bool handleATOM(Instruction
*);
602 bool handleCasExch(Instruction
*, bool needCctl
);
603 void handleSurfaceOpNVE4(TexInstruction
*);
605 void checkPredicate(Instruction
*);
607 void readTessCoord(LValue
*dst
, int c
);
609 Value
*loadResInfo32(Value
*ptr
, uint32_t off
);
610 Value
*loadMsInfo32(Value
*ptr
, uint32_t off
);
611 Value
*loadTexHandle(Value
*ptr
, unsigned int slot
);
613 void adjustCoordinatesMS(TexInstruction
*);
614 void processSurfaceCoordsNVE4(TexInstruction
*);
617 const Target
*const targ
;
622 LValue
*gpEmitAddress
;
625 NVC0LoweringPass::NVC0LoweringPass(Program
*prog
) : targ(prog
->getTarget())
627 bld
.setProgram(prog
);
632 NVC0LoweringPass::visit(Function
*fn
)
634 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
635 assert(!strncmp(fn
->getName(), "MAIN", 4));
636 // TODO: when we generate actual functions pass this value along somehow
637 bld
.setPosition(BasicBlock::get(fn
->cfg
.getRoot()), false);
638 gpEmitAddress
= bld
.loadImm(NULL
, 0)->asLValue();
640 bld
.setPosition(BasicBlock::get(fn
->cfgExit
)->getExit(), false);
641 bld
.mkMovToReg(0, gpEmitAddress
);
648 NVC0LoweringPass::visit(BasicBlock
*bb
)
654 NVC0LoweringPass::loadTexHandle(Value
*ptr
, unsigned int slot
)
656 uint8_t b
= prog
->driver
->io
.resInfoCBSlot
;
657 uint32_t off
= prog
->driver
->io
.texBindBase
+ slot
* 4;
659 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
662 // move array source to first slot, convert to u16, add indirections
664 NVC0LoweringPass::handleTEX(TexInstruction
*i
)
666 const int dim
= i
->tex
.target
.getDim() + i
->tex
.target
.isCube();
667 const int arg
= i
->tex
.target
.getArgCount();
668 const int lyr
= arg
- (i
->tex
.target
.isMS() ? 2 : 1);
670 if (prog
->getTarget()->getChipset() >= NVISA_GK104_CHIPSET
) {
671 if (i
->tex
.rIndirectSrc
>= 0 || i
->tex
.sIndirectSrc
>= 0) {
672 WARN("indirect TEX not implemented\n");
674 if (i
->tex
.r
== i
->tex
.s
) {
675 i
->tex
.r
+= prog
->driver
->io
.texBindBase
/ 4;
676 i
->tex
.s
= 0; // only a single cX[] value possible here
678 Value
*hnd
= bld
.getScratch();
679 Value
*rHnd
= loadTexHandle(NULL
, i
->tex
.r
);
680 Value
*sHnd
= loadTexHandle(NULL
, i
->tex
.s
);
682 bld
.mkOp3(OP_INSBF
, TYPE_U32
, hnd
, rHnd
, bld
.mkImm(0x1400), sHnd
);
684 i
->tex
.r
= 0; // not used for indirect tex
686 i
->setIndirectR(hnd
);
688 if (i
->tex
.target
.isArray()) {
689 LValue
*layer
= new_LValue(func
, FILE_GPR
);
690 Value
*src
= i
->getSrc(lyr
);
691 const int sat
= (i
->op
== OP_TXF
) ? 1 : 0;
692 DataType sTy
= (i
->op
== OP_TXF
) ? TYPE_U32
: TYPE_F32
;
693 bld
.mkCvt(OP_CVT
, TYPE_U16
, layer
, sTy
, src
)->saturate
= sat
;
694 for (int s
= dim
; s
>= 1; --s
)
695 i
->setSrc(s
, i
->getSrc(s
- 1));
699 // (nvc0) generate and move the tsc/tic/array source to the front
700 if (dim
!= arg
|| i
->tex
.rIndirectSrc
>= 0 || i
->tex
.sIndirectSrc
>= 0) {
701 LValue
*src
= new_LValue(func
, FILE_GPR
); // 0xttxsaaaa
703 Value
*arrayIndex
= i
->tex
.target
.isArray() ? i
->getSrc(lyr
) : NULL
;
704 for (int s
= dim
; s
>= 1; --s
)
705 i
->setSrc(s
, i
->getSrc(s
- 1));
706 i
->setSrc(0, arrayIndex
);
708 Value
*ticRel
= i
->getIndirectR();
709 Value
*tscRel
= i
->getIndirectS();
712 int sat
= (i
->op
== OP_TXF
) ? 1 : 0;
713 DataType sTy
= (i
->op
== OP_TXF
) ? TYPE_U32
: TYPE_F32
;
714 bld
.mkCvt(OP_CVT
, TYPE_U16
, src
, sTy
, arrayIndex
)->saturate
= sat
;
720 i
->setSrc(i
->tex
.rIndirectSrc
, NULL
);
721 bld
.mkOp3(OP_INSBF
, TYPE_U32
, src
, ticRel
, bld
.mkImm(0x0917), src
);
724 i
->setSrc(i
->tex
.sIndirectSrc
, NULL
);
725 bld
.mkOp3(OP_INSBF
, TYPE_U32
, src
, tscRel
, bld
.mkImm(0x0710), src
);
731 // offset is last source (lod 1st, dc 2nd)
732 if (i
->tex
.useOffsets
) {
735 int s
= i
->srcCount(0xff, true);
736 if (i
->srcExists(s
)) // move potential predicate out of the way
737 i
->moveSources(s
, 1);
738 for (n
= 0; n
< i
->tex
.useOffsets
; ++n
)
739 for (c
= 0; c
< 3; ++c
)
740 value
|= (i
->tex
.offset
[n
][c
] & 0xf) << (n
* 12 + c
* 4);
741 i
->setSrc(s
, bld
.loadImm(NULL
, value
));
744 if (prog
->getTarget()->getChipset() >= NVISA_GK104_CHIPSET
) {
746 // If TEX requires more than 4 sources, the 2nd register tuple must be
747 // aligned to 4, even if it consists of just a single 4-byte register.
749 // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
751 int s
= i
->srcCount(0xff, true);
752 if (s
> 4 && s
< 7) {
753 if (i
->srcExists(s
)) // move potential predicate out of the way
754 i
->moveSources(s
, 7 - s
);
756 i
->setSrc(s
++, bld
.loadImm(NULL
, 0));
764 NVC0LoweringPass::handleManualTXD(TexInstruction
*i
)
766 static const uint8_t qOps
[4][2] =
768 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l0
769 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l1
770 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l2
771 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l3
776 Value
*zero
= bld
.loadImm(bld
.getSSA(), 0);
778 const int dim
= i
->tex
.target
.getDim();
780 i
->op
= OP_TEX
; // no need to clone dPdx/dPdy later
782 for (c
= 0; c
< dim
; ++c
)
783 crd
[c
] = bld
.getScratch();
785 bld
.mkOp(OP_QUADON
, TYPE_NONE
, NULL
);
786 for (l
= 0; l
< 4; ++l
) {
787 // mov coordinates from lane l to all lanes
788 for (c
= 0; c
< dim
; ++c
)
789 bld
.mkQuadop(0x00, crd
[c
], l
, i
->getSrc(c
), zero
);
790 // add dPdx from lane l to lanes dx
791 for (c
= 0; c
< dim
; ++c
)
792 bld
.mkQuadop(qOps
[l
][0], crd
[c
], l
, i
->dPdx
[c
].get(), crd
[c
]);
793 // add dPdy from lane l to lanes dy
794 for (c
= 0; c
< dim
; ++c
)
795 bld
.mkQuadop(qOps
[l
][1], crd
[c
], l
, i
->dPdy
[c
].get(), crd
[c
]);
797 bld
.insert(tex
= cloneForward(func
, i
));
798 for (c
= 0; c
< dim
; ++c
)
799 tex
->setSrc(c
, crd
[c
]);
801 for (c
= 0; i
->defExists(c
); ++c
) {
803 def
[c
][l
] = bld
.getSSA();
804 mov
= bld
.mkMov(def
[c
][l
], tex
->getDef(c
));
809 bld
.mkOp(OP_QUADPOP
, TYPE_NONE
, NULL
);
811 for (c
= 0; i
->defExists(c
); ++c
) {
812 Instruction
*u
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(c
));
813 for (l
= 0; l
< 4; ++l
)
814 u
->setSrc(l
, def
[c
][l
]);
822 NVC0LoweringPass::handleTXD(TexInstruction
*txd
)
824 int dim
= txd
->tex
.target
.getDim();
825 int arg
= txd
->tex
.target
.getArgCount();
828 while (txd
->srcExists(arg
))
831 txd
->tex
.derivAll
= true;
833 txd
->tex
.target
.isCube() ||
835 txd
->tex
.target
.isShadow())
836 return handleManualTXD(txd
);
838 for (int c
= 0; c
< dim
; ++c
) {
839 txd
->setSrc(arg
+ c
* 2 + 0, txd
->dPdx
[c
]);
840 txd
->setSrc(arg
+ c
* 2 + 1, txd
->dPdy
[c
]);
841 txd
->dPdx
[c
].set(NULL
);
842 txd
->dPdy
[c
].set(NULL
);
848 NVC0LoweringPass::handleTXQ(TexInstruction
*txq
)
850 // TODO: indirect resource/sampler index
855 NVC0LoweringPass::handleATOM(Instruction
*atom
)
859 switch (atom
->src(0).getFile()) {
860 case FILE_MEMORY_LOCAL
:
863 case FILE_MEMORY_SHARED
:
867 assert(atom
->src(0).getFile() == FILE_MEMORY_GLOBAL
);
871 bld
.mkOp1v(OP_RDSV
, TYPE_U32
, bld
.getScratch(), bld
.mkSysVal(sv
, 0));
872 Value
*ptr
= atom
->getIndirect(0, 0);
874 atom
->setSrc(0, cloneShallow(func
, atom
->getSrc(0)));
875 atom
->getSrc(0)->reg
.file
= FILE_MEMORY_GLOBAL
;
877 base
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, base
, base
, ptr
);
878 atom
->setIndirect(0, 0, base
);
884 NVC0LoweringPass::handleCasExch(Instruction
*cas
, bool needCctl
)
886 if (cas
->subOp
!= NV50_IR_SUBOP_ATOM_CAS
&&
887 cas
->subOp
!= NV50_IR_SUBOP_ATOM_EXCH
)
889 bld
.setPosition(cas
, true);
892 Instruction
*cctl
= bld
.mkOp1(OP_CCTL
, TYPE_NONE
, NULL
, cas
->getSrc(0));
893 cctl
->setIndirect(0, 0, cas
->getIndirect(0, 0));
895 cctl
->subOp
= NV50_IR_SUBOP_CCTL_IV
;
896 if (cas
->isPredicated())
897 cctl
->setPredicate(cas
->cc
, cas
->getPredicate());
900 if (cas
->defExists(0) && cas
->subOp
== NV50_IR_SUBOP_ATOM_CAS
) {
901 // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
902 // should be set to the high part of the double reg or bad things will
903 // happen elsewhere in the universe.
904 // Also, it sometimes returns the new value instead of the old one
905 // under mysterious circumstances.
906 Value
*dreg
= bld
.getSSA(8);
907 bld
.setPosition(cas
, false);
908 bld
.mkOp2(OP_MERGE
, TYPE_U64
, dreg
, cas
->getSrc(1), cas
->getSrc(2));
909 cas
->setSrc(1, dreg
);
916 NVC0LoweringPass::loadResInfo32(Value
*ptr
, uint32_t off
)
918 uint8_t b
= prog
->driver
->io
.resInfoCBSlot
;
919 off
+= prog
->driver
->io
.suInfoBase
;
921 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
925 NVC0LoweringPass::loadMsInfo32(Value
*ptr
, uint32_t off
)
927 uint8_t b
= prog
->driver
->io
.msInfoCBSlot
;
928 off
+= prog
->driver
->io
.msInfoBase
;
930 mkLoadv(TYPE_U32
, bld
.mkSymbol(FILE_MEMORY_CONST
, b
, TYPE_U32
, off
), ptr
);
933 /* On nvc0, surface info is obtained via the surface binding points passed
934 * to the SULD/SUST instructions.
935 * On nve4, surface info is stored in c[] and is used by various special
936 * instructions, e.g. for clamping coordiantes or generating an address.
937 * They couldn't just have added an equivalent to TIC now, couldn't they ?
939 #define NVE4_SU_INFO_ADDR 0x00
940 #define NVE4_SU_INFO_FMT 0x04
941 #define NVE4_SU_INFO_DIM_X 0x08
942 #define NVE4_SU_INFO_PITCH 0x0c
943 #define NVE4_SU_INFO_DIM_Y 0x10
944 #define NVE4_SU_INFO_ARRAY 0x14
945 #define NVE4_SU_INFO_DIM_Z 0x18
946 #define NVE4_SU_INFO_UNK1C 0x1c
947 #define NVE4_SU_INFO_WIDTH 0x20
948 #define NVE4_SU_INFO_HEIGHT 0x24
949 #define NVE4_SU_INFO_DEPTH 0x28
950 #define NVE4_SU_INFO_TARGET 0x2c
951 #define NVE4_SU_INFO_CALL 0x30
952 #define NVE4_SU_INFO_RAW_X 0x34
953 #define NVE4_SU_INFO_MS_X 0x38
954 #define NVE4_SU_INFO_MS_Y 0x3c
956 #define NVE4_SU_INFO__STRIDE 0x40
958 #define NVE4_SU_INFO_DIM(i) (0x08 + (i) * 8)
959 #define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
960 #define NVE4_SU_INFO_MS(i) (0x38 + (i) * 4)
962 static inline uint16_t getSuClampSubOp(const TexInstruction
*su
, int c
)
964 switch (su
->tex
.target
.getEnum()) {
965 case TEX_TARGET_BUFFER
: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
966 case TEX_TARGET_RECT
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
967 case TEX_TARGET_1D
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
968 case TEX_TARGET_1D_ARRAY
: return (c
== 1) ?
969 NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
970 NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
971 case TEX_TARGET_2D
: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
972 case TEX_TARGET_2D_MS
: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
973 case TEX_TARGET_2D_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
974 case TEX_TARGET_2D_MS_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
975 case TEX_TARGET_3D
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
976 case TEX_TARGET_CUBE
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
977 case TEX_TARGET_CUBE_ARRAY
: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
985 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction
*tex
)
987 const uint16_t base
= tex
->tex
.r
* NVE4_SU_INFO__STRIDE
;
988 const int arg
= tex
->tex
.target
.getArgCount();
990 if (tex
->tex
.target
== TEX_TARGET_2D_MS
)
991 tex
->tex
.target
= TEX_TARGET_2D
;
993 if (tex
->tex
.target
== TEX_TARGET_2D_MS_ARRAY
)
994 tex
->tex
.target
= TEX_TARGET_2D_ARRAY
;
998 Value
*x
= tex
->getSrc(0);
999 Value
*y
= tex
->getSrc(1);
1000 Value
*s
= tex
->getSrc(arg
- 1);
1002 Value
*tx
= bld
.getSSA(), *ty
= bld
.getSSA(), *ts
= bld
.getSSA();
1004 Value
*ms_x
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_MS(0));
1005 Value
*ms_y
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_MS(1));
1007 bld
.mkOp2(OP_SHL
, TYPE_U32
, tx
, x
, ms_x
);
1008 bld
.mkOp2(OP_SHL
, TYPE_U32
, ty
, y
, ms_y
);
1010 s
= bld
.mkOp2v(OP_AND
, TYPE_U32
, ts
, s
, bld
.loadImm(NULL
, 0x7));
1011 s
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, ts
, ts
, bld
.mkImm(3));
1013 Value
*dx
= loadMsInfo32(ts
, 0x0);
1014 Value
*dy
= loadMsInfo32(ts
, 0x4);
1016 bld
.mkOp2(OP_ADD
, TYPE_U32
, tx
, tx
, dx
);
1017 bld
.mkOp2(OP_ADD
, TYPE_U32
, ty
, ty
, dy
);
1021 tex
->moveSources(arg
, -1);
1024 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1025 // They're computed from the coordinates using the surface info in c[] space.
1027 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction
*su
)
1030 const bool atom
= su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
;
1032 su
->op
== OP_SULDB
|| su
->op
== OP_SUSTB
|| su
->op
== OP_SUREDB
;
1033 const int idx
= su
->tex
.r
;
1034 const int dim
= su
->tex
.target
.getDim();
1035 const int arg
= dim
+ (su
->tex
.target
.isArray() ? 1 : 0);
1036 const uint16_t base
= idx
* NVE4_SU_INFO__STRIDE
;
1038 Value
*zero
= bld
.mkImm(0);
1042 Value
*bf
, *eau
, *off
;
1045 off
= bld
.getScratch(4);
1046 bf
= bld
.getScratch(4);
1047 addr
= bld
.getSSA(8);
1048 pred
= bld
.getScratch(1, FILE_PREDICATE
);
1050 bld
.setPosition(su
, false);
1052 adjustCoordinatesMS(su
);
1054 // calculate clamped coordinates
1055 for (c
= 0; c
< arg
; ++c
) {
1056 src
[c
] = bld
.getScratch();
1058 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_RAW_X
);
1060 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_DIM(c
));
1061 bld
.mkOp3(OP_SUCLAMP
, TYPE_S32
, src
[c
], su
->getSrc(c
), v
, zero
)
1062 ->subOp
= getSuClampSubOp(su
, c
);
1067 // set predicate output
1068 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1069 src
[0]->getInsn()->setFlagsDef(1, pred
);
1071 if (su
->tex
.target
.isArray()) {
1072 p1
= bld
.getSSA(1, FILE_PREDICATE
);
1073 src
[dim
]->getInsn()->setFlagsDef(1, p1
);
1076 // calculate pixel offset
1078 if (su
->tex
.target
!= TEX_TARGET_BUFFER
)
1079 bld
.mkOp2(OP_AND
, TYPE_U32
, off
, src
[0], bld
.loadImm(NULL
, 0xffff));
1082 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_UNK1C
);
1083 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, src
[2], v
, src
[1])
1084 ->subOp
= NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1086 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_PITCH
);
1087 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, off
, v
, src
[0])
1088 ->subOp
= NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1091 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_PITCH
);
1092 bld
.mkOp3(OP_MADSP
, TYPE_U32
, off
, src
[1], v
, src
[0])
1093 ->subOp
= su
->tex
.target
.isArray() ?
1094 NV50_IR_SUBOP_MADSP_SD
: NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1097 // calculate effective address part 1
1098 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1102 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_FMT
);
1103 bld
.mkOp3(OP_VSHL
, TYPE_U32
, bf
, src
[0], v
, zero
)
1104 ->subOp
= NV50_IR_SUBOP_V1(7,6,8|2);
1118 if (!su
->tex
.target
.isArray()) {
1119 z
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_UNK1C
);
1120 subOp
= NV50_IR_SUBOP_SUBFM_3D
;
1124 subOp
= NV50_IR_SUBOP_SUBFM_3D
;
1128 insn
= bld
.mkOp3(OP_SUBFM
, TYPE_U32
, bf
, src
[0], y
, z
);
1129 insn
->subOp
= subOp
;
1130 insn
->setFlagsDef(1, pred
);
1134 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_ADDR
);
1136 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1139 eau
= bld
.mkOp3v(OP_SUEAU
, TYPE_U32
, bld
.getScratch(4), off
, bf
, v
);
1141 // add array layer offset
1142 if (su
->tex
.target
.isArray()) {
1143 v
= loadResInfo32(NULL
, base
+ NVE4_SU_INFO_ARRAY
);
1145 bld
.mkOp3(OP_MADSP
, TYPE_U32
, eau
, src
[1], v
, eau
)
1146 ->subOp
= NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1148 bld
.mkOp3(OP_MADSP
, TYPE_U32
, eau
, v
, src
[2], eau
)
1149 ->subOp
= NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
1150 // combine predicates
1152 bld
.mkOp2(OP_OR
, TYPE_U8
, pred
, pred
, p1
);
1157 if (su
->tex
.target
== TEX_TARGET_BUFFER
) {
1161 // bf == g[] address & 0xff
1162 // eau == g[] address >> 8
1163 bld
.mkOp3(OP_PERMT
, TYPE_U32
, bf
, lo
, bld
.loadImm(NULL
, 0x6540), eau
);
1164 bld
.mkOp3(OP_PERMT
, TYPE_U32
, eau
, zero
, bld
.loadImm(NULL
, 0x0007), eau
);
1166 if (su
->op
== OP_SULDP
&& su
->tex
.target
== TEX_TARGET_BUFFER
) {
1167 // Convert from u32 to u8 address format, which is what the library code
1168 // doing SULDP currently uses.
1169 // XXX: can SUEAU do this ?
1170 // XXX: does it matter that we don't mask high bytes in bf ?
1172 bld
.mkOp2(OP_SHR
, TYPE_U32
, off
, bf
, bld
.mkImm(8));
1173 bld
.mkOp2(OP_ADD
, TYPE_U32
, eau
, eau
, off
);
1176 bld
.mkOp2(OP_MERGE
, TYPE_U64
, addr
, bf
, eau
);
1178 if (atom
&& su
->tex
.target
== TEX_TARGET_BUFFER
)
1179 bld
.mkOp2(OP_ADD
, TYPE_U64
, addr
, addr
, off
);
1181 // let's just set it 0 for raw access and hope it works
1183 bld
.mkImm(0) : loadResInfo32(NULL
, base
+ NVE4_SU_INFO_FMT
);
1185 // get rid of old coordinate sources, make space for fmt info and predicate
1186 su
->moveSources(arg
, 3 - arg
);
1187 // set 64 bit address and 32-bit format sources
1188 su
->setSrc(0, addr
);
1190 su
->setSrc(2, pred
);
1194 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction
*su
)
1196 processSurfaceCoordsNVE4(su
);
1198 // Who do we hate more ? The person who decided that nvc0's SULD doesn't
1199 // have to support conversion or the person who decided that, in OpenCL,
1200 // you don't have to specify the format here like you do in OpenGL ?
1202 if (su
->op
== OP_SULDP
) {
1203 // We don't patch shaders. Ever.
1204 // You get an indirect call to our library blob here.
1205 // But at least it's uniform.
1206 FlowInstruction
*call
;
1209 uint16_t base
= su
->tex
.r
* NVE4_SU_INFO__STRIDE
+ NVE4_SU_INFO_CALL
;
1211 for (int i
= 0; i
< 4; ++i
)
1212 (r
[i
] = bld
.getScratch(4, FILE_GPR
))->reg
.data
.id
= i
;
1213 for (int i
= 0; i
< 3; ++i
)
1214 (p
[i
] = bld
.getScratch(1, FILE_PREDICATE
))->reg
.data
.id
= i
;
1215 (r
[4] = bld
.getScratch(8, FILE_GPR
))->reg
.data
.id
= 4;
1217 bld
.mkMov(p
[1], bld
.mkImm((su
->cache
== CACHE_CA
) ? 1 : 0), TYPE_U8
);
1218 bld
.mkMov(p
[2], bld
.mkImm((su
->cache
== CACHE_CG
) ? 1 : 0), TYPE_U8
);
1219 bld
.mkMov(p
[0], su
->getSrc(2), TYPE_U8
);
1220 bld
.mkMov(r
[4], su
->getSrc(0), TYPE_U64
);
1221 bld
.mkMov(r
[2], su
->getSrc(1), TYPE_U32
);
1223 call
= bld
.mkFlow(OP_CALL
, NULL
, su
->cc
, su
->getPredicate());
1227 call
->setSrc(0, bld
.mkSymbol(FILE_MEMORY_CONST
,
1228 prog
->driver
->io
.resInfoCBSlot
, TYPE_U32
,
1229 prog
->driver
->io
.suInfoBase
+ base
));
1230 call
->setSrc(1, r
[2]);
1231 call
->setSrc(2, r
[4]);
1232 for (int i
= 0; i
< 3; ++i
)
1233 call
->setSrc(3 + i
, p
[i
]);
1234 for (int i
= 0; i
< 4; ++i
) {
1235 call
->setDef(i
, r
[i
]);
1236 bld
.mkMov(su
->getDef(i
), r
[i
]);
1238 call
->setDef(4, p
[1]);
1239 delete_Instruction(bld
.getProgram(), su
);
1242 if (su
->op
== OP_SUREDB
|| su
->op
== OP_SUREDP
) {
1243 // FIXME: for out of bounds access, destination value will be undefined !
1244 Value
*pred
= su
->getSrc(2);
1245 CondCode cc
= CC_NOT_P
;
1246 if (su
->getPredicate()) {
1247 pred
= bld
.getScratch(1, FILE_PREDICATE
);
1249 if (cc
== CC_NOT_P
) {
1250 bld
.mkOp2(OP_OR
, TYPE_U8
, pred
, su
->getPredicate(), su
->getSrc(2));
1252 bld
.mkOp2(OP_AND
, TYPE_U8
, pred
, su
->getPredicate(), su
->getSrc(2));
1253 pred
->getInsn()->src(1).mod
= Modifier(NV50_IR_MOD_NOT
);
1256 Instruction
*red
= bld
.mkOp(OP_ATOM
, su
->dType
, su
->getDef(0));
1257 red
->subOp
= su
->subOp
;
1259 gMemBase
= bld
.mkSymbol(FILE_MEMORY_GLOBAL
, 0, TYPE_U32
, 0);
1260 red
->setSrc(0, gMemBase
);
1261 red
->setSrc(1, su
->getSrc(3));
1262 if (su
->subOp
== NV50_IR_SUBOP_ATOM_CAS
)
1263 red
->setSrc(2, su
->getSrc(4));
1264 red
->setIndirect(0, 0, su
->getSrc(0));
1265 red
->setPredicate(cc
, pred
);
1266 delete_Instruction(bld
.getProgram(), su
);
1267 handleCasExch(red
, true);
1269 su
->sType
= (su
->tex
.target
== TEX_TARGET_BUFFER
) ? TYPE_U32
: TYPE_U8
;
1274 NVC0LoweringPass::handleWRSV(Instruction
*i
)
1280 // must replace, $sreg are not writeable
1281 addr
= targ
->getSVAddress(FILE_SHADER_OUTPUT
, i
->getSrc(0)->asSym());
1284 sym
= bld
.mkSymbol(FILE_SHADER_OUTPUT
, 0, i
->sType
, addr
);
1286 st
= bld
.mkStore(OP_EXPORT
, i
->dType
, sym
, i
->getIndirect(0, 0),
1288 st
->perPatch
= i
->perPatch
;
1290 bld
.getBB()->remove(i
);
1295 NVC0LoweringPass::readTessCoord(LValue
*dst
, int c
)
1297 Value
*laneid
= bld
.getSSA();
1300 bld
.mkOp1(OP_RDSV
, TYPE_U32
, laneid
, bld
.mkSysVal(SV_LANEID
, 0));
1315 bld
.mkFetch(x
, TYPE_F32
, FILE_SHADER_OUTPUT
, 0x2f0, NULL
, laneid
);
1317 bld
.mkFetch(y
, TYPE_F32
, FILE_SHADER_OUTPUT
, 0x2f4, NULL
, laneid
);
1320 bld
.mkOp2(OP_ADD
, TYPE_F32
, dst
, x
, y
);
1321 bld
.mkOp2(OP_SUB
, TYPE_F32
, dst
, bld
.loadImm(NULL
, 1.0f
), dst
);
1326 NVC0LoweringPass::handleRDSV(Instruction
*i
)
1328 Symbol
*sym
= i
->getSrc(0)->asSym();
1329 const SVSemantic sv
= sym
->reg
.data
.sv
.sv
;
1332 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_INPUT
, sym
);
1334 if (addr
>= 0x400) {
1336 if (sym
->reg
.data
.sv
.index
== 3) {
1337 // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
1339 i
->setSrc(0, bld
.mkImm((sv
== SV_NTID
|| sv
== SV_NCTAID
) ? 1 : 0));
1346 assert(prog
->getType() == Program::TYPE_FRAGMENT
);
1347 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, i
->getDef(0), addr
, NULL
);
1351 Value
*face
= i
->getDef(0);
1352 bld
.mkInterp(NV50_IR_INTERP_FLAT
, face
, addr
, NULL
);
1353 if (i
->dType
== TYPE_F32
) {
1354 bld
.mkOp2(OP_AND
, TYPE_U32
, face
, face
, bld
.mkImm(0x80000000));
1355 bld
.mkOp2(OP_XOR
, TYPE_U32
, face
, face
, bld
.mkImm(0xbf800000));
1360 assert(prog
->getType() == Program::TYPE_TESSELLATION_EVAL
);
1361 readTessCoord(i
->getDef(0)->asLValue(), i
->getSrc(0)->reg
.data
.sv
.index
);
1366 assert(targ
->getChipset() >= NVISA_GK104_CHIPSET
); // mov $sreg otherwise
1367 if (sym
->reg
.data
.sv
.index
== 3) {
1369 i
->setSrc(0, bld
.mkImm(sv
== SV_GRIDID
? 0 : 1));
1372 addr
+= prog
->driver
->prop
.cp
.gridInfoBase
;
1373 bld
.mkLoad(TYPE_U32
, i
->getDef(0),
1374 bld
.mkSymbol(FILE_MEMORY_CONST
, 0, TYPE_U32
, addr
), NULL
);
1377 if (prog
->getType() == Program::TYPE_TESSELLATION_EVAL
)
1378 vtx
= bld
.mkOp1v(OP_PFETCH
, TYPE_U32
, bld
.getSSA(), bld
.mkImm(0));
1379 ld
= bld
.mkFetch(i
->getDef(0), i
->dType
,
1380 FILE_SHADER_INPUT
, addr
, i
->getIndirect(0, 0), vtx
);
1381 ld
->perPatch
= i
->perPatch
;
1384 bld
.getBB()->remove(i
);
1389 NVC0LoweringPass::handleDIV(Instruction
*i
)
1391 if (!isFloatType(i
->dType
))
1393 bld
.setPosition(i
, false);
1394 Instruction
*rcp
= bld
.mkOp1(OP_RCP
, i
->dType
, bld
.getSSA(), i
->getSrc(1));
1396 i
->setSrc(1, rcp
->getDef(0));
1401 NVC0LoweringPass::handleMOD(Instruction
*i
)
1403 if (i
->dType
!= TYPE_F32
)
1405 LValue
*value
= bld
.getScratch();
1406 bld
.mkOp1(OP_RCP
, TYPE_F32
, value
, i
->getSrc(1));
1407 bld
.mkOp2(OP_MUL
, TYPE_F32
, value
, i
->getSrc(0), value
);
1408 bld
.mkOp1(OP_TRUNC
, TYPE_F32
, value
, value
);
1409 bld
.mkOp2(OP_MUL
, TYPE_F32
, value
, i
->getSrc(1), value
);
1411 i
->setSrc(1, value
);
1416 NVC0LoweringPass::handleSQRT(Instruction
*i
)
1418 Instruction
*rsq
= bld
.mkOp1(OP_RSQ
, TYPE_F32
,
1419 bld
.getSSA(), i
->getSrc(0));
1421 i
->setSrc(1, rsq
->getDef(0));
1427 NVC0LoweringPass::handlePOW(Instruction
*i
)
1429 LValue
*val
= bld
.getScratch();
1431 bld
.mkOp1(OP_LG2
, TYPE_F32
, val
, i
->getSrc(0));
1432 bld
.mkOp2(OP_MUL
, TYPE_F32
, val
, i
->getSrc(1), val
)->dnz
= 1;
1433 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, val
, val
);
1443 NVC0LoweringPass::handleEXPORT(Instruction
*i
)
1445 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
1446 int id
= i
->getSrc(0)->reg
.data
.offset
/ 4;
1448 if (i
->src(0).isIndirect(0)) // TODO, ugly
1451 i
->subOp
= NV50_IR_SUBOP_MOV_FINAL
;
1452 i
->src(0).set(i
->src(1));
1454 i
->setDef(0, new_LValue(func
, FILE_GPR
));
1455 i
->getDef(0)->reg
.data
.id
= id
;
1457 prog
->maxGPR
= MAX2(prog
->maxGPR
, id
);
1459 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
1460 i
->setIndirect(0, 1, gpEmitAddress
);
1466 NVC0LoweringPass::handleOUT(Instruction
*i
)
1468 if (i
->op
== OP_RESTART
&& i
->prev
&& i
->prev
->op
== OP_EMIT
) {
1469 i
->prev
->subOp
= NV50_IR_SUBOP_EMIT_RESTART
;
1470 delete_Instruction(prog
, i
);
1472 assert(gpEmitAddress
);
1473 i
->setDef(0, gpEmitAddress
);
1474 if (i
->srcExists(0))
1475 i
->setSrc(1, i
->getSrc(0));
1476 i
->setSrc(0, gpEmitAddress
);
1481 // Generate a binary predicate if an instruction is predicated by
1482 // e.g. an f32 value.
1484 NVC0LoweringPass::checkPredicate(Instruction
*insn
)
1486 Value
*pred
= insn
->getPredicate();
1489 if (!pred
|| pred
->reg
.file
== FILE_PREDICATE
)
1491 pdst
= new_LValue(func
, FILE_PREDICATE
);
1493 // CAUTION: don't use pdst->getInsn, the definition might not be unique,
1494 // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
1496 bld
.mkCmp(OP_SET
, CC_NEU
, insn
->dType
, pdst
, insn
->dType
, bld
.mkImm(0), pred
);
1498 insn
->setPredicate(insn
->cc
, pdst
);
1502 // - add quadop dance for texturing
1503 // - put FP outputs in GPRs
1504 // - convert instruction sequences
1507 NVC0LoweringPass::visit(Instruction
*i
)
1509 bld
.setPosition(i
, false);
1511 if (i
->cc
!= CC_ALWAYS
)
1520 return handleTEX(i
->asTex());
1522 return handleTXD(i
->asTex());
1524 return handleTXQ(i
->asTex());
1526 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, i
->getDef(0), i
->getSrc(0));
1527 i
->setSrc(0, i
->getDef(0));
1530 return handlePOW(i
);
1532 return handleDIV(i
);
1534 return handleMOD(i
);
1536 return handleSQRT(i
);
1538 return handleEXPORT(i
);
1541 return handleOUT(i
);
1543 return handleRDSV(i
);
1545 return handleWRSV(i
);
1547 if (i
->src(0).getFile() == FILE_SHADER_INPUT
) {
1548 if (prog
->getType() == Program::TYPE_COMPUTE
) {
1549 i
->getSrc(0)->reg
.file
= FILE_MEMORY_CONST
;
1550 i
->getSrc(0)->reg
.fileIndex
= 0;
1553 assert(prog
->getType() != Program::TYPE_FRAGMENT
); // INTERP
1559 const bool cctl
= i
->src(0).getFile() == FILE_MEMORY_GLOBAL
;
1561 handleCasExch(i
, cctl
);
1570 if (targ
->getChipset() >= NVISA_GK104_CHIPSET
)
1571 handleSurfaceOpNVE4(i
->asTex());
1580 TargetNVC0::runLegalizePass(Program
*prog
, CGStage stage
) const
1582 if (stage
== CG_STAGE_PRE_SSA
) {
1583 NVC0LoweringPass
pass(prog
);
1584 return pass
.run(prog
, false, true);
1586 if (stage
== CG_STAGE_POST_RA
) {
1587 NVC0LegalizePostRA
pass(prog
);
1588 return pass
.run(prog
, false, true);
1590 if (stage
== CG_STAGE_SSA
) {
1591 NVC0LegalizeSSA pass
;
1592 return pass
.run(prog
, false, true);
1597 } // namespace nv50_ir