2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
26 #include "codegen/nv50_ir_target_nv50.h"
30 // nv50 doesn't support 32 bit integer multiplication
32 // ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33 // -------------------
34 // al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35 // ah*bh 00 00 ( carry1) << 16 + ( carry2)
39 // fffe0001 + fffe0001
41 // Note that this sort of splitting doesn't work for signed values, so we
42 // compute the sign on those manually and then perform an unsigned multiply.
44 expandIntegerMUL(BuildUtil
*bld
, Instruction
*mul
)
46 const bool highResult
= mul
->subOp
== NV50_IR_SUBOP_MUL_HIGH
;
48 DataType fTy
; // full type
50 case TYPE_S32
: fTy
= TYPE_U32
; break;
51 case TYPE_S64
: fTy
= TYPE_U64
; break;
52 default: fTy
= mul
->sType
; break;
55 DataType hTy
; // half type
57 case TYPE_U32
: hTy
= TYPE_U16
; break;
58 case TYPE_U64
: hTy
= TYPE_U32
; break;
62 unsigned int fullSize
= typeSizeof(fTy
);
63 unsigned int halfSize
= typeSizeof(hTy
);
67 bld
->setPosition(mul
, true);
72 for (int j
= 0; j
< 4; ++j
)
73 t
[j
] = bld
->getSSA(fullSize
);
75 s
[0] = mul
->getSrc(0);
76 s
[1] = mul
->getSrc(1);
78 if (isSignedType(mul
->sType
) && highResult
) {
79 s
[0] = bld
->getSSA(fullSize
);
80 s
[1] = bld
->getSSA(fullSize
);
81 bld
->mkOp1(OP_ABS
, mul
->sType
, s
[0], mul
->getSrc(0));
82 bld
->mkOp1(OP_ABS
, mul
->sType
, s
[1], mul
->getSrc(1));
85 // split sources into halves
86 i
[0] = bld
->mkSplit(a
, halfSize
, s
[0]);
87 i
[1] = bld
->mkSplit(b
, halfSize
, s
[1]);
89 i
[2] = bld
->mkOp2(OP_MUL
, fTy
, t
[0], a
[0], b
[1]);
90 i
[3] = bld
->mkOp3(OP_MAD
, fTy
, t
[1], a
[1], b
[0], t
[0]);
91 i
[7] = bld
->mkOp2(OP_SHL
, fTy
, t
[2], t
[1], bld
->mkImm(halfSize
* 8));
92 i
[4] = bld
->mkOp3(OP_MAD
, fTy
, t
[3], a
[0], b
[0], t
[2]);
97 Value
*imm
= bld
->loadImm(NULL
, 1 << (halfSize
* 8));
98 c
[0] = bld
->getSSA(1, FILE_FLAGS
);
99 c
[1] = bld
->getSSA(1, FILE_FLAGS
);
100 for (int j
= 0; j
< 5; ++j
)
101 r
[j
] = bld
->getSSA(fullSize
);
103 i
[8] = bld
->mkOp2(OP_SHR
, fTy
, r
[0], t
[1], bld
->mkImm(halfSize
* 8));
104 i
[6] = bld
->mkOp2(OP_ADD
, fTy
, r
[1], r
[0], imm
);
105 bld
->mkMov(r
[3], r
[0])->setPredicate(CC_NC
, c
[0]);
106 bld
->mkOp2(OP_UNION
, TYPE_U32
, r
[2], r
[1], r
[3]);
107 i
[5] = bld
->mkOp3(OP_MAD
, fTy
, r
[4], a
[1], b
[1], r
[2]);
109 // set carry defs / sources
110 i
[3]->setFlagsDef(1, c
[0]);
111 // actual result required in negative case, but ignored for
112 // unsigned. for some reason the compiler ends up dropping the whole
113 // instruction if the destination is unused but the flags are.
114 if (isSignedType(mul
->sType
))
115 i
[4]->setFlagsDef(1, c
[1]);
117 i
[4]->setFlagsDef(0, c
[1]);
118 i
[6]->setPredicate(CC_C
, c
[0]);
119 i
[5]->setFlagsSrc(3, c
[1]);
121 if (isSignedType(mul
->sType
)) {
124 Value
*one
= bld
->getSSA(fullSize
);
125 bld
->loadImm(one
, 1);
126 for (int j
= 0; j
< 7; j
++)
127 rr
[j
] = bld
->getSSA(fullSize
);
129 // NOTE: this logic uses predicates because splitting basic blocks is
130 // ~impossible during the SSA phase. The RA relies on a correlation
131 // between edge order and phi node sources.
133 // Set the sign of the result based on the inputs
134 bld
->mkOp2(OP_XOR
, fTy
, NULL
, mul
->getSrc(0), mul
->getSrc(1))
135 ->setFlagsDef(0, (cc
[0] = bld
->getSSA(1, FILE_FLAGS
)));
137 // 1s complement of 64-bit value
138 bld
->mkOp1(OP_NOT
, fTy
, rr
[0], r
[4])
139 ->setPredicate(CC_S
, cc
[0]);
140 bld
->mkOp1(OP_NOT
, fTy
, rr
[1], t
[3])
141 ->setPredicate(CC_S
, cc
[0]);
143 // add to low 32-bits, keep track of the carry
144 Instruction
*n
= bld
->mkOp2(OP_ADD
, fTy
, NULL
, rr
[1], one
);
145 n
->setPredicate(CC_S
, cc
[0]);
146 n
->setFlagsDef(0, (cc
[1] = bld
->getSSA(1, FILE_FLAGS
)));
148 // If there was a carry, add 1 to the upper 32 bits
149 // XXX: These get executed even if they shouldn't be
150 bld
->mkOp2(OP_ADD
, fTy
, rr
[2], rr
[0], one
)
151 ->setPredicate(CC_C
, cc
[1]);
152 bld
->mkMov(rr
[3], rr
[0])
153 ->setPredicate(CC_NC
, cc
[1]);
154 bld
->mkOp2(OP_UNION
, fTy
, rr
[4], rr
[2], rr
[3]);
156 // Merge the results from the negative and non-negative paths
157 bld
->mkMov(rr
[5], rr
[4])
158 ->setPredicate(CC_S
, cc
[0]);
159 bld
->mkMov(rr
[6], r
[4])
160 ->setPredicate(CC_NS
, cc
[0]);
161 bld
->mkOp2(OP_UNION
, mul
->sType
, mul
->getDef(0), rr
[5], rr
[6]);
163 bld
->mkMov(mul
->getDef(0), r
[4]);
166 bld
->mkMov(mul
->getDef(0), t
[3]);
168 delete_Instruction(bld
->getProgram(), mul
);
170 for (int j
= 2; j
<= (highResult
? 5 : 4); ++j
)
183 #define QUADOP(q, r, s, t) \
184 ((QOP_##q << 6) | (QOP_##r << 4) | \
185 (QOP_##s << 2) | (QOP_##t << 0))
187 class NV50LegalizePostRA
: public Pass
190 virtual bool visit(Function
*);
191 virtual bool visit(BasicBlock
*);
193 void handlePRERET(FlowInstruction
*);
194 void replaceZero(Instruction
*);
200 NV50LegalizePostRA::visit(Function
*fn
)
202 Program
*prog
= fn
->getProgram();
204 r63
= new_LValue(fn
, FILE_GPR
);
205 // GPR units on nv50 are in half-regs
206 if (prog
->maxGPR
< 126)
207 r63
->reg
.data
.id
= 63;
209 r63
->reg
.data
.id
= 127;
211 // this is actually per-program, but we can do it all on visiting main()
212 std::list
<Instruction
*> *outWrites
=
213 reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
216 for (std::list
<Instruction
*>::iterator it
= outWrites
->begin();
217 it
!= outWrites
->end(); ++it
)
218 (*it
)->getSrc(1)->defs
.front()->getInsn()->setDef(0, (*it
)->getSrc(0));
219 // instructions will be deleted on exit
227 NV50LegalizePostRA::replaceZero(Instruction
*i
)
229 for (int s
= 0; i
->srcExists(s
); ++s
) {
230 ImmediateValue
*imm
= i
->getSrc(s
)->asImm();
231 if (imm
&& imm
->reg
.data
.u64
== 0)
236 // Emulate PRERET: jump to the target and call to the origin from there
238 // WARNING: atm only works if BBs are affected by at most a single PRERET
247 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
250 // bra BB:3 + n1 (skip the call)
251 // call BB:0 + n2 (skip bra at beginning of BB:0)
254 NV50LegalizePostRA::handlePRERET(FlowInstruction
*pre
)
256 BasicBlock
*bbE
= pre
->bb
;
257 BasicBlock
*bbT
= pre
->target
.bb
;
259 pre
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 0;
261 bbE
->insertHead(pre
);
263 Instruction
*skip
= new_FlowInstruction(func
, OP_PRERET
, bbT
);
264 Instruction
*call
= new_FlowInstruction(func
, OP_PRERET
, bbE
);
266 bbT
->insertHead(call
);
267 bbT
->insertHead(skip
);
269 // NOTE: maybe split blocks to prevent the instructions from moving ?
271 skip
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 1;
272 call
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 2;
276 NV50LegalizePostRA::visit(BasicBlock
*bb
)
278 Instruction
*i
, *next
;
280 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
281 for (i
= bb
->getFirst(); i
; i
= next
) {
286 if (i
->op
== OP_PRERET
&& prog
->getTarget()->getChipset() < 0xa0) {
287 handlePRERET(i
->asFlow());
289 // TODO: We will want to do this before register allocation,
290 // since have to use a $c register for the carry flag.
291 if (typeSizeof(i
->dType
) == 8) {
292 Instruction
*hi
= BuildUtil::split64BitOpPostRA(func
, i
, r63
, NULL
);
297 if (i
->op
!= OP_PFETCH
&& i
->op
!= OP_BAR
&&
298 (!i
->defExists(0) || i
->def(0).getFile() != FILE_ADDRESS
))
308 class NV50LegalizeSSA
: public Pass
311 NV50LegalizeSSA(Program
*);
313 virtual bool visit(BasicBlock
*bb
);
316 void propagateWriteToOutput(Instruction
*);
317 void handleDIV(Instruction
*);
318 void handleMOD(Instruction
*);
319 void handleMUL(Instruction
*);
320 void handleAddrDef(Instruction
*);
322 inline bool isARL(const Instruction
*) const;
326 std::list
<Instruction
*> *outWrites
;
329 NV50LegalizeSSA::NV50LegalizeSSA(Program
*prog
)
331 bld
.setProgram(prog
);
333 if (prog
->optLevel
>= 2 &&
334 (prog
->getType() == Program::TYPE_GEOMETRY
||
335 prog
->getType() == Program::TYPE_VERTEX
))
337 reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
343 NV50LegalizeSSA::propagateWriteToOutput(Instruction
*st
)
345 if (st
->src(0).isIndirect(0) || st
->getSrc(1)->refCount() != 1)
348 // check def instruction can store
349 Instruction
*di
= st
->getSrc(1)->defs
.front()->getInsn();
351 // TODO: move exports (if beneficial) in common opt pass
352 if (di
->isPseudo() || isTextureOp(di
->op
) || di
->defCount(0xff, true) > 1)
355 for (int s
= 0; di
->srcExists(s
); ++s
)
356 if (di
->src(s
).getFile() == FILE_IMMEDIATE
)
359 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
360 // Only propagate output writes in geometry shaders when we can be sure
361 // that we are propagating to the same output vertex.
362 if (di
->bb
!= st
->bb
)
365 for (i
= di
; i
!= st
; i
= i
->next
) {
366 if (i
->op
== OP_EMIT
|| i
->op
== OP_RESTART
)
369 assert(i
); // st after di
372 // We cannot set defs to non-lvalues before register allocation, so
373 // save & remove (to save registers) the exports and replace later.
374 outWrites
->push_back(st
);
379 NV50LegalizeSSA::isARL(const Instruction
*i
) const
383 if (i
->op
!= OP_SHL
|| i
->src(0).getFile() != FILE_GPR
)
385 if (!i
->src(1).getImmediate(imm
))
387 return imm
.isInteger(0);
391 NV50LegalizeSSA::handleAddrDef(Instruction
*i
)
395 i
->getDef(0)->reg
.size
= 2; // $aX are only 16 bit
397 // PFETCH can always write to $a
398 if (i
->op
== OP_PFETCH
)
400 // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
401 if (i
->srcExists(1) && i
->src(1).getFile() == FILE_IMMEDIATE
) {
402 if (i
->op
== OP_SHL
&& i
->src(0).getFile() == FILE_GPR
)
404 if (i
->op
== OP_ADD
&& i
->src(0).getFile() == FILE_ADDRESS
)
408 // turn $a sources into $r sources (can't operate on $a)
409 for (int s
= 0; i
->srcExists(s
); ++s
) {
410 Value
*a
= i
->getSrc(s
);
412 if (a
->reg
.file
== FILE_ADDRESS
) {
413 if (a
->getInsn() && isARL(a
->getInsn())) {
414 i
->setSrc(s
, a
->getInsn()->getSrc(0));
416 bld
.setPosition(i
, false);
423 if (i
->op
== OP_SHL
&& i
->src(1).getFile() == FILE_IMMEDIATE
)
426 // turn result back into $a
427 bld
.setPosition(i
, true);
428 arl
= bld
.mkOp2(OP_SHL
, TYPE_U32
, i
->getDef(0), bld
.getSSA(), bld
.mkImm(0));
429 i
->setDef(0, arl
->getSrc(0));
433 NV50LegalizeSSA::handleMUL(Instruction
*mul
)
435 if (isFloatType(mul
->sType
) || typeSizeof(mul
->sType
) <= 2)
437 Value
*def
= mul
->getDef(0);
438 Value
*pred
= mul
->getPredicate();
439 CondCode cc
= mul
->cc
;
441 mul
->setPredicate(CC_ALWAYS
, NULL
);
443 if (mul
->op
== OP_MAD
) {
444 Instruction
*add
= mul
;
445 bld
.setPosition(add
, false);
446 Value
*res
= cloneShallow(func
, mul
->getDef(0));
447 mul
= bld
.mkOp2(OP_MUL
, add
->sType
, res
, add
->getSrc(0), add
->getSrc(1));
449 add
->setSrc(0, mul
->getDef(0));
450 add
->setSrc(1, add
->getSrc(2));
451 for (int s
= 2; add
->srcExists(s
); ++s
)
452 add
->setSrc(s
, NULL
);
453 mul
->subOp
= add
->subOp
;
456 expandIntegerMUL(&bld
, mul
);
458 def
->getInsn()->setPredicate(cc
, pred
);
461 // Use f32 division: first compute an approximate result, use it to reduce
462 // the dividend, which should then be representable as f32, divide the reduced
463 // dividend, and add the quotients.
465 NV50LegalizeSSA::handleDIV(Instruction
*div
)
467 const DataType ty
= div
->sType
;
469 if (ty
!= TYPE_U32
&& ty
!= TYPE_S32
)
472 Value
*q
, *q0
, *qf
, *aR
, *aRf
, *qRf
, *qR
, *t
, *s
, *m
, *cond
;
474 bld
.setPosition(div
, false);
476 Value
*a
, *af
= bld
.getSSA();
477 Value
*b
, *bf
= bld
.getSSA();
479 bld
.mkCvt(OP_CVT
, TYPE_F32
, af
, ty
, div
->getSrc(0));
480 bld
.mkCvt(OP_CVT
, TYPE_F32
, bf
, ty
, div
->getSrc(1));
482 if (isSignedType(ty
)) {
483 af
->getInsn()->src(0).mod
= Modifier(NV50_IR_MOD_ABS
);
484 bf
->getInsn()->src(0).mod
= Modifier(NV50_IR_MOD_ABS
);
487 bld
.mkOp1(OP_ABS
, ty
, a
, div
->getSrc(0));
488 bld
.mkOp1(OP_ABS
, ty
, b
, div
->getSrc(1));
494 bf
= bld
.mkOp1v(OP_RCP
, TYPE_F32
, bld
.getSSA(), bf
);
495 bf
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), bf
, bld
.mkImm(-2));
497 bld
.mkOp2(OP_MUL
, TYPE_F32
, (qf
= bld
.getSSA()), af
, bf
)->rnd
= ROUND_Z
;
498 bld
.mkCvt(OP_CVT
, ty
, (q0
= bld
.getSSA()), TYPE_F32
, qf
)->rnd
= ROUND_Z
;
500 // get error of 1st result
501 expandIntegerMUL(&bld
,
502 bld
.mkOp2(OP_MUL
, TYPE_U32
, (t
= bld
.getSSA()), q0
, b
));
503 bld
.mkOp2(OP_SUB
, TYPE_U32
, (aRf
= bld
.getSSA()), a
, t
);
505 bld
.mkCvt(OP_CVT
, TYPE_F32
, (aR
= bld
.getSSA()), TYPE_U32
, aRf
);
507 bld
.mkOp2(OP_MUL
, TYPE_F32
, (qRf
= bld
.getSSA()), aR
, bf
)->rnd
= ROUND_Z
;
508 bld
.mkCvt(OP_CVT
, TYPE_U32
, (qR
= bld
.getSSA()), TYPE_F32
, qRf
)
510 bld
.mkOp2(OP_ADD
, ty
, (q
= bld
.getSSA()), q0
, qR
); // add quotients
512 // correction: if modulus >= divisor, add 1
513 expandIntegerMUL(&bld
,
514 bld
.mkOp2(OP_MUL
, TYPE_U32
, (t
= bld
.getSSA()), q
, b
));
515 bld
.mkOp2(OP_SUB
, TYPE_U32
, (m
= bld
.getSSA()), a
, t
);
516 bld
.mkCmp(OP_SET
, CC_GE
, TYPE_U32
, (s
= bld
.getSSA()), TYPE_U32
, m
, b
);
517 if (!isSignedType(ty
)) {
523 bld
.mkOp2(OP_SUB
, TYPE_U32
, (q
= bld
.getSSA()), t
, s
);
527 bld
.mkOp2(OP_XOR
, TYPE_U32
, NULL
, div
->getSrc(0), div
->getSrc(1))
528 ->setFlagsDef(0, (cond
= bld
.getSSA(1, FILE_FLAGS
)));
529 bld
.mkOp1(OP_NEG
, ty
, s
, q
)->setPredicate(CC_S
, cond
);
530 bld
.mkOp1(OP_MOV
, ty
, t
, q
)->setPredicate(CC_NS
, cond
);
539 NV50LegalizeSSA::handleMOD(Instruction
*mod
)
541 if (mod
->dType
!= TYPE_U32
&& mod
->dType
!= TYPE_S32
)
543 bld
.setPosition(mod
, false);
545 Value
*q
= bld
.getSSA();
546 Value
*m
= bld
.getSSA();
548 bld
.mkOp2(OP_DIV
, mod
->dType
, q
, mod
->getSrc(0), mod
->getSrc(1));
549 handleDIV(q
->getInsn());
551 bld
.setPosition(mod
, false);
552 expandIntegerMUL(&bld
, bld
.mkOp2(OP_MUL
, TYPE_U32
, m
, q
, mod
->getSrc(1)));
559 NV50LegalizeSSA::visit(BasicBlock
*bb
)
561 Instruction
*insn
, *next
;
562 // skipping PHIs (don't pass them to handleAddrDef) !
563 for (insn
= bb
->getEntry(); insn
; insn
= next
) {
566 if (insn
->defExists(0) && insn
->getDef(0)->reg
.file
== FILE_ADDRESS
)
572 propagateWriteToOutput(insn
);
591 class NV50LoweringPreSSA
: public Pass
594 NV50LoweringPreSSA(Program
*);
597 virtual bool visit(Instruction
*);
598 virtual bool visit(Function
*);
600 bool handleRDSV(Instruction
*);
601 bool handleWRSV(Instruction
*);
603 bool handlePFETCH(Instruction
*);
604 bool handleEXPORT(Instruction
*);
605 bool handleLOAD(Instruction
*);
607 bool handleDIV(Instruction
*);
608 bool handleSQRT(Instruction
*);
609 bool handlePOW(Instruction
*);
611 bool handleSET(Instruction
*);
612 bool handleSLCT(CmpInstruction
*);
613 bool handleSELP(Instruction
*);
615 bool handleTEX(TexInstruction
*);
616 bool handleTXB(TexInstruction
*); // I really
617 bool handleTXL(TexInstruction
*); // hate
618 bool handleTXD(TexInstruction
*); // these 3
619 bool handleTXLQ(TexInstruction
*);
620 bool handleTXQ(TexInstruction
*);
622 bool handleCALL(Instruction
*);
623 bool handlePRECONT(Instruction
*);
624 bool handleCONT(Instruction
*);
626 void checkPredicate(Instruction
*);
627 void loadTexMsInfo(uint32_t off
, Value
**ms
, Value
**ms_x
, Value
**ms_y
);
628 void loadMsInfo(Value
*ms
, Value
*s
, Value
**dx
, Value
**dy
);
631 const Target
*const targ
;
638 NV50LoweringPreSSA::NV50LoweringPreSSA(Program
*prog
) :
639 targ(prog
->getTarget()), tid(NULL
)
641 bld
.setProgram(prog
);
645 NV50LoweringPreSSA::visit(Function
*f
)
647 BasicBlock
*root
= BasicBlock::get(func
->cfg
.getRoot());
649 if (prog
->getType() == Program::TYPE_COMPUTE
) {
650 // Add implicit "thread id" argument in $r0 to the function
651 Value
*arg
= new_LValue(func
, FILE_GPR
);
652 arg
->reg
.data
.id
= 0;
653 f
->ins
.push_back(arg
);
655 bld
.setPosition(root
, false);
656 tid
= bld
.mkMov(bld
.getScratch(), arg
, TYPE_U32
)->getDef(0);
662 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off
, Value
**ms
,
663 Value
**ms_x
, Value
**ms_y
) {
664 // This loads the texture-indexed ms setting from the constant buffer
665 Value
*tmp
= new_LValue(func
, FILE_GPR
);
666 uint8_t b
= prog
->driver
->io
.resInfoCBSlot
;
667 off
+= prog
->driver
->io
.suInfoBase
;
668 if (prog
->getType() > Program::TYPE_VERTEX
)
670 if (prog
->getType() > Program::TYPE_GEOMETRY
)
672 *ms_x
= bld
.mkLoadv(TYPE_U32
, bld
.mkSymbol(
673 FILE_MEMORY_CONST
, b
, TYPE_U32
, off
+ 0), NULL
);
674 *ms_y
= bld
.mkLoadv(TYPE_U32
, bld
.mkSymbol(
675 FILE_MEMORY_CONST
, b
, TYPE_U32
, off
+ 4), NULL
);
676 *ms
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, tmp
, *ms_x
, *ms_y
);
679 void NV50LoweringPreSSA::loadMsInfo(Value
*ms
, Value
*s
, Value
**dx
, Value
**dy
) {
680 // Given a MS level, and a sample id, compute the delta x/y
681 uint8_t b
= prog
->driver
->io
.msInfoCBSlot
;
682 Value
*off
= new_LValue(func
, FILE_ADDRESS
), *t
= new_LValue(func
, FILE_GPR
);
684 // The required information is at mslevel * 16 * 4 + sample * 8
685 // = (mslevel * 8 + sample) * 8
689 bld
.mkOp2v(OP_ADD
, TYPE_U32
, t
,
690 bld
.mkOp2v(OP_SHL
, TYPE_U32
, t
, ms
, bld
.mkImm(3)),
693 *dx
= bld
.mkLoadv(TYPE_U32
, bld
.mkSymbol(
694 FILE_MEMORY_CONST
, b
, TYPE_U32
,
695 prog
->driver
->io
.msInfoBase
), off
);
696 *dy
= bld
.mkLoadv(TYPE_U32
, bld
.mkSymbol(
697 FILE_MEMORY_CONST
, b
, TYPE_U32
,
698 prog
->driver
->io
.msInfoBase
+ 4), off
);
702 NV50LoweringPreSSA::handleTEX(TexInstruction
*i
)
704 const int arg
= i
->tex
.target
.getArgCount();
705 const int dref
= arg
;
706 const int lod
= i
->tex
.target
.isShadow() ? (arg
+ 1) : arg
;
708 // handle MS, which means looking up the MS params for this texture, and
709 // adjusting the input coordinates to point at the right sample.
710 if (i
->tex
.target
.isMS()) {
711 Value
*x
= i
->getSrc(0);
712 Value
*y
= i
->getSrc(1);
713 Value
*s
= i
->getSrc(arg
- 1);
714 Value
*tx
= new_LValue(func
, FILE_GPR
), *ty
= new_LValue(func
, FILE_GPR
),
715 *ms
, *ms_x
, *ms_y
, *dx
, *dy
;
717 i
->tex
.target
.clearMS();
719 loadTexMsInfo(i
->tex
.r
* 4 * 2, &ms
, &ms_x
, &ms_y
);
720 loadMsInfo(ms
, s
, &dx
, &dy
);
722 bld
.mkOp2(OP_SHL
, TYPE_U32
, tx
, x
, ms_x
);
723 bld
.mkOp2(OP_SHL
, TYPE_U32
, ty
, y
, ms_y
);
724 bld
.mkOp2(OP_ADD
, TYPE_U32
, tx
, tx
, dx
);
725 bld
.mkOp2(OP_ADD
, TYPE_U32
, ty
, ty
, dy
);
728 i
->setSrc(arg
- 1, bld
.loadImm(NULL
, 0));
731 // dref comes before bias/lod
732 if (i
->tex
.target
.isShadow())
733 if (i
->op
== OP_TXB
|| i
->op
== OP_TXL
)
734 i
->swapSources(dref
, lod
);
736 if (i
->tex
.target
.isArray()) {
737 if (i
->op
!= OP_TXF
) {
738 // array index must be converted to u32, but it's already an integer
740 Value
*layer
= i
->getSrc(arg
- 1);
741 LValue
*src
= new_LValue(func
, FILE_GPR
);
742 bld
.mkCvt(OP_CVT
, TYPE_U32
, src
, TYPE_F32
, layer
);
743 bld
.mkOp2(OP_MIN
, TYPE_U32
, src
, src
, bld
.loadImm(NULL
, 511));
744 i
->setSrc(arg
- 1, src
);
746 if (i
->tex
.target
.isCube() && i
->srcCount() > 4) {
747 std::vector
<Value
*> acube
, a2d
;
751 for (c
= 0; c
< 4; ++c
)
752 acube
[c
] = i
->getSrc(c
);
754 for (c
= 0; c
< 3; ++c
)
755 a2d
[c
] = new_LValue(func
, FILE_GPR
);
758 bld
.mkTex(OP_TEXPREP
, TEX_TARGET_CUBE_ARRAY
, i
->tex
.r
, i
->tex
.s
,
759 a2d
, acube
)->asTex()->tex
.mask
= 0x7;
761 for (c
= 0; c
< 3; ++c
)
762 i
->setSrc(c
, a2d
[c
]);
763 for (; i
->srcExists(c
+ 1); ++c
)
764 i
->setSrc(c
, i
->getSrc(c
+ 1));
768 i
->tex
.target
= i
->tex
.target
.isShadow() ?
769 TEX_TARGET_2D_ARRAY_SHADOW
: TEX_TARGET_2D_ARRAY
;
773 // texel offsets are 3 immediate fields in the instruction,
774 // nv50 cannot do textureGatherOffsets
775 assert(i
->tex
.useOffsets
<= 1);
776 if (i
->tex
.useOffsets
) {
777 for (int c
= 0; c
< 3; ++c
) {
779 if (!i
->offset
[0][c
].getImmediate(val
))
780 assert(!"non-immediate offset");
781 i
->tex
.offset
[c
] = val
.reg
.data
.u32
;
782 i
->offset
[0][c
].set(NULL
);
789 // Bias must be equal for all threads of a quad or lod calculation will fail.
791 // The lanes of a quad are grouped by the bit in the condition register they
792 // have set, which is selected by differing bias values.
793 // Move the input values for TEX into a new register set for each group and
794 // execute TEX only for a specific group.
795 // We always need to use 4 new registers for the inputs/outputs because the
796 // implicitly calculated derivatives must be correct.
798 // TODO: move to SSA phase so we can easily determine whether bias is constant
800 NV50LoweringPreSSA::handleTXB(TexInstruction
*i
)
802 const CondCode cc
[4] = { CC_EQU
, CC_S
, CC_C
, CC_O
};
805 // We can't actually apply bias *and* do a compare for a cube
806 // texture. Since the compare has to be done before the filtering, just
807 // drop the bias on the floor.
808 if (i
->tex
.target
== TEX_TARGET_CUBE_SHADOW
) {
810 i
->setSrc(3, i
->getSrc(4));
816 Value
*bias
= i
->getSrc(i
->tex
.target
.getArgCount());
817 if (bias
->isUniform())
820 Instruction
*cond
= bld
.mkOp1(OP_UNION
, TYPE_U32
, bld
.getScratch(),
821 bld
.loadImm(NULL
, 1));
822 bld
.setPosition(cond
, false);
824 for (l
= 1; l
< 4; ++l
) {
825 const uint8_t qop
= QUADOP(SUBR
, SUBR
, SUBR
, SUBR
);
826 Value
*bit
= bld
.getSSA();
827 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
828 Value
*imm
= bld
.loadImm(NULL
, (1 << l
));
829 bld
.mkQuadop(qop
, pred
, l
, bias
, bias
)->flagsDef
= 0;
830 bld
.mkMov(bit
, imm
)->setPredicate(CC_EQ
, pred
);
831 cond
->setSrc(l
, bit
);
833 Value
*flags
= bld
.getScratch(1, FILE_FLAGS
);
834 bld
.setPosition(cond
, true);
835 bld
.mkCvt(OP_CVT
, TYPE_U8
, flags
, TYPE_U32
, cond
->getDef(0))->flagsDef
= 0;
838 for (l
= 0; l
< 4; ++l
) {
839 (tex
[l
] = cloneForward(func
, i
))->setPredicate(cc
[l
], flags
);
844 for (d
= 0; i
->defExists(d
); ++d
)
845 res
[0][d
] = tex
[0]->getDef(d
);
846 for (l
= 1; l
< 4; ++l
) {
847 for (d
= 0; tex
[l
]->defExists(d
); ++d
) {
848 res
[l
][d
] = cloneShallow(func
, res
[0][d
]);
849 bld
.mkMov(res
[l
][d
], tex
[l
]->getDef(d
))->setPredicate(cc
[l
], flags
);
853 for (d
= 0; i
->defExists(d
); ++d
) {
854 Instruction
*dst
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(d
));
855 for (l
= 0; l
< 4; ++l
)
856 dst
->setSrc(l
, res
[l
][d
]);
858 delete_Instruction(prog
, i
);
862 // LOD must be equal for all threads of a quad.
863 // Unlike with TXB, here we can just diverge since there's no LOD calculation
864 // that would require all 4 threads' sources to be set up properly.
866 NV50LoweringPreSSA::handleTXL(TexInstruction
*i
)
869 Value
*lod
= i
->getSrc(i
->tex
.target
.getArgCount());
870 if (lod
->isUniform())
873 BasicBlock
*currBB
= i
->bb
;
874 BasicBlock
*texiBB
= i
->bb
->splitBefore(i
, false);
875 BasicBlock
*joinBB
= i
->bb
->splitAfter(i
);
877 bld
.setPosition(currBB
, true);
878 assert(!currBB
->joinAt
);
879 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
881 for (int l
= 0; l
<= 3; ++l
) {
882 const uint8_t qop
= QUADOP(SUBR
, SUBR
, SUBR
, SUBR
);
883 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
884 bld
.setPosition(currBB
, true);
885 bld
.mkQuadop(qop
, pred
, l
, lod
, lod
)->flagsDef
= 0;
886 bld
.mkFlow(OP_BRA
, texiBB
, CC_EQ
, pred
)->fixed
= 1;
887 currBB
->cfg
.attach(&texiBB
->cfg
, Graph::Edge::FORWARD
);
889 BasicBlock
*laneBB
= new BasicBlock(func
);
890 currBB
->cfg
.attach(&laneBB
->cfg
, Graph::Edge::TREE
);
894 bld
.setPosition(joinBB
, false);
895 bld
.mkFlow(OP_JOIN
, NULL
, CC_ALWAYS
, NULL
)->fixed
= 1;
900 NV50LoweringPreSSA::handleTXD(TexInstruction
*i
)
902 static const uint8_t qOps
[4][2] =
904 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l0
905 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l1
906 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l2
907 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l3
912 Value
*zero
= bld
.loadImm(bld
.getSSA(), 0);
914 const int dim
= i
->tex
.target
.getDim();
917 i
->op
= OP_TEX
; // no need to clone dPdx/dPdy later
919 for (c
= 0; c
< dim
; ++c
)
920 crd
[c
] = bld
.getScratch();
922 bld
.mkOp(OP_QUADON
, TYPE_NONE
, NULL
);
923 for (l
= 0; l
< 4; ++l
) {
924 // mov coordinates from lane l to all lanes
925 for (c
= 0; c
< dim
; ++c
)
926 bld
.mkQuadop(0x00, crd
[c
], l
, i
->getSrc(c
), zero
);
927 // add dPdx from lane l to lanes dx
928 for (c
= 0; c
< dim
; ++c
)
929 bld
.mkQuadop(qOps
[l
][0], crd
[c
], l
, i
->dPdx
[c
].get(), crd
[c
]);
930 // add dPdy from lane l to lanes dy
931 for (c
= 0; c
< dim
; ++c
)
932 bld
.mkQuadop(qOps
[l
][1], crd
[c
], l
, i
->dPdy
[c
].get(), crd
[c
]);
934 bld
.insert(tex
= cloneForward(func
, i
));
935 for (c
= 0; c
< dim
; ++c
)
936 tex
->setSrc(c
, crd
[c
]);
938 for (c
= 0; i
->defExists(c
); ++c
) {
940 def
[c
][l
] = bld
.getSSA();
941 mov
= bld
.mkMov(def
[c
][l
], tex
->getDef(c
));
946 bld
.mkOp(OP_QUADPOP
, TYPE_NONE
, NULL
);
948 for (c
= 0; i
->defExists(c
); ++c
) {
949 Instruction
*u
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(c
));
950 for (l
= 0; l
< 4; ++l
)
951 u
->setSrc(l
, def
[c
][l
]);
959 NV50LoweringPreSSA::handleTXLQ(TexInstruction
*i
)
962 bld
.setPosition(i
, true);
964 /* The returned values are not quite what we want:
965 * (a) convert from s32 to f32
966 * (b) multiply by 1/256
968 for (int def
= 0; def
< 2; ++def
) {
969 if (!i
->defExists(def
))
971 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(def
), TYPE_S32
, i
->getDef(def
));
972 bld
.mkOp2(OP_MUL
, TYPE_F32
, i
->getDef(def
),
973 i
->getDef(def
), bld
.loadImm(NULL
, 1.0f
/ 256));
979 NV50LoweringPreSSA::handleTXQ(TexInstruction
*i
)
981 Value
*ms
, *ms_x
, *ms_y
;
982 if (i
->tex
.query
== TXQ_DIMS
)
984 assert(i
->tex
.query
== TXQ_TYPE
);
985 assert(i
->tex
.mask
== 4);
987 loadTexMsInfo(i
->tex
.r
* 4 * 2, &ms
, &ms_x
, &ms_y
);
988 bld
.mkOp2(OP_SHL
, TYPE_U32
, i
->getDef(0), bld
.loadImm(NULL
, 1), ms
);
996 NV50LoweringPreSSA::handleSET(Instruction
*i
)
998 if (i
->dType
== TYPE_F32
) {
999 bld
.setPosition(i
, true);
1000 i
->dType
= TYPE_U32
;
1001 bld
.mkOp1(OP_ABS
, TYPE_S32
, i
->getDef(0), i
->getDef(0));
1002 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(0), TYPE_S32
, i
->getDef(0));
1008 NV50LoweringPreSSA::handleSLCT(CmpInstruction
*i
)
1010 Value
*src0
= bld
.getSSA();
1011 Value
*src1
= bld
.getSSA();
1012 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
1014 Value
*v0
= i
->getSrc(0);
1015 Value
*v1
= i
->getSrc(1);
1016 // XXX: these probably shouldn't be immediates in the first place ...
1018 v0
= bld
.mkMov(bld
.getSSA(), v0
)->getDef(0);
1020 v1
= bld
.mkMov(bld
.getSSA(), v1
)->getDef(0);
1022 bld
.setPosition(i
, true);
1023 bld
.mkMov(src0
, v0
)->setPredicate(CC_NE
, pred
);
1024 bld
.mkMov(src1
, v1
)->setPredicate(CC_EQ
, pred
);
1025 bld
.mkOp2(OP_UNION
, i
->dType
, i
->getDef(0), src0
, src1
);
1027 bld
.setPosition(i
, false);
1029 i
->setFlagsDef(0, pred
);
1031 i
->setSrc(0, i
->getSrc(2));
1033 i
->setSrc(1, bld
.loadImm(NULL
, 0));
1039 NV50LoweringPreSSA::handleSELP(Instruction
*i
)
1041 Value
*src0
= bld
.getSSA();
1042 Value
*src1
= bld
.getSSA();
1044 Value
*v0
= i
->getSrc(0);
1045 Value
*v1
= i
->getSrc(1);
1047 v0
= bld
.mkMov(bld
.getSSA(), v0
)->getDef(0);
1049 v1
= bld
.mkMov(bld
.getSSA(), v1
)->getDef(0);
1051 bld
.mkMov(src0
, v0
)->setPredicate(CC_NE
, i
->getSrc(2));
1052 bld
.mkMov(src1
, v1
)->setPredicate(CC_EQ
, i
->getSrc(2));
1053 bld
.mkOp2(OP_UNION
, i
->dType
, i
->getDef(0), src0
, src1
);
1054 delete_Instruction(prog
, i
);
1059 NV50LoweringPreSSA::handleWRSV(Instruction
*i
)
1061 Symbol
*sym
= i
->getSrc(0)->asSym();
1063 // these are all shader outputs, $sreg are not writeable
1064 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_OUTPUT
, sym
);
1067 sym
= bld
.mkSymbol(FILE_SHADER_OUTPUT
, 0, i
->sType
, addr
);
1069 bld
.mkStore(OP_EXPORT
, i
->dType
, sym
, i
->getIndirect(0, 0), i
->getSrc(1));
1071 bld
.getBB()->remove(i
);
1076 NV50LoweringPreSSA::handleCALL(Instruction
*i
)
1078 if (prog
->getType() == Program::TYPE_COMPUTE
) {
1079 // Add implicit "thread id" argument in $r0 to the function
1080 i
->setSrc(i
->srcCount(), tid
);
1086 NV50LoweringPreSSA::handlePRECONT(Instruction
*i
)
1088 delete_Instruction(prog
, i
);
1093 NV50LoweringPreSSA::handleCONT(Instruction
*i
)
1100 NV50LoweringPreSSA::handleRDSV(Instruction
*i
)
1102 Symbol
*sym
= i
->getSrc(0)->asSym();
1103 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_INPUT
, sym
);
1104 Value
*def
= i
->getDef(0);
1105 SVSemantic sv
= sym
->reg
.data
.sv
.sv
;
1106 int idx
= sym
->reg
.data
.sv
.index
;
1108 if (addr
>= 0x400) // mov $sreg
1113 assert(prog
->getType() == Program::TYPE_FRAGMENT
);
1114 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, i
->getDef(0), addr
, NULL
);
1117 bld
.mkInterp(NV50_IR_INTERP_FLAT
, def
, addr
, NULL
);
1118 if (i
->dType
== TYPE_F32
) {
1119 bld
.mkOp2(OP_OR
, TYPE_U32
, def
, def
, bld
.mkImm(0x00000001));
1120 bld
.mkOp1(OP_NEG
, TYPE_S32
, def
, def
);
1121 bld
.mkCvt(OP_CVT
, TYPE_F32
, def
, TYPE_S32
, def
);
1127 if ((sv
== SV_NCTAID
&& idx
>= 2) ||
1128 (sv
== SV_NTID
&& idx
>= 3)) {
1129 bld
.mkMov(def
, bld
.mkImm(1));
1130 } else if (sv
== SV_CTAID
&& idx
>= 2) {
1131 bld
.mkMov(def
, bld
.mkImm(0));
1133 Value
*x
= bld
.getSSA(2);
1134 bld
.mkOp1(OP_LOAD
, TYPE_U16
, x
,
1135 bld
.mkSymbol(FILE_MEMORY_SHARED
, 0, TYPE_U16
, addr
));
1136 bld
.mkCvt(OP_CVT
, TYPE_U32
, def
, TYPE_U16
, x
);
1141 bld
.mkOp2(OP_AND
, TYPE_U32
, def
, tid
, bld
.mkImm(0x0000ffff));
1142 } else if (idx
== 1) {
1143 bld
.mkOp2(OP_AND
, TYPE_U32
, def
, tid
, bld
.mkImm(0x03ff0000));
1144 bld
.mkOp2(OP_SHR
, TYPE_U32
, def
, def
, bld
.mkImm(16));
1145 } else if (idx
== 2) {
1146 bld
.mkOp2(OP_SHR
, TYPE_U32
, def
, tid
, bld
.mkImm(26));
1148 bld
.mkMov(def
, bld
.mkImm(0));
1151 case SV_SAMPLE_POS
: {
1152 Value
*off
= new_LValue(func
, FILE_ADDRESS
);
1153 bld
.mkOp1(OP_RDSV
, TYPE_U32
, def
, bld
.mkSysVal(SV_SAMPLE_INDEX
, 0));
1154 bld
.mkOp2(OP_SHL
, TYPE_U32
, off
, def
, bld
.mkImm(3));
1155 bld
.mkLoad(TYPE_F32
,
1158 FILE_MEMORY_CONST
, prog
->driver
->io
.resInfoCBSlot
,
1159 TYPE_U32
, prog
->driver
->io
.sampleInfoBase
+ 4 * idx
),
1164 bld
.mkFetch(i
->getDef(0), i
->dType
,
1165 FILE_SHADER_INPUT
, addr
, i
->getIndirect(0, 0), NULL
);
1168 bld
.getBB()->remove(i
);
1173 NV50LoweringPreSSA::handleDIV(Instruction
*i
)
1175 if (!isFloatType(i
->dType
))
1177 bld
.setPosition(i
, false);
1178 Instruction
*rcp
= bld
.mkOp1(OP_RCP
, i
->dType
, bld
.getSSA(), i
->getSrc(1));
1180 i
->setSrc(1, rcp
->getDef(0));
1185 NV50LoweringPreSSA::handleSQRT(Instruction
*i
)
1187 Instruction
*rsq
= bld
.mkOp1(OP_RSQ
, TYPE_F32
,
1188 bld
.getSSA(), i
->getSrc(0));
1190 i
->setSrc(1, rsq
->getDef(0));
1196 NV50LoweringPreSSA::handlePOW(Instruction
*i
)
1198 LValue
*val
= bld
.getScratch();
1200 bld
.mkOp1(OP_LG2
, TYPE_F32
, val
, i
->getSrc(0));
1201 bld
.mkOp2(OP_MUL
, TYPE_F32
, val
, i
->getSrc(1), val
)->dnz
= 1;
1202 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, val
, val
);
1212 NV50LoweringPreSSA::handleEXPORT(Instruction
*i
)
1214 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
1215 if (i
->getIndirect(0, 0)) {
1216 // TODO: redirect to l[] here, load to GPRs at exit
1219 int id
= i
->getSrc(0)->reg
.data
.offset
/ 4; // in 32 bit reg units
1222 i
->subOp
= NV50_IR_SUBOP_MOV_FINAL
;
1223 i
->src(0).set(i
->src(1));
1225 i
->setDef(0, new_LValue(func
, FILE_GPR
));
1226 i
->getDef(0)->reg
.data
.id
= id
;
1228 prog
->maxGPR
= MAX2(prog
->maxGPR
, id
);
1234 // Handle indirect addressing in geometry shaders:
1236 // ld $r0 a[$a1][$a2+k] ->
1237 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1240 NV50LoweringPreSSA::handleLOAD(Instruction
*i
)
1242 ValueRef src
= i
->src(0);
1244 if (src
.isIndirect(1)) {
1245 assert(prog
->getType() == Program::TYPE_GEOMETRY
);
1246 Value
*addr
= i
->getIndirect(0, 1);
1248 if (src
.isIndirect(0)) {
1249 // base address is in an address register, so move to a GPR
1250 Value
*base
= bld
.getScratch();
1251 bld
.mkMov(base
, addr
);
1253 Symbol
*sv
= bld
.mkSysVal(SV_VERTEX_STRIDE
, 0);
1254 Value
*vstride
= bld
.mkOp1v(OP_RDSV
, TYPE_U32
, bld
.getSSA(), sv
);
1255 Value
*attrib
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
1256 i
->getIndirect(0, 0), bld
.mkImm(2));
1258 // Calculate final address: addr = base + attr*vstride; use 16-bit
1259 // multiplication since 32-bit would be lowered to multiple
1260 // instructions, and we only need the low 16 bits of the result
1262 bld
.mkSplit(a
, 2, attrib
);
1263 bld
.mkSplit(b
, 2, vstride
);
1264 Value
*sum
= bld
.mkOp3v(OP_MAD
, TYPE_U16
, bld
.getSSA(), a
[0], b
[0],
1267 // move address from GPR into an address register
1268 addr
= bld
.getSSA(2, FILE_ADDRESS
);
1269 bld
.mkMov(addr
, sum
);
1272 i
->setIndirect(0, 1, NULL
);
1273 i
->setIndirect(0, 0, addr
);
1280 NV50LoweringPreSSA::handlePFETCH(Instruction
*i
)
1282 assert(prog
->getType() == Program::TYPE_GEOMETRY
);
1284 // NOTE: cannot use getImmediate here, not in SSA form yet, move to
1285 // later phase if that assertion ever triggers:
1287 ImmediateValue
*imm
= i
->getSrc(0)->asImm();
1290 assert(imm
->reg
.data
.u32
<= 127); // TODO: use address reg if that happens
1292 if (i
->srcExists(1)) {
1293 // indirect addressing of vertex in primitive space
1295 LValue
*val
= bld
.getScratch();
1296 Value
*ptr
= bld
.getSSA(2, FILE_ADDRESS
);
1297 bld
.mkOp2v(OP_SHL
, TYPE_U32
, ptr
, i
->getSrc(1), bld
.mkImm(2));
1298 bld
.mkOp2v(OP_PFETCH
, TYPE_U32
, val
, imm
, ptr
);
1300 // NOTE: PFETCH directly to an $aX only works with direct addressing
1303 i
->setSrc(1, bld
.mkImm(0));
1309 // Set flags according to predicate and make the instruction read $cX.
1311 NV50LoweringPreSSA::checkPredicate(Instruction
*insn
)
1313 Value
*pred
= insn
->getPredicate();
1316 // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
1318 pred
->reg
.file
== FILE_FLAGS
|| pred
->reg
.file
== FILE_PREDICATE
)
1321 cdst
= bld
.getSSA(1, FILE_FLAGS
);
1323 bld
.mkCmp(OP_SET
, CC_NEU
, insn
->dType
, cdst
, insn
->dType
, bld
.loadImm(NULL
, 0), pred
);
1325 insn
->setPredicate(insn
->cc
, cdst
);
1329 // - add quadop dance for texturing
1330 // - put FP outputs in GPRs
1331 // - convert instruction sequences
1334 NV50LoweringPreSSA::visit(Instruction
*i
)
1336 bld
.setPosition(i
, false);
1338 if (i
->cc
!= CC_ALWAYS
)
1345 return handleTEX(i
->asTex());
1347 return handleTXB(i
->asTex());
1349 return handleTXL(i
->asTex());
1351 return handleTXD(i
->asTex());
1353 return handleTXLQ(i
->asTex());
1355 return handleTXQ(i
->asTex());
1357 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, i
->getDef(0), i
->getSrc(0));
1358 i
->setSrc(0, i
->getDef(0));
1361 return handleSET(i
);
1363 return handleSLCT(i
->asCmp());
1365 return handleSELP(i
);
1367 return handlePOW(i
);
1369 return handleDIV(i
);
1371 return handleSQRT(i
);
1373 return handleEXPORT(i
);
1375 return handleLOAD(i
);
1377 return handleRDSV(i
);
1379 return handleWRSV(i
);
1381 return handleCALL(i
);
1383 return handlePRECONT(i
);
1385 return handleCONT(i
);
1387 return handlePFETCH(i
);
1395 TargetNV50::runLegalizePass(Program
*prog
, CGStage stage
) const
1399 if (stage
== CG_STAGE_PRE_SSA
) {
1400 NV50LoweringPreSSA
pass(prog
);
1401 ret
= pass
.run(prog
, false, true);
1403 if (stage
== CG_STAGE_SSA
) {
1404 if (!prog
->targetPriv
)
1405 prog
->targetPriv
= new std::list
<Instruction
*>();
1406 NV50LegalizeSSA
pass(prog
);
1407 ret
= pass
.run(prog
, false, true);
1409 if (stage
== CG_STAGE_POST_RA
) {
1410 NV50LegalizePostRA pass
;
1411 ret
= pass
.run(prog
, false, true);
1412 if (prog
->targetPriv
)
1413 delete reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
1418 } // namespace nv50_ir