2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
26 #include "codegen/nv50_ir_target_nv50.h"
30 // nv50 doesn't support 32 bit integer multiplication
32 // ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33 // -------------------
34 // al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35 // ah*bh 00 00 ( carry1) << 16 + ( carry2)
39 // fffe0001 + fffe0001
41 // Note that this sort of splitting doesn't work for signed values, so we
42 // compute the sign on those manually and then perform an unsigned multiply.
44 expandIntegerMUL(BuildUtil
*bld
, Instruction
*mul
)
46 const bool highResult
= mul
->subOp
== NV50_IR_SUBOP_MUL_HIGH
;
48 bool src1imm
= mul
->src(1).getImmediate(src1
);
50 DataType fTy
; // full type
52 case TYPE_S32
: fTy
= TYPE_U32
; break;
53 case TYPE_S64
: fTy
= TYPE_U64
; break;
54 default: fTy
= mul
->sType
; break;
57 DataType hTy
; // half type
59 case TYPE_U32
: hTy
= TYPE_U16
; break;
60 case TYPE_U64
: hTy
= TYPE_U32
; break;
64 unsigned int fullSize
= typeSizeof(fTy
);
65 unsigned int halfSize
= typeSizeof(hTy
);
69 bld
->setPosition(mul
, true);
74 for (int j
= 0; j
< 4; ++j
)
75 t
[j
] = bld
->getSSA(fullSize
);
77 if (isSignedType(mul
->sType
) && highResult
) {
78 s
[0] = bld
->getSSA(fullSize
);
79 s
[1] = bld
->getSSA(fullSize
);
80 bld
->mkOp1(OP_ABS
, mul
->sType
, s
[0], mul
->getSrc(0));
81 bld
->mkOp1(OP_ABS
, mul
->sType
, s
[1], mul
->getSrc(1));
82 src1
.reg
.data
.s32
= abs(src1
.reg
.data
.s32
);
84 s
[0] = mul
->getSrc(0);
85 s
[1] = mul
->getSrc(1);
88 // split sources into halves
89 i
[0] = bld
->mkSplit(a
, halfSize
, s
[0]);
90 i
[1] = bld
->mkSplit(b
, halfSize
, s
[1]);
92 if (src1imm
&& (src1
.reg
.data
.u32
& 0xffff0000) == 0) {
93 i
[2] = i
[3] = bld
->mkOp2(OP_MUL
, fTy
, t
[1], a
[1],
94 bld
->mkImm(src1
.reg
.data
.u32
& 0xffff));
96 i
[2] = bld
->mkOp2(OP_MUL
, fTy
, t
[0], a
[0],
97 src1imm
? bld
->mkImm(src1
.reg
.data
.u32
>> 16) : b
[1]);
98 if (src1imm
&& (src1
.reg
.data
.u32
& 0x0000ffff) == 0) {
102 i
[3] = bld
->mkOp3(OP_MAD
, fTy
, t
[1], a
[1], b
[0], t
[0]);
105 i
[7] = bld
->mkOp2(OP_SHL
, fTy
, t
[2], t
[1], bld
->mkImm(halfSize
* 8));
106 if (src1imm
&& (src1
.reg
.data
.u32
& 0x0000ffff) == 0) {
110 i
[4] = bld
->mkOp3(OP_MAD
, fTy
, t
[3], a
[0], b
[0], t
[2]);
116 Value
*imm
= bld
->loadImm(NULL
, 1 << (halfSize
* 8));
117 c
[0] = bld
->getSSA(1, FILE_FLAGS
);
118 c
[1] = bld
->getSSA(1, FILE_FLAGS
);
119 for (int j
= 0; j
< 5; ++j
)
120 r
[j
] = bld
->getSSA(fullSize
);
122 i
[8] = bld
->mkOp2(OP_SHR
, fTy
, r
[0], t
[1], bld
->mkImm(halfSize
* 8));
123 i
[6] = bld
->mkOp2(OP_ADD
, fTy
, r
[1], r
[0], imm
);
124 bld
->mkMov(r
[3], r
[0])->setPredicate(CC_NC
, c
[0]);
125 bld
->mkOp2(OP_UNION
, TYPE_U32
, r
[2], r
[1], r
[3]);
126 i
[5] = bld
->mkOp3(OP_MAD
, fTy
, r
[4], a
[1], b
[1], r
[2]);
128 // set carry defs / sources
129 i
[3]->setFlagsDef(1, c
[0]);
130 // actual result required in negative case, but ignored for
131 // unsigned. for some reason the compiler ends up dropping the whole
132 // instruction if the destination is unused but the flags are.
133 if (isSignedType(mul
->sType
))
134 i
[4]->setFlagsDef(1, c
[1]);
136 i
[4]->setFlagsDef(0, c
[1]);
137 i
[6]->setPredicate(CC_C
, c
[0]);
138 i
[5]->setFlagsSrc(3, c
[1]);
140 if (isSignedType(mul
->sType
)) {
143 Value
*one
= bld
->getSSA(fullSize
);
144 bld
->loadImm(one
, 1);
145 for (int j
= 0; j
< 7; j
++)
146 rr
[j
] = bld
->getSSA(fullSize
);
148 // NOTE: this logic uses predicates because splitting basic blocks is
149 // ~impossible during the SSA phase. The RA relies on a correlation
150 // between edge order and phi node sources.
152 // Set the sign of the result based on the inputs
153 bld
->mkOp2(OP_XOR
, fTy
, NULL
, mul
->getSrc(0), mul
->getSrc(1))
154 ->setFlagsDef(0, (cc
[0] = bld
->getSSA(1, FILE_FLAGS
)));
156 // 1s complement of 64-bit value
157 bld
->mkOp1(OP_NOT
, fTy
, rr
[0], r
[4])
158 ->setPredicate(CC_S
, cc
[0]);
159 bld
->mkOp1(OP_NOT
, fTy
, rr
[1], t
[3])
160 ->setPredicate(CC_S
, cc
[0]);
162 // add to low 32-bits, keep track of the carry
163 Instruction
*n
= bld
->mkOp2(OP_ADD
, fTy
, NULL
, rr
[1], one
);
164 n
->setPredicate(CC_S
, cc
[0]);
165 n
->setFlagsDef(0, (cc
[1] = bld
->getSSA(1, FILE_FLAGS
)));
167 // If there was a carry, add 1 to the upper 32 bits
168 // XXX: These get executed even if they shouldn't be
169 bld
->mkOp2(OP_ADD
, fTy
, rr
[2], rr
[0], one
)
170 ->setPredicate(CC_C
, cc
[1]);
171 bld
->mkMov(rr
[3], rr
[0])
172 ->setPredicate(CC_NC
, cc
[1]);
173 bld
->mkOp2(OP_UNION
, fTy
, rr
[4], rr
[2], rr
[3]);
175 // Merge the results from the negative and non-negative paths
176 bld
->mkMov(rr
[5], rr
[4])
177 ->setPredicate(CC_S
, cc
[0]);
178 bld
->mkMov(rr
[6], r
[4])
179 ->setPredicate(CC_NS
, cc
[0]);
180 bld
->mkOp2(OP_UNION
, mul
->sType
, mul
->getDef(0), rr
[5], rr
[6]);
182 bld
->mkMov(mul
->getDef(0), r
[4]);
185 bld
->mkMov(mul
->getDef(0), t
[3]);
187 delete_Instruction(bld
->getProgram(), mul
);
189 for (int j
= 2; j
<= (highResult
? 5 : 4); ++j
)
202 #define QUADOP(q, r, s, t) \
203 ((QOP_##q << 6) | (QOP_##r << 4) | \
204 (QOP_##s << 2) | (QOP_##t << 0))
206 class NV50LegalizePostRA
: public Pass
209 virtual bool visit(Function
*);
210 virtual bool visit(BasicBlock
*);
212 void handlePRERET(FlowInstruction
*);
213 void replaceZero(Instruction
*);
219 NV50LegalizePostRA::visit(Function
*fn
)
221 Program
*prog
= fn
->getProgram();
223 r63
= new_LValue(fn
, FILE_GPR
);
224 // GPR units on nv50 are in half-regs
225 if (prog
->maxGPR
< 126)
226 r63
->reg
.data
.id
= 63;
228 r63
->reg
.data
.id
= 127;
230 // this is actually per-program, but we can do it all on visiting main()
231 std::list
<Instruction
*> *outWrites
=
232 reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
235 for (std::list
<Instruction
*>::iterator it
= outWrites
->begin();
236 it
!= outWrites
->end(); ++it
)
237 (*it
)->getSrc(1)->defs
.front()->getInsn()->setDef(0, (*it
)->getSrc(0));
238 // instructions will be deleted on exit
246 NV50LegalizePostRA::replaceZero(Instruction
*i
)
248 for (int s
= 0; i
->srcExists(s
); ++s
) {
249 ImmediateValue
*imm
= i
->getSrc(s
)->asImm();
250 if (imm
&& imm
->reg
.data
.u64
== 0)
255 // Emulate PRERET: jump to the target and call to the origin from there
257 // WARNING: atm only works if BBs are affected by at most a single PRERET
266 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
269 // bra BB:3 + n1 (skip the call)
270 // call BB:0 + n2 (skip bra at beginning of BB:0)
273 NV50LegalizePostRA::handlePRERET(FlowInstruction
*pre
)
275 BasicBlock
*bbE
= pre
->bb
;
276 BasicBlock
*bbT
= pre
->target
.bb
;
278 pre
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 0;
280 bbE
->insertHead(pre
);
282 Instruction
*skip
= new_FlowInstruction(func
, OP_PRERET
, bbT
);
283 Instruction
*call
= new_FlowInstruction(func
, OP_PRERET
, bbE
);
285 bbT
->insertHead(call
);
286 bbT
->insertHead(skip
);
288 // NOTE: maybe split blocks to prevent the instructions from moving ?
290 skip
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 1;
291 call
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 2;
295 NV50LegalizePostRA::visit(BasicBlock
*bb
)
297 Instruction
*i
, *next
;
299 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
300 for (i
= bb
->getFirst(); i
; i
= next
) {
305 if (i
->op
== OP_PRERET
&& prog
->getTarget()->getChipset() < 0xa0) {
306 handlePRERET(i
->asFlow());
308 // TODO: We will want to do this before register allocation,
309 // since have to use a $c register for the carry flag.
310 if (typeSizeof(i
->dType
) == 8) {
311 Instruction
*hi
= BuildUtil::split64BitOpPostRA(func
, i
, r63
, NULL
);
316 if (i
->op
!= OP_PFETCH
&& i
->op
!= OP_BAR
&&
317 (!i
->defExists(0) || i
->def(0).getFile() != FILE_ADDRESS
))
327 class NV50LegalizeSSA
: public Pass
330 NV50LegalizeSSA(Program
*);
332 virtual bool visit(BasicBlock
*bb
);
335 void propagateWriteToOutput(Instruction
*);
336 void handleDIV(Instruction
*);
337 void handleMOD(Instruction
*);
338 void handleMUL(Instruction
*);
339 void handleAddrDef(Instruction
*);
341 inline bool isARL(const Instruction
*) const;
345 std::list
<Instruction
*> *outWrites
;
348 NV50LegalizeSSA::NV50LegalizeSSA(Program
*prog
)
350 bld
.setProgram(prog
);
352 if (prog
->optLevel
>= 2 &&
353 (prog
->getType() == Program::TYPE_GEOMETRY
||
354 prog
->getType() == Program::TYPE_VERTEX
))
356 reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
362 NV50LegalizeSSA::propagateWriteToOutput(Instruction
*st
)
364 if (st
->src(0).isIndirect(0) || st
->getSrc(1)->refCount() != 1)
367 // check def instruction can store
368 Instruction
*di
= st
->getSrc(1)->defs
.front()->getInsn();
370 // TODO: move exports (if beneficial) in common opt pass
371 if (di
->isPseudo() || isTextureOp(di
->op
) || di
->defCount(0xff, true) > 1)
374 for (int s
= 0; di
->srcExists(s
); ++s
)
375 if (di
->src(s
).getFile() == FILE_IMMEDIATE
||
376 di
->src(s
).getFile() == FILE_MEMORY_LOCAL
)
379 if (prog
->getType() == Program::TYPE_GEOMETRY
) {
380 // Only propagate output writes in geometry shaders when we can be sure
381 // that we are propagating to the same output vertex.
382 if (di
->bb
!= st
->bb
)
385 for (i
= di
; i
!= st
; i
= i
->next
) {
386 if (i
->op
== OP_EMIT
|| i
->op
== OP_RESTART
)
389 assert(i
); // st after di
392 // We cannot set defs to non-lvalues before register allocation, so
393 // save & remove (to save registers) the exports and replace later.
394 outWrites
->push_back(st
);
399 NV50LegalizeSSA::isARL(const Instruction
*i
) const
403 if (i
->op
!= OP_SHL
|| i
->src(0).getFile() != FILE_GPR
)
405 if (!i
->src(1).getImmediate(imm
))
407 return imm
.isInteger(0);
411 NV50LegalizeSSA::handleAddrDef(Instruction
*i
)
415 i
->getDef(0)->reg
.size
= 2; // $aX are only 16 bit
417 // PFETCH can always write to $a
418 if (i
->op
== OP_PFETCH
)
420 // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
421 if (i
->srcExists(1) && i
->src(1).getFile() == FILE_IMMEDIATE
) {
422 if (i
->op
== OP_SHL
&& i
->src(0).getFile() == FILE_GPR
)
424 if (i
->op
== OP_ADD
&& i
->src(0).getFile() == FILE_ADDRESS
)
428 // turn $a sources into $r sources (can't operate on $a)
429 for (int s
= 0; i
->srcExists(s
); ++s
) {
430 Value
*a
= i
->getSrc(s
);
432 if (a
->reg
.file
== FILE_ADDRESS
) {
433 if (a
->getInsn() && isARL(a
->getInsn())) {
434 i
->setSrc(s
, a
->getInsn()->getSrc(0));
436 bld
.setPosition(i
, false);
443 if (i
->op
== OP_SHL
&& i
->src(1).getFile() == FILE_IMMEDIATE
)
446 // turn result back into $a
447 bld
.setPosition(i
, true);
448 arl
= bld
.mkOp2(OP_SHL
, TYPE_U32
, i
->getDef(0), bld
.getSSA(), bld
.mkImm(0));
449 i
->setDef(0, arl
->getSrc(0));
453 NV50LegalizeSSA::handleMUL(Instruction
*mul
)
455 if (isFloatType(mul
->sType
) || typeSizeof(mul
->sType
) <= 2)
457 Value
*def
= mul
->getDef(0);
458 Value
*pred
= mul
->getPredicate();
459 CondCode cc
= mul
->cc
;
461 mul
->setPredicate(CC_ALWAYS
, NULL
);
463 if (mul
->op
== OP_MAD
) {
464 Instruction
*add
= mul
;
465 bld
.setPosition(add
, false);
466 Value
*res
= cloneShallow(func
, mul
->getDef(0));
467 mul
= bld
.mkOp2(OP_MUL
, add
->sType
, res
, add
->getSrc(0), add
->getSrc(1));
469 add
->setSrc(0, mul
->getDef(0));
470 add
->setSrc(1, add
->getSrc(2));
471 for (int s
= 2; add
->srcExists(s
); ++s
)
472 add
->setSrc(s
, NULL
);
473 mul
->subOp
= add
->subOp
;
476 expandIntegerMUL(&bld
, mul
);
478 def
->getInsn()->setPredicate(cc
, pred
);
481 // Use f32 division: first compute an approximate result, use it to reduce
482 // the dividend, which should then be representable as f32, divide the reduced
483 // dividend, and add the quotients.
485 NV50LegalizeSSA::handleDIV(Instruction
*div
)
487 const DataType ty
= div
->sType
;
489 if (ty
!= TYPE_U32
&& ty
!= TYPE_S32
)
492 Value
*q
, *q0
, *qf
, *aR
, *aRf
, *qRf
, *qR
, *t
, *s
, *m
, *cond
;
494 bld
.setPosition(div
, false);
496 Value
*a
, *af
= bld
.getSSA();
497 Value
*b
, *bf
= bld
.getSSA();
499 bld
.mkCvt(OP_CVT
, TYPE_F32
, af
, ty
, div
->getSrc(0));
500 bld
.mkCvt(OP_CVT
, TYPE_F32
, bf
, ty
, div
->getSrc(1));
502 if (isSignedType(ty
)) {
503 af
->getInsn()->src(0).mod
= Modifier(NV50_IR_MOD_ABS
);
504 bf
->getInsn()->src(0).mod
= Modifier(NV50_IR_MOD_ABS
);
507 bld
.mkOp1(OP_ABS
, ty
, a
, div
->getSrc(0));
508 bld
.mkOp1(OP_ABS
, ty
, b
, div
->getSrc(1));
514 bf
= bld
.mkOp1v(OP_RCP
, TYPE_F32
, bld
.getSSA(), bf
);
515 bf
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), bf
, bld
.mkImm(-2));
517 bld
.mkOp2(OP_MUL
, TYPE_F32
, (qf
= bld
.getSSA()), af
, bf
)->rnd
= ROUND_Z
;
518 bld
.mkCvt(OP_CVT
, ty
, (q0
= bld
.getSSA()), TYPE_F32
, qf
)->rnd
= ROUND_Z
;
520 // get error of 1st result
521 expandIntegerMUL(&bld
,
522 bld
.mkOp2(OP_MUL
, TYPE_U32
, (t
= bld
.getSSA()), q0
, b
));
523 bld
.mkOp2(OP_SUB
, TYPE_U32
, (aRf
= bld
.getSSA()), a
, t
);
525 bld
.mkCvt(OP_CVT
, TYPE_F32
, (aR
= bld
.getSSA()), TYPE_U32
, aRf
);
527 bld
.mkOp2(OP_MUL
, TYPE_F32
, (qRf
= bld
.getSSA()), aR
, bf
)->rnd
= ROUND_Z
;
528 bld
.mkCvt(OP_CVT
, TYPE_U32
, (qR
= bld
.getSSA()), TYPE_F32
, qRf
)
530 bld
.mkOp2(OP_ADD
, ty
, (q
= bld
.getSSA()), q0
, qR
); // add quotients
532 // correction: if modulus >= divisor, add 1
533 expandIntegerMUL(&bld
,
534 bld
.mkOp2(OP_MUL
, TYPE_U32
, (t
= bld
.getSSA()), q
, b
));
535 bld
.mkOp2(OP_SUB
, TYPE_U32
, (m
= bld
.getSSA()), a
, t
);
536 bld
.mkCmp(OP_SET
, CC_GE
, TYPE_U32
, (s
= bld
.getSSA()), TYPE_U32
, m
, b
);
537 if (!isSignedType(ty
)) {
543 bld
.mkOp2(OP_SUB
, TYPE_U32
, (q
= bld
.getSSA()), t
, s
);
547 bld
.mkOp2(OP_XOR
, TYPE_U32
, NULL
, div
->getSrc(0), div
->getSrc(1))
548 ->setFlagsDef(0, (cond
= bld
.getSSA(1, FILE_FLAGS
)));
549 bld
.mkOp1(OP_NEG
, ty
, s
, q
)->setPredicate(CC_S
, cond
);
550 bld
.mkOp1(OP_MOV
, ty
, t
, q
)->setPredicate(CC_NS
, cond
);
559 NV50LegalizeSSA::handleMOD(Instruction
*mod
)
561 if (mod
->dType
!= TYPE_U32
&& mod
->dType
!= TYPE_S32
)
563 bld
.setPosition(mod
, false);
565 Value
*q
= bld
.getSSA();
566 Value
*m
= bld
.getSSA();
568 bld
.mkOp2(OP_DIV
, mod
->dType
, q
, mod
->getSrc(0), mod
->getSrc(1));
569 handleDIV(q
->getInsn());
571 bld
.setPosition(mod
, false);
572 expandIntegerMUL(&bld
, bld
.mkOp2(OP_MUL
, TYPE_U32
, m
, q
, mod
->getSrc(1)));
579 NV50LegalizeSSA::visit(BasicBlock
*bb
)
581 Instruction
*insn
, *next
;
582 // skipping PHIs (don't pass them to handleAddrDef) !
583 for (insn
= bb
->getEntry(); insn
; insn
= next
) {
586 if (insn
->defExists(0) && insn
->getDef(0)->reg
.file
== FILE_ADDRESS
)
592 propagateWriteToOutput(insn
);
611 class NV50LoweringPreSSA
: public Pass
614 NV50LoweringPreSSA(Program
*);
617 virtual bool visit(Instruction
*);
618 virtual bool visit(Function
*);
620 bool handleRDSV(Instruction
*);
621 bool handleWRSV(Instruction
*);
623 bool handlePFETCH(Instruction
*);
624 bool handleEXPORT(Instruction
*);
625 bool handleLOAD(Instruction
*);
627 bool handleDIV(Instruction
*);
628 bool handleSQRT(Instruction
*);
629 bool handlePOW(Instruction
*);
631 bool handleSET(Instruction
*);
632 bool handleSLCT(CmpInstruction
*);
633 bool handleSELP(Instruction
*);
635 bool handleTEX(TexInstruction
*);
636 bool handleTXB(TexInstruction
*); // I really
637 bool handleTXL(TexInstruction
*); // hate
638 bool handleTXD(TexInstruction
*); // these 3
639 bool handleTXLQ(TexInstruction
*);
640 bool handleTXQ(TexInstruction
*);
642 bool handleCALL(Instruction
*);
643 bool handlePRECONT(Instruction
*);
644 bool handleCONT(Instruction
*);
646 void checkPredicate(Instruction
*);
647 void loadTexMsInfo(uint32_t off
, Value
**ms
, Value
**ms_x
, Value
**ms_y
);
648 void loadMsInfo(Value
*ms
, Value
*s
, Value
**dx
, Value
**dy
);
651 const Target
*const targ
;
658 NV50LoweringPreSSA::NV50LoweringPreSSA(Program
*prog
) :
659 targ(prog
->getTarget()), tid(NULL
)
661 bld
.setProgram(prog
);
665 NV50LoweringPreSSA::visit(Function
*f
)
667 BasicBlock
*root
= BasicBlock::get(func
->cfg
.getRoot());
669 if (prog
->getType() == Program::TYPE_COMPUTE
) {
670 // Add implicit "thread id" argument in $r0 to the function
671 Value
*arg
= new_LValue(func
, FILE_GPR
);
672 arg
->reg
.data
.id
= 0;
673 f
->ins
.push_back(arg
);
675 bld
.setPosition(root
, false);
676 tid
= bld
.mkMov(bld
.getScratch(), arg
, TYPE_U32
)->getDef(0);
682 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off
, Value
**ms
,
683 Value
**ms_x
, Value
**ms_y
) {
684 // This loads the texture-indexed ms setting from the constant buffer
685 Value
*tmp
= new_LValue(func
, FILE_GPR
);
686 uint8_t b
= prog
->driver
->io
.auxCBSlot
;
687 off
+= prog
->driver
->io
.suInfoBase
;
688 if (prog
->getType() > Program::TYPE_VERTEX
)
690 if (prog
->getType() > Program::TYPE_GEOMETRY
)
692 *ms_x
= bld
.mkLoadv(TYPE_U32
, bld
.mkSymbol(
693 FILE_MEMORY_CONST
, b
, TYPE_U32
, off
+ 0), NULL
);
694 *ms_y
= bld
.mkLoadv(TYPE_U32
, bld
.mkSymbol(
695 FILE_MEMORY_CONST
, b
, TYPE_U32
, off
+ 4), NULL
);
696 *ms
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, tmp
, *ms_x
, *ms_y
);
699 void NV50LoweringPreSSA::loadMsInfo(Value
*ms
, Value
*s
, Value
**dx
, Value
**dy
) {
700 // Given a MS level, and a sample id, compute the delta x/y
701 uint8_t b
= prog
->driver
->io
.msInfoCBSlot
;
702 Value
*off
= new_LValue(func
, FILE_ADDRESS
), *t
= new_LValue(func
, FILE_GPR
);
704 // The required information is at mslevel * 16 * 4 + sample * 8
705 // = (mslevel * 8 + sample) * 8
709 bld
.mkOp2v(OP_ADD
, TYPE_U32
, t
,
710 bld
.mkOp2v(OP_SHL
, TYPE_U32
, t
, ms
, bld
.mkImm(3)),
713 *dx
= bld
.mkLoadv(TYPE_U32
, bld
.mkSymbol(
714 FILE_MEMORY_CONST
, b
, TYPE_U32
,
715 prog
->driver
->io
.msInfoBase
), off
);
716 *dy
= bld
.mkLoadv(TYPE_U32
, bld
.mkSymbol(
717 FILE_MEMORY_CONST
, b
, TYPE_U32
,
718 prog
->driver
->io
.msInfoBase
+ 4), off
);
722 NV50LoweringPreSSA::handleTEX(TexInstruction
*i
)
724 const int arg
= i
->tex
.target
.getArgCount();
725 const int dref
= arg
;
726 const int lod
= i
->tex
.target
.isShadow() ? (arg
+ 1) : arg
;
728 /* Only normalize in the non-explicit derivatives case.
730 if (i
->tex
.target
.isCube() && i
->op
!= OP_TXD
) {
733 for (c
= 0; c
< 3; ++c
)
734 src
[c
] = bld
.mkOp1v(OP_ABS
, TYPE_F32
, bld
.getSSA(), i
->getSrc(c
));
735 val
= bld
.getScratch();
736 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[0], src
[1]);
737 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[2], val
);
738 bld
.mkOp1(OP_RCP
, TYPE_F32
, val
, val
);
739 for (c
= 0; c
< 3; ++c
) {
740 i
->setSrc(c
, bld
.mkOp2v(OP_MUL
, TYPE_F32
, bld
.getSSA(),
745 // handle MS, which means looking up the MS params for this texture, and
746 // adjusting the input coordinates to point at the right sample.
747 if (i
->tex
.target
.isMS()) {
748 Value
*x
= i
->getSrc(0);
749 Value
*y
= i
->getSrc(1);
750 Value
*s
= i
->getSrc(arg
- 1);
751 Value
*tx
= new_LValue(func
, FILE_GPR
), *ty
= new_LValue(func
, FILE_GPR
),
752 *ms
, *ms_x
, *ms_y
, *dx
, *dy
;
754 i
->tex
.target
.clearMS();
756 loadTexMsInfo(i
->tex
.r
* 4 * 2, &ms
, &ms_x
, &ms_y
);
757 loadMsInfo(ms
, s
, &dx
, &dy
);
759 bld
.mkOp2(OP_SHL
, TYPE_U32
, tx
, x
, ms_x
);
760 bld
.mkOp2(OP_SHL
, TYPE_U32
, ty
, y
, ms_y
);
761 bld
.mkOp2(OP_ADD
, TYPE_U32
, tx
, tx
, dx
);
762 bld
.mkOp2(OP_ADD
, TYPE_U32
, ty
, ty
, dy
);
765 i
->setSrc(arg
- 1, bld
.loadImm(NULL
, 0));
768 // dref comes before bias/lod
769 if (i
->tex
.target
.isShadow())
770 if (i
->op
== OP_TXB
|| i
->op
== OP_TXL
)
771 i
->swapSources(dref
, lod
);
773 if (i
->tex
.target
.isArray()) {
774 if (i
->op
!= OP_TXF
) {
775 // array index must be converted to u32, but it's already an integer
777 Value
*layer
= i
->getSrc(arg
- 1);
778 LValue
*src
= new_LValue(func
, FILE_GPR
);
779 bld
.mkCvt(OP_CVT
, TYPE_U32
, src
, TYPE_F32
, layer
);
780 bld
.mkOp2(OP_MIN
, TYPE_U32
, src
, src
, bld
.loadImm(NULL
, 511));
781 i
->setSrc(arg
- 1, src
);
783 if (i
->tex
.target
.isCube() && i
->srcCount() > 4) {
784 std::vector
<Value
*> acube
, a2d
;
788 for (c
= 0; c
< 4; ++c
)
789 acube
[c
] = i
->getSrc(c
);
791 for (c
= 0; c
< 3; ++c
)
792 a2d
[c
] = new_LValue(func
, FILE_GPR
);
795 bld
.mkTex(OP_TEXPREP
, TEX_TARGET_CUBE_ARRAY
, i
->tex
.r
, i
->tex
.s
,
796 a2d
, acube
)->asTex()->tex
.mask
= 0x7;
798 for (c
= 0; c
< 3; ++c
)
799 i
->setSrc(c
, a2d
[c
]);
800 for (; i
->srcExists(c
+ 1); ++c
)
801 i
->setSrc(c
, i
->getSrc(c
+ 1));
805 i
->tex
.target
= i
->tex
.target
.isShadow() ?
806 TEX_TARGET_2D_ARRAY_SHADOW
: TEX_TARGET_2D_ARRAY
;
810 // texel offsets are 3 immediate fields in the instruction,
811 // nv50 cannot do textureGatherOffsets
812 assert(i
->tex
.useOffsets
<= 1);
813 if (i
->tex
.useOffsets
) {
814 for (int c
= 0; c
< 3; ++c
) {
816 if (!i
->offset
[0][c
].getImmediate(val
))
817 assert(!"non-immediate offset");
818 i
->tex
.offset
[c
] = val
.reg
.data
.u32
;
819 i
->offset
[0][c
].set(NULL
);
826 // Bias must be equal for all threads of a quad or lod calculation will fail.
828 // The lanes of a quad are grouped by the bit in the condition register they
829 // have set, which is selected by differing bias values.
830 // Move the input values for TEX into a new register set for each group and
831 // execute TEX only for a specific group.
832 // We always need to use 4 new registers for the inputs/outputs because the
833 // implicitly calculated derivatives must be correct.
835 // TODO: move to SSA phase so we can easily determine whether bias is constant
837 NV50LoweringPreSSA::handleTXB(TexInstruction
*i
)
839 const CondCode cc
[4] = { CC_EQU
, CC_S
, CC_C
, CC_O
};
842 // We can't actually apply bias *and* do a compare for a cube
843 // texture. Since the compare has to be done before the filtering, just
844 // drop the bias on the floor.
845 if (i
->tex
.target
== TEX_TARGET_CUBE_SHADOW
) {
847 i
->setSrc(3, i
->getSrc(4));
853 Value
*bias
= i
->getSrc(i
->tex
.target
.getArgCount());
854 if (bias
->isUniform())
857 Instruction
*cond
= bld
.mkOp1(OP_UNION
, TYPE_U32
, bld
.getScratch(),
858 bld
.loadImm(NULL
, 1));
859 bld
.setPosition(cond
, false);
861 for (l
= 1; l
< 4; ++l
) {
862 const uint8_t qop
= QUADOP(SUBR
, SUBR
, SUBR
, SUBR
);
863 Value
*bit
= bld
.getSSA();
864 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
865 Value
*imm
= bld
.loadImm(NULL
, (1 << l
));
866 bld
.mkQuadop(qop
, pred
, l
, bias
, bias
)->flagsDef
= 0;
867 bld
.mkMov(bit
, imm
)->setPredicate(CC_EQ
, pred
);
868 cond
->setSrc(l
, bit
);
870 Value
*flags
= bld
.getScratch(1, FILE_FLAGS
);
871 bld
.setPosition(cond
, true);
872 bld
.mkCvt(OP_CVT
, TYPE_U8
, flags
, TYPE_U32
, cond
->getDef(0))->flagsDef
= 0;
875 for (l
= 0; l
< 4; ++l
) {
876 (tex
[l
] = cloneForward(func
, i
))->setPredicate(cc
[l
], flags
);
881 for (d
= 0; i
->defExists(d
); ++d
)
882 res
[0][d
] = tex
[0]->getDef(d
);
883 for (l
= 1; l
< 4; ++l
) {
884 for (d
= 0; tex
[l
]->defExists(d
); ++d
) {
885 res
[l
][d
] = cloneShallow(func
, res
[0][d
]);
886 bld
.mkMov(res
[l
][d
], tex
[l
]->getDef(d
))->setPredicate(cc
[l
], flags
);
890 for (d
= 0; i
->defExists(d
); ++d
) {
891 Instruction
*dst
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(d
));
892 for (l
= 0; l
< 4; ++l
)
893 dst
->setSrc(l
, res
[l
][d
]);
895 delete_Instruction(prog
, i
);
899 // LOD must be equal for all threads of a quad.
900 // Unlike with TXB, here we can just diverge since there's no LOD calculation
901 // that would require all 4 threads' sources to be set up properly.
903 NV50LoweringPreSSA::handleTXL(TexInstruction
*i
)
906 Value
*lod
= i
->getSrc(i
->tex
.target
.getArgCount());
907 if (lod
->isUniform())
910 BasicBlock
*currBB
= i
->bb
;
911 BasicBlock
*texiBB
= i
->bb
->splitBefore(i
, false);
912 BasicBlock
*joinBB
= i
->bb
->splitAfter(i
);
914 bld
.setPosition(currBB
, true);
915 assert(!currBB
->joinAt
);
916 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
918 for (int l
= 0; l
<= 3; ++l
) {
919 const uint8_t qop
= QUADOP(SUBR
, SUBR
, SUBR
, SUBR
);
920 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
921 bld
.setPosition(currBB
, true);
922 bld
.mkQuadop(qop
, pred
, l
, lod
, lod
)->flagsDef
= 0;
923 bld
.mkFlow(OP_BRA
, texiBB
, CC_EQ
, pred
)->fixed
= 1;
924 currBB
->cfg
.attach(&texiBB
->cfg
, Graph::Edge::FORWARD
);
926 BasicBlock
*laneBB
= new BasicBlock(func
);
927 currBB
->cfg
.attach(&laneBB
->cfg
, Graph::Edge::TREE
);
931 bld
.setPosition(joinBB
, false);
932 bld
.mkFlow(OP_JOIN
, NULL
, CC_ALWAYS
, NULL
)->fixed
= 1;
937 NV50LoweringPreSSA::handleTXD(TexInstruction
*i
)
939 static const uint8_t qOps
[4][2] =
941 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l0
942 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l1
943 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l2
944 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l3
949 Value
*zero
= bld
.loadImm(bld
.getSSA(), 0);
951 const int dim
= i
->tex
.target
.getDim() + i
->tex
.target
.isCube();
954 i
->op
= OP_TEX
; // no need to clone dPdx/dPdy later
955 i
->tex
.derivAll
= true;
957 for (c
= 0; c
< dim
; ++c
)
958 crd
[c
] = bld
.getScratch();
960 bld
.mkOp(OP_QUADON
, TYPE_NONE
, NULL
);
961 for (l
= 0; l
< 4; ++l
) {
963 // mov coordinates from lane l to all lanes
964 for (c
= 0; c
< dim
; ++c
)
965 bld
.mkQuadop(0x00, crd
[c
], l
, i
->getSrc(c
), zero
);
966 // add dPdx from lane l to lanes dx
967 for (c
= 0; c
< dim
; ++c
)
968 bld
.mkQuadop(qOps
[l
][0], crd
[c
], l
, i
->dPdx
[c
].get(), crd
[c
]);
969 // add dPdy from lane l to lanes dy
970 for (c
= 0; c
< dim
; ++c
)
971 bld
.mkQuadop(qOps
[l
][1], crd
[c
], l
, i
->dPdy
[c
].get(), crd
[c
]);
972 // normalize cube coordinates if necessary
973 if (i
->tex
.target
.isCube()) {
974 for (c
= 0; c
< 3; ++c
)
975 src
[c
] = bld
.mkOp1v(OP_ABS
, TYPE_F32
, bld
.getSSA(), crd
[c
]);
976 val
= bld
.getScratch();
977 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[0], src
[1]);
978 bld
.mkOp2(OP_MAX
, TYPE_F32
, val
, src
[2], val
);
979 bld
.mkOp1(OP_RCP
, TYPE_F32
, val
, val
);
980 for (c
= 0; c
< 3; ++c
)
981 src
[c
] = bld
.mkOp2v(OP_MUL
, TYPE_F32
, bld
.getSSA(), crd
[c
], val
);
983 for (c
= 0; c
< dim
; ++c
)
987 bld
.insert(tex
= cloneForward(func
, i
));
988 for (c
= 0; c
< dim
; ++c
)
989 tex
->setSrc(c
, src
[c
]);
991 for (c
= 0; i
->defExists(c
); ++c
) {
993 def
[c
][l
] = bld
.getSSA();
994 mov
= bld
.mkMov(def
[c
][l
], tex
->getDef(c
));
999 bld
.mkOp(OP_QUADPOP
, TYPE_NONE
, NULL
);
1001 for (c
= 0; i
->defExists(c
); ++c
) {
1002 Instruction
*u
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(c
));
1003 for (l
= 0; l
< 4; ++l
)
1004 u
->setSrc(l
, def
[c
][l
]);
1012 NV50LoweringPreSSA::handleTXLQ(TexInstruction
*i
)
1015 bld
.setPosition(i
, true);
1017 /* The returned values are not quite what we want:
1018 * (a) convert from s32 to f32
1019 * (b) multiply by 1/256
1021 for (int def
= 0; def
< 2; ++def
) {
1022 if (!i
->defExists(def
))
1024 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(def
), TYPE_S32
, i
->getDef(def
));
1025 bld
.mkOp2(OP_MUL
, TYPE_F32
, i
->getDef(def
),
1026 i
->getDef(def
), bld
.loadImm(NULL
, 1.0f
/ 256));
1032 NV50LoweringPreSSA::handleTXQ(TexInstruction
*i
)
1034 Value
*ms
, *ms_x
, *ms_y
;
1035 if (i
->tex
.query
== TXQ_DIMS
)
1037 assert(i
->tex
.query
== TXQ_TYPE
);
1038 assert(i
->tex
.mask
== 4);
1040 loadTexMsInfo(i
->tex
.r
* 4 * 2, &ms
, &ms_x
, &ms_y
);
1041 bld
.mkOp2(OP_SHL
, TYPE_U32
, i
->getDef(0), bld
.loadImm(NULL
, 1), ms
);
1049 NV50LoweringPreSSA::handleSET(Instruction
*i
)
1051 if (i
->dType
== TYPE_F32
) {
1052 bld
.setPosition(i
, true);
1053 i
->dType
= TYPE_U32
;
1054 bld
.mkOp1(OP_ABS
, TYPE_S32
, i
->getDef(0), i
->getDef(0));
1055 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(0), TYPE_S32
, i
->getDef(0));
1061 NV50LoweringPreSSA::handleSLCT(CmpInstruction
*i
)
1063 Value
*src0
= bld
.getSSA();
1064 Value
*src1
= bld
.getSSA();
1065 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
1067 Value
*v0
= i
->getSrc(0);
1068 Value
*v1
= i
->getSrc(1);
1069 // XXX: these probably shouldn't be immediates in the first place ...
1071 v0
= bld
.mkMov(bld
.getSSA(), v0
)->getDef(0);
1073 v1
= bld
.mkMov(bld
.getSSA(), v1
)->getDef(0);
1075 bld
.setPosition(i
, true);
1076 bld
.mkMov(src0
, v0
)->setPredicate(CC_NE
, pred
);
1077 bld
.mkMov(src1
, v1
)->setPredicate(CC_EQ
, pred
);
1078 bld
.mkOp2(OP_UNION
, i
->dType
, i
->getDef(0), src0
, src1
);
1080 bld
.setPosition(i
, false);
1082 i
->setFlagsDef(0, pred
);
1084 i
->setSrc(0, i
->getSrc(2));
1086 i
->setSrc(1, bld
.loadImm(NULL
, 0));
1092 NV50LoweringPreSSA::handleSELP(Instruction
*i
)
1094 Value
*src0
= bld
.getSSA();
1095 Value
*src1
= bld
.getSSA();
1097 Value
*v0
= i
->getSrc(0);
1098 Value
*v1
= i
->getSrc(1);
1100 v0
= bld
.mkMov(bld
.getSSA(), v0
)->getDef(0);
1102 v1
= bld
.mkMov(bld
.getSSA(), v1
)->getDef(0);
1104 bld
.mkMov(src0
, v0
)->setPredicate(CC_NE
, i
->getSrc(2));
1105 bld
.mkMov(src1
, v1
)->setPredicate(CC_EQ
, i
->getSrc(2));
1106 bld
.mkOp2(OP_UNION
, i
->dType
, i
->getDef(0), src0
, src1
);
1107 delete_Instruction(prog
, i
);
1112 NV50LoweringPreSSA::handleWRSV(Instruction
*i
)
1114 Symbol
*sym
= i
->getSrc(0)->asSym();
1116 // these are all shader outputs, $sreg are not writeable
1117 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_OUTPUT
, sym
);
1120 sym
= bld
.mkSymbol(FILE_SHADER_OUTPUT
, 0, i
->sType
, addr
);
1122 bld
.mkStore(OP_EXPORT
, i
->dType
, sym
, i
->getIndirect(0, 0), i
->getSrc(1));
1124 bld
.getBB()->remove(i
);
1129 NV50LoweringPreSSA::handleCALL(Instruction
*i
)
1131 if (prog
->getType() == Program::TYPE_COMPUTE
) {
1132 // Add implicit "thread id" argument in $r0 to the function
1133 i
->setSrc(i
->srcCount(), tid
);
1139 NV50LoweringPreSSA::handlePRECONT(Instruction
*i
)
1141 delete_Instruction(prog
, i
);
1146 NV50LoweringPreSSA::handleCONT(Instruction
*i
)
1153 NV50LoweringPreSSA::handleRDSV(Instruction
*i
)
1155 Symbol
*sym
= i
->getSrc(0)->asSym();
1156 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_INPUT
, sym
);
1157 Value
*def
= i
->getDef(0);
1158 SVSemantic sv
= sym
->reg
.data
.sv
.sv
;
1159 int idx
= sym
->reg
.data
.sv
.index
;
1161 if (addr
>= 0x400) // mov $sreg
1166 assert(prog
->getType() == Program::TYPE_FRAGMENT
);
1167 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, i
->getDef(0), addr
, NULL
);
1170 bld
.mkInterp(NV50_IR_INTERP_FLAT
, def
, addr
, NULL
);
1171 if (i
->dType
== TYPE_F32
) {
1172 bld
.mkOp2(OP_OR
, TYPE_U32
, def
, def
, bld
.mkImm(0x00000001));
1173 bld
.mkOp1(OP_NEG
, TYPE_S32
, def
, def
);
1174 bld
.mkCvt(OP_CVT
, TYPE_F32
, def
, TYPE_S32
, def
);
1180 if ((sv
== SV_NCTAID
&& idx
>= 2) ||
1181 (sv
== SV_NTID
&& idx
>= 3)) {
1182 bld
.mkMov(def
, bld
.mkImm(1));
1183 } else if (sv
== SV_CTAID
&& idx
>= 2) {
1184 bld
.mkMov(def
, bld
.mkImm(0));
1186 Value
*x
= bld
.getSSA(2);
1187 bld
.mkOp1(OP_LOAD
, TYPE_U16
, x
,
1188 bld
.mkSymbol(FILE_MEMORY_SHARED
, 0, TYPE_U16
, addr
));
1189 bld
.mkCvt(OP_CVT
, TYPE_U32
, def
, TYPE_U16
, x
);
1194 bld
.mkOp2(OP_AND
, TYPE_U32
, def
, tid
, bld
.mkImm(0x0000ffff));
1195 } else if (idx
== 1) {
1196 bld
.mkOp2(OP_AND
, TYPE_U32
, def
, tid
, bld
.mkImm(0x03ff0000));
1197 bld
.mkOp2(OP_SHR
, TYPE_U32
, def
, def
, bld
.mkImm(16));
1198 } else if (idx
== 2) {
1199 bld
.mkOp2(OP_SHR
, TYPE_U32
, def
, tid
, bld
.mkImm(26));
1201 bld
.mkMov(def
, bld
.mkImm(0));
1204 case SV_SAMPLE_POS
: {
1205 Value
*off
= new_LValue(func
, FILE_ADDRESS
);
1206 bld
.mkOp1(OP_RDSV
, TYPE_U32
, def
, bld
.mkSysVal(SV_SAMPLE_INDEX
, 0));
1207 bld
.mkOp2(OP_SHL
, TYPE_U32
, off
, def
, bld
.mkImm(3));
1208 bld
.mkLoad(TYPE_F32
,
1211 FILE_MEMORY_CONST
, prog
->driver
->io
.auxCBSlot
,
1212 TYPE_U32
, prog
->driver
->io
.sampleInfoBase
+ 4 * idx
),
1217 bld
.mkFetch(i
->getDef(0), i
->dType
,
1218 FILE_SHADER_INPUT
, addr
, i
->getIndirect(0, 0), NULL
);
1221 bld
.getBB()->remove(i
);
1226 NV50LoweringPreSSA::handleDIV(Instruction
*i
)
1228 if (!isFloatType(i
->dType
))
1230 bld
.setPosition(i
, false);
1231 Instruction
*rcp
= bld
.mkOp1(OP_RCP
, i
->dType
, bld
.getSSA(), i
->getSrc(1));
1233 i
->setSrc(1, rcp
->getDef(0));
1238 NV50LoweringPreSSA::handleSQRT(Instruction
*i
)
1240 bld
.setPosition(i
, true);
1242 bld
.mkOp1(OP_RCP
, i
->dType
, i
->getDef(0), i
->getDef(0));
1248 NV50LoweringPreSSA::handlePOW(Instruction
*i
)
1250 LValue
*val
= bld
.getScratch();
1252 bld
.mkOp1(OP_LG2
, TYPE_F32
, val
, i
->getSrc(0));
1253 bld
.mkOp2(OP_MUL
, TYPE_F32
, val
, i
->getSrc(1), val
)->dnz
= 1;
1254 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, val
, val
);
1264 NV50LoweringPreSSA::handleEXPORT(Instruction
*i
)
1266 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
1267 if (i
->getIndirect(0, 0)) {
1268 // TODO: redirect to l[] here, load to GPRs at exit
1271 int id
= i
->getSrc(0)->reg
.data
.offset
/ 4; // in 32 bit reg units
1274 i
->subOp
= NV50_IR_SUBOP_MOV_FINAL
;
1275 i
->src(0).set(i
->src(1));
1277 i
->setDef(0, new_LValue(func
, FILE_GPR
));
1278 i
->getDef(0)->reg
.data
.id
= id
;
1280 prog
->maxGPR
= MAX2(prog
->maxGPR
, id
* 2);
1286 // Handle indirect addressing in geometry shaders:
1288 // ld $r0 a[$a1][$a2+k] ->
1289 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1292 NV50LoweringPreSSA::handleLOAD(Instruction
*i
)
1294 ValueRef src
= i
->src(0);
1296 if (src
.isIndirect(1)) {
1297 assert(prog
->getType() == Program::TYPE_GEOMETRY
);
1298 Value
*addr
= i
->getIndirect(0, 1);
1300 if (src
.isIndirect(0)) {
1301 // base address is in an address register, so move to a GPR
1302 Value
*base
= bld
.getScratch();
1303 bld
.mkMov(base
, addr
);
1305 Symbol
*sv
= bld
.mkSysVal(SV_VERTEX_STRIDE
, 0);
1306 Value
*vstride
= bld
.mkOp1v(OP_RDSV
, TYPE_U32
, bld
.getSSA(), sv
);
1307 Value
*attrib
= bld
.mkOp2v(OP_SHL
, TYPE_U32
, bld
.getSSA(),
1308 i
->getIndirect(0, 0), bld
.mkImm(2));
1310 // Calculate final address: addr = base + attr*vstride; use 16-bit
1311 // multiplication since 32-bit would be lowered to multiple
1312 // instructions, and we only need the low 16 bits of the result
1314 bld
.mkSplit(a
, 2, attrib
);
1315 bld
.mkSplit(b
, 2, vstride
);
1316 Value
*sum
= bld
.mkOp3v(OP_MAD
, TYPE_U16
, bld
.getSSA(), a
[0], b
[0],
1319 // move address from GPR into an address register
1320 addr
= bld
.getSSA(2, FILE_ADDRESS
);
1321 bld
.mkMov(addr
, sum
);
1324 i
->setIndirect(0, 1, NULL
);
1325 i
->setIndirect(0, 0, addr
);
1332 NV50LoweringPreSSA::handlePFETCH(Instruction
*i
)
1334 assert(prog
->getType() == Program::TYPE_GEOMETRY
);
1336 // NOTE: cannot use getImmediate here, not in SSA form yet, move to
1337 // later phase if that assertion ever triggers:
1339 ImmediateValue
*imm
= i
->getSrc(0)->asImm();
1342 assert(imm
->reg
.data
.u32
<= 127); // TODO: use address reg if that happens
1344 if (i
->srcExists(1)) {
1345 // indirect addressing of vertex in primitive space
1347 LValue
*val
= bld
.getScratch();
1348 Value
*ptr
= bld
.getSSA(2, FILE_ADDRESS
);
1349 bld
.mkOp2v(OP_SHL
, TYPE_U32
, ptr
, i
->getSrc(1), bld
.mkImm(2));
1350 bld
.mkOp2v(OP_PFETCH
, TYPE_U32
, val
, imm
, ptr
);
1352 // NOTE: PFETCH directly to an $aX only works with direct addressing
1355 i
->setSrc(1, bld
.mkImm(0));
1361 // Set flags according to predicate and make the instruction read $cX.
1363 NV50LoweringPreSSA::checkPredicate(Instruction
*insn
)
1365 Value
*pred
= insn
->getPredicate();
1368 // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
1370 pred
->reg
.file
== FILE_FLAGS
|| pred
->reg
.file
== FILE_PREDICATE
)
1373 cdst
= bld
.getSSA(1, FILE_FLAGS
);
1375 bld
.mkCmp(OP_SET
, CC_NEU
, insn
->dType
, cdst
, insn
->dType
, bld
.loadImm(NULL
, 0), pred
);
1377 insn
->setPredicate(insn
->cc
, cdst
);
1381 // - add quadop dance for texturing
1382 // - put FP outputs in GPRs
1383 // - convert instruction sequences
1386 NV50LoweringPreSSA::visit(Instruction
*i
)
1388 bld
.setPosition(i
, false);
1390 if (i
->cc
!= CC_ALWAYS
)
1397 return handleTEX(i
->asTex());
1399 return handleTXB(i
->asTex());
1401 return handleTXL(i
->asTex());
1403 return handleTXD(i
->asTex());
1405 return handleTXLQ(i
->asTex());
1407 return handleTXQ(i
->asTex());
1409 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, i
->getDef(0), i
->getSrc(0));
1410 i
->setSrc(0, i
->getDef(0));
1413 return handleSET(i
);
1415 return handleSLCT(i
->asCmp());
1417 return handleSELP(i
);
1419 return handlePOW(i
);
1421 return handleDIV(i
);
1423 return handleSQRT(i
);
1425 return handleEXPORT(i
);
1427 return handleLOAD(i
);
1429 return handleRDSV(i
);
1431 return handleWRSV(i
);
1433 return handleCALL(i
);
1435 return handlePRECONT(i
);
1437 return handleCONT(i
);
1439 return handlePFETCH(i
);
1447 TargetNV50::runLegalizePass(Program
*prog
, CGStage stage
) const
1451 if (stage
== CG_STAGE_PRE_SSA
) {
1452 NV50LoweringPreSSA
pass(prog
);
1453 ret
= pass
.run(prog
, false, true);
1455 if (stage
== CG_STAGE_SSA
) {
1456 if (!prog
->targetPriv
)
1457 prog
->targetPriv
= new std::list
<Instruction
*>();
1458 NV50LegalizeSSA
pass(prog
);
1459 ret
= pass
.run(prog
, false, true);
1461 if (stage
== CG_STAGE_POST_RA
) {
1462 NV50LegalizePostRA pass
;
1463 ret
= pass
.run(prog
, false, true);
1464 if (prog
->targetPriv
)
1465 delete reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
1470 } // namespace nv50_ir