2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
26 #include "codegen/nv50_ir_target_nv50.h"
30 // nv50 doesn't support 32 bit integer multiplication
32 // ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33 // -------------------
34 // al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35 // ah*bh 00 00 ( carry1) << 16 + ( carry2)
39 // fffe0001 + fffe0001
41 expandIntegerMUL(BuildUtil
*bld
, Instruction
*mul
)
43 const bool highResult
= mul
->subOp
== NV50_IR_SUBOP_MUL_HIGH
;
45 DataType fTy
= mul
->sType
; // full type
48 case TYPE_S32
: hTy
= TYPE_S16
; break;
49 case TYPE_U32
: hTy
= TYPE_U16
; break;
50 case TYPE_U64
: hTy
= TYPE_U32
; break;
51 case TYPE_S64
: hTy
= TYPE_S32
; break;
55 unsigned int fullSize
= typeSizeof(fTy
);
56 unsigned int halfSize
= typeSizeof(hTy
);
60 bld
->setPosition(mul
, true);
65 for (int j
= 0; j
< 4; ++j
)
66 t
[j
] = bld
->getSSA(fullSize
);
68 // split sources into halves
69 i
[0] = bld
->mkSplit(a
, halfSize
, mul
->getSrc(0));
70 i
[1] = bld
->mkSplit(b
, halfSize
, mul
->getSrc(1));
72 i
[2] = bld
->mkOp2(OP_MUL
, fTy
, t
[0], a
[0], b
[1]);
73 i
[3] = bld
->mkOp3(OP_MAD
, fTy
, t
[1], a
[1], b
[0], t
[0]);
74 i
[7] = bld
->mkOp2(OP_SHL
, fTy
, t
[2], t
[1], bld
->mkImm(halfSize
* 8));
75 i
[4] = bld
->mkOp3(OP_MAD
, fTy
, t
[3], a
[0], b
[0], t
[2]);
79 Value
*imm
= bld
->loadImm(NULL
, 1 << (halfSize
* 8));
80 c
[0] = bld
->getSSA(1, FILE_FLAGS
);
81 c
[1] = bld
->getSSA(1, FILE_FLAGS
);
82 for (int j
= 0; j
< 3; ++j
)
83 r
[j
] = bld
->getSSA(fullSize
);
85 i
[8] = bld
->mkOp2(OP_SHR
, fTy
, r
[0], t
[1], bld
->mkImm(halfSize
* 8));
86 i
[6] = bld
->mkOp2(OP_ADD
, fTy
, r
[1], r
[0], imm
);
87 bld
->mkOp2(OP_UNION
, TYPE_U32
, r
[2], r
[1], r
[0]);
88 i
[5] = bld
->mkOp3(OP_MAD
, fTy
, mul
->getDef(0), a
[1], b
[1], r
[2]);
90 // set carry defs / sources
91 i
[3]->setFlagsDef(1, c
[0]);
92 i
[4]->setFlagsDef(0, c
[1]); // actual result not required, just the carry
93 i
[6]->setPredicate(CC_C
, c
[0]);
94 i
[5]->setFlagsSrc(3, c
[1]);
96 bld
->mkMov(mul
->getDef(0), t
[3]);
98 delete_Instruction(bld
->getProgram(), mul
);
100 for (int j
= 2; j
<= (highResult
? 5 : 4); ++j
)
113 #define QUADOP(q, r, s, t) \
114 ((QOP_##q << 6) | (QOP_##r << 4) | \
115 (QOP_##s << 2) | (QOP_##t << 0))
117 class NV50LegalizePostRA
: public Pass
120 virtual bool visit(Function
*);
121 virtual bool visit(BasicBlock
*);
123 void handlePRERET(FlowInstruction
*);
124 void replaceZero(Instruction
*);
130 NV50LegalizePostRA::visit(Function
*fn
)
132 Program
*prog
= fn
->getProgram();
134 r63
= new_LValue(fn
, FILE_GPR
);
135 r63
->reg
.data
.id
= 63;
137 // this is actually per-program, but we can do it all on visiting main()
138 std::list
<Instruction
*> *outWrites
=
139 reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
142 for (std::list
<Instruction
*>::iterator it
= outWrites
->begin();
143 it
!= outWrites
->end(); ++it
)
144 (*it
)->getSrc(1)->defs
.front()->getInsn()->setDef(0, (*it
)->getSrc(0));
145 // instructions will be deleted on exit
153 NV50LegalizePostRA::replaceZero(Instruction
*i
)
155 for (int s
= 0; i
->srcExists(s
); ++s
) {
156 ImmediateValue
*imm
= i
->getSrc(s
)->asImm();
157 if (imm
&& imm
->reg
.data
.u64
== 0)
162 // Emulate PRERET: jump to the target and call to the origin from there
164 // WARNING: atm only works if BBs are affected by at most a single PRERET
173 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
176 // bra BB:3 + n1 (skip the call)
177 // call BB:0 + n2 (skip bra at beginning of BB:0)
180 NV50LegalizePostRA::handlePRERET(FlowInstruction
*pre
)
182 BasicBlock
*bbE
= pre
->bb
;
183 BasicBlock
*bbT
= pre
->target
.bb
;
185 pre
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 0;
187 bbE
->insertHead(pre
);
189 Instruction
*skip
= new_FlowInstruction(func
, OP_PRERET
, bbT
);
190 Instruction
*call
= new_FlowInstruction(func
, OP_PRERET
, bbE
);
192 bbT
->insertHead(call
);
193 bbT
->insertHead(skip
);
195 // NOTE: maybe split blocks to prevent the instructions from moving ?
197 skip
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 1;
198 call
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 2;
202 NV50LegalizePostRA::visit(BasicBlock
*bb
)
204 Instruction
*i
, *next
;
206 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
207 for (i
= bb
->getFirst(); i
; i
= next
) {
212 if (i
->op
== OP_PRERET
&& prog
->getTarget()->getChipset() < 0xa0) {
213 handlePRERET(i
->asFlow());
215 // TODO: We will want to do this before register allocation,
216 // since have to use a $c register for the carry flag.
217 if (typeSizeof(i
->dType
) == 8) {
218 Instruction
*hi
= BuildUtil::split64BitOpPostRA(func
, i
, r63
, NULL
);
223 if (i
->op
!= OP_MOV
&& i
->op
!= OP_PFETCH
&&
225 (!i
->defExists(0) || i
->def(0).getFile() != FILE_ADDRESS
))
235 class NV50LegalizeSSA
: public Pass
238 NV50LegalizeSSA(Program
*);
240 virtual bool visit(BasicBlock
*bb
);
243 void propagateWriteToOutput(Instruction
*);
244 void handleDIV(Instruction
*);
245 void handleMOD(Instruction
*);
246 void handleMUL(Instruction
*);
247 void handleAddrDef(Instruction
*);
249 inline bool isARL(const Instruction
*) const;
253 std::list
<Instruction
*> *outWrites
;
256 NV50LegalizeSSA::NV50LegalizeSSA(Program
*prog
)
258 bld
.setProgram(prog
);
260 if (prog
->optLevel
>= 2 &&
261 (prog
->getType() == Program::TYPE_GEOMETRY
||
262 prog
->getType() == Program::TYPE_VERTEX
))
264 reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
270 NV50LegalizeSSA::propagateWriteToOutput(Instruction
*st
)
272 if (st
->src(0).isIndirect(0) || st
->getSrc(1)->refCount() != 1)
275 // check def instruction can store
276 Instruction
*di
= st
->getSrc(1)->defs
.front()->getInsn();
278 // TODO: move exports (if beneficial) in common opt pass
279 if (di
->isPseudo() || isTextureOp(di
->op
) || di
->defCount(0xff, true) > 1)
281 for (int s
= 0; di
->srcExists(s
); ++s
)
282 if (di
->src(s
).getFile() == FILE_IMMEDIATE
)
285 // We cannot set defs to non-lvalues before register allocation, so
286 // save & remove (to save registers) the exports and replace later.
287 outWrites
->push_back(st
);
292 NV50LegalizeSSA::isARL(const Instruction
*i
) const
296 if (i
->op
!= OP_SHL
|| i
->src(0).getFile() != FILE_GPR
)
298 if (!i
->src(1).getImmediate(imm
))
300 return imm
.isInteger(0);
304 NV50LegalizeSSA::handleAddrDef(Instruction
*i
)
308 i
->getDef(0)->reg
.size
= 2; // $aX are only 16 bit
310 // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
311 if (i
->srcExists(1) && i
->src(1).getFile() == FILE_IMMEDIATE
) {
312 if (i
->op
== OP_SHL
&& i
->src(0).getFile() == FILE_GPR
)
314 if (i
->op
== OP_ADD
&& i
->src(0).getFile() == FILE_ADDRESS
)
318 // turn $a sources into $r sources (can't operate on $a)
319 for (int s
= 0; i
->srcExists(s
); ++s
) {
320 Value
*a
= i
->getSrc(s
);
322 if (a
->reg
.file
== FILE_ADDRESS
) {
323 if (a
->getInsn() && isARL(a
->getInsn())) {
324 i
->setSrc(s
, a
->getInsn()->getSrc(0));
326 bld
.setPosition(i
, false);
333 if (i
->op
== OP_SHL
&& i
->src(1).getFile() == FILE_IMMEDIATE
)
336 // turn result back into $a
337 bld
.setPosition(i
, true);
338 arl
= bld
.mkOp2(OP_SHL
, TYPE_U32
, i
->getDef(0), bld
.getSSA(), bld
.mkImm(0));
339 i
->setDef(0, arl
->getSrc(0));
343 NV50LegalizeSSA::handleMUL(Instruction
*mul
)
345 if (isFloatType(mul
->sType
) || typeSizeof(mul
->sType
) <= 2)
347 Value
*def
= mul
->getDef(0);
348 Value
*pred
= mul
->getPredicate();
349 CondCode cc
= mul
->cc
;
351 mul
->setPredicate(CC_ALWAYS
, NULL
);
353 if (mul
->op
== OP_MAD
) {
354 Instruction
*add
= mul
;
355 bld
.setPosition(add
, false);
356 Value
*res
= cloneShallow(func
, mul
->getDef(0));
357 mul
= bld
.mkOp2(OP_MUL
, add
->sType
, res
, add
->getSrc(0), add
->getSrc(1));
359 add
->setSrc(0, mul
->getDef(0));
360 add
->setSrc(1, add
->getSrc(2));
361 for (int s
= 2; add
->srcExists(s
); ++s
)
362 add
->setSrc(s
, NULL
);
363 mul
->subOp
= add
->subOp
;
366 expandIntegerMUL(&bld
, mul
);
368 def
->getInsn()->setPredicate(cc
, pred
);
371 // Use f32 division: first compute an approximate result, use it to reduce
372 // the dividend, which should then be representable as f32, divide the reduced
373 // dividend, and add the quotients.
375 NV50LegalizeSSA::handleDIV(Instruction
*div
)
377 const DataType ty
= div
->sType
;
379 if (ty
!= TYPE_U32
&& ty
!= TYPE_S32
)
382 Value
*q
, *q0
, *qf
, *aR
, *aRf
, *qRf
, *qR
, *t
, *s
, *m
, *cond
;
384 bld
.setPosition(div
, false);
386 Value
*a
, *af
= bld
.getSSA();
387 Value
*b
, *bf
= bld
.getSSA();
389 bld
.mkCvt(OP_CVT
, TYPE_F32
, af
, ty
, div
->getSrc(0));
390 bld
.mkCvt(OP_CVT
, TYPE_F32
, bf
, ty
, div
->getSrc(1));
392 if (isSignedType(ty
)) {
393 af
->getInsn()->src(0).mod
= Modifier(NV50_IR_MOD_ABS
);
394 bf
->getInsn()->src(0).mod
= Modifier(NV50_IR_MOD_ABS
);
397 bld
.mkOp1(OP_ABS
, ty
, a
, div
->getSrc(0));
398 bld
.mkOp1(OP_ABS
, ty
, b
, div
->getSrc(1));
404 bf
= bld
.mkOp1v(OP_RCP
, TYPE_F32
, bld
.getSSA(), bf
);
405 bf
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), bf
, bld
.mkImm(-2));
407 bld
.mkOp2(OP_MUL
, TYPE_F32
, (qf
= bld
.getSSA()), af
, bf
)->rnd
= ROUND_Z
;
408 bld
.mkCvt(OP_CVT
, ty
, (q0
= bld
.getSSA()), TYPE_F32
, qf
)->rnd
= ROUND_Z
;
410 // get error of 1st result
411 expandIntegerMUL(&bld
,
412 bld
.mkOp2(OP_MUL
, TYPE_U32
, (t
= bld
.getSSA()), q0
, b
));
413 bld
.mkOp2(OP_SUB
, TYPE_U32
, (aRf
= bld
.getSSA()), a
, t
);
415 bld
.mkCvt(OP_CVT
, TYPE_F32
, (aR
= bld
.getSSA()), TYPE_U32
, aRf
);
417 bld
.mkOp2(OP_MUL
, TYPE_F32
, (qRf
= bld
.getSSA()), aR
, bf
)->rnd
= ROUND_Z
;
418 bld
.mkCvt(OP_CVT
, TYPE_U32
, (qR
= bld
.getSSA()), TYPE_F32
, qRf
)
420 bld
.mkOp2(OP_ADD
, ty
, (q
= bld
.getSSA()), q0
, qR
); // add quotients
422 // correction: if modulus >= divisor, add 1
423 expandIntegerMUL(&bld
,
424 bld
.mkOp2(OP_MUL
, TYPE_U32
, (t
= bld
.getSSA()), q
, b
));
425 bld
.mkOp2(OP_SUB
, TYPE_U32
, (m
= bld
.getSSA()), a
, t
);
426 bld
.mkCmp(OP_SET
, CC_GE
, TYPE_U32
, (s
= bld
.getSSA()), TYPE_U32
, m
, b
);
427 if (!isSignedType(ty
)) {
433 bld
.mkOp2(OP_SUB
, TYPE_U32
, (q
= bld
.getSSA()), t
, s
);
437 bld
.mkOp2(OP_XOR
, TYPE_U32
, NULL
, div
->getSrc(0), div
->getSrc(1))
438 ->setFlagsDef(0, (cond
= bld
.getSSA(1, FILE_FLAGS
)));
439 bld
.mkOp1(OP_NEG
, ty
, s
, q
)->setPredicate(CC_S
, cond
);
440 bld
.mkOp1(OP_MOV
, ty
, t
, q
)->setPredicate(CC_NS
, cond
);
449 NV50LegalizeSSA::handleMOD(Instruction
*mod
)
451 if (mod
->dType
!= TYPE_U32
&& mod
->dType
!= TYPE_S32
)
453 bld
.setPosition(mod
, false);
455 Value
*q
= bld
.getSSA();
456 Value
*m
= bld
.getSSA();
458 bld
.mkOp2(OP_DIV
, mod
->dType
, q
, mod
->getSrc(0), mod
->getSrc(1));
459 handleDIV(q
->getInsn());
461 bld
.setPosition(mod
, false);
462 expandIntegerMUL(&bld
, bld
.mkOp2(OP_MUL
, TYPE_U32
, m
, q
, mod
->getSrc(1)));
469 NV50LegalizeSSA::visit(BasicBlock
*bb
)
471 Instruction
*insn
, *next
;
472 // skipping PHIs (don't pass them to handleAddrDef) !
473 for (insn
= bb
->getEntry(); insn
; insn
= next
) {
479 propagateWriteToOutput(insn
);
495 if (insn
->defExists(0) && insn
->getDef(0)->reg
.file
== FILE_ADDRESS
)
501 class NV50LoweringPreSSA
: public Pass
504 NV50LoweringPreSSA(Program
*);
507 virtual bool visit(Instruction
*);
508 virtual bool visit(Function
*);
510 bool handleRDSV(Instruction
*);
511 bool handleWRSV(Instruction
*);
513 bool handleEXPORT(Instruction
*);
515 bool handleDIV(Instruction
*);
516 bool handleSQRT(Instruction
*);
517 bool handlePOW(Instruction
*);
519 bool handleSET(Instruction
*);
520 bool handleSLCT(CmpInstruction
*);
521 bool handleSELP(Instruction
*);
523 bool handleTEX(TexInstruction
*);
524 bool handleTXB(TexInstruction
*); // I really
525 bool handleTXL(TexInstruction
*); // hate
526 bool handleTXD(TexInstruction
*); // these 3
528 bool handleCALL(Instruction
*);
529 bool handlePRECONT(Instruction
*);
530 bool handleCONT(Instruction
*);
532 void checkPredicate(Instruction
*);
535 const Target
*const targ
;
542 NV50LoweringPreSSA::NV50LoweringPreSSA(Program
*prog
) :
543 targ(prog
->getTarget()), tid(NULL
)
545 bld
.setProgram(prog
);
549 NV50LoweringPreSSA::visit(Function
*f
)
551 BasicBlock
*root
= BasicBlock::get(func
->cfg
.getRoot());
553 if (prog
->getType() == Program::TYPE_COMPUTE
) {
554 // Add implicit "thread id" argument in $r0 to the function
555 Value
*arg
= new_LValue(func
, FILE_GPR
);
556 arg
->reg
.data
.id
= 0;
557 f
->ins
.push_back(arg
);
559 bld
.setPosition(root
, false);
560 tid
= bld
.mkMov(bld
.getScratch(), arg
, TYPE_U32
)->getDef(0);
567 NV50LoweringPreSSA::handleTEX(TexInstruction
*i
)
569 const int arg
= i
->tex
.target
.getArgCount();
570 const int dref
= arg
;
571 const int lod
= i
->tex
.target
.isShadow() ? (arg
+ 1) : arg
;
573 // dref comes before bias/lod
574 if (i
->tex
.target
.isShadow())
575 if (i
->op
== OP_TXB
|| i
->op
== OP_TXL
)
576 i
->swapSources(dref
, lod
);
578 if (i
->tex
.target
.isArray()) {
579 if (i
->op
!= OP_TXF
) {
580 // array index must be converted to u32, but it's already an integer
582 Value
*layer
= i
->getSrc(arg
- 1);
583 LValue
*src
= new_LValue(func
, FILE_GPR
);
584 bld
.mkCvt(OP_CVT
, TYPE_U32
, src
, TYPE_F32
, layer
);
585 bld
.mkOp2(OP_MIN
, TYPE_U32
, src
, src
, bld
.loadImm(NULL
, 511));
586 i
->setSrc(arg
- 1, src
);
588 if (i
->tex
.target
.isCube()) {
589 std::vector
<Value
*> acube
, a2d
;
593 for (c
= 0; c
< 4; ++c
)
594 acube
[c
] = i
->getSrc(c
);
596 for (c
= 0; c
< 3; ++c
)
597 a2d
[c
] = new_LValue(func
, FILE_GPR
);
600 bld
.mkTex(OP_TEXPREP
, TEX_TARGET_CUBE_ARRAY
, i
->tex
.r
, i
->tex
.s
,
601 a2d
, acube
)->asTex()->tex
.mask
= 0x7;
603 for (c
= 0; c
< 3; ++c
)
604 i
->setSrc(c
, a2d
[c
]);
606 for (; i
->srcExists(c
+ 1); ++c
)
607 i
->setSrc(c
, i
->getSrc(c
+ 1));
609 i
->tex
.target
= i
->tex
.target
.isShadow() ?
610 TEX_TARGET_2D_ARRAY_SHADOW
: TEX_TARGET_2D_ARRAY
;
614 // texel offsets are 3 immediate fields in the instruction,
615 // nv50 cannot do textureGatherOffsets
616 assert(i
->tex
.useOffsets
<= 1);
621 // Bias must be equal for all threads of a quad or lod calculation will fail.
623 // The lanes of a quad are grouped by the bit in the condition register they
624 // have set, which is selected by differing bias values.
625 // Move the input values for TEX into a new register set for each group and
626 // execute TEX only for a specific group.
627 // We always need to use 4 new registers for the inputs/outputs because the
628 // implicitly calculated derivatives must be correct.
630 // TODO: move to SSA phase so we can easily determine whether bias is constant
632 NV50LoweringPreSSA::handleTXB(TexInstruction
*i
)
634 const CondCode cc
[4] = { CC_EQU
, CC_S
, CC_C
, CC_O
};
638 Value
*bias
= i
->getSrc(i
->tex
.target
.getArgCount());
639 if (bias
->isUniform())
642 Instruction
*cond
= bld
.mkOp1(OP_UNION
, TYPE_U32
, bld
.getScratch(),
643 bld
.loadImm(NULL
, 1));
644 bld
.setPosition(cond
, false);
646 for (l
= 1; l
< 4; ++l
) {
647 const uint8_t qop
= QUADOP(SUBR
, SUBR
, SUBR
, SUBR
);
648 Value
*bit
= bld
.getSSA();
649 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
650 Value
*imm
= bld
.loadImm(NULL
, (1 << l
));
651 bld
.mkQuadop(qop
, pred
, l
, bias
, bias
)->flagsDef
= 0;
652 bld
.mkMov(bit
, imm
)->setPredicate(CC_EQ
, pred
);
653 cond
->setSrc(l
, bit
);
655 Value
*flags
= bld
.getScratch(1, FILE_FLAGS
);
656 bld
.setPosition(cond
, true);
657 bld
.mkCvt(OP_CVT
, TYPE_U8
, flags
, TYPE_U32
, cond
->getDef(0));
660 for (l
= 0; l
< 4; ++l
) {
661 (tex
[l
] = cloneForward(func
, i
))->setPredicate(cc
[l
], flags
);
666 for (d
= 0; i
->defExists(d
); ++d
)
667 res
[0][d
] = tex
[0]->getDef(d
);
668 for (l
= 1; l
< 4; ++l
) {
669 for (d
= 0; tex
[l
]->defExists(d
); ++d
) {
670 res
[l
][d
] = cloneShallow(func
, res
[0][d
]);
671 bld
.mkMov(res
[l
][d
], tex
[l
]->getDef(d
))->setPredicate(cc
[l
], flags
);
675 for (d
= 0; i
->defExists(d
); ++d
) {
676 Instruction
*dst
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(d
));
677 for (l
= 0; l
< 4; ++l
)
678 dst
->setSrc(l
, res
[l
][d
]);
680 delete_Instruction(prog
, i
);
684 // LOD must be equal for all threads of a quad.
685 // Unlike with TXB, here we can just diverge since there's no LOD calculation
686 // that would require all 4 threads' sources to be set up properly.
688 NV50LoweringPreSSA::handleTXL(TexInstruction
*i
)
691 Value
*lod
= i
->getSrc(i
->tex
.target
.getArgCount());
692 if (lod
->isUniform())
695 BasicBlock
*currBB
= i
->bb
;
696 BasicBlock
*texiBB
= i
->bb
->splitBefore(i
, false);
697 BasicBlock
*joinBB
= i
->bb
->splitAfter(i
);
699 bld
.setPosition(currBB
, true);
700 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
702 for (int l
= 0; l
<= 3; ++l
) {
703 const uint8_t qop
= QUADOP(SUBR
, SUBR
, SUBR
, SUBR
);
704 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
705 bld
.setPosition(currBB
, true);
706 bld
.mkQuadop(qop
, pred
, l
, lod
, lod
)->flagsDef
= 0;
707 bld
.mkFlow(OP_BRA
, texiBB
, CC_EQ
, pred
)->fixed
= 1;
708 currBB
->cfg
.attach(&texiBB
->cfg
, Graph::Edge::FORWARD
);
710 BasicBlock
*laneBB
= new BasicBlock(func
);
711 currBB
->cfg
.attach(&laneBB
->cfg
, Graph::Edge::TREE
);
715 bld
.setPosition(joinBB
, false);
716 bld
.mkOp(OP_JOIN
, TYPE_NONE
, NULL
);
721 NV50LoweringPreSSA::handleTXD(TexInstruction
*i
)
723 static const uint8_t qOps
[4][2] =
725 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l0
726 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l1
727 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l2
728 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l3
733 Value
*zero
= bld
.loadImm(bld
.getSSA(), 0);
735 const int dim
= i
->tex
.target
.getDim();
738 i
->op
= OP_TEX
; // no need to clone dPdx/dPdy later
740 for (c
= 0; c
< dim
; ++c
)
741 crd
[c
] = bld
.getScratch();
743 bld
.mkOp(OP_QUADON
, TYPE_NONE
, NULL
);
744 for (l
= 0; l
< 4; ++l
) {
745 // mov coordinates from lane l to all lanes
746 for (c
= 0; c
< dim
; ++c
)
747 bld
.mkQuadop(0x00, crd
[c
], l
, i
->getSrc(c
), zero
);
748 // add dPdx from lane l to lanes dx
749 for (c
= 0; c
< dim
; ++c
)
750 bld
.mkQuadop(qOps
[l
][0], crd
[c
], l
, i
->dPdx
[c
].get(), crd
[c
]);
751 // add dPdy from lane l to lanes dy
752 for (c
= 0; c
< dim
; ++c
)
753 bld
.mkQuadop(qOps
[l
][1], crd
[c
], l
, i
->dPdy
[c
].get(), crd
[c
]);
755 bld
.insert(tex
= cloneForward(func
, i
));
756 for (c
= 0; c
< dim
; ++c
)
757 tex
->setSrc(c
, crd
[c
]);
759 for (c
= 0; i
->defExists(c
); ++c
) {
761 def
[c
][l
] = bld
.getSSA();
762 mov
= bld
.mkMov(def
[c
][l
], tex
->getDef(c
));
767 bld
.mkOp(OP_QUADPOP
, TYPE_NONE
, NULL
);
769 for (c
= 0; i
->defExists(c
); ++c
) {
770 Instruction
*u
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(c
));
771 for (l
= 0; l
< 4; ++l
)
772 u
->setSrc(l
, def
[c
][l
]);
780 NV50LoweringPreSSA::handleSET(Instruction
*i
)
782 if (i
->dType
== TYPE_F32
) {
783 bld
.setPosition(i
, true);
785 bld
.mkOp1(OP_ABS
, TYPE_S32
, i
->getDef(0), i
->getDef(0));
786 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(0), TYPE_S32
, i
->getDef(0));
792 NV50LoweringPreSSA::handleSLCT(CmpInstruction
*i
)
794 Value
*src0
= bld
.getSSA();
795 Value
*src1
= bld
.getSSA();
796 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
798 Value
*v0
= i
->getSrc(0);
799 Value
*v1
= i
->getSrc(1);
800 // XXX: these probably shouldn't be immediates in the first place ...
802 v0
= bld
.mkMov(bld
.getSSA(), v0
)->getDef(0);
804 v1
= bld
.mkMov(bld
.getSSA(), v1
)->getDef(0);
806 bld
.setPosition(i
, true);
807 bld
.mkMov(src0
, v0
)->setPredicate(CC_NE
, pred
);
808 bld
.mkMov(src1
, v1
)->setPredicate(CC_EQ
, pred
);
809 bld
.mkOp2(OP_UNION
, i
->dType
, i
->getDef(0), src0
, src1
);
811 bld
.setPosition(i
, false);
813 i
->setFlagsDef(0, pred
);
815 i
->setSrc(0, i
->getSrc(2));
817 i
->setSrc(1, bld
.loadImm(NULL
, 0));
823 NV50LoweringPreSSA::handleSELP(Instruction
*i
)
825 Value
*src0
= bld
.getSSA();
826 Value
*src1
= bld
.getSSA();
828 Value
*v0
= i
->getSrc(0);
829 Value
*v1
= i
->getSrc(1);
831 v0
= bld
.mkMov(bld
.getSSA(), v0
)->getDef(0);
833 v1
= bld
.mkMov(bld
.getSSA(), v1
)->getDef(0);
835 bld
.mkMov(src0
, v0
)->setPredicate(CC_NE
, i
->getSrc(2));
836 bld
.mkMov(src1
, v1
)->setPredicate(CC_EQ
, i
->getSrc(2));
837 bld
.mkOp2(OP_UNION
, i
->dType
, i
->getDef(0), src0
, src1
);
838 delete_Instruction(prog
, i
);
843 NV50LoweringPreSSA::handleWRSV(Instruction
*i
)
845 Symbol
*sym
= i
->getSrc(0)->asSym();
847 // these are all shader outputs, $sreg are not writeable
848 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_OUTPUT
, sym
);
851 sym
= bld
.mkSymbol(FILE_SHADER_OUTPUT
, 0, i
->sType
, addr
);
853 bld
.mkStore(OP_EXPORT
, i
->dType
, sym
, i
->getIndirect(0, 0), i
->getSrc(1));
855 bld
.getBB()->remove(i
);
860 NV50LoweringPreSSA::handleCALL(Instruction
*i
)
862 if (prog
->getType() == Program::TYPE_COMPUTE
) {
863 // Add implicit "thread id" argument in $r0 to the function
864 i
->setSrc(i
->srcCount(), tid
);
870 NV50LoweringPreSSA::handlePRECONT(Instruction
*i
)
872 delete_Instruction(prog
, i
);
877 NV50LoweringPreSSA::handleCONT(Instruction
*i
)
884 NV50LoweringPreSSA::handleRDSV(Instruction
*i
)
886 Symbol
*sym
= i
->getSrc(0)->asSym();
887 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_INPUT
, sym
);
888 Value
*def
= i
->getDef(0);
889 SVSemantic sv
= sym
->reg
.data
.sv
.sv
;
890 int idx
= sym
->reg
.data
.sv
.index
;
892 if (addr
>= 0x400) // mov $sreg
897 assert(prog
->getType() == Program::TYPE_FRAGMENT
);
898 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, i
->getDef(0), addr
, NULL
);
901 bld
.mkInterp(NV50_IR_INTERP_FLAT
, def
, addr
, NULL
);
902 if (i
->dType
== TYPE_F32
) {
903 bld
.mkOp2(OP_AND
, TYPE_U32
, def
, def
, bld
.mkImm(0x80000000));
904 bld
.mkOp2(OP_XOR
, TYPE_U32
, def
, def
, bld
.mkImm(0xbf800000));
910 if ((sv
== SV_NCTAID
&& idx
>= 2) ||
911 (sv
== SV_NTID
&& idx
>= 3)) {
912 bld
.mkMov(def
, bld
.mkImm(1));
913 } else if (sv
== SV_CTAID
&& idx
>= 2) {
914 bld
.mkMov(def
, bld
.mkImm(0));
916 Value
*x
= bld
.getSSA(2);
917 bld
.mkOp1(OP_LOAD
, TYPE_U16
, x
,
918 bld
.mkSymbol(FILE_MEMORY_SHARED
, 0, TYPE_U16
, addr
));
919 bld
.mkCvt(OP_CVT
, TYPE_U32
, def
, TYPE_U16
, x
);
924 bld
.mkOp2(OP_AND
, TYPE_U32
, def
, tid
, bld
.mkImm(0x0000ffff));
925 } else if (idx
== 1) {
926 bld
.mkOp2(OP_AND
, TYPE_U32
, def
, tid
, bld
.mkImm(0x03ff0000));
927 bld
.mkOp2(OP_SHR
, TYPE_U32
, def
, def
, bld
.mkImm(16));
928 } else if (idx
== 2) {
929 bld
.mkOp2(OP_SHR
, TYPE_U32
, def
, tid
, bld
.mkImm(26));
931 bld
.mkMov(def
, bld
.mkImm(0));
935 bld
.mkFetch(i
->getDef(0), i
->dType
,
936 FILE_SHADER_INPUT
, addr
, i
->getIndirect(0, 0), NULL
);
939 bld
.getBB()->remove(i
);
944 NV50LoweringPreSSA::handleDIV(Instruction
*i
)
946 if (!isFloatType(i
->dType
))
948 bld
.setPosition(i
, false);
949 Instruction
*rcp
= bld
.mkOp1(OP_RCP
, i
->dType
, bld
.getSSA(), i
->getSrc(1));
951 i
->setSrc(1, rcp
->getDef(0));
956 NV50LoweringPreSSA::handleSQRT(Instruction
*i
)
958 Instruction
*rsq
= bld
.mkOp1(OP_RSQ
, TYPE_F32
,
959 bld
.getSSA(), i
->getSrc(0));
961 i
->setSrc(1, rsq
->getDef(0));
967 NV50LoweringPreSSA::handlePOW(Instruction
*i
)
969 LValue
*val
= bld
.getScratch();
971 bld
.mkOp1(OP_LG2
, TYPE_F32
, val
, i
->getSrc(0));
972 bld
.mkOp2(OP_MUL
, TYPE_F32
, val
, i
->getSrc(1), val
)->dnz
= 1;
973 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, val
, val
);
983 NV50LoweringPreSSA::handleEXPORT(Instruction
*i
)
985 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
986 if (i
->getIndirect(0, 0)) {
987 // TODO: redirect to l[] here, load to GPRs at exit
990 int id
= i
->getSrc(0)->reg
.data
.offset
/ 4; // in 32 bit reg units
993 i
->subOp
= NV50_IR_SUBOP_MOV_FINAL
;
994 i
->src(0).set(i
->src(1));
996 i
->setDef(0, new_LValue(func
, FILE_GPR
));
997 i
->getDef(0)->reg
.data
.id
= id
;
999 prog
->maxGPR
= MAX2(prog
->maxGPR
, id
);
1005 // Set flags according to predicate and make the instruction read $cX.
1007 NV50LoweringPreSSA::checkPredicate(Instruction
*insn
)
1009 Value
*pred
= insn
->getPredicate();
1012 if (!pred
|| pred
->reg
.file
== FILE_FLAGS
)
1014 cdst
= bld
.getSSA(1, FILE_FLAGS
);
1016 bld
.mkCmp(OP_SET
, CC_NEU
, insn
->dType
, cdst
, insn
->dType
, bld
.loadImm(NULL
, 0), pred
);
1018 insn
->setPredicate(insn
->cc
, cdst
);
1022 // - add quadop dance for texturing
1023 // - put FP outputs in GPRs
1024 // - convert instruction sequences
1027 NV50LoweringPreSSA::visit(Instruction
*i
)
1029 bld
.setPosition(i
, false);
1031 if (i
->cc
!= CC_ALWAYS
)
1038 return handleTEX(i
->asTex());
1040 return handleTXB(i
->asTex());
1042 return handleTXL(i
->asTex());
1044 return handleTXD(i
->asTex());
1046 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, i
->getDef(0), i
->getSrc(0));
1047 i
->setSrc(0, i
->getDef(0));
1050 return handleSET(i
);
1052 return handleSLCT(i
->asCmp());
1054 return handleSELP(i
);
1056 return handlePOW(i
);
1058 return handleDIV(i
);
1060 return handleSQRT(i
);
1062 return handleEXPORT(i
);
1064 return handleRDSV(i
);
1066 return handleWRSV(i
);
1068 return handleCALL(i
);
1070 return handlePRECONT(i
);
1072 return handleCONT(i
);
1080 TargetNV50::runLegalizePass(Program
*prog
, CGStage stage
) const
1084 if (stage
== CG_STAGE_PRE_SSA
) {
1085 NV50LoweringPreSSA
pass(prog
);
1086 ret
= pass
.run(prog
, false, true);
1088 if (stage
== CG_STAGE_SSA
) {
1089 if (!prog
->targetPriv
)
1090 prog
->targetPriv
= new std::list
<Instruction
*>();
1091 NV50LegalizeSSA
pass(prog
);
1092 ret
= pass
.run(prog
, false, true);
1094 if (stage
== CG_STAGE_POST_RA
) {
1095 NV50LegalizePostRA pass
;
1096 ret
= pass
.run(prog
, false, true);
1097 if (prog
->targetPriv
)
1098 delete reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
1103 } // namespace nv50_ir