2 * Copyright 2011 Christoph Bumiller
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 #include "nv50/codegen/nv50_ir.h"
24 #include "nv50/codegen/nv50_ir_build_util.h"
26 #include "nv50_ir_target_nv50.h"
30 // nv50 doesn't support 32 bit integer multiplication
32 // ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33 // -------------------
34 // al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35 // ah*bh 00 00 ( carry1) << 16 + ( carry2)
39 // fffe0001 + fffe0001
41 expandIntegerMUL(BuildUtil
*bld
, Instruction
*mul
)
43 const bool highResult
= mul
->subOp
== NV50_IR_SUBOP_MUL_HIGH
;
45 DataType fTy
= mul
->sType
; // full type
48 case TYPE_S32
: hTy
= TYPE_S16
; break;
49 case TYPE_U32
: hTy
= TYPE_U16
; break;
50 case TYPE_U64
: hTy
= TYPE_U32
; break;
51 case TYPE_S64
: hTy
= TYPE_S32
; break;
55 unsigned int fullSize
= typeSizeof(fTy
);
56 unsigned int halfSize
= typeSizeof(hTy
);
60 Value
*a
[2] = { bld
->getSSA(halfSize
), bld
->getSSA(halfSize
) };
61 Value
*b
[2] = { bld
->getSSA(halfSize
), bld
->getSSA(halfSize
) };
64 for (int j
= 0; j
< 4; ++j
)
65 t
[j
] = bld
->getSSA(fullSize
);
67 (i
[0] = bld
->mkOp1(OP_SPLIT
, fTy
, a
[0], mul
->getSrc(0)))->setDef(1, a
[1]);
68 (i
[1] = bld
->mkOp1(OP_SPLIT
, fTy
, b
[0], mul
->getSrc(1)))->setDef(1, b
[1]);
70 i
[2] = bld
->mkOp2(OP_MUL
, fTy
, t
[0], a
[0], b
[1]);
71 i
[3] = bld
->mkOp3(OP_MAD
, fTy
, t
[1], a
[1], b
[0], t
[0]);
72 i
[7] = bld
->mkOp2(OP_SHL
, fTy
, t
[2], t
[1], bld
->mkImm(halfSize
* 8));
73 i
[4] = bld
->mkOp3(OP_MAD
, fTy
, t
[3], a
[0], b
[0], t
[2]);
77 Value
*imm
= bld
->loadImm(NULL
, 1 << (halfSize
* 8));
78 c
[0] = bld
->getSSA(1, FILE_FLAGS
);
79 c
[1] = bld
->getSSA(1, FILE_FLAGS
);
80 for (int j
= 0; j
< 3; ++j
)
81 r
[j
] = bld
->getSSA(fullSize
);
83 i
[8] = bld
->mkOp2(OP_SHR
, fTy
, r
[0], t
[1], bld
->mkImm(halfSize
* 8));
84 i
[6] = bld
->mkOp2(OP_ADD
, fTy
, r
[1], r
[0], imm
);
85 bld
->mkOp2(OP_UNION
, TYPE_U32
, r
[2], r
[1], r
[0]);
86 i
[5] = bld
->mkOp3(OP_MAD
, fTy
, mul
->getDef(0), a
[1], b
[1], r
[2]);
88 // set carry defs / sources
89 i
[3]->setFlagsDef(1, c
[0]);
90 i
[4]->setFlagsDef(0, c
[1]); // actual result not required, just the carry
91 i
[6]->setPredicate(CC_C
, c
[0]);
92 i
[5]->setFlagsSrc(3, c
[1]);
94 bld
->mkMov(mul
->getDef(0), t
[3]);
96 delete_Instruction(bld
->getProgram(), mul
);
98 for (int j
= 2; j
<= (highResult
? 5 : 4); ++j
)
109 #define QUADOP(q, r, s, t) \
110 ((QOP_##q << 0) | (QOP_##r << 2) | \
111 (QOP_##s << 4) | (QOP_##t << 6))
113 class NV50LegalizePostRA
: public Pass
116 virtual bool visit(Function
*);
117 virtual bool visit(BasicBlock
*);
119 void handlePRERET(FlowInstruction
*);
120 void replaceZero(Instruction
*);
121 void split64BitOp(Instruction
*);
127 NV50LegalizePostRA::visit(Function
*fn
)
129 Program
*prog
= fn
->getProgram();
131 r63
= new_LValue(fn
, FILE_GPR
);
132 r63
->reg
.data
.id
= 63;
134 // this is actually per-program, but we can do it all on visiting main()
135 std::list
<Instruction
*> *outWrites
=
136 reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
139 for (std::list
<Instruction
*>::iterator it
= outWrites
->begin();
140 it
!= outWrites
->end(); ++it
)
141 (*it
)->getSrc(1)->defs
.front()->getInsn()->setDef(0, (*it
)->getSrc(0));
142 // instructions will be deleted on exit
150 NV50LegalizePostRA::replaceZero(Instruction
*i
)
152 for (int s
= 0; i
->srcExists(s
); ++s
) {
153 ImmediateValue
*imm
= i
->getSrc(s
)->asImm();
154 if (imm
&& imm
->reg
.data
.u64
== 0)
160 NV50LegalizePostRA::split64BitOp(Instruction
*i
)
162 if (i
->dType
== TYPE_F64
) {
165 if (i
->op
== OP_ADD
|| i
->op
== OP_MUL
|| i
->op
== OP_FMA
||
166 i
->op
== OP_CVT
|| i
->op
== OP_MIN
|| i
->op
== OP_MAX
||
169 i
->dType
= i
->sType
= TYPE_U32
;
171 i
->bb
->insertAfter(i
, cloneForward(func
, i
));
175 // Emulate PRERET: jump to the target and call to the origin from there
177 // WARNING: atm only works if BBs are affected by at most a single PRERET
186 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
189 // bra BB:3 + n1 (skip the call)
190 // call BB:0 + n2 (skip bra at beginning of BB:0)
193 NV50LegalizePostRA::handlePRERET(FlowInstruction
*pre
)
195 BasicBlock
*bbE
= pre
->bb
;
196 BasicBlock
*bbT
= pre
->target
.bb
;
198 pre
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 0;
200 bbE
->insertHead(pre
);
202 Instruction
*skip
= new_FlowInstruction(func
, OP_PRERET
, bbT
);
203 Instruction
*call
= new_FlowInstruction(func
, OP_PRERET
, bbE
);
205 bbT
->insertHead(call
);
206 bbT
->insertHead(skip
);
208 // NOTE: maybe split blocks to prevent the instructions from moving ?
210 skip
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 1;
211 call
->subOp
= NV50_IR_SUBOP_EMU_PRERET
+ 2;
215 NV50LegalizePostRA::visit(BasicBlock
*bb
)
217 Instruction
*i
, *next
;
219 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
220 for (i
= bb
->getFirst(); i
; i
= next
) {
225 if (i
->op
== OP_PRERET
&& prog
->getTarget()->getChipset() < 0xa0) {
226 handlePRERET(i
->asFlow());
228 if (i
->op
!= OP_MOV
&& i
->op
!= OP_PFETCH
&&
229 (!i
->defExists(0) || i
->def(0).getFile() != FILE_ADDRESS
))
231 if (typeSizeof(i
->dType
) == 8)
241 class NV50LegalizeSSA
: public Pass
244 NV50LegalizeSSA(Program
*);
246 virtual bool visit(BasicBlock
*bb
);
249 void propagateWriteToOutput(Instruction
*);
250 void handleDIV(Instruction
*);
251 void handleMOD(Instruction
*);
252 void handleMUL(Instruction
*);
253 void handleAddrDef(Instruction
*);
255 inline bool isARL(const Instruction
*) const;
259 std::list
<Instruction
*> *outWrites
;
262 NV50LegalizeSSA::NV50LegalizeSSA(Program
*prog
)
264 bld
.setProgram(prog
);
266 if (prog
->optLevel
>= 2 &&
267 (prog
->getType() == Program::TYPE_GEOMETRY
||
268 prog
->getType() == Program::TYPE_VERTEX
))
270 reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
276 NV50LegalizeSSA::propagateWriteToOutput(Instruction
*st
)
278 if (st
->src(0).isIndirect(0) || st
->getSrc(1)->refCount() != 1)
281 // check def instruction can store
282 Instruction
*di
= st
->getSrc(1)->defs
.front()->getInsn();
284 // TODO: move exports (if beneficial) in common opt pass
285 if (di
->isPseudo() || isTextureOp(di
->op
) || di
->defCount(0xff, true) > 1)
287 for (int s
= 0; di
->srcExists(s
); ++s
)
288 if (di
->src(s
).getFile() == FILE_IMMEDIATE
)
291 // We cannot set defs to non-lvalues before register allocation, so
292 // save & remove (to save registers) the exports and replace later.
293 outWrites
->push_back(st
);
298 NV50LegalizeSSA::isARL(const Instruction
*i
) const
302 if (i
->op
!= OP_SHL
|| i
->src(0).getFile() != FILE_GPR
)
304 if (!i
->src(1).getImmediate(imm
))
306 return imm
.isInteger(0);
310 NV50LegalizeSSA::handleAddrDef(Instruction
*i
)
314 i
->getDef(0)->reg
.size
= 2; // $aX are only 16 bit
316 // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
317 if (i
->srcExists(1) && i
->src(1).getFile() == FILE_IMMEDIATE
) {
318 if (i
->op
== OP_SHL
&& i
->src(0).getFile() == FILE_GPR
)
320 if (i
->op
== OP_ADD
&& i
->src(0).getFile() == FILE_ADDRESS
)
324 // turn $a sources into $r sources (can't operate on $a)
325 for (int s
= 0; i
->srcExists(s
); ++s
) {
326 Value
*a
= i
->getSrc(s
);
328 if (a
->reg
.file
== FILE_ADDRESS
) {
329 if (a
->getInsn() && isARL(a
->getInsn())) {
330 i
->setSrc(s
, a
->getInsn()->getSrc(0));
332 bld
.setPosition(i
, false);
339 if (i
->op
== OP_SHL
&& i
->src(1).getFile() == FILE_IMMEDIATE
)
342 // turn result back into $a
343 bld
.setPosition(i
, true);
344 arl
= bld
.mkOp2(OP_SHL
, TYPE_U32
, i
->getDef(0), bld
.getSSA(), bld
.mkImm(0));
345 i
->setDef(0, arl
->getSrc(0));
349 NV50LegalizeSSA::handleMUL(Instruction
*mul
)
351 if (isFloatType(mul
->sType
) || typeSizeof(mul
->sType
) <= 2)
353 Value
*def
= mul
->getDef(0);
354 Value
*pred
= mul
->getPredicate();
355 CondCode cc
= mul
->cc
;
357 mul
->setPredicate(CC_ALWAYS
, NULL
);
359 if (mul
->op
== OP_MAD
) {
360 Instruction
*add
= mul
;
361 bld
.setPosition(add
, false);
362 Value
*res
= cloneShallow(func
, mul
->getDef(0));
363 mul
= bld
.mkOp2(OP_MUL
, add
->sType
, res
, add
->getSrc(0), add
->getSrc(1));
365 add
->setSrc(0, mul
->getDef(0));
366 add
->setSrc(1, add
->getSrc(2));
367 for (int s
= 2; add
->srcExists(s
); ++s
)
368 add
->setSrc(s
, NULL
);
369 mul
->subOp
= add
->subOp
;
372 expandIntegerMUL(&bld
, mul
);
374 def
->getInsn()->setPredicate(cc
, pred
);
377 // Use f32 division: first compute an approximate result, use it to reduce
378 // the dividend, which should then be representable as f32, divide the reduced
379 // dividend, and add the quotients.
381 NV50LegalizeSSA::handleDIV(Instruction
*div
)
383 const DataType ty
= div
->sType
;
385 if (ty
!= TYPE_U32
&& ty
!= TYPE_S32
)
388 Value
*q
, *q0
, *qf
, *aR
, *aRf
, *qRf
, *qR
, *t
, *s
, *m
, *cond
;
390 bld
.setPosition(div
, false);
392 Value
*a
, *af
= bld
.getSSA();
393 Value
*b
, *bf
= bld
.getSSA();
395 bld
.mkCvt(OP_CVT
, TYPE_F32
, af
, ty
, div
->getSrc(0));
396 bld
.mkCvt(OP_CVT
, TYPE_F32
, bf
, ty
, div
->getSrc(1));
398 if (isSignedType(ty
)) {
399 af
->getInsn()->src(0).mod
= Modifier(NV50_IR_MOD_ABS
);
400 bf
->getInsn()->src(0).mod
= Modifier(NV50_IR_MOD_ABS
);
403 bld
.mkOp1(OP_ABS
, ty
, a
, div
->getSrc(0));
404 bld
.mkOp1(OP_ABS
, ty
, b
, div
->getSrc(1));
410 bf
= bld
.mkOp1v(OP_RCP
, TYPE_F32
, bld
.getSSA(), bf
);
411 bf
= bld
.mkOp2v(OP_ADD
, TYPE_U32
, bld
.getSSA(), bf
, bld
.mkImm(-2));
413 bld
.mkOp2(OP_MUL
, TYPE_F32
, (qf
= bld
.getSSA()), af
, bf
)->rnd
= ROUND_Z
;
414 bld
.mkCvt(OP_CVT
, ty
, (q0
= bld
.getSSA()), TYPE_F32
, qf
)->rnd
= ROUND_Z
;
416 // get error of 1st result
417 expandIntegerMUL(&bld
,
418 bld
.mkOp2(OP_MUL
, TYPE_U32
, (t
= bld
.getSSA()), q0
, b
));
419 bld
.mkOp2(OP_SUB
, TYPE_U32
, (aRf
= bld
.getSSA()), a
, t
);
421 bld
.mkCvt(OP_CVT
, TYPE_F32
, (aR
= bld
.getSSA()), TYPE_U32
, aRf
);
423 bld
.mkOp2(OP_MUL
, TYPE_F32
, (qRf
= bld
.getSSA()), aR
, bf
)->rnd
= ROUND_Z
;
424 bld
.mkCvt(OP_CVT
, TYPE_U32
, (qR
= bld
.getSSA()), TYPE_F32
, qRf
)
426 bld
.mkOp2(OP_ADD
, ty
, (q
= bld
.getSSA()), q0
, qR
); // add quotients
428 // correction: if modulus >= divisor, add 1
429 expandIntegerMUL(&bld
,
430 bld
.mkOp2(OP_MUL
, TYPE_U32
, (t
= bld
.getSSA()), q
, b
));
431 bld
.mkOp2(OP_SUB
, TYPE_U32
, (m
= bld
.getSSA()), a
, t
);
432 bld
.mkCmp(OP_SET
, CC_GE
, TYPE_U32
, (s
= bld
.getSSA()), m
, b
);
433 if (!isSignedType(ty
)) {
439 bld
.mkOp2(OP_SUB
, TYPE_U32
, (q
= bld
.getSSA()), t
, s
);
443 bld
.mkOp2(OP_XOR
, TYPE_U32
, NULL
, div
->getSrc(0), div
->getSrc(1))
444 ->setFlagsDef(0, (cond
= bld
.getSSA(1, FILE_FLAGS
)));
445 bld
.mkOp1(OP_NEG
, ty
, s
, q
)->setPredicate(CC_S
, cond
);
446 bld
.mkOp1(OP_MOV
, ty
, t
, q
)->setPredicate(CC_NS
, cond
);
455 NV50LegalizeSSA::handleMOD(Instruction
*mod
)
457 if (mod
->dType
!= TYPE_U32
&& mod
->dType
!= TYPE_S32
)
459 bld
.setPosition(mod
, false);
461 Value
*q
= bld
.getSSA();
462 Value
*m
= bld
.getSSA();
464 bld
.mkOp2(OP_DIV
, mod
->dType
, q
, mod
->getSrc(0), mod
->getSrc(1));
465 handleDIV(q
->getInsn());
467 bld
.setPosition(mod
, false);
468 expandIntegerMUL(&bld
, bld
.mkOp2(OP_MUL
, TYPE_U32
, m
, q
, mod
->getSrc(1)));
475 NV50LegalizeSSA::visit(BasicBlock
*bb
)
477 Instruction
*insn
, *next
;
478 // skipping PHIs (don't pass them to handleAddrDef) !
479 for (insn
= bb
->getEntry(); insn
; insn
= next
) {
485 propagateWriteToOutput(insn
);
501 if (insn
->defExists(0) && insn
->getDef(0)->reg
.file
== FILE_ADDRESS
)
507 class NV50LoweringPreSSA
: public Pass
510 NV50LoweringPreSSA(Program
*);
513 virtual bool visit(Instruction
*);
514 virtual bool visit(Function
*);
516 bool handleRDSV(Instruction
*);
517 bool handleWRSV(Instruction
*);
519 bool handleEXPORT(Instruction
*);
521 bool handleMUL(Instruction
*);
522 bool handleDIV(Instruction
*);
523 bool handleSQRT(Instruction
*);
524 bool handlePOW(Instruction
*);
526 bool handleSET(Instruction
*);
527 bool handleSLCT(CmpInstruction
*);
528 bool handleSELP(Instruction
*);
530 bool handleTEX(TexInstruction
*);
531 bool handleTXB(TexInstruction
*); // I really
532 bool handleTXL(TexInstruction
*); // hate
533 bool handleTXD(TexInstruction
*); // these 3
535 bool handleCALL(Instruction
*);
536 bool handlePRECONT(Instruction
*);
537 bool handleCONT(Instruction
*);
539 void checkPredicate(Instruction
*);
542 const Target
*const targ
;
549 NV50LoweringPreSSA::NV50LoweringPreSSA(Program
*prog
) :
550 targ(prog
->getTarget()), tid(NULL
)
552 bld
.setProgram(prog
);
556 NV50LoweringPreSSA::visit(Function
*f
)
558 BasicBlock
*root
= BasicBlock::get(func
->cfg
.getRoot());
560 if (prog
->getType() == Program::TYPE_COMPUTE
) {
561 // Add implicit "thread id" argument in $r0 to the function
562 Value
*arg
= new_LValue(func
, FILE_GPR
);
563 arg
->reg
.data
.id
= 0;
564 f
->ins
.push_back(arg
);
566 bld
.setPosition(root
, false);
567 tid
= bld
.mkMov(bld
.getScratch(), arg
, TYPE_U32
)->getDef(0);
573 // move array source to first slot, convert to u16, add indirections
575 NV50LoweringPreSSA::handleTEX(TexInstruction
*i
)
577 const int arg
= i
->tex
.target
.getArgCount();
578 const int dref
= arg
;
579 const int lod
= i
->tex
.target
.isShadow() ? (arg
+ 1) : arg
;
581 // dref comes before bias/lod
582 if (i
->tex
.target
.isShadow())
583 if (i
->op
== OP_TXB
|| i
->op
== OP_TXL
)
584 i
->swapSources(dref
, lod
);
586 // array index must be converted to u32
587 if (i
->tex
.target
.isArray()) {
588 Value
*layer
= i
->getSrc(arg
- 1);
589 LValue
*src
= new_LValue(func
, FILE_GPR
);
590 bld
.mkCvt(OP_CVT
, TYPE_U16
, src
, TYPE_F32
, layer
);
591 i
->setSrc(arg
- 1, src
);
593 if (i
->tex
.target
.isCube()) {
594 // Value *face = layer;
596 x
= new_LValue(func
, FILE_GPR
);
597 y
= new_LValue(func
, FILE_GPR
);
598 layer
= new_LValue(func
, FILE_GPR
);
600 i
->tex
.target
= TEX_TARGET_2D_ARRAY
;
602 // TODO: use TEXPREP to convert x,y,z,face -> x,y,layer
603 bld
.mkMov(x
, i
->getSrc(0));
604 bld
.mkMov(y
, i
->getSrc(1));
605 bld
.mkMov(layer
, i
->getSrc(3));
610 i
->setSrc(3, i
->getSrc(4));
615 // texel offsets are 3 immediate fields in the instruction,
616 // nv50 cannot do textureGatherOffsets
617 assert(i
->tex
.useOffsets
<= 1);
622 // Bias must be equal for all threads of a quad or lod calculation will fail.
624 // The lanes of a quad are grouped by the bit in the condition register they
625 // have set, which is selected by differing bias values.
626 // Move the input values for TEX into a new register set for each group and
627 // execute TEX only for a specific group.
628 // We always need to use 4 new registers for the inputs/outputs because the
629 // implicitly calculated derivatives must be correct.
631 // TODO: move to SSA phase so we can easily determine whether bias is constant
633 NV50LoweringPreSSA::handleTXB(TexInstruction
*i
)
635 const CondCode cc
[4] = { CC_EQU
, CC_S
, CC_C
, CC_O
};
639 Value
*bias
= i
->getSrc(i
->tex
.target
.getArgCount());
640 if (bias
->isUniform())
643 Instruction
*cond
= bld
.mkOp1(OP_UNION
, TYPE_U32
, bld
.getScratch(),
644 bld
.loadImm(NULL
, 1));
645 bld
.setPosition(cond
, false);
647 for (l
= 1; l
< 4; ++l
) {
648 const uint8_t qop
= QUADOP(SUBR
, SUBR
, SUBR
, SUBR
);
649 Value
*bit
= bld
.getSSA();
650 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
651 Value
*imm
= bld
.loadImm(NULL
, (1 << l
));
652 bld
.mkQuadop(qop
, pred
, l
, bias
, bias
)->flagsDef
= 0;
653 bld
.mkMov(bit
, imm
)->setPredicate(CC_EQ
, pred
);
654 cond
->setSrc(l
, bit
);
656 Value
*flags
= bld
.getScratch(1, FILE_FLAGS
);
657 bld
.setPosition(cond
, true);
658 bld
.mkCvt(OP_CVT
, TYPE_U8
, flags
, TYPE_U32
, cond
->getDef(0));
661 for (l
= 0; l
< 4; ++l
) {
662 (tex
[l
] = cloneForward(func
, i
))->setPredicate(cc
[l
], flags
);
667 for (d
= 0; i
->defExists(d
); ++d
)
668 res
[0][d
] = tex
[0]->getDef(d
);
669 for (l
= 1; l
< 4; ++l
) {
670 for (d
= 0; tex
[l
]->defExists(d
); ++d
) {
671 res
[l
][d
] = cloneShallow(func
, res
[0][d
]);
672 bld
.mkMov(res
[l
][d
], tex
[l
]->getDef(d
))->setPredicate(cc
[l
], flags
);
676 for (d
= 0; i
->defExists(d
); ++d
) {
677 Instruction
*dst
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(d
));
678 for (l
= 0; l
< 4; ++l
)
679 dst
->setSrc(l
, res
[l
][d
]);
681 delete_Instruction(prog
, i
);
685 // LOD must be equal for all threads of a quad.
686 // Unlike with TXB, here we can just diverge since there's no LOD calculation
687 // that would require all 4 threads' sources to be set up properly.
689 NV50LoweringPreSSA::handleTXL(TexInstruction
*i
)
692 Value
*lod
= i
->getSrc(i
->tex
.target
.getArgCount());
693 if (lod
->isUniform())
696 BasicBlock
*currBB
= i
->bb
;
697 BasicBlock
*texiBB
= i
->bb
->splitBefore(i
, false);
698 BasicBlock
*joinBB
= i
->bb
->splitAfter(i
);
700 currBB
->joinAt
= bld
.mkFlow(OP_JOINAT
, joinBB
, CC_ALWAYS
, NULL
);
702 for (int l
= 0; l
<= 3; ++l
) {
703 const uint8_t qop
= QUADOP(SUBR
, SUBR
, SUBR
, SUBR
);
704 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
705 bld
.setPosition(currBB
, true);
706 bld
.mkQuadop(qop
, pred
, l
, lod
, lod
)->flagsDef
= 0;
707 bld
.mkFlow(OP_BRA
, texiBB
, CC_EQ
, pred
)->fixed
= 1;
708 currBB
->cfg
.attach(&texiBB
->cfg
, Graph::Edge::FORWARD
);
710 BasicBlock
*laneBB
= new BasicBlock(func
);
711 currBB
->cfg
.attach(&laneBB
->cfg
, Graph::Edge::TREE
);
715 bld
.setPosition(joinBB
, false);
716 bld
.mkOp(OP_JOIN
, TYPE_NONE
, NULL
);
721 NV50LoweringPreSSA::handleTXD(TexInstruction
*i
)
723 static const uint8_t qOps
[4][2] =
725 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l0
726 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(MOV2
, MOV2
, ADD
, ADD
) }, // l1
727 { QUADOP(MOV2
, ADD
, MOV2
, ADD
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l2
728 { QUADOP(SUBR
, MOV2
, SUBR
, MOV2
), QUADOP(SUBR
, SUBR
, MOV2
, MOV2
) }, // l3
733 Value
*zero
= bld
.loadImm(bld
.getSSA(), 0);
735 const int dim
= i
->tex
.target
.getDim();
738 i
->op
= OP_TEX
; // no need to clone dPdx/dPdy later
740 for (c
= 0; c
< dim
; ++c
)
741 crd
[c
] = bld
.getScratch();
743 bld
.mkOp(OP_QUADON
, TYPE_NONE
, NULL
);
744 for (l
= 0; l
< 4; ++l
) {
745 // mov coordinates from lane l to all lanes
746 for (c
= 0; c
< dim
; ++c
)
747 bld
.mkQuadop(0x00, crd
[c
], l
, i
->getSrc(c
), zero
);
748 // add dPdx from lane l to lanes dx
749 for (c
= 0; c
< dim
; ++c
)
750 bld
.mkQuadop(qOps
[l
][0], crd
[c
], l
, i
->dPdx
[c
].get(), crd
[c
]);
751 // add dPdy from lane l to lanes dy
752 for (c
= 0; c
< dim
; ++c
)
753 bld
.mkQuadop(qOps
[l
][1], crd
[c
], l
, i
->dPdy
[c
].get(), crd
[c
]);
755 bld
.insert(tex
= cloneForward(func
, i
));
756 for (c
= 0; c
< dim
; ++c
)
757 tex
->setSrc(c
, crd
[c
]);
759 for (c
= 0; i
->defExists(c
); ++c
) {
761 def
[c
][l
] = bld
.getSSA();
762 mov
= bld
.mkMov(def
[c
][l
], tex
->getDef(c
));
767 bld
.mkOp(OP_QUADPOP
, TYPE_NONE
, NULL
);
769 for (c
= 0; i
->defExists(c
); ++c
) {
770 Instruction
*u
= bld
.mkOp(OP_UNION
, TYPE_U32
, i
->getDef(c
));
771 for (l
= 0; l
< 4; ++l
)
772 u
->setSrc(l
, def
[c
][l
]);
780 NV50LoweringPreSSA::handleSET(Instruction
*i
)
782 if (i
->dType
== TYPE_F32
) {
783 bld
.setPosition(i
, true);
785 bld
.mkOp1(OP_ABS
, TYPE_S32
, i
->getDef(0), i
->getDef(0));
786 bld
.mkCvt(OP_CVT
, TYPE_F32
, i
->getDef(0), TYPE_S32
, i
->getDef(0));
792 NV50LoweringPreSSA::handleSLCT(CmpInstruction
*i
)
794 Value
*src0
= bld
.getSSA();
795 Value
*src1
= bld
.getSSA();
796 Value
*pred
= bld
.getScratch(1, FILE_FLAGS
);
798 Value
*v0
= i
->getSrc(0);
799 Value
*v1
= i
->getSrc(1);
800 // XXX: these probably shouldn't be immediates in the first place ...
802 v0
= bld
.mkMov(bld
.getSSA(), v0
)->getDef(0);
804 v1
= bld
.mkMov(bld
.getSSA(), v1
)->getDef(0);
806 bld
.setPosition(i
, true);
807 bld
.mkMov(src0
, v0
)->setPredicate(CC_NE
, pred
);
808 bld
.mkMov(src1
, v1
)->setPredicate(CC_EQ
, pred
);
809 bld
.mkOp2(OP_UNION
, i
->dType
, i
->getDef(0), src0
, src1
);
811 bld
.setPosition(i
, false);
813 i
->setFlagsDef(0, pred
);
815 i
->setSrc(0, i
->getSrc(2));
817 i
->setSrc(1, bld
.loadImm(NULL
, 0));
823 NV50LoweringPreSSA::handleSELP(Instruction
*i
)
825 Value
*src0
= bld
.getSSA();
826 Value
*src1
= bld
.getSSA();
828 Value
*v0
= i
->getSrc(0);
829 Value
*v1
= i
->getSrc(1);
831 v0
= bld
.mkMov(bld
.getSSA(), v0
)->getDef(0);
833 v1
= bld
.mkMov(bld
.getSSA(), v1
)->getDef(0);
835 bld
.mkMov(src0
, v0
)->setPredicate(CC_NE
, i
->getSrc(2));
836 bld
.mkMov(src1
, v1
)->setPredicate(CC_EQ
, i
->getSrc(2));
837 bld
.mkOp2(OP_UNION
, i
->dType
, i
->getDef(0), src0
, src1
);
838 delete_Instruction(prog
, i
);
843 NV50LoweringPreSSA::handleWRSV(Instruction
*i
)
845 Symbol
*sym
= i
->getSrc(0)->asSym();
847 // these are all shader outputs, $sreg are not writeable
848 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_OUTPUT
, sym
);
851 sym
= bld
.mkSymbol(FILE_SHADER_OUTPUT
, 0, i
->sType
, addr
);
853 bld
.mkStore(OP_EXPORT
, i
->dType
, sym
, i
->getIndirect(0, 0), i
->getSrc(1));
855 bld
.getBB()->remove(i
);
860 NV50LoweringPreSSA::handleCALL(Instruction
*i
)
862 if (prog
->getType() == Program::TYPE_COMPUTE
) {
863 // Add implicit "thread id" argument in $r0 to the function
864 i
->setSrc(i
->srcCount(), tid
);
870 NV50LoweringPreSSA::handlePRECONT(Instruction
*i
)
872 delete_Instruction(prog
, i
);
877 NV50LoweringPreSSA::handleCONT(Instruction
*i
)
884 NV50LoweringPreSSA::handleRDSV(Instruction
*i
)
886 Symbol
*sym
= i
->getSrc(0)->asSym();
887 uint32_t addr
= targ
->getSVAddress(FILE_SHADER_INPUT
, sym
);
888 Value
*def
= i
->getDef(0);
889 SVSemantic sv
= sym
->reg
.data
.sv
.sv
;
890 int idx
= sym
->reg
.data
.sv
.index
;
892 if (addr
>= 0x400) // mov $sreg
897 assert(prog
->getType() == Program::TYPE_FRAGMENT
);
898 bld
.mkInterp(NV50_IR_INTERP_LINEAR
, i
->getDef(0), addr
, NULL
);
901 bld
.mkInterp(NV50_IR_INTERP_FLAT
, def
, addr
, NULL
);
902 if (i
->dType
== TYPE_F32
) {
903 bld
.mkOp2(OP_AND
, TYPE_U32
, def
, def
, bld
.mkImm(0x80000000));
904 bld
.mkOp2(OP_XOR
, TYPE_U32
, def
, def
, bld
.mkImm(0xbf800000));
910 if ((sv
== SV_NCTAID
&& idx
>= 2) ||
911 (sv
== SV_NTID
&& idx
>= 3)) {
912 bld
.mkMov(def
, bld
.mkImm(1));
913 } else if (sv
== SV_CTAID
&& idx
>= 2) {
914 bld
.mkMov(def
, bld
.mkImm(0));
916 Value
*x
= bld
.getSSA(2);
917 bld
.mkOp1(OP_LOAD
, TYPE_U16
, x
,
918 bld
.mkSymbol(FILE_MEMORY_SHARED
, 0, TYPE_U16
, addr
));
919 bld
.mkCvt(OP_CVT
, TYPE_U32
, def
, TYPE_U16
, x
);
924 bld
.mkOp2(OP_AND
, TYPE_U32
, def
, tid
, bld
.mkImm(0x0000ffff));
925 } else if (idx
== 1) {
926 bld
.mkOp2(OP_AND
, TYPE_U32
, def
, tid
, bld
.mkImm(0x03ff0000));
927 bld
.mkOp2(OP_SHR
, TYPE_U32
, def
, def
, bld
.mkImm(16));
928 } else if (idx
== 2) {
929 bld
.mkOp2(OP_SHR
, TYPE_U32
, def
, tid
, bld
.mkImm(26));
931 bld
.mkMov(def
, bld
.mkImm(0));
935 bld
.mkFetch(i
->getDef(0), i
->dType
,
936 FILE_SHADER_INPUT
, addr
, i
->getIndirect(0, 0), NULL
);
939 bld
.getBB()->remove(i
);
944 NV50LoweringPreSSA::handleMUL(Instruction
*i
)
946 if (!isFloatType(i
->dType
) && typeSizeof(i
->sType
) > 2)
947 return expandIntegerMUL(&bld
, i
);
952 NV50LoweringPreSSA::handleDIV(Instruction
*i
)
954 if (!isFloatType(i
->dType
))
956 bld
.setPosition(i
, false);
957 Instruction
*rcp
= bld
.mkOp1(OP_RCP
, i
->dType
, bld
.getSSA(), i
->getSrc(1));
959 i
->setSrc(1, rcp
->getDef(0));
964 NV50LoweringPreSSA::handleSQRT(Instruction
*i
)
966 Instruction
*rsq
= bld
.mkOp1(OP_RSQ
, TYPE_F32
,
967 bld
.getSSA(), i
->getSrc(0));
969 i
->setSrc(1, rsq
->getDef(0));
975 NV50LoweringPreSSA::handlePOW(Instruction
*i
)
977 LValue
*val
= bld
.getScratch();
979 bld
.mkOp1(OP_LG2
, TYPE_F32
, val
, i
->getSrc(0));
980 bld
.mkOp2(OP_MUL
, TYPE_F32
, val
, i
->getSrc(1), val
)->dnz
= 1;
981 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, val
, val
);
991 NV50LoweringPreSSA::handleEXPORT(Instruction
*i
)
993 if (prog
->getType() == Program::TYPE_FRAGMENT
) {
994 if (i
->getIndirect(0, 0)) {
995 // TODO: redirect to l[] here, load to GPRs at exit
998 int id
= i
->getSrc(0)->reg
.data
.offset
/ 4; // in 32 bit reg units
1001 i
->src(0).set(i
->src(1));
1003 i
->setDef(0, new_LValue(func
, FILE_GPR
));
1004 i
->getDef(0)->reg
.data
.id
= id
;
1006 prog
->maxGPR
= MAX2(prog
->maxGPR
, id
);
1012 // Set flags according to predicate and make the instruction read $cX.
1014 NV50LoweringPreSSA::checkPredicate(Instruction
*insn
)
1016 Value
*pred
= insn
->getPredicate();
1019 if (!pred
|| pred
->reg
.file
== FILE_FLAGS
)
1021 cdst
= bld
.getSSA(1, FILE_FLAGS
);
1023 bld
.mkCmp(OP_SET
, CC_NEU
, TYPE_U32
, cdst
, bld
.loadImm(NULL
, 0), pred
);
1025 insn
->setPredicate(insn
->cc
, cdst
);
1029 // - add quadop dance for texturing
1030 // - put FP outputs in GPRs
1031 // - convert instruction sequences
1034 NV50LoweringPreSSA::visit(Instruction
*i
)
1037 bld
.setPosition(i
->prev
, true);
1040 bld
.setPosition(i
->next
, false);
1042 bld
.setPosition(i
->bb
, true);
1044 if (i
->cc
!= CC_ALWAYS
)
1051 return handleTEX(i
->asTex());
1053 return handleTXB(i
->asTex());
1055 return handleTXL(i
->asTex());
1057 return handleTXD(i
->asTex());
1059 bld
.mkOp1(OP_PREEX2
, TYPE_F32
, i
->getDef(0), i
->getSrc(0));
1060 i
->setSrc(0, i
->getDef(0));
1063 return handleSET(i
);
1065 return handleSLCT(i
->asCmp());
1067 return handleSELP(i
);
1069 return handlePOW(i
);
1071 return handleMUL(i
);
1073 return handleDIV(i
);
1075 return handleSQRT(i
);
1077 return handleEXPORT(i
);
1079 return handleRDSV(i
);
1081 return handleWRSV(i
);
1083 return handleCALL(i
);
1085 return handlePRECONT(i
);
1087 return handleCONT(i
);
1095 TargetNV50::runLegalizePass(Program
*prog
, CGStage stage
) const
1099 if (stage
== CG_STAGE_PRE_SSA
) {
1100 NV50LoweringPreSSA
pass(prog
);
1101 ret
= pass
.run(prog
, false, true);
1103 if (stage
== CG_STAGE_SSA
) {
1104 if (!prog
->targetPriv
)
1105 prog
->targetPriv
= new std::list
<Instruction
*>();
1106 NV50LegalizeSSA
pass(prog
);
1107 ret
= pass
.run(prog
, false, true);
1109 if (stage
== CG_STAGE_POST_RA
) {
1110 NV50LegalizePostRA pass
;
1111 ret
= pass
.run(prog
, false, true);
1112 if (prog
->targetPriv
)
1113 delete reinterpret_cast<std::list
<Instruction
*> *>(prog
->targetPriv
);
1118 } // namespace nv50_ir