nv50/ir: import nv50 target
[mesa.git] / src / gallium / drivers / nv50 / codegen / nv50_ir_lowering_nv50.cpp
1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "nv50/codegen/nv50_ir.h"
24 #include "nv50/codegen/nv50_ir_build_util.h"
25
26 #include "nv50_ir_target_nv50.h"
27
28 namespace nv50_ir {
29
30 // nv50 doesn't support 32 bit integer multiplication
31 //
32 // ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33 // -------------------
34 // al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35 // ah*bh 00 00 ( carry1) << 16 + ( carry2)
36 // al*bl
37 // ah*bl 00
38 //
39 // fffe0001 + fffe0001
40 static bool
41 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
42 {
43 const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
44
45 DataType fTy = mul->sType; // full type
46 DataType hTy;
47 switch (fTy) {
48 case TYPE_S32: hTy = TYPE_S16; break;
49 case TYPE_U32: hTy = TYPE_U16; break;
50 case TYPE_U64: hTy = TYPE_U32; break;
51 case TYPE_S64: hTy = TYPE_S32; break;
52 default:
53 return false;
54 }
55 unsigned int fullSize = typeSizeof(fTy);
56 unsigned int halfSize = typeSizeof(hTy);
57
58 Instruction *i[9];
59
60 Value *a[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) };
61 Value *b[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) };
62 Value *c[2];
63 Value *t[4];
64 for (int j = 0; j < 4; ++j)
65 t[j] = bld->getSSA(fullSize);
66
67 (i[0] = bld->mkOp1(OP_SPLIT, fTy, a[0], mul->getSrc(0)))->setDef(1, a[1]);
68 (i[1] = bld->mkOp1(OP_SPLIT, fTy, b[0], mul->getSrc(1)))->setDef(1, b[1]);
69
70 i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
71 i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
72 i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
73 i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
74
75 if (highResult) {
76 Value *r[3];
77 Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
78 c[0] = bld->getSSA(1, FILE_FLAGS);
79 c[1] = bld->getSSA(1, FILE_FLAGS);
80 for (int j = 0; j < 3; ++j)
81 r[j] = bld->getSSA(fullSize);
82
83 i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
84 i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
85 bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
86 i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
87
88 // set carry defs / sources
89 i[3]->setFlagsDef(1, c[0]);
90 i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
91 i[6]->setPredicate(CC_C, c[0]);
92 i[5]->setFlagsSrc(3, c[1]);
93 } else {
94 bld->mkMov(mul->getDef(0), t[3]);
95 }
96 delete_Instruction(bld->getProgram(), mul);
97
98 for (int j = 2; j <= (highResult ? 5 : 4); ++j)
99 i[j]->sType = hTy;
100
101 return true;
102 }
103
104 #define QOP_ADD 0
105 #define QOP_SUBR 1
106 #define QOP_SUB 2
107 #define QOP_MOV2 3
108
109 #define QUADOP(q, r, s, t) \
110 ((QOP_##q << 0) | (QOP_##r << 2) | \
111 (QOP_##s << 4) | (QOP_##t << 6))
112
113 class NV50LegalizePostRA : public Pass
114 {
115 private:
116 virtual bool visit(Function *);
117 virtual bool visit(BasicBlock *);
118
119 void handlePRERET(FlowInstruction *);
120 void replaceZero(Instruction *);
121 void split64BitOp(Instruction *);
122
123 LValue *r63;
124 };
125
126 bool
127 NV50LegalizePostRA::visit(Function *fn)
128 {
129 Program *prog = fn->getProgram();
130
131 r63 = new_LValue(fn, FILE_GPR);
132 r63->reg.data.id = 63;
133
134 // this is actually per-program, but we can do it all on visiting main()
135 std::list<Instruction *> *outWrites =
136 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
137
138 if (outWrites) {
139 for (std::list<Instruction *>::iterator it = outWrites->begin();
140 it != outWrites->end(); ++it)
141 (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
142 // instructions will be deleted on exit
143 outWrites->clear();
144 }
145
146 return true;
147 }
148
149 void
150 NV50LegalizePostRA::replaceZero(Instruction *i)
151 {
152 for (int s = 0; i->srcExists(s); ++s) {
153 ImmediateValue *imm = i->getSrc(s)->asImm();
154 if (imm && imm->reg.data.u64 == 0)
155 i->setSrc(s, r63);
156 }
157 }
158
159 void
160 NV50LegalizePostRA::split64BitOp(Instruction *i)
161 {
162 if (i->dType == TYPE_F64) {
163 if (i->op == OP_MAD)
164 i->op = OP_FMA;
165 if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA ||
166 i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX ||
167 i->op == OP_SET)
168 return;
169 i->dType = i->sType = TYPE_U32;
170
171 i->bb->insertAfter(i, cloneForward(func, i));
172 }
173 }
174
175 // Emulate PRERET: jump to the target and call to the origin from there
176 //
177 // WARNING: atm only works if BBs are affected by at most a single PRERET
178 //
179 // BB:0
180 // preret BB:3
181 // (...)
182 // BB:3
183 // (...)
184 // --->
185 // BB:0
186 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
187 // (...)
188 // BB:3
189 // bra BB:3 + n1 (skip the call)
190 // call BB:0 + n2 (skip bra at beginning of BB:0)
191 // (...)
192 void
193 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
194 {
195 BasicBlock *bbE = pre->bb;
196 BasicBlock *bbT = pre->target.bb;
197
198 pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
199 bbE->remove(pre);
200 bbE->insertHead(pre);
201
202 Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
203 Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
204
205 bbT->insertHead(call);
206 bbT->insertHead(skip);
207
208 // NOTE: maybe split blocks to prevent the instructions from moving ?
209
210 skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
211 call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
212 }
213
214 bool
215 NV50LegalizePostRA::visit(BasicBlock *bb)
216 {
217 Instruction *i, *next;
218
219 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
220 for (i = bb->getFirst(); i; i = next) {
221 next = i->next;
222 if (i->isNop()) {
223 bb->remove(i);
224 } else
225 if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
226 handlePRERET(i->asFlow());
227 } else {
228 if (i->op != OP_MOV && i->op != OP_PFETCH &&
229 (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
230 replaceZero(i);
231 if (typeSizeof(i->dType) == 8)
232 split64BitOp(i);
233 }
234 }
235 if (!bb->getEntry())
236 return true;
237
238 return true;
239 }
240
241 class NV50LegalizeSSA : public Pass
242 {
243 public:
244 NV50LegalizeSSA(Program *);
245
246 virtual bool visit(BasicBlock *bb);
247
248 private:
249 void propagateWriteToOutput(Instruction *);
250 void handleDIV(Instruction *);
251 void handleMOD(Instruction *);
252 void handleMUL(Instruction *);
253 void handleAddrDef(Instruction *);
254
255 inline bool isARL(const Instruction *) const;
256
257 BuildUtil bld;
258
259 std::list<Instruction *> *outWrites;
260 };
261
262 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
263 {
264 bld.setProgram(prog);
265
266 if (prog->optLevel >= 2 &&
267 (prog->getType() == Program::TYPE_GEOMETRY ||
268 prog->getType() == Program::TYPE_VERTEX))
269 outWrites =
270 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
271 else
272 outWrites = NULL;
273 }
274
275 void
276 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
277 {
278 if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
279 return;
280
281 // check def instruction can store
282 Instruction *di = st->getSrc(1)->defs.front()->getInsn();
283
284 // TODO: move exports (if beneficial) in common opt pass
285 if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
286 return;
287 for (int s = 0; di->srcExists(s); ++s)
288 if (di->src(s).getFile() == FILE_IMMEDIATE)
289 return;
290
291 // We cannot set defs to non-lvalues before register allocation, so
292 // save & remove (to save registers) the exports and replace later.
293 outWrites->push_back(st);
294 st->bb->remove(st);
295 }
296
297 bool
298 NV50LegalizeSSA::isARL(const Instruction *i) const
299 {
300 ImmediateValue imm;
301
302 if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
303 return false;
304 if (!i->src(1).getImmediate(imm))
305 return false;
306 return imm.isInteger(0);
307 }
308
309 void
310 NV50LegalizeSSA::handleAddrDef(Instruction *i)
311 {
312 Instruction *arl;
313
314 i->getDef(0)->reg.size = 2; // $aX are only 16 bit
315
316 // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
317 if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
318 if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
319 return;
320 if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
321 return;
322 }
323
324 // turn $a sources into $r sources (can't operate on $a)
325 for (int s = 0; i->srcExists(s); ++s) {
326 Value *a = i->getSrc(s);
327 Value *r;
328 if (a->reg.file == FILE_ADDRESS) {
329 if (a->getInsn() && isARL(a->getInsn())) {
330 i->setSrc(s, a->getInsn()->getSrc(0));
331 } else {
332 bld.setPosition(i, false);
333 r = bld.getSSA();
334 bld.mkMov(r, a);
335 i->setSrc(s, r);
336 }
337 }
338 }
339 if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
340 return;
341
342 // turn result back into $a
343 bld.setPosition(i, true);
344 arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
345 i->setDef(0, arl->getSrc(0));
346 }
347
348 void
349 NV50LegalizeSSA::handleMUL(Instruction *mul)
350 {
351 if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
352 return;
353 Value *def = mul->getDef(0);
354 Value *pred = mul->getPredicate();
355 CondCode cc = mul->cc;
356 if (pred)
357 mul->setPredicate(CC_ALWAYS, NULL);
358
359 if (mul->op == OP_MAD) {
360 Instruction *add = mul;
361 bld.setPosition(add, false);
362 Value *res = cloneShallow(func, mul->getDef(0));
363 mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
364 add->op = OP_ADD;
365 add->setSrc(0, mul->getDef(0));
366 add->setSrc(1, add->getSrc(2));
367 for (int s = 2; add->srcExists(s); ++s)
368 add->setSrc(s, NULL);
369 mul->subOp = add->subOp;
370 add->subOp = 0;
371 }
372 expandIntegerMUL(&bld, mul);
373 if (pred)
374 def->getInsn()->setPredicate(cc, pred);
375 }
376
377 // Use f32 division: first compute an approximate result, use it to reduce
378 // the dividend, which should then be representable as f32, divide the reduced
379 // dividend, and add the quotients.
380 void
381 NV50LegalizeSSA::handleDIV(Instruction *div)
382 {
383 const DataType ty = div->sType;
384
385 if (ty != TYPE_U32 && ty != TYPE_S32)
386 return;
387
388 Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
389
390 bld.setPosition(div, false);
391
392 Value *a, *af = bld.getSSA();
393 Value *b, *bf = bld.getSSA();
394
395 bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
396 bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
397
398 if (isSignedType(ty)) {
399 af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
400 bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
401 a = bld.getSSA();
402 b = bld.getSSA();
403 bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
404 bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
405 } else {
406 a = div->getSrc(0);
407 b = div->getSrc(1);
408 }
409
410 bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
411 bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
412
413 bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
414 bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
415
416 // get error of 1st result
417 expandIntegerMUL(&bld,
418 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
419 bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
420
421 bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
422
423 bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
424 bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
425 ->rnd = ROUND_Z;
426 bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
427
428 // correction: if modulus >= divisor, add 1
429 expandIntegerMUL(&bld,
430 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
431 bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
432 bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
433 if (!isSignedType(ty)) {
434 div->op = OP_SUB;
435 div->setSrc(0, q);
436 div->setSrc(1, s);
437 } else {
438 t = q;
439 bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
440 s = bld.getSSA();
441 t = bld.getSSA();
442 // fix the sign
443 bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
444 ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
445 bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
446 bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
447
448 div->op = OP_UNION;
449 div->setSrc(0, s);
450 div->setSrc(1, t);
451 }
452 }
453
454 void
455 NV50LegalizeSSA::handleMOD(Instruction *mod)
456 {
457 if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
458 return;
459 bld.setPosition(mod, false);
460
461 Value *q = bld.getSSA();
462 Value *m = bld.getSSA();
463
464 bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
465 handleDIV(q->getInsn());
466
467 bld.setPosition(mod, false);
468 expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
469
470 mod->op = OP_SUB;
471 mod->setSrc(1, m);
472 }
473
474 bool
475 NV50LegalizeSSA::visit(BasicBlock *bb)
476 {
477 Instruction *insn, *next;
478 // skipping PHIs (don't pass them to handleAddrDef) !
479 for (insn = bb->getEntry(); insn; insn = next) {
480 next = insn->next;
481
482 switch (insn->op) {
483 case OP_EXPORT:
484 if (outWrites)
485 propagateWriteToOutput(insn);
486 break;
487 case OP_DIV:
488 handleDIV(insn);
489 break;
490 case OP_MOD:
491 handleMOD(insn);
492 break;
493 case OP_MAD:
494 case OP_MUL:
495 handleMUL(insn);
496 break;
497 default:
498 break;
499 }
500
501 if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
502 handleAddrDef(insn);
503 }
504 return true;
505 }
506
507 class NV50LoweringPreSSA : public Pass
508 {
509 public:
510 NV50LoweringPreSSA(Program *);
511
512 private:
513 virtual bool visit(Instruction *);
514 virtual bool visit(Function *);
515
516 bool handleRDSV(Instruction *);
517 bool handleWRSV(Instruction *);
518
519 bool handleEXPORT(Instruction *);
520
521 bool handleMUL(Instruction *);
522 bool handleDIV(Instruction *);
523 bool handleSQRT(Instruction *);
524 bool handlePOW(Instruction *);
525
526 bool handleSET(Instruction *);
527 bool handleSLCT(CmpInstruction *);
528 bool handleSELP(Instruction *);
529
530 bool handleTEX(TexInstruction *);
531 bool handleTXB(TexInstruction *); // I really
532 bool handleTXL(TexInstruction *); // hate
533 bool handleTXD(TexInstruction *); // these 3
534
535 bool handleCALL(Instruction *);
536 bool handlePRECONT(Instruction *);
537 bool handleCONT(Instruction *);
538
539 void checkPredicate(Instruction *);
540
541 private:
542 const Target *const targ;
543
544 BuildUtil bld;
545
546 Value *tid;
547 };
548
549 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
550 targ(prog->getTarget()), tid(NULL)
551 {
552 bld.setProgram(prog);
553 }
554
555 bool
556 NV50LoweringPreSSA::visit(Function *f)
557 {
558 BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
559
560 if (prog->getType() == Program::TYPE_COMPUTE) {
561 // Add implicit "thread id" argument in $r0 to the function
562 Value *arg = new_LValue(func, FILE_GPR);
563 arg->reg.data.id = 0;
564 f->ins.push_back(arg);
565
566 bld.setPosition(root, false);
567 tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
568 }
569
570 return true;
571 }
572
573 // move array source to first slot, convert to u16, add indirections
574 bool
575 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
576 {
577 const int arg = i->tex.target.getArgCount();
578 const int dref = arg;
579 const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
580
581 // dref comes before bias/lod
582 if (i->tex.target.isShadow())
583 if (i->op == OP_TXB || i->op == OP_TXL)
584 i->swapSources(dref, lod);
585
586 // array index must be converted to u32
587 if (i->tex.target.isArray()) {
588 Value *layer = i->getSrc(arg - 1);
589 LValue *src = new_LValue(func, FILE_GPR);
590 bld.mkCvt(OP_CVT, TYPE_U16, src, TYPE_F32, layer);
591 i->setSrc(arg - 1, src);
592
593 if (i->tex.target.isCube()) {
594 // Value *face = layer;
595 Value *x, *y;
596 x = new_LValue(func, FILE_GPR);
597 y = new_LValue(func, FILE_GPR);
598 layer = new_LValue(func, FILE_GPR);
599
600 i->tex.target = TEX_TARGET_2D_ARRAY;
601
602 // TODO: use TEXPREP to convert x,y,z,face -> x,y,layer
603 bld.mkMov(x, i->getSrc(0));
604 bld.mkMov(y, i->getSrc(1));
605 bld.mkMov(layer, i->getSrc(3));
606
607 i->setSrc(0, x);
608 i->setSrc(1, y);
609 i->setSrc(2, layer);
610 i->setSrc(3, i->getSrc(4));
611 i->setSrc(4, NULL);
612 }
613 }
614
615 // texel offsets are 3 immediate fields in the instruction,
616 // nv50 cannot do textureGatherOffsets
617 assert(i->tex.useOffsets <= 1);
618
619 return true;
620 }
621
622 // Bias must be equal for all threads of a quad or lod calculation will fail.
623 //
624 // The lanes of a quad are grouped by the bit in the condition register they
625 // have set, which is selected by differing bias values.
626 // Move the input values for TEX into a new register set for each group and
627 // execute TEX only for a specific group.
628 // We always need to use 4 new registers for the inputs/outputs because the
629 // implicitly calculated derivatives must be correct.
630 //
631 // TODO: move to SSA phase so we can easily determine whether bias is constant
632 bool
633 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
634 {
635 const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
636 int l, d;
637
638 handleTEX(i);
639 Value *bias = i->getSrc(i->tex.target.getArgCount());
640 if (bias->isUniform())
641 return true;
642
643 Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
644 bld.loadImm(NULL, 1));
645 bld.setPosition(cond, false);
646
647 for (l = 1; l < 4; ++l) {
648 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
649 Value *bit = bld.getSSA();
650 Value *pred = bld.getScratch(1, FILE_FLAGS);
651 Value *imm = bld.loadImm(NULL, (1 << l));
652 bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
653 bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
654 cond->setSrc(l, bit);
655 }
656 Value *flags = bld.getScratch(1, FILE_FLAGS);
657 bld.setPosition(cond, true);
658 bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
659
660 Instruction *tex[4];
661 for (l = 0; l < 4; ++l) {
662 (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
663 bld.insert(tex[l]);
664 }
665
666 Value *res[4][4];
667 for (d = 0; i->defExists(d); ++d)
668 res[0][d] = tex[0]->getDef(d);
669 for (l = 1; l < 4; ++l) {
670 for (d = 0; tex[l]->defExists(d); ++d) {
671 res[l][d] = cloneShallow(func, res[0][d]);
672 bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
673 }
674 }
675
676 for (d = 0; i->defExists(d); ++d) {
677 Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
678 for (l = 0; l < 4; ++l)
679 dst->setSrc(l, res[l][d]);
680 }
681 delete_Instruction(prog, i);
682 return true;
683 }
684
685 // LOD must be equal for all threads of a quad.
686 // Unlike with TXB, here we can just diverge since there's no LOD calculation
687 // that would require all 4 threads' sources to be set up properly.
688 bool
689 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
690 {
691 handleTEX(i);
692 Value *lod = i->getSrc(i->tex.target.getArgCount());
693 if (lod->isUniform())
694 return true;
695
696 BasicBlock *currBB = i->bb;
697 BasicBlock *texiBB = i->bb->splitBefore(i, false);
698 BasicBlock *joinBB = i->bb->splitAfter(i);
699
700 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
701
702 for (int l = 0; l <= 3; ++l) {
703 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
704 Value *pred = bld.getScratch(1, FILE_FLAGS);
705 bld.setPosition(currBB, true);
706 bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
707 bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
708 currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
709 if (l <= 2) {
710 BasicBlock *laneBB = new BasicBlock(func);
711 currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
712 currBB = laneBB;
713 }
714 }
715 bld.setPosition(joinBB, false);
716 bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
717 return true;
718 }
719
720 bool
721 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
722 {
723 static const uint8_t qOps[4][2] =
724 {
725 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
726 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
727 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
728 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
729 };
730 Value *def[4][4];
731 Value *crd[3];
732 Instruction *tex;
733 Value *zero = bld.loadImm(bld.getSSA(), 0);
734 int l, c;
735 const int dim = i->tex.target.getDim();
736
737 handleTEX(i);
738 i->op = OP_TEX; // no need to clone dPdx/dPdy later
739
740 for (c = 0; c < dim; ++c)
741 crd[c] = bld.getScratch();
742
743 bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
744 for (l = 0; l < 4; ++l) {
745 // mov coordinates from lane l to all lanes
746 for (c = 0; c < dim; ++c)
747 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
748 // add dPdx from lane l to lanes dx
749 for (c = 0; c < dim; ++c)
750 bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
751 // add dPdy from lane l to lanes dy
752 for (c = 0; c < dim; ++c)
753 bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
754 // texture
755 bld.insert(tex = cloneForward(func, i));
756 for (c = 0; c < dim; ++c)
757 tex->setSrc(c, crd[c]);
758 // save results
759 for (c = 0; i->defExists(c); ++c) {
760 Instruction *mov;
761 def[c][l] = bld.getSSA();
762 mov = bld.mkMov(def[c][l], tex->getDef(c));
763 mov->fixed = 1;
764 mov->lanes = 1 << l;
765 }
766 }
767 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
768
769 for (c = 0; i->defExists(c); ++c) {
770 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
771 for (l = 0; l < 4; ++l)
772 u->setSrc(l, def[c][l]);
773 }
774
775 i->bb->remove(i);
776 return true;
777 }
778
779 bool
780 NV50LoweringPreSSA::handleSET(Instruction *i)
781 {
782 if (i->dType == TYPE_F32) {
783 bld.setPosition(i, true);
784 i->dType = TYPE_U32;
785 bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
786 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
787 }
788 return true;
789 }
790
791 bool
792 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
793 {
794 Value *src0 = bld.getSSA();
795 Value *src1 = bld.getSSA();
796 Value *pred = bld.getScratch(1, FILE_FLAGS);
797
798 Value *v0 = i->getSrc(0);
799 Value *v1 = i->getSrc(1);
800 // XXX: these probably shouldn't be immediates in the first place ...
801 if (v0->asImm())
802 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
803 if (v1->asImm())
804 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
805
806 bld.setPosition(i, true);
807 bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
808 bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
809 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
810
811 bld.setPosition(i, false);
812 i->op = OP_SET;
813 i->setFlagsDef(0, pred);
814 i->dType = TYPE_U8;
815 i->setSrc(0, i->getSrc(2));
816 i->setSrc(2, NULL);
817 i->setSrc(1, bld.loadImm(NULL, 0));
818
819 return true;
820 }
821
822 bool
823 NV50LoweringPreSSA::handleSELP(Instruction *i)
824 {
825 Value *src0 = bld.getSSA();
826 Value *src1 = bld.getSSA();
827
828 Value *v0 = i->getSrc(0);
829 Value *v1 = i->getSrc(1);
830 if (v0->asImm())
831 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
832 if (v1->asImm())
833 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
834
835 bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
836 bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
837 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
838 delete_Instruction(prog, i);
839 return true;
840 }
841
842 bool
843 NV50LoweringPreSSA::handleWRSV(Instruction *i)
844 {
845 Symbol *sym = i->getSrc(0)->asSym();
846
847 // these are all shader outputs, $sreg are not writeable
848 uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
849 if (addr >= 0x400)
850 return false;
851 sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
852
853 bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
854
855 bld.getBB()->remove(i);
856 return true;
857 }
858
859 bool
860 NV50LoweringPreSSA::handleCALL(Instruction *i)
861 {
862 if (prog->getType() == Program::TYPE_COMPUTE) {
863 // Add implicit "thread id" argument in $r0 to the function
864 i->setSrc(i->srcCount(), tid);
865 }
866 return true;
867 }
868
869 bool
870 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
871 {
872 delete_Instruction(prog, i);
873 return true;
874 }
875
876 bool
877 NV50LoweringPreSSA::handleCONT(Instruction *i)
878 {
879 i->op = OP_BRA;
880 return true;
881 }
882
883 bool
884 NV50LoweringPreSSA::handleRDSV(Instruction *i)
885 {
886 Symbol *sym = i->getSrc(0)->asSym();
887 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
888 Value *def = i->getDef(0);
889 SVSemantic sv = sym->reg.data.sv.sv;
890 int idx = sym->reg.data.sv.index;
891
892 if (addr >= 0x400) // mov $sreg
893 return true;
894
895 switch (sv) {
896 case SV_POSITION:
897 assert(prog->getType() == Program::TYPE_FRAGMENT);
898 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
899 break;
900 case SV_FACE:
901 bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
902 if (i->dType == TYPE_F32) {
903 bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
904 bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
905 }
906 break;
907 case SV_NCTAID:
908 case SV_CTAID:
909 case SV_NTID:
910 if ((sv == SV_NCTAID && idx >= 2) ||
911 (sv == SV_NTID && idx >= 3)) {
912 bld.mkMov(def, bld.mkImm(1));
913 } else if (sv == SV_CTAID && idx >= 2) {
914 bld.mkMov(def, bld.mkImm(0));
915 } else {
916 Value *x = bld.getSSA(2);
917 bld.mkOp1(OP_LOAD, TYPE_U16, x,
918 bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
919 bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
920 }
921 break;
922 case SV_TID:
923 if (idx == 0) {
924 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
925 } else if (idx == 1) {
926 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
927 bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
928 } else if (idx == 2) {
929 bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
930 } else {
931 bld.mkMov(def, bld.mkImm(0));
932 }
933 break;
934 default:
935 bld.mkFetch(i->getDef(0), i->dType,
936 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
937 break;
938 }
939 bld.getBB()->remove(i);
940 return true;
941 }
942
943 bool
944 NV50LoweringPreSSA::handleMUL(Instruction *i)
945 {
946 if (!isFloatType(i->dType) && typeSizeof(i->sType) > 2)
947 return expandIntegerMUL(&bld, i);
948 return true;
949 }
950
951 bool
952 NV50LoweringPreSSA::handleDIV(Instruction *i)
953 {
954 if (!isFloatType(i->dType))
955 return true;
956 bld.setPosition(i, false);
957 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
958 i->op = OP_MUL;
959 i->setSrc(1, rcp->getDef(0));
960 return true;
961 }
962
963 bool
964 NV50LoweringPreSSA::handleSQRT(Instruction *i)
965 {
966 Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
967 bld.getSSA(), i->getSrc(0));
968 i->op = OP_MUL;
969 i->setSrc(1, rsq->getDef(0));
970
971 return true;
972 }
973
974 bool
975 NV50LoweringPreSSA::handlePOW(Instruction *i)
976 {
977 LValue *val = bld.getScratch();
978
979 bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
980 bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
981 bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
982
983 i->op = OP_EX2;
984 i->setSrc(0, val);
985 i->setSrc(1, NULL);
986
987 return true;
988 }
989
990 bool
991 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
992 {
993 if (prog->getType() == Program::TYPE_FRAGMENT) {
994 if (i->getIndirect(0, 0)) {
995 // TODO: redirect to l[] here, load to GPRs at exit
996 return false;
997 } else {
998 int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
999
1000 i->op = OP_MOV;
1001 i->src(0).set(i->src(1));
1002 i->setSrc(1, NULL);
1003 i->setDef(0, new_LValue(func, FILE_GPR));
1004 i->getDef(0)->reg.data.id = id;
1005
1006 prog->maxGPR = MAX2(prog->maxGPR, id);
1007 }
1008 }
1009 return true;
1010 }
1011
1012 // Set flags according to predicate and make the instruction read $cX.
1013 void
1014 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1015 {
1016 Value *pred = insn->getPredicate();
1017 Value *cdst;
1018
1019 if (!pred || pred->reg.file == FILE_FLAGS)
1020 return;
1021 cdst = bld.getSSA(1, FILE_FLAGS);
1022
1023 bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, cdst, bld.loadImm(NULL, 0), pred);
1024
1025 insn->setPredicate(insn->cc, cdst);
1026 }
1027
1028 //
1029 // - add quadop dance for texturing
1030 // - put FP outputs in GPRs
1031 // - convert instruction sequences
1032 //
1033 bool
1034 NV50LoweringPreSSA::visit(Instruction *i)
1035 {
1036 if (i->prev)
1037 bld.setPosition(i->prev, true);
1038 else
1039 if (i->next)
1040 bld.setPosition(i->next, false);
1041 else
1042 bld.setPosition(i->bb, true);
1043
1044 if (i->cc != CC_ALWAYS)
1045 checkPredicate(i);
1046
1047 switch (i->op) {
1048 case OP_TEX:
1049 case OP_TXF:
1050 case OP_TXG:
1051 return handleTEX(i->asTex());
1052 case OP_TXB:
1053 return handleTXB(i->asTex());
1054 case OP_TXL:
1055 return handleTXL(i->asTex());
1056 case OP_TXD:
1057 return handleTXD(i->asTex());
1058 case OP_EX2:
1059 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1060 i->setSrc(0, i->getDef(0));
1061 break;
1062 case OP_SET:
1063 return handleSET(i);
1064 case OP_SLCT:
1065 return handleSLCT(i->asCmp());
1066 case OP_SELP:
1067 return handleSELP(i);
1068 case OP_POW:
1069 return handlePOW(i);
1070 case OP_MUL:
1071 return handleMUL(i);
1072 case OP_DIV:
1073 return handleDIV(i);
1074 case OP_SQRT:
1075 return handleSQRT(i);
1076 case OP_EXPORT:
1077 return handleEXPORT(i);
1078 case OP_RDSV:
1079 return handleRDSV(i);
1080 case OP_WRSV:
1081 return handleWRSV(i);
1082 case OP_CALL:
1083 return handleCALL(i);
1084 case OP_PRECONT:
1085 return handlePRECONT(i);
1086 case OP_CONT:
1087 return handleCONT(i);
1088 default:
1089 break;
1090 }
1091 return true;
1092 }
1093
1094 bool
1095 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1096 {
1097 bool ret = false;
1098
1099 if (stage == CG_STAGE_PRE_SSA) {
1100 NV50LoweringPreSSA pass(prog);
1101 ret = pass.run(prog, false, true);
1102 } else
1103 if (stage == CG_STAGE_SSA) {
1104 if (!prog->targetPriv)
1105 prog->targetPriv = new std::list<Instruction *>();
1106 NV50LegalizeSSA pass(prog);
1107 ret = pass.run(prog, false, true);
1108 } else
1109 if (stage == CG_STAGE_POST_RA) {
1110 NV50LegalizePostRA pass;
1111 ret = pass.run(prog, false, true);
1112 if (prog->targetPriv)
1113 delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1114 }
1115 return ret;
1116 }
1117
1118 } // namespace nv50_ir