nv50/ir: fix cutoff for using r63 vs r127 when replacing zero
[mesa.git] / src / gallium / drivers / nouveau / codegen / nv50_ir_lowering_nv50.cpp
1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
25
26 #include "codegen/nv50_ir_target_nv50.h"
27
28 namespace nv50_ir {
29
30 // nv50 doesn't support 32 bit integer multiplication
31 //
32 // ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33 // -------------------
34 // al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35 // ah*bh 00 00 ( carry1) << 16 + ( carry2)
36 // al*bl
37 // ah*bl 00
38 //
39 // fffe0001 + fffe0001
40 //
41 // Note that this sort of splitting doesn't work for signed values, so we
42 // compute the sign on those manually and then perform an unsigned multiply.
43 static bool
44 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
45 {
46 const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
47
48 DataType fTy; // full type
49 switch (mul->sType) {
50 case TYPE_S32: fTy = TYPE_U32; break;
51 case TYPE_S64: fTy = TYPE_U64; break;
52 default: fTy = mul->sType; break;
53 }
54
55 DataType hTy; // half type
56 switch (fTy) {
57 case TYPE_U32: hTy = TYPE_U16; break;
58 case TYPE_U64: hTy = TYPE_U32; break;
59 default:
60 return false;
61 }
62 unsigned int fullSize = typeSizeof(fTy);
63 unsigned int halfSize = typeSizeof(hTy);
64
65 Instruction *i[9];
66
67 bld->setPosition(mul, true);
68
69 Value *s[2];
70 Value *a[2], *b[2];
71 Value *t[4];
72 for (int j = 0; j < 4; ++j)
73 t[j] = bld->getSSA(fullSize);
74
75 s[0] = mul->getSrc(0);
76 s[1] = mul->getSrc(1);
77
78 if (isSignedType(mul->sType) && highResult) {
79 s[0] = bld->getSSA(fullSize);
80 s[1] = bld->getSSA(fullSize);
81 bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
82 bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
83 }
84
85 // split sources into halves
86 i[0] = bld->mkSplit(a, halfSize, s[0]);
87 i[1] = bld->mkSplit(b, halfSize, s[1]);
88
89 i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
90 i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
91 i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
92 i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
93
94 if (highResult) {
95 Value *c[2];
96 Value *r[5];
97 Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
98 c[0] = bld->getSSA(1, FILE_FLAGS);
99 c[1] = bld->getSSA(1, FILE_FLAGS);
100 for (int j = 0; j < 5; ++j)
101 r[j] = bld->getSSA(fullSize);
102
103 i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
104 i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
105 bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
106 bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
107 i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
108
109 // set carry defs / sources
110 i[3]->setFlagsDef(1, c[0]);
111 // actual result required in negative case, but ignored for
112 // unsigned. for some reason the compiler ends up dropping the whole
113 // instruction if the destination is unused but the flags are.
114 if (isSignedType(mul->sType))
115 i[4]->setFlagsDef(1, c[1]);
116 else
117 i[4]->setFlagsDef(0, c[1]);
118 i[6]->setPredicate(CC_C, c[0]);
119 i[5]->setFlagsSrc(3, c[1]);
120
121 if (isSignedType(mul->sType)) {
122 Value *cc[2];
123 Value *rr[7];
124 Value *one = bld->getSSA(fullSize);
125 bld->loadImm(one, 1);
126 for (int j = 0; j < 7; j++)
127 rr[j] = bld->getSSA(fullSize);
128
129 // NOTE: this logic uses predicates because splitting basic blocks is
130 // ~impossible during the SSA phase. The RA relies on a correlation
131 // between edge order and phi node sources.
132
133 // Set the sign of the result based on the inputs
134 bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
135 ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
136
137 // 1s complement of 64-bit value
138 bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
139 ->setPredicate(CC_S, cc[0]);
140 bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
141 ->setPredicate(CC_S, cc[0]);
142
143 // add to low 32-bits, keep track of the carry
144 Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
145 n->setPredicate(CC_S, cc[0]);
146 n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
147
148 // If there was a carry, add 1 to the upper 32 bits
149 // XXX: These get executed even if they shouldn't be
150 bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
151 ->setPredicate(CC_C, cc[1]);
152 bld->mkMov(rr[3], rr[0])
153 ->setPredicate(CC_NC, cc[1]);
154 bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
155
156 // Merge the results from the negative and non-negative paths
157 bld->mkMov(rr[5], rr[4])
158 ->setPredicate(CC_S, cc[0]);
159 bld->mkMov(rr[6], r[4])
160 ->setPredicate(CC_NS, cc[0]);
161 bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
162 } else {
163 bld->mkMov(mul->getDef(0), r[4]);
164 }
165 } else {
166 bld->mkMov(mul->getDef(0), t[3]);
167 }
168 delete_Instruction(bld->getProgram(), mul);
169
170 for (int j = 2; j <= (highResult ? 5 : 4); ++j)
171 if (i[j])
172 i[j]->sType = hTy;
173
174 return true;
175 }
176
177 #define QOP_ADD 0
178 #define QOP_SUBR 1
179 #define QOP_SUB 2
180 #define QOP_MOV2 3
181
182 // UL UR LL LR
183 #define QUADOP(q, r, s, t) \
184 ((QOP_##q << 6) | (QOP_##r << 4) | \
185 (QOP_##s << 2) | (QOP_##t << 0))
186
187 class NV50LegalizePostRA : public Pass
188 {
189 private:
190 virtual bool visit(Function *);
191 virtual bool visit(BasicBlock *);
192
193 void handlePRERET(FlowInstruction *);
194 void replaceZero(Instruction *);
195
196 LValue *r63;
197 };
198
199 bool
200 NV50LegalizePostRA::visit(Function *fn)
201 {
202 Program *prog = fn->getProgram();
203
204 r63 = new_LValue(fn, FILE_GPR);
205 // GPR units on nv50 are in half-regs
206 if (prog->maxGPR < 126)
207 r63->reg.data.id = 63;
208 else
209 r63->reg.data.id = 127;
210
211 // this is actually per-program, but we can do it all on visiting main()
212 std::list<Instruction *> *outWrites =
213 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
214
215 if (outWrites) {
216 for (std::list<Instruction *>::iterator it = outWrites->begin();
217 it != outWrites->end(); ++it)
218 (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
219 // instructions will be deleted on exit
220 outWrites->clear();
221 }
222
223 return true;
224 }
225
226 void
227 NV50LegalizePostRA::replaceZero(Instruction *i)
228 {
229 for (int s = 0; i->srcExists(s); ++s) {
230 ImmediateValue *imm = i->getSrc(s)->asImm();
231 if (imm && imm->reg.data.u64 == 0)
232 i->setSrc(s, r63);
233 }
234 }
235
236 // Emulate PRERET: jump to the target and call to the origin from there
237 //
238 // WARNING: atm only works if BBs are affected by at most a single PRERET
239 //
240 // BB:0
241 // preret BB:3
242 // (...)
243 // BB:3
244 // (...)
245 // --->
246 // BB:0
247 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
248 // (...)
249 // BB:3
250 // bra BB:3 + n1 (skip the call)
251 // call BB:0 + n2 (skip bra at beginning of BB:0)
252 // (...)
253 void
254 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
255 {
256 BasicBlock *bbE = pre->bb;
257 BasicBlock *bbT = pre->target.bb;
258
259 pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
260 bbE->remove(pre);
261 bbE->insertHead(pre);
262
263 Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
264 Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
265
266 bbT->insertHead(call);
267 bbT->insertHead(skip);
268
269 // NOTE: maybe split blocks to prevent the instructions from moving ?
270
271 skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
272 call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
273 }
274
275 bool
276 NV50LegalizePostRA::visit(BasicBlock *bb)
277 {
278 Instruction *i, *next;
279
280 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
281 for (i = bb->getFirst(); i; i = next) {
282 next = i->next;
283 if (i->isNop()) {
284 bb->remove(i);
285 } else
286 if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
287 handlePRERET(i->asFlow());
288 } else {
289 // TODO: We will want to do this before register allocation,
290 // since have to use a $c register for the carry flag.
291 if (typeSizeof(i->dType) == 8) {
292 Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
293 if (hi)
294 next = hi;
295 }
296
297 if (i->op != OP_PFETCH && i->op != OP_BAR &&
298 (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
299 replaceZero(i);
300 }
301 }
302 if (!bb->getEntry())
303 return true;
304
305 return true;
306 }
307
308 class NV50LegalizeSSA : public Pass
309 {
310 public:
311 NV50LegalizeSSA(Program *);
312
313 virtual bool visit(BasicBlock *bb);
314
315 private:
316 void propagateWriteToOutput(Instruction *);
317 void handleDIV(Instruction *);
318 void handleMOD(Instruction *);
319 void handleMUL(Instruction *);
320 void handleAddrDef(Instruction *);
321
322 inline bool isARL(const Instruction *) const;
323
324 BuildUtil bld;
325
326 std::list<Instruction *> *outWrites;
327 };
328
329 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
330 {
331 bld.setProgram(prog);
332
333 if (prog->optLevel >= 2 &&
334 (prog->getType() == Program::TYPE_GEOMETRY ||
335 prog->getType() == Program::TYPE_VERTEX))
336 outWrites =
337 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
338 else
339 outWrites = NULL;
340 }
341
342 void
343 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
344 {
345 if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
346 return;
347
348 // check def instruction can store
349 Instruction *di = st->getSrc(1)->defs.front()->getInsn();
350
351 // TODO: move exports (if beneficial) in common opt pass
352 if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
353 return;
354
355 for (int s = 0; di->srcExists(s); ++s)
356 if (di->src(s).getFile() == FILE_IMMEDIATE)
357 return;
358
359 if (prog->getType() == Program::TYPE_GEOMETRY) {
360 // Only propagate output writes in geometry shaders when we can be sure
361 // that we are propagating to the same output vertex.
362 if (di->bb != st->bb)
363 return;
364 Instruction *i;
365 for (i = di; i != st; i = i->next) {
366 if (i->op == OP_EMIT || i->op == OP_RESTART)
367 return;
368 }
369 assert(i); // st after di
370 }
371
372 // We cannot set defs to non-lvalues before register allocation, so
373 // save & remove (to save registers) the exports and replace later.
374 outWrites->push_back(st);
375 st->bb->remove(st);
376 }
377
378 bool
379 NV50LegalizeSSA::isARL(const Instruction *i) const
380 {
381 ImmediateValue imm;
382
383 if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
384 return false;
385 if (!i->src(1).getImmediate(imm))
386 return false;
387 return imm.isInteger(0);
388 }
389
390 void
391 NV50LegalizeSSA::handleAddrDef(Instruction *i)
392 {
393 Instruction *arl;
394
395 i->getDef(0)->reg.size = 2; // $aX are only 16 bit
396
397 // PFETCH can always write to $a
398 if (i->op == OP_PFETCH)
399 return;
400 // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
401 if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
402 if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
403 return;
404 if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
405 return;
406 }
407
408 // turn $a sources into $r sources (can't operate on $a)
409 for (int s = 0; i->srcExists(s); ++s) {
410 Value *a = i->getSrc(s);
411 Value *r;
412 if (a->reg.file == FILE_ADDRESS) {
413 if (a->getInsn() && isARL(a->getInsn())) {
414 i->setSrc(s, a->getInsn()->getSrc(0));
415 } else {
416 bld.setPosition(i, false);
417 r = bld.getSSA();
418 bld.mkMov(r, a);
419 i->setSrc(s, r);
420 }
421 }
422 }
423 if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
424 return;
425
426 // turn result back into $a
427 bld.setPosition(i, true);
428 arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
429 i->setDef(0, arl->getSrc(0));
430 }
431
432 void
433 NV50LegalizeSSA::handleMUL(Instruction *mul)
434 {
435 if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
436 return;
437 Value *def = mul->getDef(0);
438 Value *pred = mul->getPredicate();
439 CondCode cc = mul->cc;
440 if (pred)
441 mul->setPredicate(CC_ALWAYS, NULL);
442
443 if (mul->op == OP_MAD) {
444 Instruction *add = mul;
445 bld.setPosition(add, false);
446 Value *res = cloneShallow(func, mul->getDef(0));
447 mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
448 add->op = OP_ADD;
449 add->setSrc(0, mul->getDef(0));
450 add->setSrc(1, add->getSrc(2));
451 for (int s = 2; add->srcExists(s); ++s)
452 add->setSrc(s, NULL);
453 mul->subOp = add->subOp;
454 add->subOp = 0;
455 }
456 expandIntegerMUL(&bld, mul);
457 if (pred)
458 def->getInsn()->setPredicate(cc, pred);
459 }
460
461 // Use f32 division: first compute an approximate result, use it to reduce
462 // the dividend, which should then be representable as f32, divide the reduced
463 // dividend, and add the quotients.
464 void
465 NV50LegalizeSSA::handleDIV(Instruction *div)
466 {
467 const DataType ty = div->sType;
468
469 if (ty != TYPE_U32 && ty != TYPE_S32)
470 return;
471
472 Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
473
474 bld.setPosition(div, false);
475
476 Value *a, *af = bld.getSSA();
477 Value *b, *bf = bld.getSSA();
478
479 bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
480 bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
481
482 if (isSignedType(ty)) {
483 af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
484 bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
485 a = bld.getSSA();
486 b = bld.getSSA();
487 bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
488 bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
489 } else {
490 a = div->getSrc(0);
491 b = div->getSrc(1);
492 }
493
494 bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
495 bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
496
497 bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
498 bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
499
500 // get error of 1st result
501 expandIntegerMUL(&bld,
502 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
503 bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
504
505 bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
506
507 bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
508 bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
509 ->rnd = ROUND_Z;
510 bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
511
512 // correction: if modulus >= divisor, add 1
513 expandIntegerMUL(&bld,
514 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
515 bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
516 bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
517 if (!isSignedType(ty)) {
518 div->op = OP_SUB;
519 div->setSrc(0, q);
520 div->setSrc(1, s);
521 } else {
522 t = q;
523 bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
524 s = bld.getSSA();
525 t = bld.getSSA();
526 // fix the sign
527 bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
528 ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
529 bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
530 bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
531
532 div->op = OP_UNION;
533 div->setSrc(0, s);
534 div->setSrc(1, t);
535 }
536 }
537
538 void
539 NV50LegalizeSSA::handleMOD(Instruction *mod)
540 {
541 if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
542 return;
543 bld.setPosition(mod, false);
544
545 Value *q = bld.getSSA();
546 Value *m = bld.getSSA();
547
548 bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
549 handleDIV(q->getInsn());
550
551 bld.setPosition(mod, false);
552 expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
553
554 mod->op = OP_SUB;
555 mod->setSrc(1, m);
556 }
557
558 bool
559 NV50LegalizeSSA::visit(BasicBlock *bb)
560 {
561 Instruction *insn, *next;
562 // skipping PHIs (don't pass them to handleAddrDef) !
563 for (insn = bb->getEntry(); insn; insn = next) {
564 next = insn->next;
565
566 if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
567 handleAddrDef(insn);
568
569 switch (insn->op) {
570 case OP_EXPORT:
571 if (outWrites)
572 propagateWriteToOutput(insn);
573 break;
574 case OP_DIV:
575 handleDIV(insn);
576 break;
577 case OP_MOD:
578 handleMOD(insn);
579 break;
580 case OP_MAD:
581 case OP_MUL:
582 handleMUL(insn);
583 break;
584 default:
585 break;
586 }
587 }
588 return true;
589 }
590
591 class NV50LoweringPreSSA : public Pass
592 {
593 public:
594 NV50LoweringPreSSA(Program *);
595
596 private:
597 virtual bool visit(Instruction *);
598 virtual bool visit(Function *);
599
600 bool handleRDSV(Instruction *);
601 bool handleWRSV(Instruction *);
602
603 bool handlePFETCH(Instruction *);
604 bool handleEXPORT(Instruction *);
605 bool handleLOAD(Instruction *);
606
607 bool handleDIV(Instruction *);
608 bool handleSQRT(Instruction *);
609 bool handlePOW(Instruction *);
610
611 bool handleSET(Instruction *);
612 bool handleSLCT(CmpInstruction *);
613 bool handleSELP(Instruction *);
614
615 bool handleTEX(TexInstruction *);
616 bool handleTXB(TexInstruction *); // I really
617 bool handleTXL(TexInstruction *); // hate
618 bool handleTXD(TexInstruction *); // these 3
619 bool handleTXLQ(TexInstruction *);
620 bool handleTXQ(TexInstruction *);
621
622 bool handleCALL(Instruction *);
623 bool handlePRECONT(Instruction *);
624 bool handleCONT(Instruction *);
625
626 void checkPredicate(Instruction *);
627 void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
628 void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
629
630 private:
631 const Target *const targ;
632
633 BuildUtil bld;
634
635 Value *tid;
636 };
637
638 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
639 targ(prog->getTarget()), tid(NULL)
640 {
641 bld.setProgram(prog);
642 }
643
644 bool
645 NV50LoweringPreSSA::visit(Function *f)
646 {
647 BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
648
649 if (prog->getType() == Program::TYPE_COMPUTE) {
650 // Add implicit "thread id" argument in $r0 to the function
651 Value *arg = new_LValue(func, FILE_GPR);
652 arg->reg.data.id = 0;
653 f->ins.push_back(arg);
654
655 bld.setPosition(root, false);
656 tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
657 }
658
659 return true;
660 }
661
662 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
663 Value **ms_x, Value **ms_y) {
664 // This loads the texture-indexed ms setting from the constant buffer
665 Value *tmp = new_LValue(func, FILE_GPR);
666 uint8_t b = prog->driver->io.resInfoCBSlot;
667 off += prog->driver->io.suInfoBase;
668 if (prog->getType() > Program::TYPE_VERTEX)
669 off += 16 * 2 * 4;
670 if (prog->getType() > Program::TYPE_GEOMETRY)
671 off += 16 * 2 * 4;
672 *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
673 FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
674 *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
675 FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
676 *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
677 }
678
679 void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
680 // Given a MS level, and a sample id, compute the delta x/y
681 uint8_t b = prog->driver->io.msInfoCBSlot;
682 Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
683
684 // The required information is at mslevel * 16 * 4 + sample * 8
685 // = (mslevel * 8 + sample) * 8
686 bld.mkOp2(OP_SHL,
687 TYPE_U32,
688 off,
689 bld.mkOp2v(OP_ADD, TYPE_U32, t,
690 bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
691 s),
692 bld.mkImm(3));
693 *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
694 FILE_MEMORY_CONST, b, TYPE_U32,
695 prog->driver->io.msInfoBase), off);
696 *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
697 FILE_MEMORY_CONST, b, TYPE_U32,
698 prog->driver->io.msInfoBase + 4), off);
699 }
700
701 bool
702 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
703 {
704 const int arg = i->tex.target.getArgCount();
705 const int dref = arg;
706 const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
707
708 // handle MS, which means looking up the MS params for this texture, and
709 // adjusting the input coordinates to point at the right sample.
710 if (i->tex.target.isMS()) {
711 Value *x = i->getSrc(0);
712 Value *y = i->getSrc(1);
713 Value *s = i->getSrc(arg - 1);
714 Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
715 *ms, *ms_x, *ms_y, *dx, *dy;
716
717 i->tex.target.clearMS();
718
719 loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
720 loadMsInfo(ms, s, &dx, &dy);
721
722 bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
723 bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
724 bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
725 bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
726 i->setSrc(0, tx);
727 i->setSrc(1, ty);
728 i->setSrc(arg - 1, bld.loadImm(NULL, 0));
729 }
730
731 // dref comes before bias/lod
732 if (i->tex.target.isShadow())
733 if (i->op == OP_TXB || i->op == OP_TXL)
734 i->swapSources(dref, lod);
735
736 if (i->tex.target.isArray()) {
737 if (i->op != OP_TXF) {
738 // array index must be converted to u32, but it's already an integer
739 // for TXF
740 Value *layer = i->getSrc(arg - 1);
741 LValue *src = new_LValue(func, FILE_GPR);
742 bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
743 bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
744 i->setSrc(arg - 1, src);
745 }
746 if (i->tex.target.isCube() && i->srcCount() > 4) {
747 std::vector<Value *> acube, a2d;
748 int c;
749
750 acube.resize(4);
751 for (c = 0; c < 4; ++c)
752 acube[c] = i->getSrc(c);
753 a2d.resize(4);
754 for (c = 0; c < 3; ++c)
755 a2d[c] = new_LValue(func, FILE_GPR);
756 a2d[3] = NULL;
757
758 bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
759 a2d, acube)->asTex()->tex.mask = 0x7;
760
761 for (c = 0; c < 3; ++c)
762 i->setSrc(c, a2d[c]);
763 for (; i->srcExists(c + 1); ++c)
764 i->setSrc(c, i->getSrc(c + 1));
765 i->setSrc(c, NULL);
766 assert(c <= 4);
767
768 i->tex.target = i->tex.target.isShadow() ?
769 TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
770 }
771 }
772
773 // texel offsets are 3 immediate fields in the instruction,
774 // nv50 cannot do textureGatherOffsets
775 assert(i->tex.useOffsets <= 1);
776 if (i->tex.useOffsets) {
777 for (int c = 0; c < 3; ++c) {
778 ImmediateValue val;
779 if (!i->offset[0][c].getImmediate(val))
780 assert(!"non-immediate offset");
781 i->tex.offset[c] = val.reg.data.u32;
782 i->offset[0][c].set(NULL);
783 }
784 }
785
786 return true;
787 }
788
789 // Bias must be equal for all threads of a quad or lod calculation will fail.
790 //
791 // The lanes of a quad are grouped by the bit in the condition register they
792 // have set, which is selected by differing bias values.
793 // Move the input values for TEX into a new register set for each group and
794 // execute TEX only for a specific group.
795 // We always need to use 4 new registers for the inputs/outputs because the
796 // implicitly calculated derivatives must be correct.
797 //
798 // TODO: move to SSA phase so we can easily determine whether bias is constant
799 bool
800 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
801 {
802 const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
803 int l, d;
804
805 // We can't actually apply bias *and* do a compare for a cube
806 // texture. Since the compare has to be done before the filtering, just
807 // drop the bias on the floor.
808 if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
809 i->op = OP_TEX;
810 i->setSrc(3, i->getSrc(4));
811 i->setSrc(4, NULL);
812 return handleTEX(i);
813 }
814
815 handleTEX(i);
816 Value *bias = i->getSrc(i->tex.target.getArgCount());
817 if (bias->isUniform())
818 return true;
819
820 Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
821 bld.loadImm(NULL, 1));
822 bld.setPosition(cond, false);
823
824 for (l = 1; l < 4; ++l) {
825 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
826 Value *bit = bld.getSSA();
827 Value *pred = bld.getScratch(1, FILE_FLAGS);
828 Value *imm = bld.loadImm(NULL, (1 << l));
829 bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
830 bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
831 cond->setSrc(l, bit);
832 }
833 Value *flags = bld.getScratch(1, FILE_FLAGS);
834 bld.setPosition(cond, true);
835 bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
836
837 Instruction *tex[4];
838 for (l = 0; l < 4; ++l) {
839 (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
840 bld.insert(tex[l]);
841 }
842
843 Value *res[4][4];
844 for (d = 0; i->defExists(d); ++d)
845 res[0][d] = tex[0]->getDef(d);
846 for (l = 1; l < 4; ++l) {
847 for (d = 0; tex[l]->defExists(d); ++d) {
848 res[l][d] = cloneShallow(func, res[0][d]);
849 bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
850 }
851 }
852
853 for (d = 0; i->defExists(d); ++d) {
854 Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
855 for (l = 0; l < 4; ++l)
856 dst->setSrc(l, res[l][d]);
857 }
858 delete_Instruction(prog, i);
859 return true;
860 }
861
862 // LOD must be equal for all threads of a quad.
863 // Unlike with TXB, here we can just diverge since there's no LOD calculation
864 // that would require all 4 threads' sources to be set up properly.
865 bool
866 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
867 {
868 handleTEX(i);
869 Value *lod = i->getSrc(i->tex.target.getArgCount());
870 if (lod->isUniform())
871 return true;
872
873 BasicBlock *currBB = i->bb;
874 BasicBlock *texiBB = i->bb->splitBefore(i, false);
875 BasicBlock *joinBB = i->bb->splitAfter(i);
876
877 bld.setPosition(currBB, true);
878 assert(!currBB->joinAt);
879 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
880
881 for (int l = 0; l <= 3; ++l) {
882 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
883 Value *pred = bld.getScratch(1, FILE_FLAGS);
884 bld.setPosition(currBB, true);
885 bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
886 bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
887 currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
888 if (l <= 2) {
889 BasicBlock *laneBB = new BasicBlock(func);
890 currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
891 currBB = laneBB;
892 }
893 }
894 bld.setPosition(joinBB, false);
895 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
896 return true;
897 }
898
899 bool
900 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
901 {
902 static const uint8_t qOps[4][2] =
903 {
904 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
905 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
906 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
907 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
908 };
909 Value *def[4][4];
910 Value *crd[3];
911 Instruction *tex;
912 Value *zero = bld.loadImm(bld.getSSA(), 0);
913 int l, c;
914 const int dim = i->tex.target.getDim();
915
916 handleTEX(i);
917 i->op = OP_TEX; // no need to clone dPdx/dPdy later
918
919 for (c = 0; c < dim; ++c)
920 crd[c] = bld.getScratch();
921
922 bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
923 for (l = 0; l < 4; ++l) {
924 // mov coordinates from lane l to all lanes
925 for (c = 0; c < dim; ++c)
926 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
927 // add dPdx from lane l to lanes dx
928 for (c = 0; c < dim; ++c)
929 bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
930 // add dPdy from lane l to lanes dy
931 for (c = 0; c < dim; ++c)
932 bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
933 // texture
934 bld.insert(tex = cloneForward(func, i));
935 for (c = 0; c < dim; ++c)
936 tex->setSrc(c, crd[c]);
937 // save results
938 for (c = 0; i->defExists(c); ++c) {
939 Instruction *mov;
940 def[c][l] = bld.getSSA();
941 mov = bld.mkMov(def[c][l], tex->getDef(c));
942 mov->fixed = 1;
943 mov->lanes = 1 << l;
944 }
945 }
946 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
947
948 for (c = 0; i->defExists(c); ++c) {
949 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
950 for (l = 0; l < 4; ++l)
951 u->setSrc(l, def[c][l]);
952 }
953
954 i->bb->remove(i);
955 return true;
956 }
957
958 bool
959 NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
960 {
961 handleTEX(i);
962 bld.setPosition(i, true);
963
964 /* The returned values are not quite what we want:
965 * (a) convert from s32 to f32
966 * (b) multiply by 1/256
967 */
968 for (int def = 0; def < 2; ++def) {
969 if (!i->defExists(def))
970 continue;
971 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
972 bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
973 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
974 }
975 return true;
976 }
977
978 bool
979 NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
980 {
981 Value *ms, *ms_x, *ms_y;
982 if (i->tex.query == TXQ_DIMS)
983 return true;
984 assert(i->tex.query == TXQ_TYPE);
985 assert(i->tex.mask == 4);
986
987 loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
988 bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
989 i->bb->remove(i);
990
991 return true;
992 }
993
994
995 bool
996 NV50LoweringPreSSA::handleSET(Instruction *i)
997 {
998 if (i->dType == TYPE_F32) {
999 bld.setPosition(i, true);
1000 i->dType = TYPE_U32;
1001 bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
1002 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
1003 }
1004 return true;
1005 }
1006
1007 bool
1008 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
1009 {
1010 Value *src0 = bld.getSSA();
1011 Value *src1 = bld.getSSA();
1012 Value *pred = bld.getScratch(1, FILE_FLAGS);
1013
1014 Value *v0 = i->getSrc(0);
1015 Value *v1 = i->getSrc(1);
1016 // XXX: these probably shouldn't be immediates in the first place ...
1017 if (v0->asImm())
1018 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1019 if (v1->asImm())
1020 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1021
1022 bld.setPosition(i, true);
1023 bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
1024 bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
1025 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1026
1027 bld.setPosition(i, false);
1028 i->op = OP_SET;
1029 i->setFlagsDef(0, pred);
1030 i->dType = TYPE_U8;
1031 i->setSrc(0, i->getSrc(2));
1032 i->setSrc(2, NULL);
1033 i->setSrc(1, bld.loadImm(NULL, 0));
1034
1035 return true;
1036 }
1037
1038 bool
1039 NV50LoweringPreSSA::handleSELP(Instruction *i)
1040 {
1041 Value *src0 = bld.getSSA();
1042 Value *src1 = bld.getSSA();
1043
1044 Value *v0 = i->getSrc(0);
1045 Value *v1 = i->getSrc(1);
1046 if (v0->asImm())
1047 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1048 if (v1->asImm())
1049 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1050
1051 bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1052 bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1053 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1054 delete_Instruction(prog, i);
1055 return true;
1056 }
1057
1058 bool
1059 NV50LoweringPreSSA::handleWRSV(Instruction *i)
1060 {
1061 Symbol *sym = i->getSrc(0)->asSym();
1062
1063 // these are all shader outputs, $sreg are not writeable
1064 uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
1065 if (addr >= 0x400)
1066 return false;
1067 sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1068
1069 bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
1070
1071 bld.getBB()->remove(i);
1072 return true;
1073 }
1074
1075 bool
1076 NV50LoweringPreSSA::handleCALL(Instruction *i)
1077 {
1078 if (prog->getType() == Program::TYPE_COMPUTE) {
1079 // Add implicit "thread id" argument in $r0 to the function
1080 i->setSrc(i->srcCount(), tid);
1081 }
1082 return true;
1083 }
1084
1085 bool
1086 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
1087 {
1088 delete_Instruction(prog, i);
1089 return true;
1090 }
1091
1092 bool
1093 NV50LoweringPreSSA::handleCONT(Instruction *i)
1094 {
1095 i->op = OP_BRA;
1096 return true;
1097 }
1098
1099 bool
1100 NV50LoweringPreSSA::handleRDSV(Instruction *i)
1101 {
1102 Symbol *sym = i->getSrc(0)->asSym();
1103 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1104 Value *def = i->getDef(0);
1105 SVSemantic sv = sym->reg.data.sv.sv;
1106 int idx = sym->reg.data.sv.index;
1107
1108 if (addr >= 0x400) // mov $sreg
1109 return true;
1110
1111 switch (sv) {
1112 case SV_POSITION:
1113 assert(prog->getType() == Program::TYPE_FRAGMENT);
1114 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1115 break;
1116 case SV_FACE:
1117 bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1118 if (i->dType == TYPE_F32) {
1119 bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
1120 bld.mkOp1(OP_NEG, TYPE_S32, def, def);
1121 bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
1122 }
1123 break;
1124 case SV_NCTAID:
1125 case SV_CTAID:
1126 case SV_NTID:
1127 if ((sv == SV_NCTAID && idx >= 2) ||
1128 (sv == SV_NTID && idx >= 3)) {
1129 bld.mkMov(def, bld.mkImm(1));
1130 } else if (sv == SV_CTAID && idx >= 2) {
1131 bld.mkMov(def, bld.mkImm(0));
1132 } else {
1133 Value *x = bld.getSSA(2);
1134 bld.mkOp1(OP_LOAD, TYPE_U16, x,
1135 bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1136 bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1137 }
1138 break;
1139 case SV_TID:
1140 if (idx == 0) {
1141 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1142 } else if (idx == 1) {
1143 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1144 bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1145 } else if (idx == 2) {
1146 bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1147 } else {
1148 bld.mkMov(def, bld.mkImm(0));
1149 }
1150 break;
1151 case SV_SAMPLE_POS: {
1152 Value *off = new_LValue(func, FILE_ADDRESS);
1153 bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1154 bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1155 bld.mkLoad(TYPE_F32,
1156 def,
1157 bld.mkSymbol(
1158 FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
1159 TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1160 off);
1161 break;
1162 }
1163 default:
1164 bld.mkFetch(i->getDef(0), i->dType,
1165 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1166 break;
1167 }
1168 bld.getBB()->remove(i);
1169 return true;
1170 }
1171
1172 bool
1173 NV50LoweringPreSSA::handleDIV(Instruction *i)
1174 {
1175 if (!isFloatType(i->dType))
1176 return true;
1177 bld.setPosition(i, false);
1178 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1179 i->op = OP_MUL;
1180 i->setSrc(1, rcp->getDef(0));
1181 return true;
1182 }
1183
1184 bool
1185 NV50LoweringPreSSA::handleSQRT(Instruction *i)
1186 {
1187 Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
1188 bld.getSSA(), i->getSrc(0));
1189 i->op = OP_MUL;
1190 i->setSrc(1, rsq->getDef(0));
1191
1192 return true;
1193 }
1194
1195 bool
1196 NV50LoweringPreSSA::handlePOW(Instruction *i)
1197 {
1198 LValue *val = bld.getScratch();
1199
1200 bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1201 bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1202 bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1203
1204 i->op = OP_EX2;
1205 i->setSrc(0, val);
1206 i->setSrc(1, NULL);
1207
1208 return true;
1209 }
1210
1211 bool
1212 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
1213 {
1214 if (prog->getType() == Program::TYPE_FRAGMENT) {
1215 if (i->getIndirect(0, 0)) {
1216 // TODO: redirect to l[] here, load to GPRs at exit
1217 return false;
1218 } else {
1219 int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1220
1221 i->op = OP_MOV;
1222 i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1223 i->src(0).set(i->src(1));
1224 i->setSrc(1, NULL);
1225 i->setDef(0, new_LValue(func, FILE_GPR));
1226 i->getDef(0)->reg.data.id = id;
1227
1228 prog->maxGPR = MAX2(prog->maxGPR, id);
1229 }
1230 }
1231 return true;
1232 }
1233
1234 // Handle indirect addressing in geometry shaders:
1235 //
1236 // ld $r0 a[$a1][$a2+k] ->
1237 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1238 //
1239 bool
1240 NV50LoweringPreSSA::handleLOAD(Instruction *i)
1241 {
1242 ValueRef src = i->src(0);
1243
1244 if (src.isIndirect(1)) {
1245 assert(prog->getType() == Program::TYPE_GEOMETRY);
1246 Value *addr = i->getIndirect(0, 1);
1247
1248 if (src.isIndirect(0)) {
1249 // base address is in an address register, so move to a GPR
1250 Value *base = bld.getScratch();
1251 bld.mkMov(base, addr);
1252
1253 Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1254 Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1255 Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1256 i->getIndirect(0, 0), bld.mkImm(2));
1257
1258 // Calculate final address: addr = base + attr*vstride; use 16-bit
1259 // multiplication since 32-bit would be lowered to multiple
1260 // instructions, and we only need the low 16 bits of the result
1261 Value *a[2], *b[2];
1262 bld.mkSplit(a, 2, attrib);
1263 bld.mkSplit(b, 2, vstride);
1264 Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1265 base);
1266
1267 // move address from GPR into an address register
1268 addr = bld.getSSA(2, FILE_ADDRESS);
1269 bld.mkMov(addr, sum);
1270 }
1271
1272 i->setIndirect(0, 1, NULL);
1273 i->setIndirect(0, 0, addr);
1274 }
1275
1276 return true;
1277 }
1278
1279 bool
1280 NV50LoweringPreSSA::handlePFETCH(Instruction *i)
1281 {
1282 assert(prog->getType() == Program::TYPE_GEOMETRY);
1283
1284 // NOTE: cannot use getImmediate here, not in SSA form yet, move to
1285 // later phase if that assertion ever triggers:
1286
1287 ImmediateValue *imm = i->getSrc(0)->asImm();
1288 assert(imm);
1289
1290 assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
1291
1292 if (i->srcExists(1)) {
1293 // indirect addressing of vertex in primitive space
1294
1295 LValue *val = bld.getScratch();
1296 Value *ptr = bld.getSSA(2, FILE_ADDRESS);
1297 bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
1298 bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
1299
1300 // NOTE: PFETCH directly to an $aX only works with direct addressing
1301 i->op = OP_SHL;
1302 i->setSrc(0, val);
1303 i->setSrc(1, bld.mkImm(0));
1304 }
1305
1306 return true;
1307 }
1308
1309 // Set flags according to predicate and make the instruction read $cX.
1310 void
1311 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1312 {
1313 Value *pred = insn->getPredicate();
1314 Value *cdst;
1315
1316 // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
1317 if (!pred ||
1318 pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
1319 return;
1320
1321 cdst = bld.getSSA(1, FILE_FLAGS);
1322
1323 bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
1324
1325 insn->setPredicate(insn->cc, cdst);
1326 }
1327
1328 //
1329 // - add quadop dance for texturing
1330 // - put FP outputs in GPRs
1331 // - convert instruction sequences
1332 //
1333 bool
1334 NV50LoweringPreSSA::visit(Instruction *i)
1335 {
1336 bld.setPosition(i, false);
1337
1338 if (i->cc != CC_ALWAYS)
1339 checkPredicate(i);
1340
1341 switch (i->op) {
1342 case OP_TEX:
1343 case OP_TXF:
1344 case OP_TXG:
1345 return handleTEX(i->asTex());
1346 case OP_TXB:
1347 return handleTXB(i->asTex());
1348 case OP_TXL:
1349 return handleTXL(i->asTex());
1350 case OP_TXD:
1351 return handleTXD(i->asTex());
1352 case OP_TXLQ:
1353 return handleTXLQ(i->asTex());
1354 case OP_TXQ:
1355 return handleTXQ(i->asTex());
1356 case OP_EX2:
1357 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1358 i->setSrc(0, i->getDef(0));
1359 break;
1360 case OP_SET:
1361 return handleSET(i);
1362 case OP_SLCT:
1363 return handleSLCT(i->asCmp());
1364 case OP_SELP:
1365 return handleSELP(i);
1366 case OP_POW:
1367 return handlePOW(i);
1368 case OP_DIV:
1369 return handleDIV(i);
1370 case OP_SQRT:
1371 return handleSQRT(i);
1372 case OP_EXPORT:
1373 return handleEXPORT(i);
1374 case OP_LOAD:
1375 return handleLOAD(i);
1376 case OP_RDSV:
1377 return handleRDSV(i);
1378 case OP_WRSV:
1379 return handleWRSV(i);
1380 case OP_CALL:
1381 return handleCALL(i);
1382 case OP_PRECONT:
1383 return handlePRECONT(i);
1384 case OP_CONT:
1385 return handleCONT(i);
1386 case OP_PFETCH:
1387 return handlePFETCH(i);
1388 default:
1389 break;
1390 }
1391 return true;
1392 }
1393
1394 bool
1395 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1396 {
1397 bool ret = false;
1398
1399 if (stage == CG_STAGE_PRE_SSA) {
1400 NV50LoweringPreSSA pass(prog);
1401 ret = pass.run(prog, false, true);
1402 } else
1403 if (stage == CG_STAGE_SSA) {
1404 if (!prog->targetPriv)
1405 prog->targetPriv = new std::list<Instruction *>();
1406 NV50LegalizeSSA pass(prog);
1407 ret = pass.run(prog, false, true);
1408 } else
1409 if (stage == CG_STAGE_POST_RA) {
1410 NV50LegalizePostRA pass;
1411 ret = pass.run(prog, false, true);
1412 if (prog->targetPriv)
1413 delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1414 }
1415 return ret;
1416 }
1417
1418 } // namespace nv50_ir