nv50/ir: implement splitting of 64 bit ops after RA
[mesa.git] / src / gallium / drivers / nvc0 / codegen / nv50_ir_lowering_nvc0.cpp
1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23 #include "nv50/codegen/nv50_ir.h"
24 #include "nv50/codegen/nv50_ir_build_util.h"
25
26 #include "nv50_ir_target_nvc0.h"
27
28 #include <limits>
29
30 namespace nv50_ir {
31
32 #define QOP_ADD 0
33 #define QOP_SUBR 1
34 #define QOP_SUB 2
35 #define QOP_MOV2 3
36
37 // UL UR LL LR
38 #define QUADOP(q, r, s, t) \
39 ((QOP_##q << 6) | (QOP_##r << 4) | \
40 (QOP_##s << 2) | (QOP_##t << 0))
41
42 class NVC0LegalizeSSA : public Pass
43 {
44 private:
45 virtual bool visit(BasicBlock *);
46 virtual bool visit(Function *);
47
48 // we want to insert calls to the builtin library only after optimization
49 void handleDIV(Instruction *); // integer division, modulus
50 void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
51
52 private:
53 BuildUtil bld;
54 };
55
56 void
57 NVC0LegalizeSSA::handleDIV(Instruction *i)
58 {
59 FlowInstruction *call;
60 int builtin;
61 Value *def[2];
62
63 bld.setPosition(i, false);
64 def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
65 def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
66 switch (i->dType) {
67 case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
68 case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
69 default:
70 return;
71 }
72 call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
73 bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
74 bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
75 bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
76
77 call->fixed = 1;
78 call->absolute = call->builtin = 1;
79 call->target.builtin = builtin;
80 delete_Instruction(prog, i);
81 }
82
83 void
84 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
85 {
86 // TODO
87 }
88
89 bool
90 NVC0LegalizeSSA::visit(Function *fn)
91 {
92 bld.setProgram(fn->getProgram());
93 return true;
94 }
95
96 bool
97 NVC0LegalizeSSA::visit(BasicBlock *bb)
98 {
99 Instruction *next;
100 for (Instruction *i = bb->getEntry(); i; i = next) {
101 next = i->next;
102 if (i->dType == TYPE_F32)
103 continue;
104 switch (i->op) {
105 case OP_DIV:
106 case OP_MOD:
107 handleDIV(i);
108 break;
109 case OP_RCP:
110 case OP_RSQ:
111 if (i->dType == TYPE_F64)
112 handleRCPRSQ(i);
113 break;
114 default:
115 break;
116 }
117 }
118 return true;
119 }
120
121 class NVC0LegalizePostRA : public Pass
122 {
123 public:
124 NVC0LegalizePostRA(const Program *);
125
126 private:
127 virtual bool visit(Function *);
128 virtual bool visit(BasicBlock *);
129
130 void replaceZero(Instruction *);
131 bool tryReplaceContWithBra(BasicBlock *);
132 void propagateJoin(BasicBlock *);
133
134 struct TexUse
135 {
136 TexUse(Instruction *use, const Instruction *tex)
137 : insn(use), tex(tex), level(-1) { }
138 Instruction *insn;
139 const Instruction *tex; // or split / mov
140 int level;
141 };
142 struct Limits
143 {
144 Limits() { }
145 Limits(int min, int max) : min(min), max(max) { }
146 int min, max;
147 };
148 bool insertTextureBarriers(Function *);
149 inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
150 void findFirstUses(const Instruction *tex, const Instruction *def,
151 std::list<TexUse>&);
152 void findOverwritingDefs(const Instruction *tex, Instruction *insn,
153 const BasicBlock *term,
154 std::list<TexUse>&);
155 void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
156 const Instruction *recurseDef(const Instruction *);
157
158 private:
159 LValue *rZero;
160 LValue *carry;
161 const bool needTexBar;
162 };
163
164 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
165 : needTexBar(prog->getTarget()->getChipset() >= 0xe0)
166 {
167 }
168
169 bool
170 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
171 const Instruction *early) const
172 {
173 if (early->bb == later->bb)
174 return early->serial < later->serial;
175 return later->bb->dominatedBy(early->bb);
176 }
177
178 void
179 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
180 Instruction *usei, const Instruction *insn)
181 {
182 bool add = true;
183 for (std::list<TexUse>::iterator it = uses.begin();
184 it != uses.end();) {
185 if (insnDominatedBy(usei, it->insn)) {
186 add = false;
187 break;
188 }
189 if (insnDominatedBy(it->insn, usei))
190 it = uses.erase(it);
191 else
192 ++it;
193 }
194 if (add)
195 uses.push_back(TexUse(usei, insn));
196 }
197
198 void
199 NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
200 Instruction *insn,
201 const BasicBlock *term,
202 std::list<TexUse> &uses)
203 {
204 while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
205 insn = insn->getSrc(0)->getUniqueInsn();
206
207 if (!insn || !insn->bb->reachableBy(texi->bb, term))
208 return;
209
210 switch (insn->op) {
211 /* Values not connected to the tex's definition through any of these should
212 * not be conflicting.
213 */
214 case OP_SPLIT:
215 case OP_MERGE:
216 case OP_PHI:
217 case OP_UNION:
218 /* recurse again */
219 for (int s = 0; insn->srcExists(s); ++s)
220 findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
221 uses);
222 break;
223 default:
224 // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
225 addTexUse(uses, insn, texi);
226 break;
227 }
228 }
229
230 void
231 NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
232 const Instruction *insn,
233 std::list<TexUse> &uses)
234 {
235 for (int d = 0; insn->defExists(d); ++d) {
236 Value *v = insn->getDef(d);
237 for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
238 Instruction *usei = (*u)->getInsn();
239
240 if (usei->op == OP_PHI || usei->op == OP_UNION) {
241 // need a barrier before WAW cases
242 for (int s = 0; usei->srcExists(s); ++s) {
243 Instruction *defi = usei->getSrc(s)->getUniqueInsn();
244 if (defi && &usei->src(s) != *u)
245 findOverwritingDefs(texi, defi, usei->bb, uses);
246 }
247 }
248
249 if (usei->op == OP_SPLIT ||
250 usei->op == OP_MERGE ||
251 usei->op == OP_PHI ||
252 usei->op == OP_UNION) {
253 // these uses don't manifest in the machine code
254 findFirstUses(texi, usei, uses);
255 } else
256 if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
257 usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
258 findFirstUses(texi, usei, uses);
259 } else {
260 addTexUse(uses, usei, insn);
261 }
262 }
263 }
264 }
265
266 // Texture barriers:
267 // This pass is a bit long and ugly and can probably be optimized.
268 //
269 // 1. obtain a list of TEXes and their outputs' first use(s)
270 // 2. calculate the barrier level of each first use (minimal number of TEXes,
271 // over all paths, between the TEX and the use in question)
272 // 3. for each barrier, if all paths from the source TEX to that barrier
273 // contain a barrier of lesser level, it can be culled
274 bool
275 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
276 {
277 std::list<TexUse> *uses;
278 std::vector<Instruction *> texes;
279 std::vector<int> bbFirstTex;
280 std::vector<int> bbFirstUse;
281 std::vector<int> texCounts;
282 std::vector<TexUse> useVec;
283 ArrayList insns;
284
285 fn->orderInstructions(insns);
286
287 texCounts.resize(fn->allBBlocks.getSize(), 0);
288 bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
289 bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
290
291 // tag BB CFG nodes by their id for later
292 for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
293 BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
294 if (bb)
295 bb->cfg.tag = bb->getId();
296 }
297
298 // gather the first uses for each TEX
299 for (int i = 0; i < insns.getSize(); ++i) {
300 Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
301 if (isTextureOp(tex->op)) {
302 texes.push_back(tex);
303 if (!texCounts.at(tex->bb->getId()))
304 bbFirstTex[tex->bb->getId()] = texes.size() - 1;
305 texCounts[tex->bb->getId()]++;
306 }
307 }
308 insns.clear();
309 if (texes.empty())
310 return false;
311 uses = new std::list<TexUse>[texes.size()];
312 if (!uses)
313 return false;
314 for (size_t i = 0; i < texes.size(); ++i)
315 findFirstUses(texes[i], texes[i], uses[i]);
316
317 // determine the barrier level at each use
318 for (size_t i = 0; i < texes.size(); ++i) {
319 for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
320 ++u) {
321 BasicBlock *tb = texes[i]->bb;
322 BasicBlock *ub = u->insn->bb;
323 if (tb == ub) {
324 u->level = 0;
325 for (size_t j = i + 1; j < texes.size() &&
326 texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
327 ++j)
328 u->level++;
329 } else {
330 u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
331 &ub->cfg, texCounts);
332 if (u->level < 0) {
333 WARN("Failed to find path TEX -> TEXBAR\n");
334 u->level = 0;
335 continue;
336 }
337 // this counted all TEXes in the origin block, correct that
338 u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
339 // and did not count the TEXes in the destination block, add those
340 for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
341 texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
342 ++j)
343 u->level++;
344 }
345 assert(u->level >= 0);
346 useVec.push_back(*u);
347 }
348 }
349 delete[] uses;
350 uses = NULL;
351
352 // insert the barriers
353 for (size_t i = 0; i < useVec.size(); ++i) {
354 Instruction *prev = useVec[i].insn->prev;
355 if (useVec[i].level < 0)
356 continue;
357 if (prev && prev->op == OP_TEXBAR) {
358 if (prev->subOp > useVec[i].level)
359 prev->subOp = useVec[i].level;
360 prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
361 } else {
362 Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
363 bar->fixed = 1;
364 bar->subOp = useVec[i].level;
365 // make use explicit to ease latency calculation
366 bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
367 useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
368 }
369 }
370
371 if (fn->getProgram()->optLevel < 3) {
372 if (uses)
373 delete[] uses;
374 return true;
375 }
376
377 std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
378
379 limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
380 limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
381 limitS.resize(fn->allBBlocks.getSize());
382
383 // cull unneeded barriers (should do that earlier, but for simplicity)
384 IteratorRef bi = fn->cfg.iteratorCFG();
385 // first calculate min/max outstanding TEXes for each BB
386 for (bi->reset(); !bi->end(); bi->next()) {
387 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
388 BasicBlock *bb = BasicBlock::get(n);
389 int min = 0;
390 int max = std::numeric_limits<int>::max();
391 for (Instruction *i = bb->getFirst(); i; i = i->next) {
392 if (isTextureOp(i->op)) {
393 min++;
394 if (max < std::numeric_limits<int>::max())
395 max++;
396 } else
397 if (i->op == OP_TEXBAR) {
398 min = MIN2(min, i->subOp);
399 max = MIN2(max, i->subOp);
400 }
401 }
402 // limits when looking at an isolated block
403 limitS[bb->getId()].min = min;
404 limitS[bb->getId()].max = max;
405 }
406 // propagate the min/max values
407 for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
408 for (bi->reset(); !bi->end(); bi->next()) {
409 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
410 BasicBlock *bb = BasicBlock::get(n);
411 const int bbId = bb->getId();
412 for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
413 BasicBlock *in = BasicBlock::get(ei.getNode());
414 const int inId = in->getId();
415 limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
416 limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
417 }
418 // I just hope this is correct ...
419 if (limitS[bbId].max == std::numeric_limits<int>::max()) {
420 // no barrier
421 limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
422 limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
423 } else {
424 // block contained a barrier
425 limitB[bbId].min = MIN2(limitS[bbId].max,
426 limitT[bbId].min + limitS[bbId].min);
427 limitB[bbId].max = MIN2(limitS[bbId].max,
428 limitT[bbId].max + limitS[bbId].min);
429 }
430 }
431 }
432 // finally delete unnecessary barriers
433 for (bi->reset(); !bi->end(); bi->next()) {
434 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
435 BasicBlock *bb = BasicBlock::get(n);
436 Instruction *prev = NULL;
437 Instruction *next;
438 int max = limitT[bb->getId()].max;
439 for (Instruction *i = bb->getFirst(); i; i = next) {
440 next = i->next;
441 if (i->op == OP_TEXBAR) {
442 if (i->subOp >= max) {
443 delete_Instruction(prog, i);
444 } else {
445 max = i->subOp;
446 if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
447 delete_Instruction(prog, prev);
448 prev = NULL;
449 }
450 }
451 } else
452 if (isTextureOp(i->op)) {
453 max++;
454 }
455 if (!i->isNop())
456 prev = i;
457 }
458 }
459 if (uses)
460 delete[] uses;
461 return true;
462 }
463
464 bool
465 NVC0LegalizePostRA::visit(Function *fn)
466 {
467 if (needTexBar)
468 insertTextureBarriers(fn);
469
470 rZero = new_LValue(fn, FILE_GPR);
471 carry = new_LValue(fn, FILE_FLAGS);
472
473 rZero->reg.data.id = prog->getTarget()->getFileSize(FILE_GPR);
474 carry->reg.data.id = 0;
475
476 return true;
477 }
478
479 void
480 NVC0LegalizePostRA::replaceZero(Instruction *i)
481 {
482 for (int s = 0; i->srcExists(s); ++s) {
483 if (s == 2 && i->op == OP_SUCLAMP)
484 continue;
485 ImmediateValue *imm = i->getSrc(s)->asImm();
486 if (imm && imm->reg.data.u64 == 0)
487 i->setSrc(s, rZero);
488 }
489 }
490
491 // replace CONT with BRA for single unconditional continue
492 bool
493 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
494 {
495 if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
496 return false;
497 Graph::EdgeIterator ei = bb->cfg.incident();
498 if (ei.getType() != Graph::Edge::BACK)
499 ei.next();
500 if (ei.getType() != Graph::Edge::BACK)
501 return false;
502 BasicBlock *contBB = BasicBlock::get(ei.getNode());
503
504 if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
505 contBB->getExit()->getPredicate())
506 return false;
507 contBB->getExit()->op = OP_BRA;
508 bb->remove(bb->getEntry()); // delete PRECONT
509
510 ei.next();
511 assert(ei.end() || ei.getType() != Graph::Edge::BACK);
512 return true;
513 }
514
515 // replace branches to join blocks with join ops
516 void
517 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
518 {
519 if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
520 return;
521 for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
522 BasicBlock *in = BasicBlock::get(ei.getNode());
523 Instruction *exit = in->getExit();
524 if (!exit) {
525 in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
526 // there should always be a terminator instruction
527 WARN("inserted missing terminator in BB:%i\n", in->getId());
528 } else
529 if (exit->op == OP_BRA) {
530 exit->op = OP_JOIN;
531 exit->asFlow()->limit = 1; // must-not-propagate marker
532 }
533 }
534 bb->remove(bb->getEntry());
535 }
536
537 bool
538 NVC0LegalizePostRA::visit(BasicBlock *bb)
539 {
540 Instruction *i, *next;
541
542 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
543 for (i = bb->getFirst(); i; i = next) {
544 next = i->next;
545 if (i->op == OP_EMIT || i->op == OP_RESTART) {
546 if (!i->getDef(0)->refCount())
547 i->setDef(0, NULL);
548 if (i->src(0).getFile() == FILE_IMMEDIATE)
549 i->setSrc(0, rZero); // initial value must be 0
550 } else
551 if (i->isNop()) {
552 bb->remove(i);
553 } else {
554 // TODO: Move this to before register allocation for operations that
555 // need the $c register !
556 if (typeSizeof(i->dType) == 8) {
557 Instruction *hi;
558 hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
559 if (hi)
560 next = hi;
561 }
562
563 if (i->op != OP_MOV && i->op != OP_PFETCH)
564 replaceZero(i);
565 }
566 }
567 if (!bb->getEntry())
568 return true;
569
570 if (!tryReplaceContWithBra(bb))
571 propagateJoin(bb);
572
573 return true;
574 }
575
576 class NVC0LoweringPass : public Pass
577 {
578 public:
579 NVC0LoweringPass(Program *);
580
581 private:
582 virtual bool visit(Function *);
583 virtual bool visit(BasicBlock *);
584 virtual bool visit(Instruction *);
585
586 bool handleRDSV(Instruction *);
587 bool handleWRSV(Instruction *);
588 bool handleEXPORT(Instruction *);
589 bool handleOUT(Instruction *);
590 bool handleDIV(Instruction *);
591 bool handleMOD(Instruction *);
592 bool handleSQRT(Instruction *);
593 bool handlePOW(Instruction *);
594 bool handleTEX(TexInstruction *);
595 bool handleTXD(TexInstruction *);
596 bool handleTXQ(TexInstruction *);
597 bool handleManualTXD(TexInstruction *);
598 bool handleATOM(Instruction *);
599 void handleSurfaceOpNVE4(TexInstruction *);
600
601 void checkPredicate(Instruction *);
602
603 void readTessCoord(LValue *dst, int c);
604
605 Value *loadResInfo32(Value *ptr, uint32_t off);
606 Value *loadMsInfo32(Value *ptr, uint32_t off);
607
608 void adjustCoordinatesMS(TexInstruction *);
609 void processSurfaceCoordsNVE4(TexInstruction *);
610
611 private:
612 const Target *const targ;
613
614 BuildUtil bld;
615
616 Symbol *gMemBase;
617 LValue *gpEmitAddress;
618 };
619
620 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
621 {
622 bld.setProgram(prog);
623 gMemBase = NULL;
624 }
625
626 bool
627 NVC0LoweringPass::visit(Function *fn)
628 {
629 if (prog->getType() == Program::TYPE_GEOMETRY) {
630 assert(!strncmp(fn->getName(), "MAIN", 4));
631 // TODO: when we generate actual functions pass this value along somehow
632 bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
633 gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
634 if (fn->cfgExit) {
635 bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
636 bld.mkMovToReg(0, gpEmitAddress);
637 }
638 }
639 return true;
640 }
641
642 bool
643 NVC0LoweringPass::visit(BasicBlock *bb)
644 {
645 return true;
646 }
647
648 // move array source to first slot, convert to u16, add indirections
649 bool
650 NVC0LoweringPass::handleTEX(TexInstruction *i)
651 {
652 const int dim = i->tex.target.getDim() + i->tex.target.isCube();
653 const int arg = i->tex.target.getArgCount();
654
655 if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET) {
656 if (i->tex.r == i->tex.s) {
657 i->tex.r += prog->driver->io.texBindBase / 4;
658 i->tex.s = 0; // only a single cX[] value possible here
659 } else {
660 // TODO: extract handles and use register to select TIC/TSC entries
661 }
662 if (i->tex.target.isArray()) {
663 LValue *layer = new_LValue(func, FILE_GPR);
664 Value *src = i->getSrc(arg - 1);
665 const int sat = (i->op == OP_TXF) ? 1 : 0;
666 DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
667 bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
668 for (int s = dim; s >= 1; --s)
669 i->setSrc(s, i->getSrc(s - 1));
670 i->setSrc(0, layer);
671 }
672 if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
673 Value *tmp[2];
674 Symbol *bind;
675 Value *rRel = i->getIndirectR();
676 Value *sRel = i->getIndirectS();
677 Value *shCnt = bld.loadImm(NULL, 2);
678
679 if (rRel) {
680 tmp[0] = bld.getScratch();
681 bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.r * 4);
682 bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], rRel, shCnt);
683 tmp[1] = bld.mkLoadv(TYPE_U32, bind, tmp[0]);
684 bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1],
685 bld.loadImm(tmp[0], 0x00ffffffu));
686 rRel = tmp[0];
687 i->setSrc(i->tex.rIndirectSrc, NULL);
688 }
689 if (sRel) {
690 tmp[0] = bld.getScratch();
691 bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.s * 4);
692 bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], sRel, shCnt);
693 tmp[1] = bld.mkLoadv(TYPE_U32, bind, tmp[0]);
694 bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1],
695 bld.loadImm(tmp[0], 0xff000000u));
696 sRel = tmp[0];
697 i->setSrc(i->tex.sIndirectSrc, NULL);
698 }
699 bld.mkOp2(OP_OR, TYPE_U32, rRel, rRel, sRel);
700
701 int min = i->tex.rIndirectSrc;
702 if (min < 0 || min > i->tex.sIndirectSrc)
703 min = i->tex.sIndirectSrc;
704 for (int s = min; s >= 1; --s)
705 i->setSrc(s, i->getSrc(s - 1));
706 i->setSrc(0, rRel);
707 }
708 } else
709 // (nvc0) generate and move the tsc/tic/array source to the front
710 if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
711 LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
712
713 Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(arg - 1) : NULL;
714 for (int s = dim; s >= 1; --s)
715 i->setSrc(s, i->getSrc(s - 1));
716 i->setSrc(0, arrayIndex);
717
718 Value *ticRel = i->getIndirectR();
719 Value *tscRel = i->getIndirectS();
720
721 if (arrayIndex) {
722 int sat = (i->op == OP_TXF) ? 1 : 0;
723 DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
724 bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
725 } else {
726 bld.loadImm(src, 0);
727 }
728
729 if (ticRel) {
730 i->setSrc(i->tex.rIndirectSrc, NULL);
731 bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
732 }
733 if (tscRel) {
734 i->setSrc(i->tex.sIndirectSrc, NULL);
735 bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
736 }
737
738 i->setSrc(0, src);
739 }
740
741 // offset is last source (lod 1st, dc 2nd)
742 if (i->tex.useOffsets) {
743 uint32_t value = 0;
744 int n, c;
745 int s = i->srcCount(0xff);
746 for (n = 0; n < i->tex.useOffsets; ++n)
747 for (c = 0; c < 3; ++c)
748 value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4);
749 i->setSrc(s, bld.loadImm(NULL, value));
750 }
751
752 return true;
753 }
754
755 bool
756 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
757 {
758 static const uint8_t qOps[4][2] =
759 {
760 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
761 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
762 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
763 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
764 };
765 Value *def[4][4];
766 Value *crd[3];
767 Instruction *tex;
768 Value *zero = bld.loadImm(bld.getSSA(), 0);
769 int l, c;
770 const int dim = i->tex.target.getDim();
771
772 i->op = OP_TEX; // no need to clone dPdx/dPdy later
773
774 for (c = 0; c < dim; ++c)
775 crd[c] = bld.getScratch();
776
777 bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
778 for (l = 0; l < 4; ++l) {
779 // mov coordinates from lane l to all lanes
780 for (c = 0; c < dim; ++c)
781 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
782 // add dPdx from lane l to lanes dx
783 for (c = 0; c < dim; ++c)
784 bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
785 // add dPdy from lane l to lanes dy
786 for (c = 0; c < dim; ++c)
787 bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
788 // texture
789 bld.insert(tex = cloneForward(func, i));
790 for (c = 0; c < dim; ++c)
791 tex->setSrc(c, crd[c]);
792 // save results
793 for (c = 0; i->defExists(c); ++c) {
794 Instruction *mov;
795 def[c][l] = bld.getSSA();
796 mov = bld.mkMov(def[c][l], tex->getDef(c));
797 mov->fixed = 1;
798 mov->lanes = 1 << l;
799 }
800 }
801 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
802
803 for (c = 0; i->defExists(c); ++c) {
804 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
805 for (l = 0; l < 4; ++l)
806 u->setSrc(l, def[c][l]);
807 }
808
809 i->bb->remove(i);
810 return true;
811 }
812
813 bool
814 NVC0LoweringPass::handleTXD(TexInstruction *txd)
815 {
816 int dim = txd->tex.target.getDim();
817 int arg = txd->tex.target.getArgCount();
818
819 handleTEX(txd);
820 while (txd->srcExists(arg))
821 ++arg;
822
823 txd->tex.derivAll = true;
824 if (dim > 2 ||
825 txd->tex.target.isCube() ||
826 arg > 4 ||
827 txd->tex.target.isShadow())
828 return handleManualTXD(txd);
829
830 for (int c = 0; c < dim; ++c) {
831 txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
832 txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
833 txd->dPdx[c].set(NULL);
834 txd->dPdy[c].set(NULL);
835 }
836 return true;
837 }
838
839 bool
840 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
841 {
842 // TODO: indirect resource/sampler index
843 return true;
844 }
845
846 bool
847 NVC0LoweringPass::handleATOM(Instruction *atom)
848 {
849 SVSemantic sv;
850
851 switch (atom->src(0).getFile()) {
852 case FILE_MEMORY_LOCAL:
853 sv = SV_LBASE;
854 break;
855 case FILE_MEMORY_SHARED:
856 sv = SV_SBASE;
857 break;
858 default:
859 assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
860 return true;
861 }
862 Value *base =
863 bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
864 Value *ptr = atom->getIndirect(0, 0);
865
866 atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
867 atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
868 if (ptr)
869 base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
870 atom->setIndirect(0, 0, base);
871
872 return true;
873 }
874
875 inline Value *
876 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
877 {
878 uint8_t b = prog->driver->io.resInfoCBSlot;
879 off += prog->driver->io.suInfoBase;
880 return bld.
881 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
882 }
883
884 inline Value *
885 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
886 {
887 uint8_t b = prog->driver->io.msInfoCBSlot;
888 off += prog->driver->io.msInfoBase;
889 return bld.
890 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
891 }
892
893 /* On nvc0, surface info is obtained via the surface binding points passed
894 * to the SULD/SUST instructions.
895 * On nve4, surface info is stored in c[] and is used by various special
896 * instructions, e.g. for clamping coordiantes or generating an address.
897 * They couldn't just have added an equivalent to TIC now, couldn't they ?
898 */
899 #define NVE4_SU_INFO_ADDR 0x00
900 #define NVE4_SU_INFO_FMT 0x04
901 #define NVE4_SU_INFO_DIM_X 0x08
902 #define NVE4_SU_INFO_PITCH 0x0c
903 #define NVE4_SU_INFO_DIM_Y 0x10
904 #define NVE4_SU_INFO_ARRAY 0x14
905 #define NVE4_SU_INFO_DIM_Z 0x18
906 #define NVE4_SU_INFO_UNK1C 0x1c
907 #define NVE4_SU_INFO_WIDTH 0x20
908 #define NVE4_SU_INFO_HEIGHT 0x24
909 #define NVE4_SU_INFO_DEPTH 0x28
910 #define NVE4_SU_INFO_TARGET 0x2c
911 #define NVE4_SU_INFO_CALL 0x30
912 #define NVE4_SU_INFO_RAW_X 0x34
913 #define NVE4_SU_INFO_MS_X 0x38
914 #define NVE4_SU_INFO_MS_Y 0x3c
915
916 #define NVE4_SU_INFO__STRIDE 0x40
917
918 #define NVE4_SU_INFO_DIM(i) (0x08 + (i) * 8)
919 #define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
920 #define NVE4_SU_INFO_MS(i) (0x38 + (i) * 4)
921
922 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
923 {
924 switch (su->tex.target.getEnum()) {
925 case TEX_TARGET_BUFFER: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
926 case TEX_TARGET_RECT: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
927 case TEX_TARGET_1D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
928 case TEX_TARGET_1D_ARRAY: return (c == 1) ?
929 NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
930 NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
931 case TEX_TARGET_2D: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
932 case TEX_TARGET_2D_MS: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
933 case TEX_TARGET_2D_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
934 case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
935 case TEX_TARGET_3D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
936 case TEX_TARGET_CUBE: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
937 case TEX_TARGET_CUBE_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
938 default:
939 assert(0);
940 return 0;
941 }
942 }
943
944 void
945 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
946 {
947 const uint16_t base = tex->tex.r * NVE4_SU_INFO__STRIDE;
948 const int arg = tex->tex.target.getArgCount();
949
950 if (tex->tex.target == TEX_TARGET_2D_MS)
951 tex->tex.target = TEX_TARGET_2D;
952 else
953 if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
954 tex->tex.target = TEX_TARGET_2D_ARRAY;
955 else
956 return;
957
958 Value *x = tex->getSrc(0);
959 Value *y = tex->getSrc(1);
960 Value *s = tex->getSrc(arg - 1);
961
962 Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
963
964 Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
965 Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
966
967 bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
968 bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
969
970 s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
971 s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
972
973 Value *dx = loadMsInfo32(ts, 0x0);
974 Value *dy = loadMsInfo32(ts, 0x4);
975
976 bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
977 bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
978
979 tex->setSrc(0, tx);
980 tex->setSrc(1, ty);
981 tex->moveSources(arg, -1);
982 }
983
984 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
985 // They're computed from the coordinates using the surface info in c[] space.
986 void
987 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
988 {
989 Instruction *insn;
990 const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
991 const bool raw =
992 su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
993 const int idx = su->tex.r;
994 const int dim = su->tex.target.getDim();
995 const int arg = dim + (su->tex.target.isArray() ? 1 : 0);
996 const uint16_t base = idx * NVE4_SU_INFO__STRIDE;
997 int c;
998 Value *zero = bld.mkImm(0);
999 Value *p1 = NULL;
1000 Value *v;
1001 Value *src[3];
1002 Value *bf, *eau, *off;
1003 Value *addr, *pred;
1004
1005 off = bld.getScratch(4);
1006 bf = bld.getScratch(4);
1007 addr = bld.getSSA(8);
1008 pred = bld.getScratch(1, FILE_PREDICATE);
1009
1010 bld.setPosition(su, false);
1011
1012 adjustCoordinatesMS(su);
1013
1014 // calculate clamped coordinates
1015 for (c = 0; c < arg; ++c) {
1016 src[c] = bld.getScratch();
1017 if (c == 0 && raw)
1018 v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
1019 else
1020 v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
1021 bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
1022 ->subOp = getSuClampSubOp(su, c);
1023 }
1024 for (; c < 3; ++c)
1025 src[c] = zero;
1026
1027 // set predicate output
1028 if (su->tex.target == TEX_TARGET_BUFFER) {
1029 src[0]->getInsn()->setFlagsDef(1, pred);
1030 } else
1031 if (su->tex.target.isArray()) {
1032 p1 = bld.getSSA(1, FILE_PREDICATE);
1033 src[dim]->getInsn()->setFlagsDef(1, p1);
1034 }
1035
1036 // calculate pixel offset
1037 if (dim == 1) {
1038 if (su->tex.target != TEX_TARGET_BUFFER)
1039 bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
1040 } else
1041 if (dim == 3) {
1042 v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
1043 bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
1044 ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1045
1046 v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
1047 bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
1048 ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1049 } else {
1050 assert(dim == 2);
1051 v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
1052 bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
1053 ->subOp = su->tex.target.isArray() ?
1054 NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1055 }
1056
1057 // calculate effective address part 1
1058 if (su->tex.target == TEX_TARGET_BUFFER) {
1059 if (raw) {
1060 bf = src[0];
1061 } else {
1062 v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
1063 bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
1064 ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
1065 }
1066 } else {
1067 Value *y = src[1];
1068 Value *z = src[2];
1069 uint16_t subOp = 0;
1070
1071 switch (dim) {
1072 case 1:
1073 y = zero;
1074 z = zero;
1075 break;
1076 case 2:
1077 z = off;
1078 if (!su->tex.target.isArray()) {
1079 z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
1080 subOp = NV50_IR_SUBOP_SUBFM_3D;
1081 }
1082 break;
1083 default:
1084 subOp = NV50_IR_SUBOP_SUBFM_3D;
1085 assert(dim == 3);
1086 break;
1087 }
1088 insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
1089 insn->subOp = subOp;
1090 insn->setFlagsDef(1, pred);
1091 }
1092
1093 // part 2
1094 v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
1095
1096 if (su->tex.target == TEX_TARGET_BUFFER) {
1097 eau = v;
1098 } else {
1099 eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
1100 }
1101 // add array layer offset
1102 if (su->tex.target.isArray()) {
1103 v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
1104 if (dim == 1)
1105 bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
1106 ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1107 else
1108 bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
1109 ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
1110 // combine predicates
1111 assert(p1);
1112 bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
1113 }
1114
1115 if (atom) {
1116 Value *lo = bf;
1117 if (su->tex.target == TEX_TARGET_BUFFER) {
1118 lo = zero;
1119 bld.mkMov(off, bf);
1120 }
1121 // bf == g[] address & 0xff
1122 // eau == g[] address >> 8
1123 bld.mkOp3(OP_PERMT, TYPE_U32, bf, lo, bld.loadImm(NULL, 0x6540), eau);
1124 bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
1125 } else
1126 if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
1127 // Convert from u32 to u8 address format, which is what the library code
1128 // doing SULDP currently uses.
1129 // XXX: can SUEAU do this ?
1130 // XXX: does it matter that we don't mask high bytes in bf ?
1131 // Grrr.
1132 bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
1133 bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
1134 }
1135
1136 bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
1137
1138 if (atom && su->tex.target == TEX_TARGET_BUFFER)
1139 bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
1140
1141 // let's just set it 0 for raw access and hope it works
1142 v = raw ?
1143 bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
1144
1145 // get rid of old coordinate sources, make space for fmt info and predicate
1146 su->moveSources(arg, 3 - arg);
1147 // set 64 bit address and 32-bit format sources
1148 su->setSrc(0, addr);
1149 su->setSrc(1, v);
1150 su->setSrc(2, pred);
1151 }
1152
1153 void
1154 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
1155 {
1156 processSurfaceCoordsNVE4(su);
1157
1158 // Who do we hate more ? The person who decided that nvc0's SULD doesn't
1159 // have to support conversion or the person who decided that, in OpenCL,
1160 // you don't have to specify the format here like you do in OpenGL ?
1161
1162 if (su->op == OP_SULDP) {
1163 // We don't patch shaders. Ever.
1164 // You get an indirect call to our library blob here.
1165 // But at least it's uniform.
1166 FlowInstruction *call;
1167 LValue *p[3];
1168 LValue *r[5];
1169 uint16_t base = su->tex.r * NVE4_SU_INFO__STRIDE + NVE4_SU_INFO_CALL;
1170
1171 for (int i = 0; i < 4; ++i)
1172 (r[i] = bld.getScratch(4, FILE_GPR))->reg.data.id = i;
1173 for (int i = 0; i < 3; ++i)
1174 (p[i] = bld.getScratch(1, FILE_PREDICATE))->reg.data.id = i;
1175 (r[4] = bld.getScratch(8, FILE_GPR))->reg.data.id = 4;
1176
1177 bld.mkMov(p[1], bld.mkImm((su->cache == CACHE_CA) ? 1 : 0), TYPE_U8);
1178 bld.mkMov(p[2], bld.mkImm((su->cache == CACHE_CG) ? 1 : 0), TYPE_U8);
1179 bld.mkMov(p[0], su->getSrc(2), TYPE_U8);
1180 bld.mkMov(r[4], su->getSrc(0), TYPE_U64);
1181 bld.mkMov(r[2], su->getSrc(1), TYPE_U32);
1182
1183 call = bld.mkFlow(OP_CALL, NULL, su->cc, su->getPredicate());
1184
1185 call->indirect = 1;
1186 call->absolute = 1;
1187 call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
1188 prog->driver->io.resInfoCBSlot, TYPE_U32,
1189 prog->driver->io.suInfoBase + base));
1190 call->setSrc(1, r[2]);
1191 call->setSrc(2, r[4]);
1192 for (int i = 0; i < 3; ++i)
1193 call->setSrc(3 + i, p[i]);
1194 for (int i = 0; i < 4; ++i) {
1195 call->setDef(i, r[i]);
1196 bld.mkMov(su->getDef(i), r[i]);
1197 }
1198 call->setDef(4, p[1]);
1199 delete_Instruction(bld.getProgram(), su);
1200 }
1201
1202 if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
1203 Value *pred = su->getSrc(2);
1204 CondCode cc = CC_NOT_P;
1205 if (su->getPredicate()) {
1206 pred = bld.getScratch(1, FILE_PREDICATE);
1207 cc = su->cc;
1208 if (cc == CC_NOT_P) {
1209 bld.mkOp2(OP_OR, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
1210 } else {
1211 bld.mkOp2(OP_AND, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
1212 pred->getInsn()->src(1).mod = Modifier(NV50_IR_MOD_NOT);
1213 }
1214 }
1215 Instruction *red = bld.mkOp(OP_ATOM, su->dType, su->getDef(0));
1216 red->subOp = su->subOp;
1217 if (!gMemBase)
1218 gMemBase = bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0);
1219 red->setSrc(0, gMemBase);
1220 red->setSrc(1, su->getSrc(3));
1221 if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
1222 red->setSrc(2, su->getSrc(4));
1223 red->setIndirect(0, 0, su->getSrc(0));
1224 red->setPredicate(cc, pred);
1225 delete_Instruction(bld.getProgram(), su);
1226 } else {
1227 su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
1228 }
1229 }
1230
1231 bool
1232 NVC0LoweringPass::handleWRSV(Instruction *i)
1233 {
1234 Instruction *st;
1235 Symbol *sym;
1236 uint32_t addr;
1237
1238 // must replace, $sreg are not writeable
1239 addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
1240 if (addr >= 0x400)
1241 return false;
1242 sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1243
1244 st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
1245 i->getSrc(1));
1246 st->perPatch = i->perPatch;
1247
1248 bld.getBB()->remove(i);
1249 return true;
1250 }
1251
1252 void
1253 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
1254 {
1255 Value *laneid = bld.getSSA();
1256 Value *x, *y;
1257
1258 bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
1259
1260 if (c == 0) {
1261 x = dst;
1262 y = NULL;
1263 } else
1264 if (c == 1) {
1265 x = NULL;
1266 y = dst;
1267 } else {
1268 assert(c == 2);
1269 x = bld.getSSA();
1270 y = bld.getSSA();
1271 }
1272 if (x)
1273 bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
1274 if (y)
1275 bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
1276
1277 if (c == 2) {
1278 bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
1279 bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
1280 }
1281 }
1282
1283 bool
1284 NVC0LoweringPass::handleRDSV(Instruction *i)
1285 {
1286 Symbol *sym = i->getSrc(0)->asSym();
1287 Value *vtx = NULL;
1288 Instruction *ld;
1289 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1290
1291 if (addr >= 0x400) // mov $sreg
1292 return true;
1293
1294 switch (i->getSrc(0)->reg.data.sv.sv) {
1295 case SV_POSITION:
1296 assert(prog->getType() == Program::TYPE_FRAGMENT);
1297 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1298 break;
1299 case SV_FACE:
1300 {
1301 Value *face = i->getDef(0);
1302 bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
1303 if (i->dType == TYPE_F32) {
1304 bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
1305 bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
1306 }
1307 }
1308 break;
1309 case SV_TESS_COORD:
1310 assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
1311 readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
1312 break;
1313 default:
1314 if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
1315 vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
1316 ld = bld.mkFetch(i->getDef(0), i->dType,
1317 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
1318 ld->perPatch = i->perPatch;
1319 break;
1320 }
1321 bld.getBB()->remove(i);
1322 return true;
1323 }
1324
1325 bool
1326 NVC0LoweringPass::handleDIV(Instruction *i)
1327 {
1328 if (!isFloatType(i->dType))
1329 return true;
1330 bld.setPosition(i, false);
1331 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1332 i->op = OP_MUL;
1333 i->setSrc(1, rcp->getDef(0));
1334 return true;
1335 }
1336
1337 bool
1338 NVC0LoweringPass::handleMOD(Instruction *i)
1339 {
1340 if (i->dType != TYPE_F32)
1341 return true;
1342 LValue *value = bld.getScratch();
1343 bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
1344 bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
1345 bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
1346 bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
1347 i->op = OP_SUB;
1348 i->setSrc(1, value);
1349 return true;
1350 }
1351
1352 bool
1353 NVC0LoweringPass::handleSQRT(Instruction *i)
1354 {
1355 Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
1356 bld.getSSA(), i->getSrc(0));
1357 i->op = OP_MUL;
1358 i->setSrc(1, rsq->getDef(0));
1359
1360 return true;
1361 }
1362
1363 bool
1364 NVC0LoweringPass::handlePOW(Instruction *i)
1365 {
1366 LValue *val = bld.getScratch();
1367
1368 bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1369 bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1370 bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1371
1372 i->op = OP_EX2;
1373 i->setSrc(0, val);
1374 i->setSrc(1, NULL);
1375
1376 return true;
1377 }
1378
1379 bool
1380 NVC0LoweringPass::handleEXPORT(Instruction *i)
1381 {
1382 if (prog->getType() == Program::TYPE_FRAGMENT) {
1383 int id = i->getSrc(0)->reg.data.offset / 4;
1384
1385 if (i->src(0).isIndirect(0)) // TODO, ugly
1386 return false;
1387 i->op = OP_MOV;
1388 i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1389 i->src(0).set(i->src(1));
1390 i->setSrc(1, NULL);
1391 i->setDef(0, new_LValue(func, FILE_GPR));
1392 i->getDef(0)->reg.data.id = id;
1393
1394 prog->maxGPR = MAX2(prog->maxGPR, id);
1395 } else
1396 if (prog->getType() == Program::TYPE_GEOMETRY) {
1397 i->setIndirect(0, 1, gpEmitAddress);
1398 }
1399 return true;
1400 }
1401
1402 bool
1403 NVC0LoweringPass::handleOUT(Instruction *i)
1404 {
1405 if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
1406 i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
1407 delete_Instruction(prog, i);
1408 } else {
1409 assert(gpEmitAddress);
1410 i->setDef(0, gpEmitAddress);
1411 if (i->srcExists(0))
1412 i->setSrc(1, i->getSrc(0));
1413 i->setSrc(0, gpEmitAddress);
1414 }
1415 return true;
1416 }
1417
1418 // Generate a binary predicate if an instruction is predicated by
1419 // e.g. an f32 value.
1420 void
1421 NVC0LoweringPass::checkPredicate(Instruction *insn)
1422 {
1423 Value *pred = insn->getPredicate();
1424 Value *pdst;
1425
1426 if (!pred || pred->reg.file == FILE_PREDICATE)
1427 return;
1428 pdst = new_LValue(func, FILE_PREDICATE);
1429
1430 // CAUTION: don't use pdst->getInsn, the definition might not be unique,
1431 // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
1432
1433 bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, pdst, bld.mkImm(0), pred);
1434
1435 insn->setPredicate(insn->cc, pdst);
1436 }
1437
1438 //
1439 // - add quadop dance for texturing
1440 // - put FP outputs in GPRs
1441 // - convert instruction sequences
1442 //
1443 bool
1444 NVC0LoweringPass::visit(Instruction *i)
1445 {
1446 bld.setPosition(i, false);
1447
1448 if (i->cc != CC_ALWAYS)
1449 checkPredicate(i);
1450
1451 switch (i->op) {
1452 case OP_TEX:
1453 case OP_TXB:
1454 case OP_TXL:
1455 case OP_TXF:
1456 case OP_TXG:
1457 return handleTEX(i->asTex());
1458 case OP_TXD:
1459 return handleTXD(i->asTex());
1460 case OP_TXQ:
1461 return handleTXQ(i->asTex());
1462 case OP_EX2:
1463 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1464 i->setSrc(0, i->getDef(0));
1465 break;
1466 case OP_POW:
1467 return handlePOW(i);
1468 case OP_DIV:
1469 return handleDIV(i);
1470 case OP_MOD:
1471 return handleMOD(i);
1472 case OP_SQRT:
1473 return handleSQRT(i);
1474 case OP_EXPORT:
1475 return handleEXPORT(i);
1476 case OP_EMIT:
1477 case OP_RESTART:
1478 return handleOUT(i);
1479 case OP_RDSV:
1480 return handleRDSV(i);
1481 case OP_WRSV:
1482 return handleWRSV(i);
1483 case OP_LOAD:
1484 if (i->src(0).getFile() == FILE_SHADER_INPUT) {
1485 if (prog->getType() == Program::TYPE_COMPUTE) {
1486 i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
1487 i->getSrc(0)->reg.fileIndex = 0;
1488 } else {
1489 i->op = OP_VFETCH;
1490 assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
1491 }
1492 }
1493 break;
1494 case OP_ATOM:
1495 handleATOM(i);
1496 break;
1497 case OP_SULDB:
1498 case OP_SULDP:
1499 case OP_SUSTB:
1500 case OP_SUSTP:
1501 case OP_SUREDB:
1502 case OP_SUREDP:
1503 if (targ->getChipset() >= NVISA_GK104_CHIPSET)
1504 handleSurfaceOpNVE4(i->asTex());
1505 break;
1506 default:
1507 break;
1508 }
1509 return true;
1510 }
1511
1512 bool
1513 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
1514 {
1515 if (stage == CG_STAGE_PRE_SSA) {
1516 NVC0LoweringPass pass(prog);
1517 return pass.run(prog, false, true);
1518 } else
1519 if (stage == CG_STAGE_POST_RA) {
1520 NVC0LegalizePostRA pass(prog);
1521 return pass.run(prog, false, true);
1522 } else
1523 if (stage == CG_STAGE_SSA) {
1524 NVC0LegalizeSSA pass;
1525 return pass.run(prog, false, true);
1526 }
1527 return false;
1528 }
1529
1530 } // namespace nv50_ir