gm107/ir: add fp64 rcp
[mesa.git] / src / gallium / drivers / nouveau / codegen / nv50_ir_lowering_nvc0.cpp
1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "codegen/nv50_ir.h"
24 #include "codegen/nv50_ir_build_util.h"
25
26 #include "codegen/nv50_ir_target_nvc0.h"
27 #include "codegen/nv50_ir_lowering_nvc0.h"
28
29 #include <limits>
30
31 namespace nv50_ir {
32
33 #define QOP_ADD 0
34 #define QOP_SUBR 1
35 #define QOP_SUB 2
36 #define QOP_MOV2 3
37
38 // UL UR LL LR
39 #define QUADOP(q, r, s, t) \
40 ((QOP_##q << 6) | (QOP_##r << 4) | \
41 (QOP_##s << 2) | (QOP_##t << 0))
42
43 void
44 NVC0LegalizeSSA::handleDIV(Instruction *i)
45 {
46 FlowInstruction *call;
47 int builtin;
48
49 bld.setPosition(i, false);
50
51 // Generate movs to the input regs for the call we want to generate
52 for (int s = 0; i->srcExists(s); ++s) {
53 Instruction *ld = i->getSrc(s)->getInsn();
54 assert(ld->getSrc(0) != NULL);
55 // check if we are moving an immediate, propagate it in that case
56 if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV) ||
57 !(ld->src(0).getFile() == FILE_IMMEDIATE))
58 bld.mkMovToReg(s, i->getSrc(s));
59 else {
60 bld.mkMovToReg(s, ld->getSrc(0));
61 // Clear the src, to make code elimination possible here before we
62 // delete the instruction i later
63 i->setSrc(s, NULL);
64 if (ld->isDead())
65 delete_Instruction(prog, ld);
66 }
67 }
68
69 switch (i->dType) {
70 case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
71 case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
72 default:
73 return;
74 }
75 call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
76 bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1);
77 bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
78 bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
79
80 call->fixed = 1;
81 call->absolute = call->builtin = 1;
82 call->target.builtin = builtin;
83 delete_Instruction(prog, i);
84 }
85
86 void
87 NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])
88 {
89 FlowInstruction *call;
90 Value *def[2];
91 int builtin;
92
93 def[0] = bld.mkMovToReg(0, src[0])->getDef(0);
94 def[1] = bld.mkMovToReg(1, src[1])->getDef(0);
95
96 if (i->op == OP_RCP)
97 builtin = NVC0_BUILTIN_RCP_F64;
98 else
99 builtin = NVC0_BUILTIN_RSQ_F64;
100
101 call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
102 def[0] = bld.getSSA();
103 def[1] = bld.getSSA();
104 bld.mkMovFromReg(def[0], 0);
105 bld.mkMovFromReg(def[1], 1);
106 bld.mkClobber(FILE_GPR, 0x3fc, 2);
107 bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);
108 bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);
109
110 call->fixed = 1;
111 call->absolute = call->builtin = 1;
112 call->target.builtin = builtin;
113 delete_Instruction(prog, i);
114
115 prog->fp64 = true;
116 }
117
118 void
119 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
120 {
121 assert(i->dType == TYPE_F64);
122 // There are instructions that will compute the high 32 bits of the 64-bit
123 // float. We will just stick 0 in the bottom 32 bits.
124
125 bld.setPosition(i, false);
126
127 // 1. Take the source and it up.
128 Value *src[2], *dst[2], *def = i->getDef(0);
129 bld.mkSplit(src, 4, i->getSrc(0));
130
131 int chip = prog->getTarget()->getChipset();
132 if (chip >= NVISA_GK104_CHIPSET && (i->op == OP_RCP || chip < NVISA_GM107_CHIPSET)) {
133 handleRCPRSQLib(i, src);
134 return;
135 }
136
137 // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
138 dst[0] = bld.loadImm(NULL, 0);
139 dst[1] = bld.getSSA();
140
141 // 3. The new version of the instruction takes the high 32 bits of the
142 // source and outputs the high 32 bits of the destination.
143 i->setSrc(0, src[1]);
144 i->setDef(0, dst[1]);
145 i->setType(TYPE_F32);
146 i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
147
148 // 4. Recombine the two dst pieces back into the original destination.
149 bld.setPosition(i, true);
150 bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
151 }
152
153 void
154 NVC0LegalizeSSA::handleFTZ(Instruction *i)
155 {
156 // Only want to flush float inputs
157 assert(i->sType == TYPE_F32);
158
159 // If we're already flushing denorms (and NaN's) to zero, no need for this.
160 if (i->dnz)
161 return;
162
163 // Only certain classes of operations can flush
164 OpClass cls = prog->getTarget()->getOpClass(i->op);
165 if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
166 cls != OPCLASS_CONVERT)
167 return;
168
169 i->ftz = true;
170 }
171
172 void
173 NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
174 {
175 if (i->tex.levelZero)
176 return;
177
178 ImmediateValue lod;
179
180 // The LOD argument comes right after the coordinates (before depth bias,
181 // offsets, etc).
182 int arg = i->tex.target.getArgCount();
183
184 // SM30+ stores the indirect handle as a separate arg, which comes before
185 // the LOD.
186 if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
187 i->tex.rIndirectSrc >= 0)
188 arg++;
189 // SM20 stores indirect handle combined with array coordinate
190 if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
191 !i->tex.target.isArray() &&
192 i->tex.rIndirectSrc >= 0)
193 arg++;
194
195 if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
196 return;
197
198 if (i->op == OP_TXL)
199 i->op = OP_TEX;
200 i->tex.levelZero = true;
201 i->moveSources(arg + 1, -1);
202 }
203
204 void
205 NVC0LegalizeSSA::handleShift(Instruction *lo)
206 {
207 Value *shift = lo->getSrc(1);
208 Value *dst64 = lo->getDef(0);
209 Value *src[2], *dst[2];
210 operation op = lo->op;
211
212 bld.setPosition(lo, false);
213
214 bld.mkSplit(src, 4, lo->getSrc(0));
215
216 // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to
217 // be completely emulated. For SM35+, we can use the more directed SHF
218 // operations.
219 if (prog->getTarget()->getChipset() < NVISA_GK20A_CHIPSET) {
220 // The strategy here is to handle shifts >= 32 and less than 32 as
221 // separate parts.
222 //
223 // For SHL:
224 // If the shift is <= 32, then
225 // (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)
226 // If the shift is > 32, then
227 // (HI,LO) << x = (LO << (x - 32), 0)
228 //
229 // For SHR:
230 // If the shift is <= 32, then
231 // (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)
232 // If the shift is > 32, then
233 // (HI,LO) >> x = (0, HI >> (x - 32))
234 //
235 // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we
236 // can use to our advantage. Also note the structural similarities
237 // between the right/left cases. The main difference is swapping hi/lo
238 // on input and output.
239
240 Value *x32_minus_shift, *pred, *hi1, *hi2;
241 DataType type = isSignedIntType(lo->dType) ? TYPE_S32 : TYPE_U32;
242 operation antiop = op == OP_SHR ? OP_SHL : OP_SHR;
243 if (op == OP_SHR)
244 std::swap(src[0], src[1]);
245 bld.mkOp2(OP_ADD, TYPE_U32, (x32_minus_shift = bld.getSSA()), shift, bld.mkImm(0x20))
246 ->src(0).mod = Modifier(NV50_IR_MOD_NEG);
247 bld.mkCmp(OP_SET, CC_LE, TYPE_U8, (pred = bld.getSSA(1, FILE_PREDICATE)),
248 TYPE_U32, shift, bld.mkImm(32));
249 // Compute HI (shift <= 32)
250 bld.mkOp2(OP_OR, TYPE_U32, (hi1 = bld.getSSA()),
251 bld.mkOp2v(op, TYPE_U32, bld.getSSA(), src[1], shift),
252 bld.mkOp2v(antiop, TYPE_U32, bld.getSSA(), src[0], x32_minus_shift))
253 ->setPredicate(CC_P, pred);
254 // Compute LO (all shift values)
255 bld.mkOp2(op, type, (dst[0] = bld.getSSA()), src[0], shift);
256 // Compute HI (shift > 32)
257 bld.mkOp2(op, type, (hi2 = bld.getSSA()), src[0],
258 bld.mkOp1v(OP_NEG, TYPE_S32, bld.getSSA(), x32_minus_shift))
259 ->setPredicate(CC_NOT_P, pred);
260 bld.mkOp2(OP_UNION, TYPE_U32, (dst[1] = bld.getSSA()), hi1, hi2);
261 if (op == OP_SHR)
262 std::swap(dst[0], dst[1]);
263 bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
264 delete_Instruction(prog, lo);
265 return;
266 }
267
268 Instruction *hi = new_Instruction(func, op, TYPE_U32);
269 lo->bb->insertAfter(lo, hi);
270
271 hi->sType = lo->sType;
272 lo->dType = TYPE_U32;
273
274 hi->setDef(0, (dst[1] = bld.getSSA()));
275 if (lo->op == OP_SHR)
276 hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;
277 lo->setDef(0, (dst[0] = bld.getSSA()));
278
279 bld.setPosition(hi, true);
280
281 if (lo->op == OP_SHL)
282 std::swap(hi, lo);
283
284 hi->setSrc(0, new_ImmediateValue(prog, 0u));
285 hi->setSrc(1, shift);
286 hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);
287
288 lo->setSrc(0, src[0]);
289 lo->setSrc(1, shift);
290 lo->setSrc(2, src[1]);
291
292 bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
293 }
294
295 void
296 NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
297 {
298 DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;
299 Value *carry;
300 Value *src0[2], *src1[2];
301 bld.setPosition(cmp, false);
302
303 bld.mkSplit(src0, 4, cmp->getSrc(0));
304 bld.mkSplit(src1, 4, cmp->getSrc(1));
305 bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])
306 ->setFlagsDef(0, (carry = bld.getSSA(1, FILE_FLAGS)));
307 cmp->setFlagsSrc(cmp->srcCount(), carry);
308 cmp->setSrc(0, src0[1]);
309 cmp->setSrc(1, src1[1]);
310 cmp->sType = hTy;
311 }
312
313 bool
314 NVC0LegalizeSSA::visit(Function *fn)
315 {
316 bld.setProgram(fn->getProgram());
317 return true;
318 }
319
320 bool
321 NVC0LegalizeSSA::visit(BasicBlock *bb)
322 {
323 Instruction *next;
324 for (Instruction *i = bb->getEntry(); i; i = next) {
325 next = i->next;
326
327 if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
328 handleFTZ(i);
329
330 switch (i->op) {
331 case OP_DIV:
332 case OP_MOD:
333 if (i->sType != TYPE_F32)
334 handleDIV(i);
335 break;
336 case OP_RCP:
337 case OP_RSQ:
338 if (i->dType == TYPE_F64)
339 handleRCPRSQ(i);
340 break;
341 case OP_TXL:
342 case OP_TXF:
343 handleTEXLOD(i->asTex());
344 break;
345 case OP_SHR:
346 case OP_SHL:
347 if (typeSizeof(i->sType) == 8)
348 handleShift(i);
349 break;
350 case OP_SET:
351 case OP_SET_AND:
352 case OP_SET_OR:
353 case OP_SET_XOR:
354 if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
355 handleSET(i->asCmp());
356 break;
357 default:
358 break;
359 }
360 }
361 return true;
362 }
363
364 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
365 : rZero(NULL),
366 carry(NULL),
367 pOne(NULL),
368 needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
369 prog->getTarget()->getChipset() < 0x110)
370 {
371 }
372
373 bool
374 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
375 const Instruction *early) const
376 {
377 if (early->bb == later->bb)
378 return early->serial < later->serial;
379 return later->bb->dominatedBy(early->bb);
380 }
381
382 void
383 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
384 Instruction *usei, const Instruction *texi)
385 {
386 bool add = true;
387 bool dominated = insnDominatedBy(usei, texi);
388 // Uses before the tex have to all be included. Just because an earlier
389 // instruction dominates another instruction doesn't mean that there's no
390 // way to get from the tex to the later instruction. For example you could
391 // have nested loops, with the tex in the inner loop, and uses before it in
392 // both loops - even though the outer loop's instruction would dominate the
393 // inner's, we still want a texbar before the inner loop's instruction.
394 //
395 // However we can still use the eliding logic between uses dominated by the
396 // tex instruction, as that is unambiguously correct.
397 if (dominated) {
398 for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
399 if (it->after) {
400 if (insnDominatedBy(usei, it->insn)) {
401 add = false;
402 break;
403 }
404 if (insnDominatedBy(it->insn, usei)) {
405 it = uses.erase(it);
406 continue;
407 }
408 }
409 ++it;
410 }
411 }
412 if (add)
413 uses.push_back(TexUse(usei, texi, dominated));
414 }
415
416 // While it might be tempting to use the an algorithm that just looks at tex
417 // uses, not all texture results are guaranteed to be used on all paths. In
418 // the case where along some control flow path a texture result is never used,
419 // we might reuse that register for something else, creating a
420 // write-after-write hazard. So we have to manually look through all
421 // instructions looking for ones that reference the registers in question.
422 void
423 NVC0LegalizePostRA::findFirstUses(
424 Instruction *texi, std::list<TexUse> &uses)
425 {
426 int minGPR = texi->def(0).rep()->reg.data.id;
427 int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
428
429 unordered_set<const BasicBlock *> visited;
430 findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
431 }
432
433 void
434 NVC0LegalizePostRA::findFirstUsesBB(
435 int minGPR, int maxGPR, Instruction *start,
436 const Instruction *texi, std::list<TexUse> &uses,
437 unordered_set<const BasicBlock *> &visited)
438 {
439 const BasicBlock *bb = start->bb;
440
441 // We don't process the whole bb the first time around. This is correct,
442 // however we might be in a loop and hit this BB again, and need to process
443 // the full thing. So only mark a bb as visited if we processed it from the
444 // beginning.
445 if (start == bb->getEntry()) {
446 if (visited.find(bb) != visited.end())
447 return;
448 visited.insert(bb);
449 }
450
451 for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
452 if (insn->isNop())
453 continue;
454
455 for (int d = 0; insn->defExists(d); ++d) {
456 const Value *def = insn->def(d).rep();
457 if (insn->def(d).getFile() != FILE_GPR ||
458 def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
459 def->reg.data.id > maxGPR)
460 continue;
461 addTexUse(uses, insn, texi);
462 return;
463 }
464
465 for (int s = 0; insn->srcExists(s); ++s) {
466 const Value *src = insn->src(s).rep();
467 if (insn->src(s).getFile() != FILE_GPR ||
468 src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
469 src->reg.data.id > maxGPR)
470 continue;
471 addTexUse(uses, insn, texi);
472 return;
473 }
474 }
475
476 for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
477 findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
478 texi, uses, visited);
479 }
480 }
481
482 // Texture barriers:
483 // This pass is a bit long and ugly and can probably be optimized.
484 //
485 // 1. obtain a list of TEXes and their outputs' first use(s)
486 // 2. calculate the barrier level of each first use (minimal number of TEXes,
487 // over all paths, between the TEX and the use in question)
488 // 3. for each barrier, if all paths from the source TEX to that barrier
489 // contain a barrier of lesser level, it can be culled
490 bool
491 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
492 {
493 std::list<TexUse> *uses;
494 std::vector<Instruction *> texes;
495 std::vector<int> bbFirstTex;
496 std::vector<int> bbFirstUse;
497 std::vector<int> texCounts;
498 std::vector<TexUse> useVec;
499 ArrayList insns;
500
501 fn->orderInstructions(insns);
502
503 texCounts.resize(fn->allBBlocks.getSize(), 0);
504 bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
505 bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
506
507 // tag BB CFG nodes by their id for later
508 for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
509 BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
510 if (bb)
511 bb->cfg.tag = bb->getId();
512 }
513
514 // gather the first uses for each TEX
515 for (int i = 0; i < insns.getSize(); ++i) {
516 Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
517 if (isTextureOp(tex->op)) {
518 texes.push_back(tex);
519 if (!texCounts.at(tex->bb->getId()))
520 bbFirstTex[tex->bb->getId()] = texes.size() - 1;
521 texCounts[tex->bb->getId()]++;
522 }
523 }
524 insns.clear();
525 if (texes.empty())
526 return false;
527 uses = new std::list<TexUse>[texes.size()];
528 if (!uses)
529 return false;
530 for (size_t i = 0; i < texes.size(); ++i) {
531 findFirstUses(texes[i], uses[i]);
532 }
533
534 // determine the barrier level at each use
535 for (size_t i = 0; i < texes.size(); ++i) {
536 for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
537 ++u) {
538 BasicBlock *tb = texes[i]->bb;
539 BasicBlock *ub = u->insn->bb;
540 if (tb == ub) {
541 u->level = 0;
542 for (size_t j = i + 1; j < texes.size() &&
543 texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
544 ++j)
545 u->level++;
546 } else {
547 u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
548 &ub->cfg, texCounts);
549 if (u->level < 0) {
550 WARN("Failed to find path TEX -> TEXBAR\n");
551 u->level = 0;
552 continue;
553 }
554 // this counted all TEXes in the origin block, correct that
555 u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
556 // and did not count the TEXes in the destination block, add those
557 for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
558 texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
559 ++j)
560 u->level++;
561 }
562 assert(u->level >= 0);
563 useVec.push_back(*u);
564 }
565 }
566 delete[] uses;
567
568 // insert the barriers
569 for (size_t i = 0; i < useVec.size(); ++i) {
570 Instruction *prev = useVec[i].insn->prev;
571 if (useVec[i].level < 0)
572 continue;
573 if (prev && prev->op == OP_TEXBAR) {
574 if (prev->subOp > useVec[i].level)
575 prev->subOp = useVec[i].level;
576 prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
577 } else {
578 Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
579 bar->fixed = 1;
580 bar->subOp = useVec[i].level;
581 // make use explicit to ease latency calculation
582 bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
583 useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
584 }
585 }
586
587 if (fn->getProgram()->optLevel < 3)
588 return true;
589
590 std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
591
592 limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
593 limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
594 limitS.resize(fn->allBBlocks.getSize());
595
596 // cull unneeded barriers (should do that earlier, but for simplicity)
597 IteratorRef bi = fn->cfg.iteratorCFG();
598 // first calculate min/max outstanding TEXes for each BB
599 for (bi->reset(); !bi->end(); bi->next()) {
600 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
601 BasicBlock *bb = BasicBlock::get(n);
602 int min = 0;
603 int max = std::numeric_limits<int>::max();
604 for (Instruction *i = bb->getFirst(); i; i = i->next) {
605 if (isTextureOp(i->op)) {
606 min++;
607 if (max < std::numeric_limits<int>::max())
608 max++;
609 } else
610 if (i->op == OP_TEXBAR) {
611 min = MIN2(min, i->subOp);
612 max = MIN2(max, i->subOp);
613 }
614 }
615 // limits when looking at an isolated block
616 limitS[bb->getId()].min = min;
617 limitS[bb->getId()].max = max;
618 }
619 // propagate the min/max values
620 for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
621 for (bi->reset(); !bi->end(); bi->next()) {
622 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
623 BasicBlock *bb = BasicBlock::get(n);
624 const int bbId = bb->getId();
625 for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
626 BasicBlock *in = BasicBlock::get(ei.getNode());
627 const int inId = in->getId();
628 limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
629 limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
630 }
631 // I just hope this is correct ...
632 if (limitS[bbId].max == std::numeric_limits<int>::max()) {
633 // no barrier
634 limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
635 limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
636 } else {
637 // block contained a barrier
638 limitB[bbId].min = MIN2(limitS[bbId].max,
639 limitT[bbId].min + limitS[bbId].min);
640 limitB[bbId].max = MIN2(limitS[bbId].max,
641 limitT[bbId].max + limitS[bbId].min);
642 }
643 }
644 }
645 // finally delete unnecessary barriers
646 for (bi->reset(); !bi->end(); bi->next()) {
647 Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
648 BasicBlock *bb = BasicBlock::get(n);
649 Instruction *prev = NULL;
650 Instruction *next;
651 int max = limitT[bb->getId()].max;
652 for (Instruction *i = bb->getFirst(); i; i = next) {
653 next = i->next;
654 if (i->op == OP_TEXBAR) {
655 if (i->subOp >= max) {
656 delete_Instruction(prog, i);
657 i = NULL;
658 } else {
659 max = i->subOp;
660 if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
661 delete_Instruction(prog, prev);
662 prev = NULL;
663 }
664 }
665 } else
666 if (isTextureOp(i->op)) {
667 max++;
668 }
669 if (i && !i->isNop())
670 prev = i;
671 }
672 }
673 return true;
674 }
675
676 bool
677 NVC0LegalizePostRA::visit(Function *fn)
678 {
679 if (needTexBar)
680 insertTextureBarriers(fn);
681
682 rZero = new_LValue(fn, FILE_GPR);
683 pOne = new_LValue(fn, FILE_PREDICATE);
684 carry = new_LValue(fn, FILE_FLAGS);
685
686 rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
687 carry->reg.data.id = 0;
688 pOne->reg.data.id = 7;
689
690 return true;
691 }
692
693 void
694 NVC0LegalizePostRA::replaceZero(Instruction *i)
695 {
696 for (int s = 0; i->srcExists(s); ++s) {
697 if (s == 2 && i->op == OP_SUCLAMP)
698 continue;
699 if (s == 1 && i->op == OP_SHLADD)
700 continue;
701 ImmediateValue *imm = i->getSrc(s)->asImm();
702 if (imm) {
703 if (i->op == OP_SELP && s == 2) {
704 i->setSrc(s, pOne);
705 if (imm->reg.data.u64 == 0)
706 i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
707 } else if (imm->reg.data.u64 == 0) {
708 i->setSrc(s, rZero);
709 }
710 }
711 }
712 }
713
714 // replace CONT with BRA for single unconditional continue
715 bool
716 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
717 {
718 if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
719 return false;
720 Graph::EdgeIterator ei = bb->cfg.incident();
721 if (ei.getType() != Graph::Edge::BACK)
722 ei.next();
723 if (ei.getType() != Graph::Edge::BACK)
724 return false;
725 BasicBlock *contBB = BasicBlock::get(ei.getNode());
726
727 if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
728 contBB->getExit()->getPredicate())
729 return false;
730 contBB->getExit()->op = OP_BRA;
731 bb->remove(bb->getEntry()); // delete PRECONT
732
733 ei.next();
734 assert(ei.end() || ei.getType() != Graph::Edge::BACK);
735 return true;
736 }
737
738 // replace branches to join blocks with join ops
739 void
740 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
741 {
742 if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
743 return;
744 for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
745 BasicBlock *in = BasicBlock::get(ei.getNode());
746 Instruction *exit = in->getExit();
747 if (!exit) {
748 in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
749 // there should always be a terminator instruction
750 WARN("inserted missing terminator in BB:%i\n", in->getId());
751 } else
752 if (exit->op == OP_BRA) {
753 exit->op = OP_JOIN;
754 exit->asFlow()->limit = 1; // must-not-propagate marker
755 }
756 }
757 bb->remove(bb->getEntry());
758 }
759
760 // replaces instructions which would end up as f2f or i2i with faster
761 // alternatives:
762 // - fabs(a) -> fadd(0, abs a)
763 // - fneg(a) -> fadd(neg 0, neg a)
764 // - ineg(a) -> iadd(0, neg a)
765 // - fneg(abs a) -> fadd(neg 0, neg abs a)
766 // - sat(a) -> sat add(0, a)
767 void
768 NVC0LegalizePostRA::replaceCvt(Instruction *cvt)
769 {
770 if (!isFloatType(cvt->sType) && typeSizeof(cvt->sType) != 4)
771 return;
772 if (cvt->sType != cvt->dType)
773 return;
774 // we could make it work, but in this case we have optimizations disabled
775 // and we don't really care either way.
776 if (cvt->src(0).getFile() != FILE_GPR &&
777 cvt->src(0).getFile() != FILE_MEMORY_CONST)
778 return;
779
780 Modifier mod0, mod1;
781
782 switch (cvt->op) {
783 case OP_ABS:
784 if (cvt->src(0).mod)
785 return;
786 if (!isFloatType(cvt->sType))
787 return;
788 mod0 = 0;
789 mod1 = NV50_IR_MOD_ABS;
790 break;
791 case OP_NEG:
792 if (!isFloatType(cvt->sType) && cvt->src(0).mod)
793 return;
794 if (isFloatType(cvt->sType) &&
795 (cvt->src(0).mod && cvt->src(0).mod != Modifier(NV50_IR_MOD_ABS)))
796 return;
797
798 mod0 = isFloatType(cvt->sType) ? NV50_IR_MOD_NEG : 0;
799 mod1 = cvt->src(0).mod == Modifier(NV50_IR_MOD_ABS) ?
800 NV50_IR_MOD_NEG_ABS : NV50_IR_MOD_NEG;
801 break;
802 case OP_SAT:
803 if (!isFloatType(cvt->sType) && cvt->src(0).mod.abs())
804 return;
805 mod0 = 0;
806 mod1 = cvt->src(0).mod;
807 cvt->saturate = true;
808 break;
809 default:
810 return;
811 }
812
813 cvt->op = OP_ADD;
814 cvt->moveSources(0, 1);
815 cvt->setSrc(0, rZero);
816 cvt->src(0).mod = mod0;
817 cvt->src(1).mod = mod1;
818 }
819
820 bool
821 NVC0LegalizePostRA::visit(BasicBlock *bb)
822 {
823 Instruction *i, *next;
824
825 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
826 for (i = bb->getFirst(); i; i = next) {
827 next = i->next;
828 if (i->op == OP_EMIT || i->op == OP_RESTART) {
829 if (!i->getDef(0)->refCount())
830 i->setDef(0, NULL);
831 if (i->src(0).getFile() == FILE_IMMEDIATE)
832 i->setSrc(0, rZero); // initial value must be 0
833 replaceZero(i);
834 } else
835 if (i->isNop()) {
836 bb->remove(i);
837 } else
838 if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
839 prog->getType() != Program::TYPE_COMPUTE) {
840 // It seems like barriers are never required for tessellation since
841 // the warp size is 32, and there are always at most 32 tcs threads.
842 bb->remove(i);
843 } else
844 if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
845 int offset = i->src(0).get()->reg.data.offset;
846 if (abs(offset) >= 0x10000)
847 i->src(0).get()->reg.fileIndex += offset >> 16;
848 i->src(0).get()->reg.data.offset = (int)(short)offset;
849 } else {
850 // TODO: Move this to before register allocation for operations that
851 // need the $c register !
852 if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {
853 Instruction *hi;
854 hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
855 if (hi)
856 next = hi;
857 }
858
859 if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS)
860 replaceCvt(i);
861
862 if (i->op != OP_MOV && i->op != OP_PFETCH)
863 replaceZero(i);
864 }
865 }
866 if (!bb->getEntry())
867 return true;
868
869 if (!tryReplaceContWithBra(bb))
870 propagateJoin(bb);
871
872 return true;
873 }
874
875 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
876 {
877 bld.setProgram(prog);
878 }
879
880 bool
881 NVC0LoweringPass::visit(Function *fn)
882 {
883 if (prog->getType() == Program::TYPE_GEOMETRY) {
884 assert(!strncmp(fn->getName(), "MAIN", 4));
885 // TODO: when we generate actual functions pass this value along somehow
886 bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
887 gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
888 if (fn->cfgExit) {
889 bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
890 bld.mkMovToReg(0, gpEmitAddress);
891 }
892 }
893 return true;
894 }
895
896 bool
897 NVC0LoweringPass::visit(BasicBlock *bb)
898 {
899 return true;
900 }
901
902 inline Value *
903 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
904 {
905 uint8_t b = prog->driver->io.auxCBSlot;
906 uint32_t off = prog->driver->io.texBindBase + slot * 4;
907
908 if (ptr)
909 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
910
911 return bld.
912 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
913 }
914
915 // move array source to first slot, convert to u16, add indirections
916 bool
917 NVC0LoweringPass::handleTEX(TexInstruction *i)
918 {
919 const int dim = i->tex.target.getDim() + i->tex.target.isCube();
920 const int arg = i->tex.target.getArgCount();
921 const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
922 const int chipset = prog->getTarget()->getChipset();
923
924 /* Only normalize in the non-explicit derivatives case. For explicit
925 * derivatives, this is handled in handleManualTXD.
926 */
927 if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
928 Value *src[3], *val;
929 int c;
930 for (c = 0; c < 3; ++c)
931 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
932 val = bld.getScratch();
933 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
934 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
935 bld.mkOp1(OP_RCP, TYPE_F32, val, val);
936 for (c = 0; c < 3; ++c) {
937 i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
938 i->getSrc(c), val));
939 }
940 }
941
942 // Arguments to the TEX instruction are a little insane. Even though the
943 // encoding is identical between SM20 and SM30, the arguments mean
944 // different things between Fermi and Kepler+. A lot of arguments are
945 // optional based on flags passed to the instruction. This summarizes the
946 // order of things.
947 //
948 // Fermi:
949 // array/indirect
950 // coords
951 // sample
952 // lod bias
953 // depth compare
954 // offsets:
955 // - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
956 // - other: 4 bits each, single reg
957 //
958 // Kepler+:
959 // indirect handle
960 // array (+ offsets for txd in upper 16 bits)
961 // coords
962 // sample
963 // lod bias
964 // depth compare
965 // offsets (same as fermi, except txd which takes it with array)
966 //
967 // Maxwell (tex):
968 // array
969 // coords
970 // indirect handle
971 // sample
972 // lod bias
973 // depth compare
974 // offsets
975 //
976 // Maxwell (txd):
977 // indirect handle
978 // coords
979 // array + offsets
980 // derivatives
981
982 if (chipset >= NVISA_GK104_CHIPSET) {
983 if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
984 // XXX this ignores tsc, and assumes a 1:1 mapping
985 assert(i->tex.rIndirectSrc >= 0);
986 if (!i->tex.bindless) {
987 Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
988 i->tex.r = 0xff;
989 i->tex.s = 0x1f;
990 i->setIndirectR(hnd);
991 }
992 i->setIndirectS(NULL);
993 } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
994 if (i->tex.r == 0xffff)
995 i->tex.r = prog->driver->io.fbtexBindBase / 4;
996 else
997 i->tex.r += prog->driver->io.texBindBase / 4;
998 i->tex.s = 0; // only a single cX[] value possible here
999 } else {
1000 Value *hnd = bld.getScratch();
1001 Value *rHnd = loadTexHandle(NULL, i->tex.r);
1002 Value *sHnd = loadTexHandle(NULL, i->tex.s);
1003
1004 bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
1005
1006 i->tex.r = 0; // not used for indirect tex
1007 i->tex.s = 0;
1008 i->setIndirectR(hnd);
1009 }
1010 if (i->tex.target.isArray()) {
1011 LValue *layer = new_LValue(func, FILE_GPR);
1012 Value *src = i->getSrc(lyr);
1013 const int sat = (i->op == OP_TXF) ? 1 : 0;
1014 DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1015 bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
1016 if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
1017 for (int s = dim; s >= 1; --s)
1018 i->setSrc(s, i->getSrc(s - 1));
1019 i->setSrc(0, layer);
1020 } else {
1021 i->setSrc(dim, layer);
1022 }
1023 }
1024 // Move the indirect reference to the first place
1025 if (i->tex.rIndirectSrc >= 0 && (
1026 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
1027 Value *hnd = i->getIndirectR();
1028
1029 i->setIndirectR(NULL);
1030 i->moveSources(0, 1);
1031 i->setSrc(0, hnd);
1032 i->tex.rIndirectSrc = 0;
1033 i->tex.sIndirectSrc = -1;
1034 }
1035 // Move the indirect reference to right after the coords
1036 else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
1037 Value *hnd = i->getIndirectR();
1038
1039 i->setIndirectR(NULL);
1040 i->moveSources(arg, 1);
1041 i->setSrc(arg, hnd);
1042 i->tex.rIndirectSrc = 0;
1043 i->tex.sIndirectSrc = -1;
1044 }
1045 } else
1046 // (nvc0) generate and move the tsc/tic/array source to the front
1047 if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
1048 LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1049
1050 Value *ticRel = i->getIndirectR();
1051 Value *tscRel = i->getIndirectS();
1052
1053 if (i->tex.r == 0xffff) {
1054 i->tex.r = 0x20;
1055 i->tex.s = 0x10;
1056 }
1057
1058 if (ticRel) {
1059 i->setSrc(i->tex.rIndirectSrc, NULL);
1060 if (i->tex.r)
1061 ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1062 ticRel, bld.mkImm(i->tex.r));
1063 }
1064 if (tscRel) {
1065 i->setSrc(i->tex.sIndirectSrc, NULL);
1066 if (i->tex.s)
1067 tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1068 tscRel, bld.mkImm(i->tex.s));
1069 }
1070
1071 Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
1072 if (arrayIndex) {
1073 for (int s = dim; s >= 1; --s)
1074 i->setSrc(s, i->getSrc(s - 1));
1075 i->setSrc(0, arrayIndex);
1076 } else {
1077 i->moveSources(0, 1);
1078 }
1079
1080 if (arrayIndex) {
1081 int sat = (i->op == OP_TXF) ? 1 : 0;
1082 DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1083 bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
1084 } else {
1085 bld.loadImm(src, 0);
1086 }
1087
1088 if (ticRel)
1089 bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
1090 if (tscRel)
1091 bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
1092
1093 i->setSrc(0, src);
1094 }
1095
1096 // For nvc0, the sample id has to be in the second operand, as the offset
1097 // does. Right now we don't know how to pass both in, and this case can't
1098 // happen with OpenGL. On nve0, the sample id is part of the texture
1099 // coordinate argument.
1100 assert(chipset >= NVISA_GK104_CHIPSET ||
1101 !i->tex.useOffsets || !i->tex.target.isMS());
1102
1103 // offset is between lod and dc
1104 if (i->tex.useOffsets) {
1105 int n, c;
1106 int s = i->srcCount(0xff, true);
1107 if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
1108 if (i->tex.target.isShadow())
1109 s--;
1110 if (i->srcExists(s)) // move potential predicate out of the way
1111 i->moveSources(s, 1);
1112 if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
1113 i->moveSources(s + 1, 1);
1114 }
1115 if (i->op == OP_TXG) {
1116 // Either there is 1 offset, which goes into the 2 low bytes of the
1117 // first source, or there are 4 offsets, which go into 2 sources (8
1118 // values, 1 byte each).
1119 Value *offs[2] = {NULL, NULL};
1120 for (n = 0; n < i->tex.useOffsets; n++) {
1121 for (c = 0; c < 2; ++c) {
1122 if ((n % 2) == 0 && c == 0)
1123 bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
1124 else
1125 bld.mkOp3(OP_INSBF, TYPE_U32,
1126 offs[n / 2],
1127 i->offset[n][c].get(),
1128 bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
1129 offs[n / 2]);
1130 }
1131 }
1132 i->setSrc(s, offs[0]);
1133 if (offs[1])
1134 i->setSrc(s + 1, offs[1]);
1135 } else {
1136 unsigned imm = 0;
1137 assert(i->tex.useOffsets == 1);
1138 for (c = 0; c < 3; ++c) {
1139 ImmediateValue val;
1140 if (!i->offset[0][c].getImmediate(val))
1141 assert(!"non-immediate offset passed to non-TXG");
1142 imm |= (val.reg.data.u32 & 0xf) << (c * 4);
1143 }
1144 if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
1145 // The offset goes into the upper 16 bits of the array index. So
1146 // create it if it's not already there, and INSBF it if it already
1147 // is.
1148 s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
1149 if (chipset >= NVISA_GM107_CHIPSET)
1150 s += dim;
1151 if (i->tex.target.isArray()) {
1152 Value *offset = bld.getScratch();
1153 bld.mkOp3(OP_INSBF, TYPE_U32, offset,
1154 bld.loadImm(NULL, imm), bld.mkImm(0xc10),
1155 i->getSrc(s));
1156 i->setSrc(s, offset);
1157 } else {
1158 i->moveSources(s, 1);
1159 i->setSrc(s, bld.loadImm(NULL, imm << 16));
1160 }
1161 } else {
1162 i->setSrc(s, bld.loadImm(NULL, imm));
1163 }
1164 }
1165 }
1166
1167 return true;
1168 }
1169
1170 bool
1171 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
1172 {
1173 // Always done from the l0 perspective. This is the way that NVIDIA's
1174 // driver does it, and doing it from the "current" lane's perpsective
1175 // doesn't seem to always work for reasons that aren't altogether clear,
1176 // even in frag shaders.
1177 //
1178 // Note that we must move not only the coordinates into lane0, but also all
1179 // ancillary arguments, like array indices and depth compare as they may
1180 // differ between lanes. Offsets for TXD are supposed to be uniform, so we
1181 // leave them alone.
1182 static const uint8_t qOps[2] =
1183 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) };
1184
1185 Value *def[4][4];
1186 Value *crd[3], *arr[2], *shadow;
1187 Instruction *tex;
1188 Value *zero = bld.loadImm(bld.getSSA(), 0);
1189 int l, c;
1190 const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1191
1192 // This function is invoked after handleTEX lowering, so we have to expect
1193 // the arguments in the order that the hw wants them. For Fermi, array and
1194 // indirect are both in the leading arg, while for Kepler, array and
1195 // indirect are separate (and both precede the coordinates). Maxwell is
1196 // handled in a separate function.
1197 int array;
1198 if (targ->getChipset() < NVISA_GK104_CHIPSET)
1199 array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
1200 else
1201 array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
1202
1203 i->op = OP_TEX; // no need to clone dPdx/dPdy later
1204
1205 for (c = 0; c < dim; ++c)
1206 crd[c] = bld.getScratch();
1207 for (c = 0; c < array; ++c)
1208 arr[c] = bld.getScratch();
1209 shadow = bld.getScratch();
1210
1211 for (l = 0; l < 4; ++l) {
1212 Value *src[3], *val;
1213
1214 bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1215 // we're using the texture result from lane 0 in all cases, so make sure
1216 // that lane 0 is pointing at the proper array index, indirect value,
1217 // and depth compare.
1218 if (l != 0) {
1219 for (c = 0; c < array; ++c)
1220 bld.mkQuadop(0x00, arr[c], l, i->getSrc(c), zero);
1221 if (i->tex.target.isShadow()) {
1222 // The next argument after coords is the depth compare
1223 bld.mkQuadop(0x00, shadow, l, i->getSrc(array + dim), zero);
1224 }
1225 }
1226 // mov position coordinates from lane l to all lanes
1227 for (c = 0; c < dim; ++c)
1228 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
1229 // add dPdx from lane l to lanes dx
1230 for (c = 0; c < dim; ++c)
1231 bld.mkQuadop(qOps[0], crd[c], l, i->dPdx[c].get(), crd[c]);
1232 // add dPdy from lane l to lanes dy
1233 for (c = 0; c < dim; ++c)
1234 bld.mkQuadop(qOps[1], crd[c], l, i->dPdy[c].get(), crd[c]);
1235 // normalize cube coordinates
1236 if (i->tex.target.isCube()) {
1237 for (c = 0; c < 3; ++c)
1238 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1239 val = bld.getScratch();
1240 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1241 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1242 bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1243 for (c = 0; c < 3; ++c)
1244 src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1245 } else {
1246 for (c = 0; c < dim; ++c)
1247 src[c] = crd[c];
1248 }
1249 // texture
1250 bld.insert(tex = cloneForward(func, i));
1251 if (l != 0) {
1252 for (c = 0; c < array; ++c)
1253 tex->setSrc(c, arr[c]);
1254 if (i->tex.target.isShadow())
1255 tex->setSrc(array + dim, shadow);
1256 }
1257 for (c = 0; c < dim; ++c)
1258 tex->setSrc(c + array, src[c]);
1259 // broadcast results from lane 0 to all lanes so that the moves *into*
1260 // the target lane pick up the proper value.
1261 if (l != 0)
1262 for (c = 0; i->defExists(c); ++c)
1263 bld.mkQuadop(0x00, tex->getDef(c), 0, tex->getDef(c), zero);
1264 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1265
1266 // save results
1267 for (c = 0; i->defExists(c); ++c) {
1268 Instruction *mov;
1269 def[c][l] = bld.getSSA();
1270 mov = bld.mkMov(def[c][l], tex->getDef(c));
1271 mov->fixed = 1;
1272 mov->lanes = 1 << l;
1273 }
1274 }
1275
1276 for (c = 0; i->defExists(c); ++c) {
1277 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1278 for (l = 0; l < 4; ++l)
1279 u->setSrc(l, def[c][l]);
1280 }
1281
1282 i->bb->remove(i);
1283 return true;
1284 }
1285
1286 bool
1287 NVC0LoweringPass::handleTXD(TexInstruction *txd)
1288 {
1289 int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
1290 unsigned arg = txd->tex.target.getArgCount();
1291 unsigned expected_args = arg;
1292 const int chipset = prog->getTarget()->getChipset();
1293
1294 if (chipset >= NVISA_GK104_CHIPSET) {
1295 if (!txd->tex.target.isArray() && txd->tex.useOffsets)
1296 expected_args++;
1297 if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
1298 expected_args++;
1299 } else {
1300 if (txd->tex.useOffsets)
1301 expected_args++;
1302 if (!txd->tex.target.isArray() && (
1303 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
1304 expected_args++;
1305 }
1306
1307 if (expected_args > 4 ||
1308 dim > 2 ||
1309 txd->tex.target.isShadow())
1310 txd->op = OP_TEX;
1311
1312 handleTEX(txd);
1313 while (txd->srcExists(arg))
1314 ++arg;
1315
1316 txd->tex.derivAll = true;
1317 if (txd->op == OP_TEX)
1318 return handleManualTXD(txd);
1319
1320 assert(arg == expected_args);
1321 for (int c = 0; c < dim; ++c) {
1322 txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
1323 txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
1324 txd->dPdx[c].set(NULL);
1325 txd->dPdy[c].set(NULL);
1326 }
1327
1328 // In this case we have fewer than 4 "real" arguments, which means that
1329 // handleTEX didn't apply any padding. However we have to make sure that
1330 // the second "group" of arguments still gets padded up to 4.
1331 if (chipset >= NVISA_GK104_CHIPSET) {
1332 int s = arg + 2 * dim;
1333 if (s >= 4 && s < 7) {
1334 if (txd->srcExists(s)) // move potential predicate out of the way
1335 txd->moveSources(s, 7 - s);
1336 while (s < 7)
1337 txd->setSrc(s++, bld.loadImm(NULL, 0));
1338 }
1339 }
1340
1341 return true;
1342 }
1343
1344 bool
1345 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
1346 {
1347 const int chipset = prog->getTarget()->getChipset();
1348 if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
1349 txq->tex.r += prog->driver->io.texBindBase / 4;
1350
1351 if (txq->tex.rIndirectSrc < 0)
1352 return true;
1353
1354 Value *ticRel = txq->getIndirectR();
1355
1356 txq->setIndirectS(NULL);
1357 txq->tex.sIndirectSrc = -1;
1358
1359 assert(ticRel);
1360
1361 if (chipset < NVISA_GK104_CHIPSET) {
1362 LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1363
1364 txq->setSrc(txq->tex.rIndirectSrc, NULL);
1365 if (txq->tex.r)
1366 ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1367 ticRel, bld.mkImm(txq->tex.r));
1368
1369 bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
1370
1371 txq->moveSources(0, 1);
1372 txq->setSrc(0, src);
1373 } else {
1374 Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
1375 txq->tex.r = 0xff;
1376 txq->tex.s = 0x1f;
1377
1378 txq->setIndirectR(NULL);
1379 txq->moveSources(0, 1);
1380 txq->setSrc(0, hnd);
1381 txq->tex.rIndirectSrc = 0;
1382 }
1383
1384 return true;
1385 }
1386
1387 bool
1388 NVC0LoweringPass::handleTXLQ(TexInstruction *i)
1389 {
1390 /* The outputs are inverted compared to what the TGSI instruction
1391 * expects. Take that into account in the mask.
1392 */
1393 assert((i->tex.mask & ~3) == 0);
1394 if (i->tex.mask == 1)
1395 i->tex.mask = 2;
1396 else if (i->tex.mask == 2)
1397 i->tex.mask = 1;
1398 handleTEX(i);
1399 bld.setPosition(i, true);
1400
1401 /* The returned values are not quite what we want:
1402 * (a) convert from s16/u16 to f32
1403 * (b) multiply by 1/256
1404 */
1405 for (int def = 0; def < 2; ++def) {
1406 if (!i->defExists(def))
1407 continue;
1408 enum DataType type = TYPE_S16;
1409 if (i->tex.mask == 2 || def > 0)
1410 type = TYPE_U16;
1411 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
1412 bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1413 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1414 }
1415 if (i->tex.mask == 3) {
1416 LValue *t = new_LValue(func, FILE_GPR);
1417 bld.mkMov(t, i->getDef(0));
1418 bld.mkMov(i->getDef(0), i->getDef(1));
1419 bld.mkMov(i->getDef(1), t);
1420 }
1421 return true;
1422 }
1423
1424 bool
1425 NVC0LoweringPass::handleBUFQ(Instruction *bufq)
1426 {
1427 bufq->op = OP_MOV;
1428 bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
1429 bufq->getSrc(0)->reg.fileIndex * 16));
1430 bufq->setIndirect(0, 0, NULL);
1431 bufq->setIndirect(0, 1, NULL);
1432 return true;
1433 }
1434
1435 void
1436 NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
1437 {
1438 assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1439
1440 BasicBlock *currBB = atom->bb;
1441 BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1442 BasicBlock *joinBB = atom->bb->splitAfter(atom);
1443 BasicBlock *setAndUnlockBB = new BasicBlock(func);
1444 BasicBlock *failLockBB = new BasicBlock(func);
1445
1446 bld.setPosition(currBB, true);
1447 assert(!currBB->joinAt);
1448 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1449
1450 CmpInstruction *pred =
1451 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1452 TYPE_U32, bld.mkImm(0), bld.mkImm(1));
1453
1454 bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1455 currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1456
1457 bld.setPosition(tryLockBB, true);
1458
1459 Instruction *ld =
1460 bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1461 atom->getIndirect(0, 0));
1462 ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1463 ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1464
1465 bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
1466 bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1467 tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1468 tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1469
1470 tryLockBB->cfg.detach(&joinBB->cfg);
1471 bld.remove(atom);
1472
1473 bld.setPosition(setAndUnlockBB, true);
1474 Value *stVal;
1475 if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1476 // Read the old value, and write the new one.
1477 stVal = atom->getSrc(1);
1478 } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1479 CmpInstruction *set =
1480 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
1481 TYPE_U32, ld->getDef(0), atom->getSrc(1));
1482
1483 bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
1484 TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
1485 } else {
1486 operation op;
1487
1488 switch (atom->subOp) {
1489 case NV50_IR_SUBOP_ATOM_ADD:
1490 op = OP_ADD;
1491 break;
1492 case NV50_IR_SUBOP_ATOM_AND:
1493 op = OP_AND;
1494 break;
1495 case NV50_IR_SUBOP_ATOM_OR:
1496 op = OP_OR;
1497 break;
1498 case NV50_IR_SUBOP_ATOM_XOR:
1499 op = OP_XOR;
1500 break;
1501 case NV50_IR_SUBOP_ATOM_MIN:
1502 op = OP_MIN;
1503 break;
1504 case NV50_IR_SUBOP_ATOM_MAX:
1505 op = OP_MAX;
1506 break;
1507 default:
1508 assert(0);
1509 return;
1510 }
1511
1512 stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
1513 atom->getSrc(1));
1514 }
1515
1516 Instruction *st =
1517 bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1518 atom->getIndirect(0, 0), stVal);
1519 st->setDef(0, pred->getDef(0));
1520 st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1521
1522 bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1523 setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1524
1525 // Lock until the store has not been performed.
1526 bld.setPosition(failLockBB, true);
1527 bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
1528 bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1529 failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1530 failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1531
1532 bld.setPosition(joinBB, false);
1533 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1534 }
1535
1536 void
1537 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
1538 {
1539 assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1540
1541 BasicBlock *currBB = atom->bb;
1542 BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
1543 BasicBlock *joinBB = atom->bb->splitAfter(atom);
1544
1545 bld.setPosition(currBB, true);
1546 assert(!currBB->joinAt);
1547 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1548
1549 bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
1550 currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
1551
1552 bld.setPosition(tryLockAndSetBB, true);
1553
1554 Instruction *ld =
1555 bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1556 atom->getIndirect(0, 0));
1557 ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1558 ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1559
1560 Value *stVal;
1561 if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1562 // Read the old value, and write the new one.
1563 stVal = atom->getSrc(1);
1564 } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1565 CmpInstruction *set =
1566 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1567 TYPE_U32, ld->getDef(0), atom->getSrc(1));
1568 set->setPredicate(CC_P, ld->getDef(1));
1569
1570 Instruction *selp =
1571 bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
1572 atom->getSrc(2), set->getDef(0));
1573 selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
1574 selp->setPredicate(CC_P, ld->getDef(1));
1575
1576 stVal = selp->getDef(0);
1577 } else {
1578 operation op;
1579
1580 switch (atom->subOp) {
1581 case NV50_IR_SUBOP_ATOM_ADD:
1582 op = OP_ADD;
1583 break;
1584 case NV50_IR_SUBOP_ATOM_AND:
1585 op = OP_AND;
1586 break;
1587 case NV50_IR_SUBOP_ATOM_OR:
1588 op = OP_OR;
1589 break;
1590 case NV50_IR_SUBOP_ATOM_XOR:
1591 op = OP_XOR;
1592 break;
1593 case NV50_IR_SUBOP_ATOM_MIN:
1594 op = OP_MIN;
1595 break;
1596 case NV50_IR_SUBOP_ATOM_MAX:
1597 op = OP_MAX;
1598 break;
1599 default:
1600 assert(0);
1601 return;
1602 }
1603
1604 Instruction *i =
1605 bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1606 atom->getSrc(1));
1607 i->setPredicate(CC_P, ld->getDef(1));
1608
1609 stVal = i->getDef(0);
1610 }
1611
1612 Instruction *st =
1613 bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1614 atom->getIndirect(0, 0), stVal);
1615 st->setPredicate(CC_P, ld->getDef(1));
1616 st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1617
1618 // Loop until the lock is acquired.
1619 bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
1620 tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
1621 tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
1622 bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1623
1624 bld.remove(atom);
1625
1626 bld.setPosition(joinBB, false);
1627 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1628 }
1629
1630 bool
1631 NVC0LoweringPass::handleATOM(Instruction *atom)
1632 {
1633 SVSemantic sv;
1634 Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
1635
1636 switch (atom->src(0).getFile()) {
1637 case FILE_MEMORY_LOCAL:
1638 sv = SV_LBASE;
1639 break;
1640 case FILE_MEMORY_SHARED:
1641 // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1642 // operations on shared memory. For Maxwell, ATOMS is enough.
1643 if (targ->getChipset() < NVISA_GK104_CHIPSET)
1644 handleSharedATOM(atom);
1645 else if (targ->getChipset() < NVISA_GM107_CHIPSET)
1646 handleSharedATOMNVE4(atom);
1647 return true;
1648 default:
1649 assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
1650 base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
1651 assert(base->reg.size == 8);
1652 if (ptr)
1653 base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
1654 assert(base->reg.size == 8);
1655 atom->setIndirect(0, 0, base);
1656 atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1657
1658 // Harden against out-of-bounds accesses
1659 Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
1660 Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
1661 Value *pred = new_LValue(func, FILE_PREDICATE);
1662 if (ptr)
1663 bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
1664 bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
1665 atom->setPredicate(CC_NOT_P, pred);
1666 if (atom->defExists(0)) {
1667 Value *zero, *dst = atom->getDef(0);
1668 atom->setDef(0, bld.getSSA());
1669
1670 bld.setPosition(atom, true);
1671 bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
1672 ->setPredicate(CC_P, pred);
1673 bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
1674 }
1675
1676 return true;
1677 }
1678 base =
1679 bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
1680
1681 atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
1682 atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1683 if (ptr)
1684 base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
1685 atom->setIndirect(0, 1, NULL);
1686 atom->setIndirect(0, 0, base);
1687
1688 return true;
1689 }
1690
1691 bool
1692 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
1693 {
1694 if (targ->getChipset() < NVISA_GM107_CHIPSET) {
1695 if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
1696 // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1697 return false;
1698 }
1699 }
1700
1701 if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
1702 cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
1703 return false;
1704 bld.setPosition(cas, true);
1705
1706 if (needCctl) {
1707 Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
1708 cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
1709 cctl->fixed = 1;
1710 cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
1711 if (cas->isPredicated())
1712 cctl->setPredicate(cas->cc, cas->getPredicate());
1713 }
1714
1715 if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1716 // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1717 // should be set to the high part of the double reg or bad things will
1718 // happen elsewhere in the universe.
1719 // Also, it sometimes returns the new value instead of the old one
1720 // under mysterious circumstances.
1721 Value *dreg = bld.getSSA(8);
1722 bld.setPosition(cas, false);
1723 bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
1724 cas->setSrc(1, dreg);
1725 cas->setSrc(2, dreg);
1726 }
1727
1728 return true;
1729 }
1730
1731 inline Value *
1732 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
1733 {
1734 uint8_t b = prog->driver->io.auxCBSlot;
1735 off += base;
1736
1737 return bld.
1738 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1739 }
1740
1741 inline Value *
1742 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
1743 {
1744 uint8_t b = prog->driver->io.auxCBSlot;
1745 off += base;
1746
1747 if (ptr)
1748 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1749
1750 return bld.
1751 mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
1752 }
1753
1754 inline Value *
1755 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
1756 {
1757 uint8_t b = prog->driver->io.auxCBSlot;
1758 off += base;
1759
1760 if (ptr)
1761 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1762
1763 return bld.
1764 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
1765 }
1766
1767 inline Value *
1768 NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
1769 {
1770 return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
1771 }
1772
1773 inline Value *
1774 NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
1775 {
1776 return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
1777 }
1778
1779 inline Value *
1780 NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
1781 {
1782 return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
1783 }
1784
1785 inline Value *
1786 NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
1787 {
1788 return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
1789 }
1790
1791 inline Value *
1792 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1793 {
1794 uint8_t b = prog->driver->io.msInfoCBSlot;
1795 off += prog->driver->io.msInfoBase;
1796 return bld.
1797 mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1798 }
1799
1800 inline Value *
1801 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless)
1802 {
1803 uint32_t base = slot * NVC0_SU_INFO__STRIDE;
1804
1805 if (ptr) {
1806 ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
1807 if (bindless)
1808 ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(511));
1809 else
1810 ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
1811 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
1812 base = 0;
1813 }
1814 off += base;
1815
1816 return loadResInfo32(ptr, off, bindless ? prog->driver->io.bindlessBase :
1817 prog->driver->io.suInfoBase);
1818 }
1819
1820 Value *
1821 NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless)
1822 {
1823 if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET)
1824 return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless);
1825
1826 assert(bindless);
1827
1828 Value *samples = bld.getSSA();
1829 // this shouldn't be lowered because it's being inserted before the current instruction
1830 TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
1831 tex->tex.target = target;
1832 tex->tex.query = TXQ_TYPE;
1833 tex->tex.mask = 0x4;
1834 tex->tex.r = 0xff;
1835 tex->tex.s = 0x1f;
1836 tex->tex.rIndirectSrc = 0;
1837 tex->setDef(0, samples);
1838 tex->setSrc(0, ind);
1839 tex->setSrc(1, bld.loadImm(NULL, 0));
1840 bld.insert(tex);
1841
1842 // doesn't work with sample counts other than 1/2/4/8 but they aren't supported
1843 switch (index) {
1844 case 0: {
1845 Value *tmp = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(2));
1846 return bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(2));
1847 }
1848 case 1: {
1849 Value *tmp = bld.mkCmp(OP_SET, CC_GT, TYPE_U32, bld.getSSA(), TYPE_U32, samples, bld.mkImm(2))->getDef(0);
1850 return bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(1));
1851 }
1852 default: {
1853 assert(false);
1854 return NULL;
1855 }
1856 }
1857 }
1858
1859 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1860 {
1861 switch (su->tex.target.getEnum()) {
1862 case TEX_TARGET_BUFFER: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1863 case TEX_TARGET_RECT: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1864 case TEX_TARGET_1D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1865 case TEX_TARGET_1D_ARRAY: return (c == 1) ?
1866 NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1867 NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1868 case TEX_TARGET_2D: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1869 case TEX_TARGET_2D_MS: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1870 case TEX_TARGET_2D_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1871 case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1872 case TEX_TARGET_3D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1873 case TEX_TARGET_CUBE: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1874 case TEX_TARGET_CUBE_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1875 default:
1876 assert(0);
1877 return 0;
1878 }
1879 }
1880
1881 bool
1882 NVC0LoweringPass::handleSUQ(TexInstruction *suq)
1883 {
1884 int mask = suq->tex.mask;
1885 int dim = suq->tex.target.getDim();
1886 int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1887 Value *ind = suq->getIndirectR();
1888 int slot = suq->tex.r;
1889 int c, d;
1890
1891 for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1892 if (c >= arg || !(mask & 1))
1893 continue;
1894
1895 int offset;
1896
1897 if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1898 offset = NVC0_SU_INFO_SIZE(2);
1899 } else {
1900 offset = NVC0_SU_INFO_SIZE(c);
1901 }
1902 bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset, suq->tex.bindless));
1903 if (c == 2 && suq->tex.target.isCube())
1904 bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1905 bld.loadImm(NULL, 6));
1906 }
1907
1908 if (mask & 1) {
1909 if (suq->tex.target.isMS()) {
1910 Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless);
1911 Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless);
1912 Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1913 bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1914 } else {
1915 bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1916 }
1917 }
1918
1919 bld.remove(suq);
1920 return true;
1921 }
1922
1923 void
1924 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1925 {
1926 const int arg = tex->tex.target.getArgCount();
1927 int slot = tex->tex.r;
1928
1929 if (tex->tex.target == TEX_TARGET_2D_MS)
1930 tex->tex.target = TEX_TARGET_2D;
1931 else
1932 if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1933 tex->tex.target = TEX_TARGET_2D_ARRAY;
1934 else
1935 return;
1936
1937 Value *x = tex->getSrc(0);
1938 Value *y = tex->getSrc(1);
1939 Value *s = tex->getSrc(arg - 1);
1940
1941 Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1942 Value *ind = tex->getIndirectR();
1943
1944 Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless);
1945 Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless);
1946
1947 bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1948 bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1949
1950 s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1951 s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1952
1953 Value *dx = loadMsInfo32(ts, 0x0);
1954 Value *dy = loadMsInfo32(ts, 0x4);
1955
1956 bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1957 bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1958
1959 tex->setSrc(0, tx);
1960 tex->setSrc(1, ty);
1961 tex->moveSources(arg, -1);
1962 }
1963
1964 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1965 // They're computed from the coordinates using the surface info in c[] space.
1966 void
1967 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1968 {
1969 Instruction *insn;
1970 const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
1971 const bool raw =
1972 su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
1973 const int slot = su->tex.r;
1974 const int dim = su->tex.target.getDim();
1975 const bool array = su->tex.target.isArray() || su->tex.target.isCube();
1976 const int arg = dim + array;
1977 int c;
1978 Value *zero = bld.mkImm(0);
1979 Value *p1 = NULL;
1980 Value *v;
1981 Value *src[3];
1982 Value *bf, *eau, *off;
1983 Value *addr, *pred;
1984 Value *ind = su->getIndirectR();
1985 Value *y, *z;
1986
1987 off = bld.getScratch(4);
1988 bf = bld.getScratch(4);
1989 addr = bld.getSSA(8);
1990 pred = bld.getScratch(1, FILE_PREDICATE);
1991
1992 bld.setPosition(su, false);
1993
1994 adjustCoordinatesMS(su);
1995
1996 // calculate clamped coordinates
1997 for (c = 0; c < arg; ++c) {
1998 int dimc = c;
1999
2000 if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
2001 // The array index is stored in the Z component for 1D arrays.
2002 dimc = 2;
2003 }
2004
2005 src[c] = bld.getScratch();
2006 if (c == 0 && raw)
2007 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X, su->tex.bindless);
2008 else
2009 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc), su->tex.bindless);
2010 bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
2011 ->subOp = getSuClampSubOp(su, dimc);
2012 }
2013 for (; c < 3; ++c)
2014 src[c] = zero;
2015
2016 if (dim == 2 && !array) {
2017 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2018 src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
2019 v, bld.loadImm(NULL, 16));
2020
2021 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);
2022 bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)
2023 ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
2024 }
2025
2026 // set predicate output
2027 if (su->tex.target == TEX_TARGET_BUFFER) {
2028 src[0]->getInsn()->setFlagsDef(1, pred);
2029 } else
2030 if (array) {
2031 p1 = bld.getSSA(1, FILE_PREDICATE);
2032 src[dim]->getInsn()->setFlagsDef(1, p1);
2033 }
2034
2035 // calculate pixel offset
2036 if (dim == 1) {
2037 y = z = zero;
2038 if (su->tex.target != TEX_TARGET_BUFFER)
2039 bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
2040 } else {
2041 y = src[1];
2042 z = src[2];
2043
2044 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2045 bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
2046 ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l
2047
2048 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
2049 bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
2050 ->subOp = array ?
2051 NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
2052 }
2053
2054 // calculate effective address part 1
2055 if (su->tex.target == TEX_TARGET_BUFFER) {
2056 if (raw) {
2057 bf = src[0];
2058 } else {
2059 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2060 bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
2061 ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
2062 }
2063 } else {
2064 uint16_t subOp = 0;
2065
2066 switch (dim) {
2067 case 1:
2068 break;
2069 case 2:
2070 if (array) {
2071 z = off;
2072 } else {
2073 subOp = NV50_IR_SUBOP_SUBFM_3D;
2074 }
2075 break;
2076 default:
2077 subOp = NV50_IR_SUBOP_SUBFM_3D;
2078 assert(dim == 3);
2079 break;
2080 }
2081 insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
2082 insn->subOp = subOp;
2083 insn->setFlagsDef(1, pred);
2084 }
2085
2086 // part 2
2087 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless);
2088
2089 if (su->tex.target == TEX_TARGET_BUFFER) {
2090 eau = v;
2091 } else {
2092 eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
2093 }
2094 // add array layer offset
2095 if (array) {
2096 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2097 if (dim == 1)
2098 bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
2099 ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
2100 else
2101 bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
2102 ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
2103 // combine predicates
2104 assert(p1);
2105 bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
2106 }
2107
2108 if (atom) {
2109 Value *lo = bf;
2110 if (su->tex.target == TEX_TARGET_BUFFER) {
2111 lo = zero;
2112 bld.mkMov(off, bf);
2113 }
2114 // bf == g[] address & 0xff
2115 // eau == g[] address >> 8
2116 bld.mkOp3(OP_PERMT, TYPE_U32, bf, lo, bld.loadImm(NULL, 0x6540), eau);
2117 bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
2118 } else
2119 if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
2120 // Convert from u32 to u8 address format, which is what the library code
2121 // doing SULDP currently uses.
2122 // XXX: can SUEAU do this ?
2123 // XXX: does it matter that we don't mask high bytes in bf ?
2124 // Grrr.
2125 bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
2126 bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
2127 }
2128
2129 bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
2130
2131 if (atom && su->tex.target == TEX_TARGET_BUFFER)
2132 bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
2133
2134 // let's just set it 0 for raw access and hope it works
2135 v = raw ?
2136 bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2137
2138 // get rid of old coordinate sources, make space for fmt info and predicate
2139 su->moveSources(arg, 3 - arg);
2140 // set 64 bit address and 32-bit format sources
2141 su->setSrc(0, addr);
2142 su->setSrc(1, v);
2143 su->setSrc(2, pred);
2144 su->setIndirectR(NULL);
2145
2146 // prevent read fault when the image is not actually bound
2147 CmpInstruction *pred1 =
2148 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2149 TYPE_U32, bld.mkImm(0),
2150 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2151
2152 if (su->op != OP_SUSTP && su->tex.format) {
2153 const TexInstruction::ImgFormatDesc *format = su->tex.format;
2154 int blockwidth = format->bits[0] + format->bits[1] +
2155 format->bits[2] + format->bits[3];
2156
2157 // make sure that the format doesn't mismatch
2158 assert(format->components != 0);
2159 bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
2160 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2161 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2162 pred1->getDef(0));
2163 }
2164 su->setPredicate(CC_NOT_P, pred1->getDef(0));
2165
2166 // TODO: initialize def values to 0 when the surface operation is not
2167 // performed (not needed for stores). Also, fix the "address bounds test"
2168 // subtests from arb_shader_image_load_store-invalid for buffers, because it
2169 // seems like that the predicate is not correctly set by suclamp.
2170 }
2171
2172 static DataType
2173 getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
2174 {
2175 switch (t->type) {
2176 case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
2177 case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
2178 case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
2179 case UINT:
2180 return (t->bits[c] == 8 ? TYPE_U8 :
2181 (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
2182 case SINT:
2183 return (t->bits[c] == 8 ? TYPE_S8 :
2184 (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
2185 }
2186 return TYPE_NONE;
2187 }
2188
2189 static DataType
2190 getDestType(const ImgType type) {
2191 switch (type) {
2192 case FLOAT:
2193 case UNORM:
2194 case SNORM:
2195 return TYPE_F32;
2196 case UINT:
2197 return TYPE_U32;
2198 case SINT:
2199 return TYPE_S32;
2200 default:
2201 assert(!"Impossible type");
2202 return TYPE_NONE;
2203 }
2204 }
2205
2206 void
2207 NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
2208 {
2209 const TexInstruction::ImgFormatDesc *format = su->tex.format;
2210 int width = format->bits[0] + format->bits[1] +
2211 format->bits[2] + format->bits[3];
2212 Value *untypedDst[4] = {};
2213 Value *typedDst[4] = {};
2214
2215 // We must convert this to a generic load.
2216 su->op = OP_SULDB;
2217
2218 su->dType = typeOfSize(width / 8);
2219 su->sType = TYPE_U8;
2220
2221 for (int i = 0; i < width / 32; i++)
2222 untypedDst[i] = bld.getSSA();
2223 if (width < 32)
2224 untypedDst[0] = bld.getSSA();
2225
2226 for (int i = 0; i < 4; i++) {
2227 typedDst[i] = su->getDef(i);
2228 }
2229
2230 // Set the untyped dsts as the su's destinations
2231 for (int i = 0; i < 4; i++)
2232 su->setDef(i, untypedDst[i]);
2233
2234 bld.setPosition(su, true);
2235
2236 // Unpack each component into the typed dsts
2237 int bits = 0;
2238 for (int i = 0; i < 4; bits += format->bits[i], i++) {
2239 if (!typedDst[i])
2240 continue;
2241 if (i >= format->components) {
2242 if (format->type == FLOAT ||
2243 format->type == UNORM ||
2244 format->type == SNORM)
2245 bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
2246 else
2247 bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
2248 continue;
2249 }
2250
2251 // Get just that component's data into the relevant place
2252 if (format->bits[i] == 32)
2253 bld.mkMov(typedDst[i], untypedDst[i]);
2254 else if (format->bits[i] == 16)
2255 bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2256 getSrcType(format, i), untypedDst[i / 2])
2257 ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
2258 else if (format->bits[i] == 8)
2259 bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2260 getSrcType(format, i), untypedDst[0])->subOp = i;
2261 else {
2262 bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
2263 bld.mkImm((bits % 32) | (format->bits[i] << 8)));
2264 if (format->type == UNORM || format->type == SNORM)
2265 bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
2266 }
2267
2268 // Normalize / convert as necessary
2269 if (format->type == UNORM)
2270 bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
2271 else if (format->type == SNORM)
2272 bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
2273 else if (format->type == FLOAT && format->bits[i] < 16) {
2274 bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
2275 bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
2276 }
2277 }
2278
2279 if (format->bgra) {
2280 std::swap(typedDst[0], typedDst[2]);
2281 }
2282 }
2283
2284 void
2285 NVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su)
2286 {
2287 if (!su->getPredicate())
2288 return;
2289
2290 bld.setPosition(su, true);
2291
2292 for (unsigned i = 0; su->defExists(i); ++i) {
2293 ValueDef &def = su->def(i);
2294
2295 Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2296 assert(su->cc == CC_NOT_P);
2297 mov->setPredicate(CC_P, su->getPredicate());
2298 Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), NULL, mov->getDef(0));
2299
2300 def.replace(uni->getDef(0), false);
2301 uni->setSrc(0, def.get());
2302 }
2303 }
2304
2305 void
2306 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
2307 {
2308 processSurfaceCoordsNVE4(su);
2309
2310 if (su->op == OP_SULDP) {
2311 convertSurfaceFormat(su);
2312 insertOOBSurfaceOpResult(su);
2313 }
2314
2315 if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2316 assert(su->getPredicate());
2317 Value *pred =
2318 bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
2319 su->getPredicate(), su->getSrc(2));
2320
2321 Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
2322 red->subOp = su->subOp;
2323 red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
2324 red->setSrc(1, su->getSrc(3));
2325 if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
2326 red->setSrc(2, su->getSrc(4));
2327 red->setIndirect(0, 0, su->getSrc(0));
2328
2329 // make sure to initialize dst value when the atomic operation is not
2330 // performed
2331 Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2332
2333 assert(su->cc == CC_NOT_P);
2334 red->setPredicate(su->cc, pred);
2335 mov->setPredicate(CC_P, pred);
2336
2337 bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
2338 red->getDef(0), mov->getDef(0));
2339
2340 delete_Instruction(bld.getProgram(), su);
2341 handleCasExch(red, true);
2342 }
2343
2344 if (su->op == OP_SUSTB || su->op == OP_SUSTP)
2345 su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
2346 }
2347
2348 void
2349 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
2350 {
2351 const int slot = su->tex.r;
2352 const int dim = su->tex.target.getDim();
2353 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2354 int c;
2355 Value *zero = bld.mkImm(0);
2356 Value *src[3];
2357 Value *v;
2358 Value *ind = su->getIndirectR();
2359
2360 bld.setPosition(su, false);
2361
2362 adjustCoordinatesMS(su);
2363
2364 if (ind) {
2365 Value *ptr;
2366 ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
2367 ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
2368 su->setIndirectR(ptr);
2369 }
2370
2371 // get surface coordinates
2372 for (c = 0; c < arg; ++c)
2373 src[c] = su->getSrc(c);
2374 for (; c < 3; ++c)
2375 src[c] = zero;
2376
2377 // calculate pixel offset
2378 if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2379 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless);
2380 su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
2381 }
2382
2383 // add array layer offset
2384 if (su->tex.target.isArray() || su->tex.target.isCube()) {
2385 v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2386 assert(dim > 1);
2387 su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
2388 }
2389
2390 // prevent read fault when the image is not actually bound
2391 CmpInstruction *pred =
2392 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2393 TYPE_U32, bld.mkImm(0),
2394 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2395 if (su->op != OP_SUSTP && su->tex.format) {
2396 const TexInstruction::ImgFormatDesc *format = su->tex.format;
2397 int blockwidth = format->bits[0] + format->bits[1] +
2398 format->bits[2] + format->bits[3];
2399
2400 assert(format->components != 0);
2401 // make sure that the format doesn't mismatch when it's not FMT_NONE
2402 bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2403 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2404 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2405 pred->getDef(0));
2406 }
2407 su->setPredicate(CC_NOT_P, pred->getDef(0));
2408 }
2409
2410 void
2411 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
2412 {
2413 if (su->tex.target == TEX_TARGET_1D_ARRAY) {
2414 /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2415 * will simplify the lowering pass and the texture constraints. */
2416 su->moveSources(1, 1);
2417 su->setSrc(1, bld.loadImm(NULL, 0));
2418 su->tex.target = TEX_TARGET_2D_ARRAY;
2419 }
2420
2421 processSurfaceCoordsNVC0(su);
2422
2423 if (su->op == OP_SULDP) {
2424 convertSurfaceFormat(su);
2425 insertOOBSurfaceOpResult(su);
2426 }
2427
2428 if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2429 const int dim = su->tex.target.getDim();
2430 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2431 LValue *addr = bld.getSSA(8);
2432 Value *def = su->getDef(0);
2433
2434 su->op = OP_SULEA;
2435
2436 // Set the destination to the address
2437 su->dType = TYPE_U64;
2438 su->setDef(0, addr);
2439 su->setDef(1, su->getPredicate());
2440
2441 bld.setPosition(su, true);
2442
2443 // Perform the atomic op
2444 Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
2445 red->subOp = su->subOp;
2446 red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
2447 red->setSrc(1, su->getSrc(arg));
2448 if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
2449 red->setSrc(2, su->getSrc(arg + 1));
2450 red->setIndirect(0, 0, addr);
2451
2452 // make sure to initialize dst value when the atomic operation is not
2453 // performed
2454 Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2455
2456 assert(su->cc == CC_NOT_P);
2457 red->setPredicate(su->cc, su->getPredicate());
2458 mov->setPredicate(CC_P, su->getPredicate());
2459
2460 bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
2461
2462 handleCasExch(red, false);
2463 }
2464 }
2465
2466 void
2467 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
2468 {
2469 const int slot = su->tex.r;
2470 const int dim = su->tex.target.getDim();
2471 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2472 Value *ind = su->getIndirectR();
2473 Value *handle;
2474 int pos = 0;
2475
2476 bld.setPosition(su, false);
2477
2478 adjustCoordinatesMS(su);
2479
2480 // add texture handle
2481 switch (su->op) {
2482 case OP_SUSTP:
2483 pos = 4;
2484 break;
2485 case OP_SUREDP:
2486 pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
2487 break;
2488 default:
2489 assert(pos == 0);
2490 break;
2491 }
2492 if (su->tex.bindless)
2493 handle = ind;
2494 else
2495 handle = loadTexHandle(ind, slot + 32);
2496 su->setSrc(arg + pos, handle);
2497
2498 // The address check doesn't make sense here. The format check could make
2499 // sense but it's a bit of a pain.
2500 if (su->tex.bindless)
2501 return;
2502
2503 // prevent read fault when the image is not actually bound
2504 CmpInstruction *pred =
2505 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2506 TYPE_U32, bld.mkImm(0),
2507 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2508 if (su->op != OP_SUSTP && su->tex.format) {
2509 const TexInstruction::ImgFormatDesc *format = su->tex.format;
2510 int blockwidth = format->bits[0] + format->bits[1] +
2511 format->bits[2] + format->bits[3];
2512
2513 assert(format->components != 0);
2514 // make sure that the format doesn't mismatch when it's not FMT_NONE
2515 bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2516 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2517 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2518 pred->getDef(0));
2519 }
2520 su->setPredicate(CC_NOT_P, pred->getDef(0));
2521 }
2522
2523 void
2524 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
2525 {
2526 processSurfaceCoordsGM107(su);
2527
2528 if (su->op == OP_SULDP) {
2529 convertSurfaceFormat(su);
2530 insertOOBSurfaceOpResult(su);
2531 }
2532
2533 if (su->op == OP_SUREDP) {
2534 Value *def = su->getDef(0);
2535
2536 su->op = OP_SUREDB;
2537
2538 // There may not be a predicate in the bindless case.
2539 if (su->getPredicate()) {
2540 su->setDef(0, bld.getSSA());
2541
2542 bld.setPosition(su, true);
2543
2544 // make sure to initialize dst value when the atomic operation is not
2545 // performed
2546 Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2547
2548 assert(su->cc == CC_NOT_P);
2549 mov->setPredicate(CC_P, su->getPredicate());
2550
2551 bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
2552 }
2553 }
2554 }
2555
2556 bool
2557 NVC0LoweringPass::handleWRSV(Instruction *i)
2558 {
2559 Instruction *st;
2560 Symbol *sym;
2561 uint32_t addr;
2562
2563 // must replace, $sreg are not writeable
2564 addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
2565 if (addr >= 0x400)
2566 return false;
2567 sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
2568
2569 st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
2570 i->getSrc(1));
2571 st->perPatch = i->perPatch;
2572
2573 bld.getBB()->remove(i);
2574 return true;
2575 }
2576
2577 void
2578 NVC0LoweringPass::handleLDST(Instruction *i)
2579 {
2580 if (i->src(0).getFile() == FILE_SHADER_INPUT) {
2581 if (prog->getType() == Program::TYPE_COMPUTE) {
2582 i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
2583 i->getSrc(0)->reg.fileIndex = 0;
2584 } else
2585 if (prog->getType() == Program::TYPE_GEOMETRY &&
2586 i->src(0).isIndirect(0)) {
2587 // XXX: this assumes vec4 units
2588 Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2589 i->getIndirect(0, 0), bld.mkImm(4));
2590 i->setIndirect(0, 0, ptr);
2591 i->op = OP_VFETCH;
2592 } else {
2593 i->op = OP_VFETCH;
2594 assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
2595 }
2596 } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
2597 int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
2598 Value *ind = i->getIndirect(0, 1);
2599
2600 if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2601 prog->getType() == Program::TYPE_COMPUTE &&
2602 (fileIndex >= 6 || ind)) {
2603 // The launch descriptor only allows to set up 8 CBs, but OpenGL
2604 // requires at least 12 UBOs. To bypass this limitation, for constant
2605 // buffers 7+, we store the addrs into the driver constbuf and we
2606 // directly load from the global memory.
2607 if (ind) {
2608 // Clamp the UBO index when an indirect access is used to avoid
2609 // loading information from the wrong place in the driver cb.
2610 // TODO - synchronize the max with the driver.
2611 ind = bld.mkOp2v(OP_MIN, TYPE_U32, bld.getSSA(),
2612 bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2613 ind, bld.loadImm(NULL, fileIndex)),
2614 bld.loadImm(NULL, 13));
2615 fileIndex = 0;
2616 }
2617
2618 Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2619 Value *ptr = loadUboInfo64(ind, fileIndex * 16);
2620 Value *length = loadUboLength32(ind, fileIndex * 16);
2621 Value *pred = new_LValue(func, FILE_PREDICATE);
2622 if (i->src(0).isIndirect(0)) {
2623 bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2624 bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2625 }
2626 i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2627 i->setIndirect(0, 1, NULL);
2628 i->setIndirect(0, 0, ptr);
2629 bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2630 i->setPredicate(CC_NOT_P, pred);
2631 Value *zero, *dst = i->getDef(0);
2632 i->setDef(0, bld.getSSA());
2633
2634 bld.setPosition(i, true);
2635 bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2636 ->setPredicate(CC_P, pred);
2637 bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2638 } else if (i->src(0).isIndirect(1)) {
2639 Value *ptr;
2640 if (i->src(0).isIndirect(0))
2641 ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
2642 i->getIndirect(0, 1), bld.mkImm(0x1010),
2643 i->getIndirect(0, 0));
2644 else
2645 ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2646 i->getIndirect(0, 1), bld.mkImm(16));
2647 i->setIndirect(0, 1, NULL);
2648 i->setIndirect(0, 0, ptr);
2649 i->subOp = NV50_IR_SUBOP_LDC_IS;
2650 }
2651 } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
2652 assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
2653 i->op = OP_VFETCH;
2654 } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
2655 Value *ind = i->getIndirect(0, 1);
2656 Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
2657 // XXX come up with a way not to do this for EVERY little access but
2658 // rather to batch these up somehow. Unfortunately we've lost the
2659 // information about the field width by the time we get here.
2660 Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2661 Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
2662 Value *pred = new_LValue(func, FILE_PREDICATE);
2663 if (i->src(0).isIndirect(0)) {
2664 bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2665 bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2666 }
2667 i->setIndirect(0, 1, NULL);
2668 i->setIndirect(0, 0, ptr);
2669 i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2670 bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2671 i->setPredicate(CC_NOT_P, pred);
2672 if (i->defExists(0)) {
2673 Value *zero, *dst = i->getDef(0);
2674 i->setDef(0, bld.getSSA());
2675
2676 bld.setPosition(i, true);
2677 bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2678 ->setPredicate(CC_P, pred);
2679 bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2680 }
2681 }
2682 }
2683
2684 void
2685 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
2686 {
2687 Value *laneid = bld.getSSA();
2688 Value *x, *y;
2689
2690 bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
2691
2692 if (c == 0) {
2693 x = dst;
2694 y = NULL;
2695 } else
2696 if (c == 1) {
2697 x = NULL;
2698 y = dst;
2699 } else {
2700 assert(c == 2);
2701 if (prog->driver->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
2702 bld.mkMov(dst, bld.loadImm(NULL, 0));
2703 return;
2704 }
2705 x = bld.getSSA();
2706 y = bld.getSSA();
2707 }
2708 if (x)
2709 bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
2710 if (y)
2711 bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
2712
2713 if (c == 2) {
2714 bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
2715 bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
2716 }
2717 }
2718
2719 bool
2720 NVC0LoweringPass::handleRDSV(Instruction *i)
2721 {
2722 Symbol *sym = i->getSrc(0)->asSym();
2723 const SVSemantic sv = sym->reg.data.sv.sv;
2724 Value *vtx = NULL;
2725 Instruction *ld;
2726 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
2727
2728 if (addr >= 0x400) {
2729 // mov $sreg
2730 if (sym->reg.data.sv.index == 3) {
2731 // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2732 i->op = OP_MOV;
2733 i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
2734 } else
2735 if (sv == SV_TID) {
2736 // Help CSE combine TID fetches
2737 Value *tid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(),
2738 bld.mkSysVal(SV_COMBINED_TID, 0));
2739 i->op = OP_EXTBF;
2740 i->setSrc(0, tid);
2741 switch (sym->reg.data.sv.index) {
2742 case 0: i->setSrc(1, bld.mkImm(0x1000)); break;
2743 case 1: i->setSrc(1, bld.mkImm(0x0a10)); break;
2744 case 2: i->setSrc(1, bld.mkImm(0x061a)); break;
2745 }
2746 }
2747 if (sv == SV_VERTEX_COUNT) {
2748 bld.setPosition(i, true);
2749 bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
2750 }
2751 return true;
2752 }
2753
2754 switch (sv) {
2755 case SV_POSITION:
2756 assert(prog->getType() == Program::TYPE_FRAGMENT);
2757 if (i->srcExists(1)) {
2758 // Pass offset through to the interpolation logic
2759 ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
2760 i->getDef(0), addr, NULL);
2761 ld->setSrc(1, i->getSrc(1));
2762 } else {
2763 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
2764 }
2765 break;
2766 case SV_FACE:
2767 {
2768 Value *face = i->getDef(0);
2769 bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
2770 if (i->dType == TYPE_F32) {
2771 bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
2772 bld.mkOp1(OP_NEG, TYPE_S32, face, face);
2773 bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
2774 }
2775 }
2776 break;
2777 case SV_TESS_COORD:
2778 assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
2779 readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
2780 break;
2781 case SV_NTID:
2782 case SV_NCTAID:
2783 case SV_GRIDID:
2784 assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
2785 if (sym->reg.data.sv.index == 3) {
2786 i->op = OP_MOV;
2787 i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
2788 return true;
2789 }
2790 // Fallthrough
2791 case SV_WORK_DIM:
2792 addr += prog->driver->prop.cp.gridInfoBase;
2793 bld.mkLoad(TYPE_U32, i->getDef(0),
2794 bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2795 TYPE_U32, addr), NULL);
2796 break;
2797 case SV_SAMPLE_INDEX:
2798 // TODO: Properly pass source as an address in the PIX address space
2799 // (which can be of the form [r0+offset]). But this is currently
2800 // unnecessary.
2801 ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2802 ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2803 break;
2804 case SV_SAMPLE_POS: {
2805 Value *sampleID = bld.getScratch();
2806 ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0));
2807 ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2808 Value *offset = calculateSampleOffset(sampleID);
2809
2810 assert(prog->driver->prop.fp.readsSampleLocations);
2811
2812 if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
2813 bld.mkLoad(TYPE_F32,
2814 i->getDef(0),
2815 bld.mkSymbol(
2816 FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2817 TYPE_U32, prog->driver->io.sampleInfoBase),
2818 offset);
2819 bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0),
2820 bld.mkImm(0x040c + sym->reg.data.sv.index * 16));
2821 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0));
2822 bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), bld.mkImm(1.0f / 16.0f));
2823 } else {
2824 bld.mkLoad(TYPE_F32,
2825 i->getDef(0),
2826 bld.mkSymbol(
2827 FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2828 TYPE_U32, prog->driver->io.sampleInfoBase +
2829 4 * sym->reg.data.sv.index),
2830 offset);
2831 }
2832 break;
2833 }
2834 case SV_SAMPLE_MASK: {
2835 ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2836 ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
2837 Instruction *sampleid =
2838 bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2839 sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2840 Value *masked =
2841 bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
2842 bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2843 bld.loadImm(NULL, 1), sampleid->getDef(0)));
2844 if (prog->driver->prop.fp.persampleInvocation) {
2845 bld.mkMov(i->getDef(0), masked);
2846 } else {
2847 bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
2848 bld.mkImm(0))
2849 ->subOp = 1;
2850 }
2851 break;
2852 }
2853 case SV_BASEVERTEX:
2854 case SV_BASEINSTANCE:
2855 case SV_DRAWID:
2856 ld = bld.mkLoad(TYPE_U32, i->getDef(0),
2857 bld.mkSymbol(FILE_MEMORY_CONST,
2858 prog->driver->io.auxCBSlot,
2859 TYPE_U32,
2860 prog->driver->io.drawInfoBase +
2861 4 * (sv - SV_BASEVERTEX)),
2862 NULL);
2863 break;
2864 default:
2865 if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
2866 vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2867 if (prog->getType() == Program::TYPE_FRAGMENT) {
2868 bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
2869 } else {
2870 ld = bld.mkFetch(i->getDef(0), i->dType,
2871 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
2872 ld->perPatch = i->perPatch;
2873 }
2874 break;
2875 }
2876 bld.getBB()->remove(i);
2877 return true;
2878 }
2879
2880 bool
2881 NVC0LoweringPass::handleDIV(Instruction *i)
2882 {
2883 if (!isFloatType(i->dType))
2884 return true;
2885 bld.setPosition(i, false);
2886 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
2887 i->op = OP_MUL;
2888 i->setSrc(1, rcp->getDef(0));
2889 return true;
2890 }
2891
2892 bool
2893 NVC0LoweringPass::handleMOD(Instruction *i)
2894 {
2895 if (!isFloatType(i->dType))
2896 return true;
2897 LValue *value = bld.getScratch(typeSizeof(i->dType));
2898 bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
2899 bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
2900 bld.mkOp1(OP_TRUNC, i->dType, value, value);
2901 bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
2902 i->op = OP_SUB;
2903 i->setSrc(1, value);
2904 return true;
2905 }
2906
2907 bool
2908 NVC0LoweringPass::handleSQRT(Instruction *i)
2909 {
2910 if (targ->isOpSupported(OP_SQRT, i->dType))
2911 return true;
2912
2913 if (i->dType == TYPE_F64) {
2914 Value *pred = bld.getSSA(1, FILE_PREDICATE);
2915 Value *zero = bld.loadImm(NULL, 0.0);
2916 Value *dst = bld.getSSA(8);
2917 bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
2918 bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
2919 bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
2920 i->op = OP_MUL;
2921 i->setSrc(1, dst);
2922 // TODO: Handle this properly with a library function
2923 } else {
2924 bld.setPosition(i, true);
2925 i->op = OP_RSQ;
2926 bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
2927 }
2928
2929 return true;
2930 }
2931
2932 bool
2933 NVC0LoweringPass::handlePOW(Instruction *i)
2934 {
2935 LValue *val = bld.getScratch();
2936
2937 bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
2938 bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
2939 bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
2940
2941 i->op = OP_EX2;
2942 i->setSrc(0, val);
2943 i->setSrc(1, NULL);
2944
2945 return true;
2946 }
2947
2948 bool
2949 NVC0LoweringPass::handleEXPORT(Instruction *i)
2950 {
2951 if (prog->getType() == Program::TYPE_FRAGMENT) {
2952 int id = i->getSrc(0)->reg.data.offset / 4;
2953
2954 if (i->src(0).isIndirect(0)) // TODO, ugly
2955 return false;
2956 i->op = OP_MOV;
2957 i->subOp = NV50_IR_SUBOP_MOV_FINAL;
2958 i->src(0).set(i->src(1));
2959 i->setSrc(1, NULL);
2960 i->setDef(0, new_LValue(func, FILE_GPR));
2961 i->getDef(0)->reg.data.id = id;
2962
2963 prog->maxGPR = MAX2(prog->maxGPR, id);
2964 } else
2965 if (prog->getType() == Program::TYPE_GEOMETRY) {
2966 i->setIndirect(0, 1, gpEmitAddress);
2967 }
2968 return true;
2969 }
2970
2971 bool
2972 NVC0LoweringPass::handleOUT(Instruction *i)
2973 {
2974 Instruction *prev = i->prev;
2975 ImmediateValue stream, prevStream;
2976
2977 // Only merge if the stream ids match. Also, note that the previous
2978 // instruction would have already been lowered, so we take arg1 from it.
2979 if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
2980 i->src(0).getImmediate(stream) &&
2981 prev->src(1).getImmediate(prevStream) &&
2982 stream.reg.data.u32 == prevStream.reg.data.u32) {
2983 i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
2984 delete_Instruction(prog, i);
2985 } else {
2986 assert(gpEmitAddress);
2987 i->setDef(0, gpEmitAddress);
2988 i->setSrc(1, i->getSrc(0));
2989 i->setSrc(0, gpEmitAddress);
2990 }
2991 return true;
2992 }
2993
2994 Value *
2995 NVC0LoweringPass::calculateSampleOffset(Value *sampleID)
2996 {
2997 Value *offset = bld.getScratch();
2998 if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
2999 // Sample location offsets (in bytes) are calculated like so:
3000 // offset = (SV_POSITION.y % 4 * 2) + (SV_POSITION.x % 2)
3001 // offset = offset * 32 + sampleID % 8 * 4;
3002 // which is equivalent to:
3003 // offset = (SV_POSITION.y & 0x3) << 6 + (SV_POSITION.x & 0x1) << 5;
3004 // offset += sampleID << 2
3005
3006 // The second operand (src1) of the INSBF instructions are like so:
3007 // 0xssll where ss is the size and ll is the offset.
3008 // so: dest = src2 | (src0 & (1 << ss - 1)) << ll
3009
3010 // Add sample ID (offset = (sampleID & 0x7) << 2)
3011 bld.mkOp3(OP_INSBF, TYPE_U32, offset, sampleID, bld.mkImm(0x0302), bld.mkImm(0x0));
3012
3013 Symbol *xSym = bld.mkSysVal(SV_POSITION, 0);
3014 Symbol *ySym = bld.mkSysVal(SV_POSITION, 1);
3015 Value *coord = bld.getScratch();
3016
3017 // Add X coordinate (offset |= (SV_POSITION.x & 0x1) << 5)
3018 bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3019 targ->getSVAddress(FILE_SHADER_INPUT, xSym), NULL);
3020 bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3021 ->rnd = ROUND_ZI;
3022 bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0105), offset);
3023
3024 // Add Y coordinate (offset |= (SV_POSITION.y & 0x3) << 6)
3025 bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3026 targ->getSVAddress(FILE_SHADER_INPUT, ySym), NULL);
3027 bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3028 ->rnd = ROUND_ZI;
3029 bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0206), offset);
3030 } else {
3031 bld.mkOp2(OP_SHL, TYPE_U32, offset, sampleID, bld.mkImm(3));
3032 }
3033 return offset;
3034 }
3035
3036 // Handle programmable sample locations for GM20x+
3037 void
3038 NVC0LoweringPass::handlePIXLD(Instruction *i)
3039 {
3040 if (i->subOp != NV50_IR_SUBOP_PIXLD_OFFSET)
3041 return;
3042 if (targ->getChipset() < NVISA_GM200_CHIPSET)
3043 return;
3044
3045 assert(prog->driver->prop.fp.readsSampleLocations);
3046
3047 bld.mkLoad(TYPE_F32,
3048 i->getDef(0),
3049 bld.mkSymbol(
3050 FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3051 TYPE_U32, prog->driver->io.sampleInfoBase),
3052 calculateSampleOffset(i->getSrc(0)));
3053
3054 bld.getBB()->remove(i);
3055 }
3056
3057 // Generate a binary predicate if an instruction is predicated by
3058 // e.g. an f32 value.
3059 void
3060 NVC0LoweringPass::checkPredicate(Instruction *insn)
3061 {
3062 Value *pred = insn->getPredicate();
3063 Value *pdst;
3064
3065 if (!pred || pred->reg.file == FILE_PREDICATE)
3066 return;
3067 pdst = new_LValue(func, FILE_PREDICATE);
3068
3069 // CAUTION: don't use pdst->getInsn, the definition might not be unique,
3070 // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
3071
3072 bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
3073
3074 insn->setPredicate(insn->cc, pdst);
3075 }
3076
3077 //
3078 // - add quadop dance for texturing
3079 // - put FP outputs in GPRs
3080 // - convert instruction sequences
3081 //
3082 bool
3083 NVC0LoweringPass::visit(Instruction *i)
3084 {
3085 bool ret = true;
3086 bld.setPosition(i, false);
3087
3088 if (i->cc != CC_ALWAYS)
3089 checkPredicate(i);
3090
3091 switch (i->op) {
3092 case OP_TEX:
3093 case OP_TXB:
3094 case OP_TXL:
3095 case OP_TXF:
3096 case OP_TXG:
3097 return handleTEX(i->asTex());
3098 case OP_TXD:
3099 return handleTXD(i->asTex());
3100 case OP_TXLQ:
3101 return handleTXLQ(i->asTex());
3102 case OP_TXQ:
3103 return handleTXQ(i->asTex());
3104 case OP_EX2:
3105 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
3106 i->setSrc(0, i->getDef(0));
3107 break;
3108 case OP_POW:
3109 return handlePOW(i);
3110 case OP_DIV:
3111 return handleDIV(i);
3112 case OP_MOD:
3113 return handleMOD(i);
3114 case OP_SQRT:
3115 return handleSQRT(i);
3116 case OP_EXPORT:
3117 ret = handleEXPORT(i);
3118 break;
3119 case OP_EMIT:
3120 case OP_RESTART:
3121 return handleOUT(i);
3122 case OP_RDSV:
3123 return handleRDSV(i);
3124 case OP_WRSV:
3125 return handleWRSV(i);
3126 case OP_STORE:
3127 case OP_LOAD:
3128 handleLDST(i);
3129 break;
3130 case OP_ATOM:
3131 {
3132 const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
3133 handleATOM(i);
3134 handleCasExch(i, cctl);
3135 }
3136 break;
3137 case OP_SULDB:
3138 case OP_SULDP:
3139 case OP_SUSTB:
3140 case OP_SUSTP:
3141 case OP_SUREDB:
3142 case OP_SUREDP:
3143 if (targ->getChipset() >= NVISA_GM107_CHIPSET)
3144 handleSurfaceOpGM107(i->asTex());
3145 else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
3146 handleSurfaceOpNVE4(i->asTex());
3147 else
3148 handleSurfaceOpNVC0(i->asTex());
3149 break;
3150 case OP_SUQ:
3151 handleSUQ(i->asTex());
3152 break;
3153 case OP_BUFQ:
3154 handleBUFQ(i);
3155 break;
3156 case OP_PIXLD:
3157 handlePIXLD(i);
3158 break;
3159 default:
3160 break;
3161 }
3162
3163 /* Kepler+ has a special opcode to compute a new base address to be used
3164 * for indirect loads.
3165 *
3166 * Maxwell+ has an additional similar requirement for indirect
3167 * interpolation ops in frag shaders.
3168 */
3169 bool doAfetch = false;
3170 if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
3171 !i->perPatch &&
3172 (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
3173 i->src(0).isIndirect(0)) {
3174 doAfetch = true;
3175 }
3176 if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
3177 (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
3178 i->src(0).isIndirect(0)) {
3179 doAfetch = true;
3180 }
3181
3182 if (doAfetch) {
3183 Value *addr = cloneShallow(func, i->getSrc(0));
3184 Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
3185 i->getSrc(0));
3186 afetch->setIndirect(0, 0, i->getIndirect(0, 0));
3187 addr->reg.data.offset = 0;
3188 i->setSrc(0, addr);
3189 i->setIndirect(0, 0, afetch->getDef(0));
3190 }
3191
3192 return ret;
3193 }
3194
3195 bool
3196 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
3197 {
3198 if (stage == CG_STAGE_PRE_SSA) {
3199 NVC0LoweringPass pass(prog);
3200 return pass.run(prog, false, true);
3201 } else
3202 if (stage == CG_STAGE_POST_RA) {
3203 NVC0LegalizePostRA pass(prog);
3204 return pass.run(prog, false, true);
3205 } else
3206 if (stage == CG_STAGE_SSA) {
3207 NVC0LegalizeSSA pass;
3208 return pass.run(prog, false, true);
3209 }
3210 return false;
3211 }
3212
3213 } // namespace nv50_ir