src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_build_util.h"
  25
  26 #include "codegen/nv50_ir_target_nvc0.h"
  27 #include "codegen/nv50_ir_lowering_nvc0.h"
  28
  29 #include <limits>
  30
  31 namespace nv50_ir {
  32
  33 #define QOP_ADD  0
  34 #define QOP_SUBR 1
  35 #define QOP_SUB  2
  36 #define QOP_MOV2 3
  37
  38 //             UL UR LL LR
  39 #define QUADOP(q, r, s, t)                      \
  40    ((QOP_##q << 6) | (QOP_##r << 4) |           \
  41     (QOP_##s << 2) | (QOP_##t << 0))
  42
  43 void
  44 NVC0LegalizeSSA::handleDIV(Instruction *i)
  45 {
  46    FlowInstruction *call;
  47    int builtin;
  48
  49    bld.setPosition(i, false);
  50
  51    // Generate movs to the input regs for the call we want to generate
  52    for (int s = 0; i->srcExists(s); ++s) {
  53       Instruction *ld = i->getSrc(s)->getInsn();
  54       // check if we are moving an immediate, propagate it in that case
  55       if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV) ||
  56             !(ld->src(0).getFile() == FILE_IMMEDIATE))
  57          bld.mkMovToReg(s, i->getSrc(s));
  58       else {
  59          assert(ld->getSrc(0) != NULL);
  60          bld.mkMovToReg(s, ld->getSrc(0));
  61          // Clear the src, to make code elimination possible here before we
  62          // delete the instruction i later
  63          i->setSrc(s, NULL);
  64          if (ld->isDead())
  65             delete_Instruction(prog, ld);
  66       }
  67    }
  68
  69    switch (i->dType) {
  70    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
  71    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
  72    default:
  73       return;
  74    }
  75    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
  76    bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1);
  77    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
  78    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
  79
  80    call->fixed = 1;
  81    call->absolute = call->builtin = 1;
  82    call->target.builtin = builtin;
  83    delete_Instruction(prog, i);
  84 }
  85
  86 void
  87 NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])
  88 {
  89    FlowInstruction *call;
  90    Value *def[2];
  91    int builtin;
  92
  93    def[0] = bld.mkMovToReg(0, src[0])->getDef(0);
  94    def[1] = bld.mkMovToReg(1, src[1])->getDef(0);
  95
  96    if (i->op == OP_RCP)
  97       builtin = NVC0_BUILTIN_RCP_F64;
  98    else
  99       builtin = NVC0_BUILTIN_RSQ_F64;
 100
 101    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
 102    def[0] = bld.getSSA();
 103    def[1] = bld.getSSA();
 104    bld.mkMovFromReg(def[0], 0);
 105    bld.mkMovFromReg(def[1], 1);
 106    bld.mkClobber(FILE_GPR, 0x3fc, 2);
 107    bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);
 108    bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);
 109
 110    call->fixed = 1;
 111    call->absolute = call->builtin = 1;
 112    call->target.builtin = builtin;
 113    delete_Instruction(prog, i);
 114
 115    prog->fp64 = true;
 116 }
 117
 118 void
 119 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
 120 {
 121    assert(i->dType == TYPE_F64);
 122    // There are instructions that will compute the high 32 bits of the 64-bit
 123    // float. We will just stick 0 in the bottom 32 bits.
 124
 125    bld.setPosition(i, false);
 126
 127    // 1. Take the source and it up.
 128    Value *src[2], *dst[2], *def = i->getDef(0);
 129    bld.mkSplit(src, 4, i->getSrc(0));
 130
 131    int chip = prog->getTarget()->getChipset();
 132    if (chip >= NVISA_GK104_CHIPSET) {
 133       handleRCPRSQLib(i, src);
 134       return;
 135    }
 136
 137    // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
 138    dst[0] = bld.loadImm(NULL, 0);
 139    dst[1] = bld.getSSA();
 140
 141    // 3. The new version of the instruction takes the high 32 bits of the
 142    // source and outputs the high 32 bits of the destination.
 143    i->setSrc(0, src[1]);
 144    i->setDef(0, dst[1]);
 145    i->setType(TYPE_F32);
 146    i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
 147
 148    // 4. Recombine the two dst pieces back into the original destination.
 149    bld.setPosition(i, true);
 150    bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
 151 }
 152
 153 void
 154 NVC0LegalizeSSA::handleFTZ(Instruction *i)
 155 {
 156    // Only want to flush float inputs
 157    assert(i->sType == TYPE_F32);
 158
 159    // If we're already flushing denorms (and NaN's) to zero, no need for this.
 160    if (i->dnz)
 161       return;
 162
 163    // Only certain classes of operations can flush
 164    OpClass cls = prog->getTarget()->getOpClass(i->op);
 165    if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
 166        cls != OPCLASS_CONVERT)
 167       return;
 168
 169    i->ftz = true;
 170 }
 171
 172 void
 173 NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
 174 {
 175    if (i->tex.levelZero)
 176       return;
 177
 178    ImmediateValue lod;
 179
 180    // The LOD argument comes right after the coordinates (before depth bias,
 181    // offsets, etc).
 182    int arg = i->tex.target.getArgCount();
 183
 184    // SM30+ stores the indirect handle as a separate arg, which comes before
 185    // the LOD.
 186    if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
 187        i->tex.rIndirectSrc >= 0)
 188       arg++;
 189    // SM20 stores indirect handle combined with array coordinate
 190    if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
 191        !i->tex.target.isArray() &&
 192        i->tex.rIndirectSrc >= 0)
 193       arg++;
 194
 195    if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
 196       return;
 197
 198    if (i->op == OP_TXL)
 199       i->op = OP_TEX;
 200    i->tex.levelZero = true;
 201    i->moveSources(arg + 1, -1);
 202 }
 203
 204 void
 205 NVC0LegalizeSSA::handleShift(Instruction *lo)
 206 {
 207    Value *shift = lo->getSrc(1);
 208    Value *dst64 = lo->getDef(0);
 209    Value *src[2], *dst[2];
 210    operation op = lo->op;
 211
 212    bld.setPosition(lo, false);
 213
 214    bld.mkSplit(src, 4, lo->getSrc(0));
 215
 216    // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to
 217    // be completely emulated. For SM35+, we can use the more directed SHF
 218    // operations.
 219    if (prog->getTarget()->getChipset() < NVISA_GK20A_CHIPSET) {
 220       // The strategy here is to handle shifts >= 32 and less than 32 as
 221       // separate parts.
 222       //
 223       // For SHL:
 224       // If the shift is <= 32, then
 225       //   (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)
 226       // If the shift is > 32, then
 227       //   (HI,LO) << x = (LO << (x - 32), 0)
 228       //
 229       // For SHR:
 230       // If the shift is <= 32, then
 231       //   (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)
 232       // If the shift is > 32, then
 233       //   (HI,LO) >> x = (0, HI >> (x - 32))
 234       //
 235       // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we
 236       // can use to our advantage. Also note the structural similarities
 237       // between the right/left cases. The main difference is swapping hi/lo
 238       // on input and output.
 239
 240       Value *x32_minus_shift, *pred, *hi1, *hi2;
 241       DataType type = isSignedIntType(lo->dType) ? TYPE_S32 : TYPE_U32;
 242       operation antiop = op == OP_SHR ? OP_SHL : OP_SHR;
 243       if (op == OP_SHR)
 244          std::swap(src[0], src[1]);
 245       bld.mkOp2(OP_ADD, TYPE_U32, (x32_minus_shift = bld.getSSA()), shift, bld.mkImm(0x20))
 246          ->src(0).mod = Modifier(NV50_IR_MOD_NEG);
 247       bld.mkCmp(OP_SET, CC_LE, TYPE_U8, (pred = bld.getSSA(1, FILE_PREDICATE)),
 248                 TYPE_U32, shift, bld.mkImm(32));
 249       // Compute HI (shift <= 32)
 250       bld.mkOp2(OP_OR, TYPE_U32, (hi1 = bld.getSSA()),
 251                 bld.mkOp2v(op, TYPE_U32, bld.getSSA(), src[1], shift),
 252                 bld.mkOp2v(antiop, TYPE_U32, bld.getSSA(), src[0], x32_minus_shift))
 253          ->setPredicate(CC_P, pred);
 254       // Compute LO (all shift values)
 255       bld.mkOp2(op, type, (dst[0] = bld.getSSA()), src[0], shift);
 256       // Compute HI (shift > 32)
 257       bld.mkOp2(op, type, (hi2 = bld.getSSA()), src[0],
 258                 bld.mkOp1v(OP_NEG, TYPE_S32, bld.getSSA(), x32_minus_shift))
 259          ->setPredicate(CC_NOT_P, pred);
 260       bld.mkOp2(OP_UNION, TYPE_U32, (dst[1] = bld.getSSA()), hi1, hi2);
 261       if (op == OP_SHR)
 262          std::swap(dst[0], dst[1]);
 263       bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
 264       delete_Instruction(prog, lo);
 265       return;
 266    }
 267
 268    Instruction *hi = new_Instruction(func, op, TYPE_U32);
 269    lo->bb->insertAfter(lo, hi);
 270
 271    hi->sType = lo->sType;
 272    lo->dType = TYPE_U32;
 273
 274    hi->setDef(0, (dst[1] = bld.getSSA()));
 275    if (lo->op == OP_SHR)
 276       hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;
 277    lo->setDef(0, (dst[0] = bld.getSSA()));
 278
 279    bld.setPosition(hi, true);
 280
 281    if (lo->op == OP_SHL)
 282       std::swap(hi, lo);
 283
 284    hi->setSrc(0, new_ImmediateValue(prog, 0u));
 285    hi->setSrc(1, shift);
 286    hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);
 287
 288    lo->setSrc(0, src[0]);
 289    lo->setSrc(1, shift);
 290    lo->setSrc(2, src[1]);
 291
 292    bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
 293 }
 294
 295 void
 296 NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
 297 {
 298    DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;
 299    Value *carry;
 300    Value *src0[2], *src1[2];
 301    bld.setPosition(cmp, false);
 302
 303    bld.mkSplit(src0, 4, cmp->getSrc(0));
 304    bld.mkSplit(src1, 4, cmp->getSrc(1));
 305    bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])
 306       ->setFlagsDef(0, (carry = bld.getSSA(1, FILE_FLAGS)));
 307    cmp->setFlagsSrc(cmp->srcCount(), carry);
 308    cmp->setSrc(0, src0[1]);
 309    cmp->setSrc(1, src1[1]);
 310    cmp->sType = hTy;
 311 }
 312
 313 bool
 314 NVC0LegalizeSSA::visit(Function *fn)
 315 {
 316    bld.setProgram(fn->getProgram());
 317    return true;
 318 }
 319
 320 bool
 321 NVC0LegalizeSSA::visit(BasicBlock *bb)
 322 {
 323    Instruction *next;
 324    for (Instruction *i = bb->getEntry(); i; i = next) {
 325       next = i->next;
 326
 327       if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
 328          handleFTZ(i);
 329
 330       switch (i->op) {
 331       case OP_DIV:
 332       case OP_MOD:
 333          if (i->sType != TYPE_F32)
 334             handleDIV(i);
 335          break;
 336       case OP_RCP:
 337       case OP_RSQ:
 338          if (i->dType == TYPE_F64)
 339             handleRCPRSQ(i);
 340          break;
 341       case OP_TXL:
 342       case OP_TXF:
 343          handleTEXLOD(i->asTex());
 344          break;
 345       case OP_SHR:
 346       case OP_SHL:
 347          if (typeSizeof(i->sType) == 8)
 348             handleShift(i);
 349          break;
 350       case OP_SET:
 351       case OP_SET_AND:
 352       case OP_SET_OR:
 353       case OP_SET_XOR:
 354          if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
 355             handleSET(i->asCmp());
 356          break;
 357       default:
 358          break;
 359       }
 360    }
 361    return true;
 362 }
 363
 364 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
 365    : rZero(NULL),
 366      carry(NULL),
 367      pOne(NULL),
 368      needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
 369                 prog->getTarget()->getChipset() < 0x110)
 370 {
 371 }
 372
 373 bool
 374 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
 375                                     const Instruction *early) const
 376 {
 377    if (early->bb == later->bb)
 378       return early->serial < later->serial;
 379    return later->bb->dominatedBy(early->bb);
 380 }
 381
 382 void
 383 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
 384                               Instruction *usei, const Instruction *texi)
 385 {
 386    bool add = true;
 387    bool dominated = insnDominatedBy(usei, texi);
 388    // Uses before the tex have to all be included. Just because an earlier
 389    // instruction dominates another instruction doesn't mean that there's no
 390    // way to get from the tex to the later instruction. For example you could
 391    // have nested loops, with the tex in the inner loop, and uses before it in
 392    // both loops - even though the outer loop's instruction would dominate the
 393    // inner's, we still want a texbar before the inner loop's instruction.
 394    //
 395    // However we can still use the eliding logic between uses dominated by the
 396    // tex instruction, as that is unambiguously correct.
 397    if (dominated) {
 398       for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
 399          if (it->after) {
 400             if (insnDominatedBy(usei, it->insn)) {
 401                add = false;
 402                break;
 403             }
 404             if (insnDominatedBy(it->insn, usei)) {
 405                it = uses.erase(it);
 406                continue;
 407             }
 408          }
 409          ++it;
 410       }
 411    }
 412    if (add)
 413       uses.push_back(TexUse(usei, texi, dominated));
 414 }
 415
 416 // While it might be tempting to use the an algorithm that just looks at tex
 417 // uses, not all texture results are guaranteed to be used on all paths. In
 418 // the case where along some control flow path a texture result is never used,
 419 // we might reuse that register for something else, creating a
 420 // write-after-write hazard. So we have to manually look through all
 421 // instructions looking for ones that reference the registers in question.
 422 void
 423 NVC0LegalizePostRA::findFirstUses(
 424    Instruction *texi, std::list<TexUse> &uses)
 425 {
 426    int minGPR = texi->def(0).rep()->reg.data.id;
 427    int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
 428
 429    unordered_set<const BasicBlock *> visited;
 430    findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
 431 }
 432
 433 void
 434 NVC0LegalizePostRA::findFirstUsesBB(
 435    int minGPR, int maxGPR, Instruction *start,
 436    const Instruction *texi, std::list<TexUse> &uses,
 437    unordered_set<const BasicBlock *> &visited)
 438 {
 439    const BasicBlock *bb = start->bb;
 440
 441    // We don't process the whole bb the first time around. This is correct,
 442    // however we might be in a loop and hit this BB again, and need to process
 443    // the full thing. So only mark a bb as visited if we processed it from the
 444    // beginning.
 445    if (start == bb->getEntry()) {
 446       if (visited.find(bb) != visited.end())
 447          return;
 448       visited.insert(bb);
 449    }
 450
 451    for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
 452       if (insn->isNop())
 453          continue;
 454
 455       for (int d = 0; insn->defExists(d); ++d) {
 456          const Value *def = insn->def(d).rep();
 457          if (insn->def(d).getFile() != FILE_GPR ||
 458              def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
 459              def->reg.data.id > maxGPR)
 460             continue;
 461          addTexUse(uses, insn, texi);
 462          return;
 463       }
 464
 465       for (int s = 0; insn->srcExists(s); ++s) {
 466          const Value *src = insn->src(s).rep();
 467          if (insn->src(s).getFile() != FILE_GPR ||
 468              src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
 469              src->reg.data.id > maxGPR)
 470             continue;
 471          addTexUse(uses, insn, texi);
 472          return;
 473       }
 474    }
 475
 476    for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
 477       findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
 478                       texi, uses, visited);
 479    }
 480 }
 481
 482 // Texture barriers:
 483 // This pass is a bit long and ugly and can probably be optimized.
 484 //
 485 // 1. obtain a list of TEXes and their outputs' first use(s)
 486 // 2. calculate the barrier level of each first use (minimal number of TEXes,
 487 //    over all paths, between the TEX and the use in question)
 488 // 3. for each barrier, if all paths from the source TEX to that barrier
 489 //    contain a barrier of lesser level, it can be culled
 490 bool
 491 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
 492 {
 493    std::list<TexUse> *uses;
 494    std::vector<Instruction *> texes;
 495    std::vector<int> bbFirstTex;
 496    std::vector<int> bbFirstUse;
 497    std::vector<int> texCounts;
 498    std::vector<TexUse> useVec;
 499    ArrayList insns;
 500
 501    fn->orderInstructions(insns);
 502
 503    texCounts.resize(fn->allBBlocks.getSize(), 0);
 504    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
 505    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
 506
 507    // tag BB CFG nodes by their id for later
 508    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
 509       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
 510       if (bb)
 511          bb->cfg.tag = bb->getId();
 512    }
 513
 514    // gather the first uses for each TEX
 515    for (int i = 0; i < insns.getSize(); ++i) {
 516       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
 517       if (isTextureOp(tex->op)) {
 518          texes.push_back(tex);
 519          if (!texCounts.at(tex->bb->getId()))
 520             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
 521          texCounts[tex->bb->getId()]++;
 522       }
 523    }
 524    insns.clear();
 525    if (texes.empty())
 526       return false;
 527    uses = new std::list<TexUse>[texes.size()];
 528    if (!uses)
 529       return false;
 530    for (size_t i = 0; i < texes.size(); ++i) {
 531       findFirstUses(texes[i], uses[i]);
 532    }
 533
 534    // determine the barrier level at each use
 535    for (size_t i = 0; i < texes.size(); ++i) {
 536       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
 537            ++u) {
 538          BasicBlock *tb = texes[i]->bb;
 539          BasicBlock *ub = u->insn->bb;
 540          if (tb == ub) {
 541             u->level = 0;
 542             for (size_t j = i + 1; j < texes.size() &&
 543                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
 544                  ++j)
 545                u->level++;
 546          } else {
 547             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
 548                                                       &ub->cfg, texCounts);
 549             if (u->level < 0) {
 550                WARN("Failed to find path TEX -> TEXBAR\n");
 551                u->level = 0;
 552                continue;
 553             }
 554             // this counted all TEXes in the origin block, correct that
 555             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
 556             // and did not count the TEXes in the destination block, add those
 557             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
 558                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
 559                  ++j)
 560                u->level++;
 561          }
 562          assert(u->level >= 0);
 563          useVec.push_back(*u);
 564       }
 565    }
 566    delete[] uses;
 567
 568    // insert the barriers
 569    for (size_t i = 0; i < useVec.size(); ++i) {
 570       Instruction *prev = useVec[i].insn->prev;
 571       if (useVec[i].level < 0)
 572          continue;
 573       if (prev && prev->op == OP_TEXBAR) {
 574          if (prev->subOp > useVec[i].level)
 575             prev->subOp = useVec[i].level;
 576          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
 577       } else {
 578          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
 579          bar->fixed = 1;
 580          bar->subOp = useVec[i].level;
 581          // make use explicit to ease latency calculation
 582          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
 583          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
 584       }
 585    }
 586
 587    if (fn->getProgram()->optLevel < 3)
 588       return true;
 589
 590    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
 591
 592    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
 593    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
 594    limitS.resize(fn->allBBlocks.getSize());
 595
 596    // cull unneeded barriers (should do that earlier, but for simplicity)
 597    IteratorRef bi = fn->cfg.iteratorCFG();
 598    // first calculate min/max outstanding TEXes for each BB
 599    for (bi->reset(); !bi->end(); bi->next()) {
 600       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 601       BasicBlock *bb = BasicBlock::get(n);
 602       int min = 0;
 603       int max = std::numeric_limits<int>::max();
 604       for (Instruction *i = bb->getFirst(); i; i = i->next) {
 605          if (isTextureOp(i->op)) {
 606             min++;
 607             if (max < std::numeric_limits<int>::max())
 608                max++;
 609          } else
 610          if (i->op == OP_TEXBAR) {
 611             min = MIN2(min, i->subOp);
 612             max = MIN2(max, i->subOp);
 613          }
 614       }
 615       // limits when looking at an isolated block
 616       limitS[bb->getId()].min = min;
 617       limitS[bb->getId()].max = max;
 618    }
 619    // propagate the min/max values
 620    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
 621       for (bi->reset(); !bi->end(); bi->next()) {
 622          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 623          BasicBlock *bb = BasicBlock::get(n);
 624          const int bbId = bb->getId();
 625          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
 626             BasicBlock *in = BasicBlock::get(ei.getNode());
 627             const int inId = in->getId();
 628             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
 629             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
 630          }
 631          // I just hope this is correct ...
 632          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
 633             // no barrier
 634             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
 635             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
 636          } else {
 637             // block contained a barrier
 638             limitB[bbId].min = MIN2(limitS[bbId].max,
 639                                     limitT[bbId].min + limitS[bbId].min);
 640             limitB[bbId].max = MIN2(limitS[bbId].max,
 641                                     limitT[bbId].max + limitS[bbId].min);
 642          }
 643       }
 644    }
 645    // finally delete unnecessary barriers
 646    for (bi->reset(); !bi->end(); bi->next()) {
 647       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 648       BasicBlock *bb = BasicBlock::get(n);
 649       Instruction *prev = NULL;
 650       Instruction *next;
 651       int max = limitT[bb->getId()].max;
 652       for (Instruction *i = bb->getFirst(); i; i = next) {
 653          next = i->next;
 654          if (i->op == OP_TEXBAR) {
 655             if (i->subOp >= max) {
 656                delete_Instruction(prog, i);
 657                i = NULL;
 658             } else {
 659                max = i->subOp;
 660                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
 661                   delete_Instruction(prog, prev);
 662                   prev = NULL;
 663                }
 664             }
 665          } else
 666          if (isTextureOp(i->op)) {
 667             max++;
 668          }
 669          if (i && !i->isNop())
 670             prev = i;
 671       }
 672    }
 673    return true;
 674 }
 675
 676 bool
 677 NVC0LegalizePostRA::visit(Function *fn)
 678 {
 679    if (needTexBar)
 680       insertTextureBarriers(fn);
 681
 682    rZero = new_LValue(fn, FILE_GPR);
 683    pOne = new_LValue(fn, FILE_PREDICATE);
 684    carry = new_LValue(fn, FILE_FLAGS);
 685
 686    rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
 687    carry->reg.data.id = 0;
 688    pOne->reg.data.id = 7;
 689
 690    return true;
 691 }
 692
 693 void
 694 NVC0LegalizePostRA::replaceZero(Instruction *i)
 695 {
 696    for (int s = 0; i->srcExists(s); ++s) {
 697       if (s == 2 && i->op == OP_SUCLAMP)
 698          continue;
 699       if (s == 1 && i->op == OP_SHLADD)
 700          continue;
 701       ImmediateValue *imm = i->getSrc(s)->asImm();
 702       if (imm) {
 703          if (i->op == OP_SELP && s == 2) {
 704             i->setSrc(s, pOne);
 705             if (imm->reg.data.u64 == 0)
 706                i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
 707          } else if (imm->reg.data.u64 == 0) {
 708             i->setSrc(s, rZero);
 709          }
 710       }
 711    }
 712 }
 713
 714 // replace CONT with BRA for single unconditional continue
 715 bool
 716 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
 717 {
 718    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
 719       return false;
 720    Graph::EdgeIterator ei = bb->cfg.incident();
 721    if (ei.getType() != Graph::Edge::BACK)
 722       ei.next();
 723    if (ei.getType() != Graph::Edge::BACK)
 724       return false;
 725    BasicBlock *contBB = BasicBlock::get(ei.getNode());
 726
 727    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
 728        contBB->getExit()->getPredicate())
 729       return false;
 730    contBB->getExit()->op = OP_BRA;
 731    bb->remove(bb->getEntry()); // delete PRECONT
 732
 733    ei.next();
 734    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
 735    return true;
 736 }
 737
 738 // replace branches to join blocks with join ops
 739 void
 740 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
 741 {
 742    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
 743       return;
 744    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
 745       BasicBlock *in = BasicBlock::get(ei.getNode());
 746       Instruction *exit = in->getExit();
 747       if (!exit) {
 748          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
 749          // there should always be a terminator instruction
 750          WARN("inserted missing terminator in BB:%i\n", in->getId());
 751       } else
 752       if (exit->op == OP_BRA) {
 753          exit->op = OP_JOIN;
 754          exit->asFlow()->limit = 1; // must-not-propagate marker
 755       }
 756    }
 757    bb->remove(bb->getEntry());
 758 }
 759
 760 // replaces instructions which would end up as f2f or i2i with faster
 761 // alternatives:
 762 //  - fabs(a)     -> fadd(0, abs a)
 763 //  - fneg(a)     -> fadd(neg 0, neg a)
 764 //  - ineg(a)     -> iadd(0, neg a)
 765 //  - fneg(abs a) -> fadd(neg 0, neg abs a)
 766 //  - sat(a)      -> sat add(0, a)
 767 void
 768 NVC0LegalizePostRA::replaceCvt(Instruction *cvt)
 769 {
 770    if (!isFloatType(cvt->sType) && typeSizeof(cvt->sType) != 4)
 771       return;
 772    if (cvt->sType != cvt->dType)
 773       return;
 774    // we could make it work, but in this case we have optimizations disabled
 775    // and we don't really care either way.
 776    if (cvt->src(0).getFile() != FILE_GPR &&
 777        cvt->src(0).getFile() != FILE_MEMORY_CONST)
 778       return;
 779
 780    Modifier mod0, mod1;
 781
 782    switch (cvt->op) {
 783    case OP_ABS:
 784       if (cvt->src(0).mod)
 785          return;
 786       if (!isFloatType(cvt->sType))
 787          return;
 788       mod0 = 0;
 789       mod1 = NV50_IR_MOD_ABS;
 790       break;
 791    case OP_NEG:
 792       if (!isFloatType(cvt->sType) && cvt->src(0).mod)
 793          return;
 794       if (isFloatType(cvt->sType) &&
 795           (cvt->src(0).mod && cvt->src(0).mod != Modifier(NV50_IR_MOD_ABS)))
 796          return;
 797
 798       mod0 = isFloatType(cvt->sType) ? NV50_IR_MOD_NEG : 0;
 799       mod1 = cvt->src(0).mod == Modifier(NV50_IR_MOD_ABS) ?
 800          NV50_IR_MOD_NEG_ABS : NV50_IR_MOD_NEG;
 801       break;
 802    case OP_SAT:
 803       if (!isFloatType(cvt->sType) && cvt->src(0).mod.abs())
 804          return;
 805       mod0 = 0;
 806       mod1 = cvt->src(0).mod;
 807       cvt->saturate = true;
 808       break;
 809    default:
 810       return;
 811    }
 812
 813    cvt->op = OP_ADD;
 814    cvt->moveSources(0, 1);
 815    cvt->setSrc(0, rZero);
 816    cvt->src(0).mod = mod0;
 817    cvt->src(1).mod = mod1;
 818 }
 819
 820 bool
 821 NVC0LegalizePostRA::visit(BasicBlock *bb)
 822 {
 823    Instruction *i, *next;
 824
 825    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
 826    for (i = bb->getFirst(); i; i = next) {
 827       next = i->next;
 828       if (i->op == OP_EMIT || i->op == OP_RESTART) {
 829          if (!i->getDef(0)->refCount())
 830             i->setDef(0, NULL);
 831          if (i->src(0).getFile() == FILE_IMMEDIATE)
 832             i->setSrc(0, rZero); // initial value must be 0
 833          replaceZero(i);
 834       } else
 835       if (i->isNop()) {
 836          bb->remove(i);
 837       } else
 838       if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
 839           prog->getType() != Program::TYPE_COMPUTE) {
 840          // It seems like barriers are never required for tessellation since
 841          // the warp size is 32, and there are always at most 32 tcs threads.
 842          bb->remove(i);
 843       } else
 844       if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
 845          int offset = i->src(0).get()->reg.data.offset;
 846          if (abs(offset) >= 0x10000)
 847             i->src(0).get()->reg.fileIndex += offset >> 16;
 848          i->src(0).get()->reg.data.offset = (int)(short)offset;
 849       } else {
 850          // TODO: Move this to before register allocation for operations that
 851          // need the $c register !
 852          if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {
 853             Instruction *hi;
 854             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
 855             if (hi)
 856                next = hi;
 857          }
 858
 859          if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS)
 860             replaceCvt(i);
 861
 862          if (i->op != OP_MOV && i->op != OP_PFETCH)
 863             replaceZero(i);
 864       }
 865    }
 866    if (!bb->getEntry())
 867       return true;
 868
 869    if (!tryReplaceContWithBra(bb))
 870       propagateJoin(bb);
 871
 872    return true;
 873 }
 874
 875 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
 876 {
 877    bld.setProgram(prog);
 878 }
 879
 880 bool
 881 NVC0LoweringPass::visit(Function *fn)
 882 {
 883    if (prog->getType() == Program::TYPE_GEOMETRY) {
 884       assert(!strncmp(fn->getName(), "MAIN", 4));
 885       // TODO: when we generate actual functions pass this value along somehow
 886       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
 887       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
 888       if (fn->cfgExit) {
 889          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
 890          bld.mkMovToReg(0, gpEmitAddress);
 891       }
 892    }
 893    return true;
 894 }
 895
 896 bool
 897 NVC0LoweringPass::visit(BasicBlock *bb)
 898 {
 899    return true;
 900 }
 901
 902 inline Value *
 903 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
 904 {
 905    uint8_t b = prog->driver->io.auxCBSlot;
 906    uint32_t off = prog->driver->io.texBindBase + slot * 4;
 907
 908    if (ptr)
 909       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
 910
 911    return bld.
 912       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 913 }
 914
 915 // move array source to first slot, convert to u16, add indirections
 916 bool
 917 NVC0LoweringPass::handleTEX(TexInstruction *i)
 918 {
 919    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
 920    const int arg = i->tex.target.getArgCount();
 921    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
 922    const int chipset = prog->getTarget()->getChipset();
 923
 924    /* Only normalize in the non-explicit derivatives case. For explicit
 925     * derivatives, this is handled in handleManualTXD.
 926     */
 927    if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
 928       Value *src[3], *val;
 929       int c;
 930       for (c = 0; c < 3; ++c)
 931          src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
 932       val = bld.getScratch();
 933       bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
 934       bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
 935       bld.mkOp1(OP_RCP, TYPE_F32, val, val);
 936       for (c = 0; c < 3; ++c) {
 937          i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
 938                                  i->getSrc(c), val));
 939       }
 940    }
 941
 942    // Arguments to the TEX instruction are a little insane. Even though the
 943    // encoding is identical between SM20 and SM30, the arguments mean
 944    // different things between Fermi and Kepler+. A lot of arguments are
 945    // optional based on flags passed to the instruction. This summarizes the
 946    // order of things.
 947    //
 948    // Fermi:
 949    //  array/indirect
 950    //  coords
 951    //  sample
 952    //  lod bias
 953    //  depth compare
 954    //  offsets:
 955    //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
 956    //    - other: 4 bits each, single reg
 957    //
 958    // Kepler+:
 959    //  indirect handle
 960    //  array (+ offsets for txd in upper 16 bits)
 961    //  coords
 962    //  sample
 963    //  lod bias
 964    //  depth compare
 965    //  offsets (same as fermi, except txd which takes it with array)
 966    //
 967    // Maxwell (tex):
 968    //  array
 969    //  coords
 970    //  indirect handle
 971    //  sample
 972    //  lod bias
 973    //  depth compare
 974    //  offsets
 975    //
 976    // Maxwell (txd):
 977    //  indirect handle
 978    //  coords
 979    //  array + offsets
 980    //  derivatives
 981
 982    if (chipset >= NVISA_GK104_CHIPSET) {
 983       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
 984          // XXX this ignores tsc, and assumes a 1:1 mapping
 985          assert(i->tex.rIndirectSrc >= 0);
 986          if (!i->tex.bindless) {
 987             Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
 988             i->tex.r = 0xff;
 989             i->tex.s = 0x1f;
 990             i->setIndirectR(hnd);
 991          }
 992          i->setIndirectS(NULL);
 993       } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
 994          if (i->tex.r == 0xffff)
 995             i->tex.r = prog->driver->io.fbtexBindBase / 4;
 996          else
 997             i->tex.r += prog->driver->io.texBindBase / 4;
 998          i->tex.s  = 0; // only a single cX[] value possible here
 999       } else {
1000          Value *hnd = bld.getScratch();
1001          Value *rHnd = loadTexHandle(NULL, i->tex.r);
1002          Value *sHnd = loadTexHandle(NULL, i->tex.s);
1003
1004          bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
1005
1006          i->tex.r = 0; // not used for indirect tex
1007          i->tex.s = 0;
1008          i->setIndirectR(hnd);
1009       }
1010       if (i->tex.target.isArray()) {
1011          LValue *layer = new_LValue(func, FILE_GPR);
1012          Value *src = i->getSrc(lyr);
1013          const int sat = (i->op == OP_TXF) ? 1 : 0;
1014          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1015          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
1016          if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
1017             for (int s = dim; s >= 1; --s)
1018                i->setSrc(s, i->getSrc(s - 1));
1019             i->setSrc(0, layer);
1020          } else {
1021             i->setSrc(dim, layer);
1022          }
1023       }
1024       // Move the indirect reference to the first place
1025       if (i->tex.rIndirectSrc >= 0 && (
1026                 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
1027          Value *hnd = i->getIndirectR();
1028
1029          i->setIndirectR(NULL);
1030          i->moveSources(0, 1);
1031          i->setSrc(0, hnd);
1032          i->tex.rIndirectSrc = 0;
1033          i->tex.sIndirectSrc = -1;
1034       }
1035       // Move the indirect reference to right after the coords
1036       else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
1037          Value *hnd = i->getIndirectR();
1038
1039          i->setIndirectR(NULL);
1040          i->moveSources(arg, 1);
1041          i->setSrc(arg, hnd);
1042          i->tex.rIndirectSrc = 0;
1043          i->tex.sIndirectSrc = -1;
1044       }
1045    } else
1046    // (nvc0) generate and move the tsc/tic/array source to the front
1047    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
1048       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1049
1050       Value *ticRel = i->getIndirectR();
1051       Value *tscRel = i->getIndirectS();
1052
1053       if (i->tex.r == 0xffff) {
1054          i->tex.r = 0x20;
1055          i->tex.s = 0x10;
1056       }
1057
1058       if (ticRel) {
1059          i->setSrc(i->tex.rIndirectSrc, NULL);
1060          if (i->tex.r)
1061             ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1062                                 ticRel, bld.mkImm(i->tex.r));
1063       }
1064       if (tscRel) {
1065          i->setSrc(i->tex.sIndirectSrc, NULL);
1066          if (i->tex.s)
1067             tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1068                                 tscRel, bld.mkImm(i->tex.s));
1069       }
1070
1071       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
1072       if (arrayIndex) {
1073          for (int s = dim; s >= 1; --s)
1074             i->setSrc(s, i->getSrc(s - 1));
1075          i->setSrc(0, arrayIndex);
1076       } else {
1077          i->moveSources(0, 1);
1078       }
1079
1080       if (arrayIndex) {
1081          int sat = (i->op == OP_TXF) ? 1 : 0;
1082          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1083          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
1084       } else {
1085          bld.loadImm(src, 0);
1086       }
1087
1088       if (ticRel)
1089          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
1090       if (tscRel)
1091          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
1092
1093       i->setSrc(0, src);
1094    }
1095
1096    // For nvc0, the sample id has to be in the second operand, as the offset
1097    // does. Right now we don't know how to pass both in, and this case can't
1098    // happen with OpenGL. On nve0, the sample id is part of the texture
1099    // coordinate argument.
1100    assert(chipset >= NVISA_GK104_CHIPSET ||
1101           !i->tex.useOffsets || !i->tex.target.isMS());
1102
1103    // offset is between lod and dc
1104    if (i->tex.useOffsets) {
1105       int n, c;
1106       int s = i->srcCount(0xff, true);
1107       if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
1108          if (i->tex.target.isShadow())
1109             s--;
1110          if (i->srcExists(s)) // move potential predicate out of the way
1111             i->moveSources(s, 1);
1112          if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
1113             i->moveSources(s + 1, 1);
1114       }
1115       if (i->op == OP_TXG) {
1116          // Either there is 1 offset, which goes into the 2 low bytes of the
1117          // first source, or there are 4 offsets, which go into 2 sources (8
1118          // values, 1 byte each).
1119          Value *offs[2] = {NULL, NULL};
1120          for (n = 0; n < i->tex.useOffsets; n++) {
1121             for (c = 0; c < 2; ++c) {
1122                if ((n % 2) == 0 && c == 0)
1123                   bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
1124                else
1125                   bld.mkOp3(OP_INSBF, TYPE_U32,
1126                             offs[n / 2],
1127                             i->offset[n][c].get(),
1128                             bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
1129                             offs[n / 2]);
1130             }
1131          }
1132          i->setSrc(s, offs[0]);
1133          if (offs[1])
1134             i->setSrc(s + 1, offs[1]);
1135       } else {
1136          unsigned imm = 0;
1137          assert(i->tex.useOffsets == 1);
1138          for (c = 0; c < 3; ++c) {
1139             ImmediateValue val;
1140             if (!i->offset[0][c].getImmediate(val))
1141                assert(!"non-immediate offset passed to non-TXG");
1142             imm |= (val.reg.data.u32 & 0xf) << (c * 4);
1143          }
1144          if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
1145             // The offset goes into the upper 16 bits of the array index. So
1146             // create it if it's not already there, and INSBF it if it already
1147             // is.
1148             s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
1149             if (chipset >= NVISA_GM107_CHIPSET)
1150                s += dim;
1151             if (i->tex.target.isArray()) {
1152                Value *offset = bld.getScratch();
1153                bld.mkOp3(OP_INSBF, TYPE_U32, offset,
1154                          bld.loadImm(NULL, imm), bld.mkImm(0xc10),
1155                          i->getSrc(s));
1156                i->setSrc(s, offset);
1157             } else {
1158                i->moveSources(s, 1);
1159                i->setSrc(s, bld.loadImm(NULL, imm << 16));
1160             }
1161          } else {
1162             i->setSrc(s, bld.loadImm(NULL, imm));
1163          }
1164       }
1165    }
1166
1167    return true;
1168 }
1169
1170 bool
1171 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
1172 {
1173    // Always done from the l0 perspective. This is the way that NVIDIA's
1174    // driver does it, and doing it from the "current" lane's perpsective
1175    // doesn't seem to always work for reasons that aren't altogether clear,
1176    // even in frag shaders.
1177    //
1178    // Note that we must move not only the coordinates into lane0, but also all
1179    // ancillary arguments, like array indices and depth compare as they may
1180    // differ between lanes. Offsets for TXD are supposed to be uniform, so we
1181    // leave them alone.
1182    static const uint8_t qOps[2] =
1183       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
1184
1185    Value *def[4][4];
1186    Value *crd[3], *arr[2], *shadow;
1187    Instruction *tex;
1188    Value *zero = bld.loadImm(bld.getSSA(), 0);
1189    int l, c;
1190    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1191
1192    // This function is invoked after handleTEX lowering, so we have to expect
1193    // the arguments in the order that the hw wants them. For Fermi, array and
1194    // indirect are both in the leading arg, while for Kepler, array and
1195    // indirect are separate (and both precede the coordinates). Maxwell is
1196    // handled in a separate function.
1197    int array;
1198    if (targ->getChipset() < NVISA_GK104_CHIPSET)
1199       array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
1200    else
1201       array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
1202
1203    i->op = OP_TEX; // no need to clone dPdx/dPdy later
1204
1205    for (c = 0; c < dim; ++c)
1206       crd[c] = bld.getScratch();
1207    for (c = 0; c < array; ++c)
1208       arr[c] = bld.getScratch();
1209    shadow = bld.getScratch();
1210
1211    for (l = 0; l < 4; ++l) {
1212       Value *src[3], *val;
1213
1214       bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1215       // we're using the texture result from lane 0 in all cases, so make sure
1216       // that lane 0 is pointing at the proper array index, indirect value,
1217       // and depth compare.
1218       if (l != 0) {
1219          for (c = 0; c < array; ++c)
1220             bld.mkQuadop(0x00, arr[c], l, i->getSrc(c), zero);
1221          if (i->tex.target.isShadow()) {
1222             // The next argument after coords is the depth compare
1223             bld.mkQuadop(0x00, shadow, l, i->getSrc(array + dim), zero);
1224          }
1225       }
1226       // mov position coordinates from lane l to all lanes
1227       for (c = 0; c < dim; ++c)
1228          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
1229       // add dPdx from lane l to lanes dx
1230       for (c = 0; c < dim; ++c)
1231          bld.mkQuadop(qOps[0], crd[c], l, i->dPdx[c].get(), crd[c]);
1232       // add dPdy from lane l to lanes dy
1233       for (c = 0; c < dim; ++c)
1234          bld.mkQuadop(qOps[1], crd[c], l, i->dPdy[c].get(), crd[c]);
1235       // normalize cube coordinates
1236       if (i->tex.target.isCube()) {
1237          for (c = 0; c < 3; ++c)
1238             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1239          val = bld.getScratch();
1240          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1241          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1242          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1243          for (c = 0; c < 3; ++c)
1244             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1245       } else {
1246          for (c = 0; c < dim; ++c)
1247             src[c] = crd[c];
1248       }
1249       // texture
1250       bld.insert(tex = cloneForward(func, i));
1251       if (l != 0) {
1252          for (c = 0; c < array; ++c)
1253             tex->setSrc(c, arr[c]);
1254          if (i->tex.target.isShadow())
1255             tex->setSrc(array + dim, shadow);
1256       }
1257       for (c = 0; c < dim; ++c)
1258          tex->setSrc(c + array, src[c]);
1259       // broadcast results from lane 0 to all lanes so that the moves *into*
1260       // the target lane pick up the proper value.
1261       if (l != 0)
1262          for (c = 0; i->defExists(c); ++c)
1263             bld.mkQuadop(0x00, tex->getDef(c), 0, tex->getDef(c), zero);
1264       bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1265
1266       // save results
1267       for (c = 0; i->defExists(c); ++c) {
1268          Instruction *mov;
1269          def[c][l] = bld.getSSA();
1270          mov = bld.mkMov(def[c][l], tex->getDef(c));
1271          mov->fixed = 1;
1272          mov->lanes = 1 << l;
1273       }
1274    }
1275
1276    for (c = 0; i->defExists(c); ++c) {
1277       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1278       for (l = 0; l < 4; ++l)
1279          u->setSrc(l, def[c][l]);
1280    }
1281
1282    i->bb->remove(i);
1283    return true;
1284 }
1285
1286 bool
1287 NVC0LoweringPass::handleTXD(TexInstruction *txd)
1288 {
1289    int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
1290    unsigned arg = txd->tex.target.getArgCount();
1291    unsigned expected_args = arg;
1292    const int chipset = prog->getTarget()->getChipset();
1293
1294    if (chipset >= NVISA_GK104_CHIPSET) {
1295       if (!txd->tex.target.isArray() && txd->tex.useOffsets)
1296          expected_args++;
1297       if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
1298          expected_args++;
1299    } else {
1300       if (txd->tex.useOffsets)
1301          expected_args++;
1302       if (!txd->tex.target.isArray() && (
1303                 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
1304          expected_args++;
1305    }
1306
1307    if (expected_args > 4 ||
1308        dim > 2 ||
1309        txd->tex.target.isShadow())
1310       txd->op = OP_TEX;
1311
1312    handleTEX(txd);
1313    while (txd->srcExists(arg))
1314       ++arg;
1315
1316    txd->tex.derivAll = true;
1317    if (txd->op == OP_TEX)
1318       return handleManualTXD(txd);
1319
1320    assert(arg == expected_args);
1321    for (int c = 0; c < dim; ++c) {
1322       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
1323       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
1324       txd->dPdx[c].set(NULL);
1325       txd->dPdy[c].set(NULL);
1326    }
1327
1328    // In this case we have fewer than 4 "real" arguments, which means that
1329    // handleTEX didn't apply any padding. However we have to make sure that
1330    // the second "group" of arguments still gets padded up to 4.
1331    if (chipset >= NVISA_GK104_CHIPSET) {
1332       int s = arg + 2 * dim;
1333       if (s >= 4 && s < 7) {
1334          if (txd->srcExists(s)) // move potential predicate out of the way
1335             txd->moveSources(s, 7 - s);
1336          while (s < 7)
1337             txd->setSrc(s++, bld.loadImm(NULL, 0));
1338       }
1339    }
1340
1341    return true;
1342 }
1343
1344 bool
1345 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
1346 {
1347    const int chipset = prog->getTarget()->getChipset();
1348    if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
1349       txq->tex.r += prog->driver->io.texBindBase / 4;
1350
1351    if (txq->tex.rIndirectSrc < 0)
1352       return true;
1353
1354    Value *ticRel = txq->getIndirectR();
1355
1356    txq->setIndirectS(NULL);
1357    txq->tex.sIndirectSrc = -1;
1358
1359    assert(ticRel);
1360
1361    if (chipset < NVISA_GK104_CHIPSET) {
1362       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1363
1364       txq->setSrc(txq->tex.rIndirectSrc, NULL);
1365       if (txq->tex.r)
1366          ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1367                              ticRel, bld.mkImm(txq->tex.r));
1368
1369       bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
1370
1371       txq->moveSources(0, 1);
1372       txq->setSrc(0, src);
1373    } else {
1374       Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
1375       txq->tex.r = 0xff;
1376       txq->tex.s = 0x1f;
1377
1378       txq->setIndirectR(NULL);
1379       txq->moveSources(0, 1);
1380       txq->setSrc(0, hnd);
1381       txq->tex.rIndirectSrc = 0;
1382    }
1383
1384    return true;
1385 }
1386
1387 bool
1388 NVC0LoweringPass::handleTXLQ(TexInstruction *i)
1389 {
1390    /* The outputs are inverted compared to what the TGSI instruction
1391     * expects. Take that into account in the mask.
1392     */
1393    assert((i->tex.mask & ~3) == 0);
1394    if (i->tex.mask == 1)
1395       i->tex.mask = 2;
1396    else if (i->tex.mask == 2)
1397       i->tex.mask = 1;
1398    handleTEX(i);
1399    bld.setPosition(i, true);
1400
1401    /* The returned values are not quite what we want:
1402     * (a) convert from s16/u16 to f32
1403     * (b) multiply by 1/256
1404     */
1405    for (int def = 0; def < 2; ++def) {
1406       if (!i->defExists(def))
1407          continue;
1408       enum DataType type = TYPE_S16;
1409       if (i->tex.mask == 2 || def > 0)
1410          type = TYPE_U16;
1411       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
1412       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1413                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1414    }
1415    if (i->tex.mask == 3) {
1416       LValue *t = new_LValue(func, FILE_GPR);
1417       bld.mkMov(t, i->getDef(0));
1418       bld.mkMov(i->getDef(0), i->getDef(1));
1419       bld.mkMov(i->getDef(1), t);
1420    }
1421    return true;
1422 }
1423
1424 bool
1425 NVC0LoweringPass::handleBUFQ(Instruction *bufq)
1426 {
1427    bufq->op = OP_MOV;
1428    bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
1429                                    bufq->getSrc(0)->reg.fileIndex * 16));
1430    bufq->setIndirect(0, 0, NULL);
1431    bufq->setIndirect(0, 1, NULL);
1432    return true;
1433 }
1434
1435 void
1436 NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
1437 {
1438    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1439
1440    BasicBlock *currBB = atom->bb;
1441    BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1442    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1443    BasicBlock *setAndUnlockBB = new BasicBlock(func);
1444    BasicBlock *failLockBB = new BasicBlock(func);
1445
1446    bld.setPosition(currBB, true);
1447    assert(!currBB->joinAt);
1448    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1449
1450    CmpInstruction *pred =
1451       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1452                 TYPE_U32, bld.mkImm(0), bld.mkImm(1));
1453
1454    bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1455    currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1456
1457    bld.setPosition(tryLockBB, true);
1458
1459    Instruction *ld =
1460       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1461                  atom->getIndirect(0, 0));
1462    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1463    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1464
1465    bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
1466    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1467    tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1468    tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1469
1470    tryLockBB->cfg.detach(&joinBB->cfg);
1471    bld.remove(atom);
1472
1473    bld.setPosition(setAndUnlockBB, true);
1474    Value *stVal;
1475    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1476       // Read the old value, and write the new one.
1477       stVal = atom->getSrc(1);
1478    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1479       CmpInstruction *set =
1480          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
1481                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1482
1483       bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
1484                 TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
1485    } else {
1486       operation op;
1487
1488       switch (atom->subOp) {
1489       case NV50_IR_SUBOP_ATOM_ADD:
1490          op = OP_ADD;
1491          break;
1492       case NV50_IR_SUBOP_ATOM_AND:
1493          op = OP_AND;
1494          break;
1495       case NV50_IR_SUBOP_ATOM_OR:
1496          op = OP_OR;
1497          break;
1498       case NV50_IR_SUBOP_ATOM_XOR:
1499          op = OP_XOR;
1500          break;
1501       case NV50_IR_SUBOP_ATOM_MIN:
1502          op = OP_MIN;
1503          break;
1504       case NV50_IR_SUBOP_ATOM_MAX:
1505          op = OP_MAX;
1506          break;
1507       default:
1508          assert(0);
1509          return;
1510       }
1511
1512       stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
1513                          atom->getSrc(1));
1514    }
1515
1516    Instruction *st =
1517       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1518                   atom->getIndirect(0, 0), stVal);
1519    st->setDef(0, pred->getDef(0));
1520    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1521
1522    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1523    setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1524
1525    // Lock until the store has not been performed.
1526    bld.setPosition(failLockBB, true);
1527    bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
1528    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1529    failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1530    failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1531
1532    bld.setPosition(joinBB, false);
1533    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1534 }
1535
1536 void
1537 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
1538 {
1539    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1540
1541    BasicBlock *currBB = atom->bb;
1542    BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
1543    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1544
1545    bld.setPosition(currBB, true);
1546    assert(!currBB->joinAt);
1547    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1548
1549    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
1550    currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
1551
1552    bld.setPosition(tryLockAndSetBB, true);
1553
1554    Instruction *ld =
1555       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1556                  atom->getIndirect(0, 0));
1557    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1558    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1559
1560    Value *stVal;
1561    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1562       // Read the old value, and write the new one.
1563       stVal = atom->getSrc(1);
1564    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1565       CmpInstruction *set =
1566          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1567                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1568       set->setPredicate(CC_P, ld->getDef(1));
1569
1570       Instruction *selp =
1571          bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
1572                    atom->getSrc(2), set->getDef(0));
1573       selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
1574       selp->setPredicate(CC_P, ld->getDef(1));
1575
1576       stVal = selp->getDef(0);
1577    } else {
1578       operation op;
1579
1580       switch (atom->subOp) {
1581       case NV50_IR_SUBOP_ATOM_ADD:
1582          op = OP_ADD;
1583          break;
1584       case NV50_IR_SUBOP_ATOM_AND:
1585          op = OP_AND;
1586          break;
1587       case NV50_IR_SUBOP_ATOM_OR:
1588          op = OP_OR;
1589          break;
1590       case NV50_IR_SUBOP_ATOM_XOR:
1591          op = OP_XOR;
1592          break;
1593       case NV50_IR_SUBOP_ATOM_MIN:
1594          op = OP_MIN;
1595          break;
1596       case NV50_IR_SUBOP_ATOM_MAX:
1597          op = OP_MAX;
1598          break;
1599       default:
1600          assert(0);
1601          return;
1602       }
1603
1604       Instruction *i =
1605          bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1606                    atom->getSrc(1));
1607       i->setPredicate(CC_P, ld->getDef(1));
1608
1609       stVal = i->getDef(0);
1610    }
1611
1612    Instruction *st =
1613       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1614                   atom->getIndirect(0, 0), stVal);
1615    st->setPredicate(CC_P, ld->getDef(1));
1616    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1617
1618    // Loop until the lock is acquired.
1619    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
1620    tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
1621    tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
1622    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1623
1624    bld.remove(atom);
1625
1626    bld.setPosition(joinBB, false);
1627    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1628 }
1629
1630 bool
1631 NVC0LoweringPass::handleATOM(Instruction *atom)
1632 {
1633    SVSemantic sv;
1634    Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
1635
1636    switch (atom->src(0).getFile()) {
1637    case FILE_MEMORY_LOCAL:
1638       sv = SV_LBASE;
1639       break;
1640    case FILE_MEMORY_SHARED:
1641       // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1642       // operations on shared memory. For Maxwell, ATOMS is enough.
1643       if (targ->getChipset() < NVISA_GK104_CHIPSET)
1644          handleSharedATOM(atom);
1645       else if (targ->getChipset() < NVISA_GM107_CHIPSET)
1646          handleSharedATOMNVE4(atom);
1647       return true;
1648    case FILE_MEMORY_GLOBAL:
1649       return true;
1650    default:
1651       assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
1652       base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
1653       assert(base->reg.size == 8);
1654       if (ptr)
1655          base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
1656       assert(base->reg.size == 8);
1657       atom->setIndirect(0, 0, base);
1658       atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1659
1660       // Harden against out-of-bounds accesses
1661       Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
1662       Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
1663       Value *pred = new_LValue(func, FILE_PREDICATE);
1664       if (ptr)
1665          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
1666       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
1667       atom->setPredicate(CC_NOT_P, pred);
1668       if (atom->defExists(0)) {
1669          Value *zero, *dst = atom->getDef(0);
1670          atom->setDef(0, bld.getSSA());
1671
1672          bld.setPosition(atom, true);
1673          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
1674             ->setPredicate(CC_P, pred);
1675          bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
1676       }
1677
1678       return true;
1679    }
1680    base =
1681       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
1682
1683    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
1684    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1685    if (ptr)
1686       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
1687    atom->setIndirect(0, 1, NULL);
1688    atom->setIndirect(0, 0, base);
1689
1690    return true;
1691 }
1692
1693 bool
1694 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
1695 {
1696    if (targ->getChipset() < NVISA_GM107_CHIPSET) {
1697       if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
1698          // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1699          return false;
1700       }
1701    }
1702
1703    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
1704        cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
1705       return false;
1706    bld.setPosition(cas, true);
1707
1708    if (needCctl) {
1709       Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
1710       cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
1711       cctl->fixed = 1;
1712       cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
1713       if (cas->isPredicated())
1714          cctl->setPredicate(cas->cc, cas->getPredicate());
1715    }
1716
1717    if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1718       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1719       // should be set to the high part of the double reg or bad things will
1720       // happen elsewhere in the universe.
1721       // Also, it sometimes returns the new value instead of the old one
1722       // under mysterious circumstances.
1723       Value *dreg = bld.getSSA(8);
1724       bld.setPosition(cas, false);
1725       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
1726       cas->setSrc(1, dreg);
1727       cas->setSrc(2, dreg);
1728    }
1729
1730    return true;
1731 }
1732
1733 inline Value *
1734 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
1735 {
1736    uint8_t b = prog->driver->io.auxCBSlot;
1737    off += base;
1738
1739    return bld.
1740       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1741 }
1742
1743 inline Value *
1744 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
1745 {
1746    uint8_t b = prog->driver->io.auxCBSlot;
1747    off += base;
1748
1749    if (ptr)
1750       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1751
1752    return bld.
1753       mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
1754 }
1755
1756 inline Value *
1757 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
1758 {
1759    uint8_t b = prog->driver->io.auxCBSlot;
1760    off += base;
1761
1762    if (ptr)
1763       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1764
1765    return bld.
1766       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
1767 }
1768
1769 inline Value *
1770 NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
1771 {
1772    return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
1773 }
1774
1775 inline Value *
1776 NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
1777 {
1778    return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
1779 }
1780
1781 inline Value *
1782 NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
1783 {
1784    return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
1785 }
1786
1787 inline Value *
1788 NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
1789 {
1790    return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
1791 }
1792
1793 inline Value *
1794 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1795 {
1796    uint8_t b = prog->driver->io.msInfoCBSlot;
1797    off += prog->driver->io.msInfoBase;
1798    return bld.
1799       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1800 }
1801
1802 inline Value *
1803 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless)
1804 {
1805    uint32_t base = slot * NVC0_SU_INFO__STRIDE;
1806
1807    // We don't upload surface info for bindless for GM107+
1808    assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET);
1809
1810    if (ptr) {
1811       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
1812       if (bindless)
1813          ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(511));
1814       else
1815          ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
1816       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
1817       base = 0;
1818    }
1819    off += base;
1820
1821    return loadResInfo32(ptr, off, bindless ? prog->driver->io.bindlessBase :
1822                         prog->driver->io.suInfoBase);
1823 }
1824
1825 Value *
1826 NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless)
1827 {
1828    if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET)
1829       return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless);
1830
1831    assert(bindless);
1832
1833    Value *samples = bld.getSSA();
1834    // this shouldn't be lowered because it's being inserted before the current instruction
1835    TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
1836    tex->tex.target = target;
1837    tex->tex.query = TXQ_TYPE;
1838    tex->tex.mask = 0x4;
1839    tex->tex.r = 0xff;
1840    tex->tex.s = 0x1f;
1841    tex->tex.rIndirectSrc = 0;
1842    tex->setDef(0, samples);
1843    tex->setSrc(0, ind);
1844    tex->setSrc(1, bld.loadImm(NULL, 0));
1845    bld.insert(tex);
1846
1847    // doesn't work with sample counts other than 1/2/4/8 but they aren't supported
1848    switch (index) {
1849    case 0: {
1850       Value *tmp = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(2));
1851       return bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(2));
1852    }
1853    case 1: {
1854       Value *tmp = bld.mkCmp(OP_SET, CC_GT, TYPE_U32, bld.getSSA(), TYPE_U32, samples, bld.mkImm(2))->getDef(0);
1855       return bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(1));
1856    }
1857    default: {
1858       assert(false);
1859       return NULL;
1860    }
1861    }
1862 }
1863
1864 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1865 {
1866    switch (su->tex.target.getEnum()) {
1867    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1868    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1869    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1870    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
1871                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1872                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1873    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1874    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1875    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1876    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1877    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1878    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1879    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1880    default:
1881       assert(0);
1882       return 0;
1883    }
1884 }
1885
1886 bool
1887 NVC0LoweringPass::handleSUQ(TexInstruction *suq)
1888 {
1889    int mask = suq->tex.mask;
1890    int dim = suq->tex.target.getDim();
1891    int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1892    Value *ind = suq->getIndirectR();
1893    int slot = suq->tex.r;
1894    int c, d;
1895
1896    for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1897       if (c >= arg || !(mask & 1))
1898          continue;
1899
1900       int offset;
1901
1902       if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1903          offset = NVC0_SU_INFO_SIZE(2);
1904       } else {
1905          offset = NVC0_SU_INFO_SIZE(c);
1906       }
1907       bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset, suq->tex.bindless));
1908       if (c == 2 && suq->tex.target.isCube())
1909          bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1910                    bld.loadImm(NULL, 6));
1911    }
1912
1913    if (mask & 1) {
1914       if (suq->tex.target.isMS()) {
1915          Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless);
1916          Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless);
1917          Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1918          bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1919       } else {
1920          bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1921       }
1922    }
1923
1924    bld.remove(suq);
1925    return true;
1926 }
1927
1928 void
1929 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1930 {
1931    const int arg = tex->tex.target.getArgCount();
1932    int slot = tex->tex.r;
1933
1934    if (tex->tex.target == TEX_TARGET_2D_MS)
1935       tex->tex.target = TEX_TARGET_2D;
1936    else
1937    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1938       tex->tex.target = TEX_TARGET_2D_ARRAY;
1939    else
1940       return;
1941
1942    Value *x = tex->getSrc(0);
1943    Value *y = tex->getSrc(1);
1944    Value *s = tex->getSrc(arg - 1);
1945
1946    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1947    Value *ind = tex->getIndirectR();
1948
1949    Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless);
1950    Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless);
1951
1952    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1953    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1954
1955    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1956    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1957
1958    Value *dx = loadMsInfo32(ts, 0x0);
1959    Value *dy = loadMsInfo32(ts, 0x4);
1960
1961    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1962    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1963
1964    tex->setSrc(0, tx);
1965    tex->setSrc(1, ty);
1966    tex->moveSources(arg, -1);
1967 }
1968
1969 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1970 // They're computed from the coordinates using the surface info in c[] space.
1971 void
1972 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1973 {
1974    Instruction *insn;
1975    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
1976    const bool raw =
1977       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
1978    const int slot = su->tex.r;
1979    const int dim = su->tex.target.getDim();
1980    const bool array = su->tex.target.isArray() || su->tex.target.isCube();
1981    const int arg = dim + array;
1982    int c;
1983    Value *zero = bld.mkImm(0);
1984    Value *p1 = NULL;
1985    Value *v;
1986    Value *src[3];
1987    Value *bf, *eau, *off;
1988    Value *addr, *pred;
1989    Value *ind = su->getIndirectR();
1990    Value *y, *z;
1991
1992    off = bld.getScratch(4);
1993    bf = bld.getScratch(4);
1994    addr = bld.getSSA(8);
1995    pred = bld.getScratch(1, FILE_PREDICATE);
1996
1997    bld.setPosition(su, false);
1998
1999    adjustCoordinatesMS(su);
2000
2001    // calculate clamped coordinates
2002    for (c = 0; c < arg; ++c) {
2003       int dimc = c;
2004
2005       if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
2006          // The array index is stored in the Z component for 1D arrays.
2007          dimc = 2;
2008       }
2009
2010       src[c] = bld.getScratch();
2011       if (c == 0 && raw)
2012          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X, su->tex.bindless);
2013       else
2014          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc), su->tex.bindless);
2015       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
2016          ->subOp = getSuClampSubOp(su, dimc);
2017    }
2018    for (; c < 3; ++c)
2019       src[c] = zero;
2020
2021    if (dim == 2 && !array) {
2022       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2023       src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
2024                           v, bld.loadImm(NULL, 16));
2025
2026       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);
2027       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)
2028          ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
2029    }
2030
2031    // set predicate output
2032    if (su->tex.target == TEX_TARGET_BUFFER) {
2033       src[0]->getInsn()->setFlagsDef(1, pred);
2034    } else
2035    if (array) {
2036       p1 = bld.getSSA(1, FILE_PREDICATE);
2037       src[dim]->getInsn()->setFlagsDef(1, p1);
2038    }
2039
2040    // calculate pixel offset
2041    if (dim == 1) {
2042       y = z = zero;
2043       if (su->tex.target != TEX_TARGET_BUFFER)
2044          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
2045    } else {
2046       y = src[1];
2047       z = src[2];
2048
2049       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2050       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
2051          ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l
2052
2053       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
2054       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
2055          ->subOp = array ?
2056          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
2057    }
2058
2059    // calculate effective address part 1
2060    if (su->tex.target == TEX_TARGET_BUFFER) {
2061       if (raw) {
2062          bf = src[0];
2063       } else {
2064          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2065          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
2066             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
2067       }
2068    } else {
2069       uint16_t subOp = 0;
2070
2071       switch (dim) {
2072       case 1:
2073          break;
2074       case 2:
2075          if (array) {
2076             z = off;
2077          } else {
2078             subOp = NV50_IR_SUBOP_SUBFM_3D;
2079          }
2080          break;
2081       default:
2082          subOp = NV50_IR_SUBOP_SUBFM_3D;
2083          assert(dim == 3);
2084          break;
2085       }
2086       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
2087       insn->subOp = subOp;
2088       insn->setFlagsDef(1, pred);
2089    }
2090
2091    // part 2
2092    v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless);
2093
2094    if (su->tex.target == TEX_TARGET_BUFFER) {
2095       eau = v;
2096    } else {
2097       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
2098    }
2099    // add array layer offset
2100    if (array) {
2101       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2102       if (dim == 1)
2103          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
2104             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
2105       else
2106          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
2107             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
2108       // combine predicates
2109       assert(p1);
2110       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
2111    }
2112
2113    if (atom) {
2114       Value *lo = bf;
2115       if (su->tex.target == TEX_TARGET_BUFFER) {
2116          lo = zero;
2117          bld.mkMov(off, bf);
2118       }
2119       //  bf == g[] address & 0xff
2120       // eau == g[] address >> 8
2121       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
2122       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
2123    } else
2124    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
2125       // Convert from u32 to u8 address format, which is what the library code
2126       // doing SULDP currently uses.
2127       // XXX: can SUEAU do this ?
2128       // XXX: does it matter that we don't mask high bytes in bf ?
2129       // Grrr.
2130       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
2131       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
2132    }
2133
2134    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
2135
2136    if (atom && su->tex.target == TEX_TARGET_BUFFER)
2137       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
2138
2139    // let's just set it 0 for raw access and hope it works
2140    v = raw ?
2141       bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2142
2143    // get rid of old coordinate sources, make space for fmt info and predicate
2144    su->moveSources(arg, 3 - arg);
2145    // set 64 bit address and 32-bit format sources
2146    su->setSrc(0, addr);
2147    su->setSrc(1, v);
2148    su->setSrc(2, pred);
2149    su->setIndirectR(NULL);
2150
2151    // prevent read fault when the image is not actually bound
2152    CmpInstruction *pred1 =
2153       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2154                 TYPE_U32, bld.mkImm(0),
2155                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2156
2157    if (su->op != OP_SUSTP && su->tex.format) {
2158       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2159       int blockwidth = format->bits[0] + format->bits[1] +
2160                        format->bits[2] + format->bits[3];
2161
2162       // make sure that the format doesn't mismatch
2163       assert(format->components != 0);
2164       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
2165                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2166                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2167                 pred1->getDef(0));
2168    }
2169    su->setPredicate(CC_NOT_P, pred1->getDef(0));
2170
2171    // TODO: initialize def values to 0 when the surface operation is not
2172    // performed (not needed for stores). Also, fix the "address bounds test"
2173    // subtests from arb_shader_image_load_store-invalid for buffers, because it
2174    // seems like that the predicate is not correctly set by suclamp.
2175 }
2176
2177 static DataType
2178 getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
2179 {
2180    switch (t->type) {
2181    case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
2182    case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
2183    case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
2184    case UINT:
2185       return (t->bits[c] == 8 ? TYPE_U8 :
2186               (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
2187    case SINT:
2188       return (t->bits[c] == 8 ? TYPE_S8 :
2189               (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
2190    }
2191    return TYPE_NONE;
2192 }
2193
2194 static DataType
2195 getDestType(const ImgType type) {
2196    switch (type) {
2197    case FLOAT:
2198    case UNORM:
2199    case SNORM:
2200       return TYPE_F32;
2201    case UINT:
2202       return TYPE_U32;
2203    case SINT:
2204       return TYPE_S32;
2205    default:
2206       assert(!"Impossible type");
2207       return TYPE_NONE;
2208    }
2209 }
2210
2211 void
2212 NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded)
2213 {
2214    const TexInstruction::ImgFormatDesc *format = su->tex.format;
2215    int width = format->bits[0] + format->bits[1] +
2216       format->bits[2] + format->bits[3];
2217    Value *untypedDst[4] = {};
2218    Value *typedDst[4] = {};
2219
2220    // We must convert this to a generic load.
2221    su->op = OP_SULDB;
2222
2223    su->dType = typeOfSize(width / 8);
2224    su->sType = TYPE_U8;
2225
2226    for (int i = 0; i < width / 32; i++)
2227       untypedDst[i] = bld.getSSA();
2228    if (width < 32)
2229       untypedDst[0] = bld.getSSA();
2230
2231    if (loaded && loaded[0]) {
2232       for (int i = 0; i < 4; i++) {
2233          if (loaded[i])
2234             typedDst[i] = loaded[i]->getDef(0);
2235       }
2236    } else {
2237       for (int i = 0; i < 4; i++) {
2238          typedDst[i] = su->getDef(i);
2239       }
2240    }
2241
2242    // Set the untyped dsts as the su's destinations
2243    if (loaded && loaded[0]) {
2244       for (int i = 0; i < 4; i++)
2245          if (loaded[i])
2246             loaded[i]->setDef(0, untypedDst[i]);
2247    } else {
2248       for (int i = 0; i < 4; i++)
2249          su->setDef(i, untypedDst[i]);
2250
2251       bld.setPosition(su, true);
2252    }
2253
2254    // Unpack each component into the typed dsts
2255    int bits = 0;
2256    for (int i = 0; i < 4; bits += format->bits[i], i++) {
2257       if (!typedDst[i])
2258          continue;
2259
2260       if (loaded && loaded[0])
2261          bld.setPosition(loaded[i], true);
2262
2263       if (i >= format->components) {
2264          if (format->type == FLOAT ||
2265              format->type == UNORM ||
2266              format->type == SNORM)
2267             bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
2268          else
2269             bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
2270          continue;
2271       }
2272
2273       // Get just that component's data into the relevant place
2274       if (format->bits[i] == 32)
2275          bld.mkMov(typedDst[i], untypedDst[i]);
2276       else if (format->bits[i] == 16)
2277          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2278                    getSrcType(format, i), untypedDst[i / 2])
2279          ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
2280       else if (format->bits[i] == 8)
2281          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2282                    getSrcType(format, i), untypedDst[0])->subOp = i;
2283       else {
2284          bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
2285                    bld.mkImm((bits % 32) | (format->bits[i] << 8)));
2286          if (format->type == UNORM || format->type == SNORM)
2287             bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
2288       }
2289
2290       // Normalize / convert as necessary
2291       if (format->type == UNORM)
2292          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
2293       else if (format->type == SNORM)
2294          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
2295       else if (format->type == FLOAT && format->bits[i] < 16) {
2296          bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
2297          bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
2298       }
2299    }
2300
2301    if (format->bgra) {
2302       std::swap(typedDst[0], typedDst[2]);
2303    }
2304 }
2305
2306 void
2307 NVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su)
2308 {
2309    if (!su->getPredicate())
2310       return;
2311
2312    bld.setPosition(su, true);
2313
2314    for (unsigned i = 0; su->defExists(i); ++i) {
2315       ValueDef &def = su->def(i);
2316
2317       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2318       assert(su->cc == CC_NOT_P);
2319       mov->setPredicate(CC_P, su->getPredicate());
2320       Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), NULL, mov->getDef(0));
2321
2322       def.replace(uni->getDef(0), false);
2323       uni->setSrc(0, def.get());
2324    }
2325 }
2326
2327 void
2328 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
2329 {
2330    processSurfaceCoordsNVE4(su);
2331
2332    if (su->op == OP_SULDP) {
2333       convertSurfaceFormat(su, NULL);
2334       insertOOBSurfaceOpResult(su);
2335    }
2336
2337    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2338       assert(su->getPredicate());
2339       Value *pred =
2340          bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
2341                     su->getPredicate(), su->getSrc(2));
2342
2343       Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
2344       red->subOp = su->subOp;
2345       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
2346       red->setSrc(1, su->getSrc(3));
2347       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
2348          red->setSrc(2, su->getSrc(4));
2349       red->setIndirect(0, 0, su->getSrc(0));
2350
2351       // make sure to initialize dst value when the atomic operation is not
2352       // performed
2353       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2354
2355       assert(su->cc == CC_NOT_P);
2356       red->setPredicate(su->cc, pred);
2357       mov->setPredicate(CC_P, pred);
2358
2359       bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
2360                 red->getDef(0), mov->getDef(0));
2361
2362       delete_Instruction(bld.getProgram(), su);
2363       handleCasExch(red, true);
2364    }
2365
2366    if (su->op == OP_SUSTB || su->op == OP_SUSTP)
2367       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
2368 }
2369
2370 void
2371 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
2372 {
2373    const int slot = su->tex.r;
2374    const int dim = su->tex.target.getDim();
2375    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2376    int c;
2377    Value *zero = bld.mkImm(0);
2378    Value *src[3];
2379    Value *v;
2380    Value *ind = su->getIndirectR();
2381
2382    bld.setPosition(su, false);
2383
2384    adjustCoordinatesMS(su);
2385
2386    if (ind) {
2387       Value *ptr;
2388       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
2389       ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
2390       su->setIndirectR(ptr);
2391    }
2392
2393    // get surface coordinates
2394    for (c = 0; c < arg; ++c)
2395       src[c] = su->getSrc(c);
2396    for (; c < 3; ++c)
2397       src[c] = zero;
2398
2399    // calculate pixel offset
2400    if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2401       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless);
2402       su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
2403    }
2404
2405    // add array layer offset
2406    if (su->tex.target.isArray() || su->tex.target.isCube()) {
2407       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2408       assert(dim > 1);
2409       su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
2410    }
2411
2412    // prevent read fault when the image is not actually bound
2413    CmpInstruction *pred =
2414       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2415                 TYPE_U32, bld.mkImm(0),
2416                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2417    if (su->op != OP_SUSTP && su->tex.format) {
2418       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2419       int blockwidth = format->bits[0] + format->bits[1] +
2420                        format->bits[2] + format->bits[3];
2421
2422       assert(format->components != 0);
2423       // make sure that the format doesn't mismatch when it's not FMT_NONE
2424       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2425                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2426                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2427                 pred->getDef(0));
2428    }
2429    su->setPredicate(CC_NOT_P, pred->getDef(0));
2430 }
2431
2432 void
2433 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
2434 {
2435    if (su->tex.target == TEX_TARGET_1D_ARRAY) {
2436       /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2437        * will simplify the lowering pass and the texture constraints. */
2438       su->moveSources(1, 1);
2439       su->setSrc(1, bld.loadImm(NULL, 0));
2440       su->tex.target = TEX_TARGET_2D_ARRAY;
2441    }
2442
2443    processSurfaceCoordsNVC0(su);
2444
2445    if (su->op == OP_SULDP) {
2446       convertSurfaceFormat(su, NULL);
2447       insertOOBSurfaceOpResult(su);
2448    }
2449
2450    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2451       const int dim = su->tex.target.getDim();
2452       const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2453       LValue *addr = bld.getSSA(8);
2454       Value *def = su->getDef(0);
2455
2456       su->op = OP_SULEA;
2457
2458       // Set the destination to the address
2459       su->dType = TYPE_U64;
2460       su->setDef(0, addr);
2461       su->setDef(1, su->getPredicate());
2462
2463       bld.setPosition(su, true);
2464
2465       // Perform the atomic op
2466       Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
2467       red->subOp = su->subOp;
2468       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
2469       red->setSrc(1, su->getSrc(arg));
2470       if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
2471          red->setSrc(2, su->getSrc(arg + 1));
2472       red->setIndirect(0, 0, addr);
2473
2474       // make sure to initialize dst value when the atomic operation is not
2475       // performed
2476       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2477
2478       assert(su->cc == CC_NOT_P);
2479       red->setPredicate(su->cc, su->getPredicate());
2480       mov->setPredicate(CC_P, su->getPredicate());
2481
2482       bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
2483
2484       handleCasExch(red, false);
2485    }
2486 }
2487
2488 TexInstruction *
2489 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4])
2490 {
2491    const int slot = su->tex.r;
2492    const int dim = su->tex.target.getDim();
2493    const bool array = su->tex.target.isArray() || su->tex.target.isCube();
2494    const int arg = dim + array;
2495    Value *ind = su->getIndirectR();
2496    Value *handle;
2497    Instruction *pred = NULL, *pred2d = NULL;
2498    int pos = 0;
2499
2500    bld.setPosition(su, false);
2501
2502    adjustCoordinatesMS(su);
2503
2504    // add texture handle
2505    switch (su->op) {
2506    case OP_SUSTP:
2507       pos = 4;
2508       break;
2509    case OP_SUREDP:
2510       pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
2511       break;
2512    default:
2513       assert(pos == 0);
2514       break;
2515    }
2516
2517    if (dim == 2 && !array) {
2518       // This might be a 2d slice of a 3d texture, try to load the z
2519       // coordinate in.
2520       Value *v;
2521       if (!su->tex.bindless)
2522          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2523       else
2524          v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11));
2525       Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1));
2526       pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2527                          TYPE_U32, bld.mkImm(0), is_3d);
2528
2529       bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16));
2530       su->moveSources(dim, 1);
2531       su->setSrc(dim, v);
2532       su->tex.target = nv50_ir::TEX_TARGET_3D;
2533       pos++;
2534    }
2535
2536    if (su->tex.bindless)
2537       handle = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ind, bld.mkImm(2047));
2538    else
2539       handle = loadTexHandle(ind, slot + 32);
2540
2541    su->setSrc(arg + pos, handle);
2542
2543    // The address check doesn't make sense here. The format check could make
2544    // sense but it's a bit of a pain.
2545    if (!su->tex.bindless) {
2546       // prevent read fault when the image is not actually bound
2547       pred =
2548          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2549                    TYPE_U32, bld.mkImm(0),
2550                    loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2551       if (su->op != OP_SUSTP && su->tex.format) {
2552          const TexInstruction::ImgFormatDesc *format = su->tex.format;
2553          int blockwidth = format->bits[0] + format->bits[1] +
2554             format->bits[2] + format->bits[3];
2555
2556          assert(format->components != 0);
2557          // make sure that the format doesn't mismatch when it's not FMT_NONE
2558          bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2559                    TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2560                    loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2561                    pred->getDef(0));
2562       }
2563    }
2564
2565    // Now we have "pred" which (optionally) contains whether to do the surface
2566    // op at all, and a "pred2d" which indicates that, in case of doing the
2567    // surface op, we have to create a 2d and 3d version, conditioned on pred2d.
2568    TexInstruction *su2d = NULL;
2569    if (pred2d) {
2570       su2d = cloneForward(func, su)->asTex();
2571       for (unsigned i = 0; su->defExists(i); ++i)
2572          su2d->setDef(i, bld.getSSA());
2573       su2d->moveSources(dim + 1, -1);
2574       su2d->tex.target = nv50_ir::TEX_TARGET_2D;
2575    }
2576    if (pred2d && pred) {
2577       Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8,
2578                                       bld.getSSA(1, FILE_PREDICATE),
2579                                       pred->getDef(0), pred2d->getDef(0));
2580       pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
2581       pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT);
2582       su->setPredicate(CC_P, pred3d->getDef(0));
2583       pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE),
2584                          pred->getDef(0), pred2d->getDef(0));
2585       pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
2586    } else if (pred) {
2587       su->setPredicate(CC_NOT_P, pred->getDef(0));
2588    } else if (pred2d) {
2589       su->setPredicate(CC_NOT_P, pred2d->getDef(0));
2590    }
2591    if (su2d) {
2592       su2d->setPredicate(CC_P, pred2d->getDef(0));
2593       bld.insert(su2d);
2594
2595       // Create a UNION so that RA assigns the same registers
2596       bld.setPosition(su, true);
2597       for (unsigned i = 0; su->defExists(i); ++i) {
2598          assert(i < 4);
2599
2600          ValueDef &def = su->def(i);
2601          ValueDef &def2 = su2d->def(i);
2602          Instruction *mov = NULL;
2603
2604          if (pred) {
2605             mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2606             mov->setPredicate(CC_P, pred->getDef(0));
2607          }
2608
2609          Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
2610                                       bld.getSSA(),
2611                                       NULL, def2.get());
2612          def.replace(uni->getDef(0), false);
2613          uni->setSrc(0, def.get());
2614          if (mov)
2615             uni->setSrc(2, mov->getDef(0));
2616       }
2617    } else if (pred) {
2618       // Create a UNION so that RA assigns the same registers
2619       bld.setPosition(su, true);
2620       for (unsigned i = 0; su->defExists(i); ++i) {
2621          assert(i < 4);
2622
2623          ValueDef &def = su->def(i);
2624
2625          Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2626          mov->setPredicate(CC_P, pred->getDef(0));
2627
2628          Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
2629                                       bld.getSSA(),
2630                                       NULL, mov->getDef(0));
2631          def.replace(uni->getDef(0), false);
2632          uni->setSrc(0, def.get());
2633       }
2634    }
2635
2636    return su2d;
2637 }
2638
2639 void
2640 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
2641 {
2642    // processSurfaceCoords also takes care of fixing up the outputs and
2643    // union'ing them with 0 as necessary. Additionally it may create a second
2644    // surface which needs some of the similar fixups.
2645
2646    Instruction *loaded[4] = {};
2647    TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded);
2648
2649    if (su->op == OP_SULDP) {
2650       convertSurfaceFormat(su, loaded);
2651    }
2652
2653    if (su->op == OP_SUREDP) {
2654       su->op = OP_SUREDB;
2655    }
2656
2657    // If we fixed up the type of the regular surface load instruction, we also
2658    // have to fix up the copy.
2659    if (su2) {
2660       su2->op = su->op;
2661       su2->dType = su->dType;
2662       su2->sType = su->sType;
2663    }
2664 }
2665
2666 bool
2667 NVC0LoweringPass::handleWRSV(Instruction *i)
2668 {
2669    Instruction *st;
2670    Symbol *sym;
2671    uint32_t addr;
2672
2673    // must replace, $sreg are not writeable
2674    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
2675    if (addr >= 0x400)
2676       return false;
2677    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
2678
2679    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
2680                     i->getSrc(1));
2681    st->perPatch = i->perPatch;
2682
2683    bld.getBB()->remove(i);
2684    return true;
2685 }
2686
2687 void
2688 NVC0LoweringPass::handleLDST(Instruction *i)
2689 {
2690    if (i->src(0).getFile() == FILE_SHADER_INPUT) {
2691       if (prog->getType() == Program::TYPE_COMPUTE) {
2692          i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
2693          i->getSrc(0)->reg.fileIndex = 0;
2694       } else
2695       if (prog->getType() == Program::TYPE_GEOMETRY &&
2696           i->src(0).isIndirect(0)) {
2697          // XXX: this assumes vec4 units
2698          Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2699                                  i->getIndirect(0, 0), bld.mkImm(4));
2700          i->setIndirect(0, 0, ptr);
2701          i->op = OP_VFETCH;
2702       } else {
2703          i->op = OP_VFETCH;
2704          assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
2705       }
2706    } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
2707       int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
2708       Value *ind = i->getIndirect(0, 1);
2709
2710       if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2711           prog->getType() == Program::TYPE_COMPUTE &&
2712           (fileIndex >= 6 || ind)) {
2713          // The launch descriptor only allows to set up 8 CBs, but OpenGL
2714          // requires at least 12 UBOs. To bypass this limitation, for constant
2715          // buffers 7+, we store the addrs into the driver constbuf and we
2716          // directly load from the global memory.
2717          if (ind) {
2718             // Clamp the UBO index when an indirect access is used to avoid
2719             // loading information from the wrong place in the driver cb.
2720             // TODO - synchronize the max with the driver.
2721             ind = bld.mkOp2v(OP_MIN, TYPE_U32, bld.getSSA(),
2722                              bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2723                                         ind, bld.loadImm(NULL, fileIndex)),
2724                              bld.loadImm(NULL, 13));
2725             fileIndex = 0;
2726          }
2727
2728          Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2729          Value *ptr = loadUboInfo64(ind, fileIndex * 16);
2730          Value *length = loadUboLength32(ind, fileIndex * 16);
2731          Value *pred = new_LValue(func, FILE_PREDICATE);
2732          if (i->src(0).isIndirect(0)) {
2733             bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2734             bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2735          }
2736          i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2737          i->setIndirect(0, 1, NULL);
2738          i->setIndirect(0, 0, ptr);
2739          bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2740          i->setPredicate(CC_NOT_P, pred);
2741          Value *zero, *dst = i->getDef(0);
2742          i->setDef(0, bld.getSSA());
2743
2744          bld.setPosition(i, true);
2745          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2746             ->setPredicate(CC_P, pred);
2747          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2748       } else if (i->src(0).isIndirect(1)) {
2749          Value *ptr;
2750          if (i->src(0).isIndirect(0))
2751             ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
2752                              i->getIndirect(0, 1), bld.mkImm(0x1010),
2753                              i->getIndirect(0, 0));
2754          else
2755             ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2756                              i->getIndirect(0, 1), bld.mkImm(16));
2757          i->setIndirect(0, 1, NULL);
2758          i->setIndirect(0, 0, ptr);
2759          i->subOp = NV50_IR_SUBOP_LDC_IS;
2760       }
2761    } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
2762       assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
2763       i->op = OP_VFETCH;
2764    } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
2765       Value *ind = i->getIndirect(0, 1);
2766       Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
2767       // XXX come up with a way not to do this for EVERY little access but
2768       // rather to batch these up somehow. Unfortunately we've lost the
2769       // information about the field width by the time we get here.
2770       Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2771       Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
2772       Value *pred = new_LValue(func, FILE_PREDICATE);
2773       if (i->src(0).isIndirect(0)) {
2774          bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2775          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2776       }
2777       i->setIndirect(0, 1, NULL);
2778       i->setIndirect(0, 0, ptr);
2779       i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2780       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2781       i->setPredicate(CC_NOT_P, pred);
2782       if (i->defExists(0)) {
2783          Value *zero, *dst = i->getDef(0);
2784          i->setDef(0, bld.getSSA());
2785
2786          bld.setPosition(i, true);
2787          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2788             ->setPredicate(CC_P, pred);
2789          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2790       }
2791    }
2792 }
2793
2794 void
2795 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
2796 {
2797    Value *laneid = bld.getSSA();
2798    Value *x, *y;
2799
2800    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
2801
2802    if (c == 0) {
2803       x = dst;
2804       y = NULL;
2805    } else
2806    if (c == 1) {
2807       x = NULL;
2808       y = dst;
2809    } else {
2810       assert(c == 2);
2811       if (prog->driver->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
2812          bld.mkMov(dst, bld.loadImm(NULL, 0));
2813          return;
2814       }
2815       x = bld.getSSA();
2816       y = bld.getSSA();
2817    }
2818    if (x)
2819       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
2820    if (y)
2821       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
2822
2823    if (c == 2) {
2824       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
2825       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
2826    }
2827 }
2828
2829 bool
2830 NVC0LoweringPass::handleRDSV(Instruction *i)
2831 {
2832    Symbol *sym = i->getSrc(0)->asSym();
2833    const SVSemantic sv = sym->reg.data.sv.sv;
2834    Value *vtx = NULL;
2835    Instruction *ld;
2836    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
2837
2838    if (addr >= 0x400) {
2839       // mov $sreg
2840       if (sym->reg.data.sv.index == 3) {
2841          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2842          i->op = OP_MOV;
2843          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
2844       } else
2845       if (sv == SV_TID) {
2846          // Help CSE combine TID fetches
2847          Value *tid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(),
2848                                  bld.mkSysVal(SV_COMBINED_TID, 0));
2849          i->op = OP_EXTBF;
2850          i->setSrc(0, tid);
2851          switch (sym->reg.data.sv.index) {
2852          case 0: i->setSrc(1, bld.mkImm(0x1000)); break;
2853          case 1: i->setSrc(1, bld.mkImm(0x0a10)); break;
2854          case 2: i->setSrc(1, bld.mkImm(0x061a)); break;
2855          }
2856       }
2857       if (sv == SV_VERTEX_COUNT) {
2858          bld.setPosition(i, true);
2859          bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
2860       }
2861       return true;
2862    }
2863
2864    switch (sv) {
2865    case SV_POSITION:
2866       assert(prog->getType() == Program::TYPE_FRAGMENT);
2867       if (i->srcExists(1)) {
2868          // Pass offset through to the interpolation logic
2869          ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
2870                            i->getDef(0), addr, NULL);
2871          ld->setSrc(1, i->getSrc(1));
2872       } else {
2873          bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
2874       }
2875       break;
2876    case SV_FACE:
2877    {
2878       Value *face = i->getDef(0);
2879       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
2880       if (i->dType == TYPE_F32) {
2881          bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
2882          bld.mkOp1(OP_NEG, TYPE_S32, face, face);
2883          bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
2884       }
2885    }
2886       break;
2887    case SV_TESS_COORD:
2888       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
2889       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
2890       break;
2891    case SV_NTID:
2892    case SV_NCTAID:
2893    case SV_GRIDID:
2894       assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
2895       if (sym->reg.data.sv.index == 3) {
2896          i->op = OP_MOV;
2897          i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
2898          return true;
2899       }
2900       // Fallthrough
2901    case SV_WORK_DIM:
2902       addr += prog->driver->prop.cp.gridInfoBase;
2903       bld.mkLoad(TYPE_U32, i->getDef(0),
2904                  bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2905                               TYPE_U32, addr), NULL);
2906       break;
2907    case SV_SAMPLE_INDEX:
2908       // TODO: Properly pass source as an address in the PIX address space
2909       // (which can be of the form [r0+offset]). But this is currently
2910       // unnecessary.
2911       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2912       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2913       break;
2914    case SV_SAMPLE_POS: {
2915       Value *sampleID = bld.getScratch();
2916       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0));
2917       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2918       Value *offset = calculateSampleOffset(sampleID);
2919
2920       assert(prog->driver->prop.fp.readsSampleLocations);
2921
2922       if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
2923          bld.mkLoad(TYPE_F32,
2924                     i->getDef(0),
2925                     bld.mkSymbol(
2926                           FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2927                           TYPE_U32, prog->driver->io.sampleInfoBase),
2928                     offset);
2929          bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0),
2930                    bld.mkImm(0x040c + sym->reg.data.sv.index * 16));
2931          bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0));
2932          bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), bld.mkImm(1.0f / 16.0f));
2933       } else {
2934          bld.mkLoad(TYPE_F32,
2935                     i->getDef(0),
2936                     bld.mkSymbol(
2937                           FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2938                           TYPE_U32, prog->driver->io.sampleInfoBase +
2939                           4 * sym->reg.data.sv.index),
2940                     offset);
2941       }
2942       break;
2943    }
2944    case SV_SAMPLE_MASK: {
2945       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2946       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
2947       Instruction *sampleid =
2948          bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2949       sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2950       Value *masked =
2951          bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
2952                     bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2953                                bld.loadImm(NULL, 1), sampleid->getDef(0)));
2954       if (prog->driver->prop.fp.persampleInvocation) {
2955          bld.mkMov(i->getDef(0), masked);
2956       } else {
2957          bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
2958                    bld.mkImm(0))
2959             ->subOp = 1;
2960       }
2961       break;
2962    }
2963    case SV_BASEVERTEX:
2964    case SV_BASEINSTANCE:
2965    case SV_DRAWID:
2966       ld = bld.mkLoad(TYPE_U32, i->getDef(0),
2967                       bld.mkSymbol(FILE_MEMORY_CONST,
2968                                    prog->driver->io.auxCBSlot,
2969                                    TYPE_U32,
2970                                    prog->driver->io.drawInfoBase +
2971                                    4 * (sv - SV_BASEVERTEX)),
2972                       NULL);
2973       break;
2974    default:
2975       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
2976          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2977       if (prog->getType() == Program::TYPE_FRAGMENT) {
2978          bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
2979       } else {
2980          ld = bld.mkFetch(i->getDef(0), i->dType,
2981                           FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
2982          ld->perPatch = i->perPatch;
2983       }
2984       break;
2985    }
2986    bld.getBB()->remove(i);
2987    return true;
2988 }
2989
2990 bool
2991 NVC0LoweringPass::handleDIV(Instruction *i)
2992 {
2993    if (!isFloatType(i->dType))
2994       return true;
2995    bld.setPosition(i, false);
2996    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
2997    i->op = OP_MUL;
2998    i->setSrc(1, rcp->getDef(0));
2999    return true;
3000 }
3001
3002 bool
3003 NVC0LoweringPass::handleMOD(Instruction *i)
3004 {
3005    if (!isFloatType(i->dType))
3006       return true;
3007    LValue *value = bld.getScratch(typeSizeof(i->dType));
3008    bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
3009    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
3010    bld.mkOp1(OP_TRUNC, i->dType, value, value);
3011    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
3012    i->op = OP_SUB;
3013    i->setSrc(1, value);
3014    return true;
3015 }
3016
3017 bool
3018 NVC0LoweringPass::handleSQRT(Instruction *i)
3019 {
3020    if (targ->isOpSupported(OP_SQRT, i->dType))
3021       return true;
3022
3023    if (i->dType == TYPE_F64) {
3024       Value *pred = bld.getSSA(1, FILE_PREDICATE);
3025       Value *zero = bld.loadImm(NULL, 0.0);
3026       Value *dst = bld.getSSA(8);
3027       bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
3028       bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
3029       bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
3030       i->op = OP_MUL;
3031       i->setSrc(1, dst);
3032       // TODO: Handle this properly with a library function
3033    } else {
3034       bld.setPosition(i, true);
3035       i->op = OP_RSQ;
3036       bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
3037    }
3038
3039    return true;
3040 }
3041
3042 bool
3043 NVC0LoweringPass::handlePOW(Instruction *i)
3044 {
3045    LValue *val = bld.getScratch();
3046
3047    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
3048    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
3049    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
3050
3051    i->op = OP_EX2;
3052    i->setSrc(0, val);
3053    i->setSrc(1, NULL);
3054
3055    return true;
3056 }
3057
3058 bool
3059 NVC0LoweringPass::handleEXPORT(Instruction *i)
3060 {
3061    if (prog->getType() == Program::TYPE_FRAGMENT) {
3062       int id = i->getSrc(0)->reg.data.offset / 4;
3063
3064       if (i->src(0).isIndirect(0)) // TODO, ugly
3065          return false;
3066       i->op = OP_MOV;
3067       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
3068       i->src(0).set(i->src(1));
3069       i->setSrc(1, NULL);
3070       i->setDef(0, new_LValue(func, FILE_GPR));
3071       i->getDef(0)->reg.data.id = id;
3072
3073       prog->maxGPR = MAX2(prog->maxGPR, id);
3074    } else
3075    if (prog->getType() == Program::TYPE_GEOMETRY) {
3076       i->setIndirect(0, 1, gpEmitAddress);
3077    }
3078    return true;
3079 }
3080
3081 bool
3082 NVC0LoweringPass::handleOUT(Instruction *i)
3083 {
3084    Instruction *prev = i->prev;
3085    ImmediateValue stream, prevStream;
3086
3087    // Only merge if the stream ids match. Also, note that the previous
3088    // instruction would have already been lowered, so we take arg1 from it.
3089    if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
3090        i->src(0).getImmediate(stream) &&
3091        prev->src(1).getImmediate(prevStream) &&
3092        stream.reg.data.u32 == prevStream.reg.data.u32) {
3093       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
3094       delete_Instruction(prog, i);
3095    } else {
3096       assert(gpEmitAddress);
3097       i->setDef(0, gpEmitAddress);
3098       i->setSrc(1, i->getSrc(0));
3099       i->setSrc(0, gpEmitAddress);
3100    }
3101    return true;
3102 }
3103
3104 Value *
3105 NVC0LoweringPass::calculateSampleOffset(Value *sampleID)
3106 {
3107    Value *offset = bld.getScratch();
3108    if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
3109       // Sample location offsets (in bytes) are calculated like so:
3110       // offset = (SV_POSITION.y % 4 * 2) + (SV_POSITION.x % 2)
3111       // offset = offset * 32 + sampleID % 8 * 4;
3112       // which is equivalent to:
3113       // offset = (SV_POSITION.y & 0x3) << 6 + (SV_POSITION.x & 0x1) << 5;
3114       // offset += sampleID << 2
3115
3116       // The second operand (src1) of the INSBF instructions are like so:
3117       // 0xssll where ss is the size and ll is the offset.
3118       // so: dest = src2 | (src0 & (1 << ss - 1)) << ll
3119
3120       // Add sample ID (offset = (sampleID & 0x7) << 2)
3121       bld.mkOp3(OP_INSBF, TYPE_U32, offset, sampleID, bld.mkImm(0x0302), bld.mkImm(0x0));
3122
3123       Symbol *xSym = bld.mkSysVal(SV_POSITION, 0);
3124       Symbol *ySym = bld.mkSysVal(SV_POSITION, 1);
3125       Value *coord = bld.getScratch();
3126
3127       // Add X coordinate (offset |= (SV_POSITION.x & 0x1) << 5)
3128       bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3129                    targ->getSVAddress(FILE_SHADER_INPUT, xSym), NULL);
3130       bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3131          ->rnd = ROUND_ZI;
3132       bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0105), offset);
3133
3134       // Add Y coordinate (offset |= (SV_POSITION.y & 0x3) << 6)
3135       bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3136                    targ->getSVAddress(FILE_SHADER_INPUT, ySym), NULL);
3137       bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3138          ->rnd = ROUND_ZI;
3139       bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0206), offset);
3140    } else {
3141       bld.mkOp2(OP_SHL, TYPE_U32, offset, sampleID, bld.mkImm(3));
3142    }
3143    return offset;
3144 }
3145
3146 // Handle programmable sample locations for GM20x+
3147 void
3148 NVC0LoweringPass::handlePIXLD(Instruction *i)
3149 {
3150    if (i->subOp != NV50_IR_SUBOP_PIXLD_OFFSET)
3151       return;
3152    if (targ->getChipset() < NVISA_GM200_CHIPSET)
3153       return;
3154
3155    assert(prog->driver->prop.fp.readsSampleLocations);
3156
3157    bld.mkLoad(TYPE_F32,
3158               i->getDef(0),
3159               bld.mkSymbol(
3160                     FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3161                     TYPE_U32, prog->driver->io.sampleInfoBase),
3162               calculateSampleOffset(i->getSrc(0)));
3163
3164    bld.getBB()->remove(i);
3165 }
3166
3167 // Generate a binary predicate if an instruction is predicated by
3168 // e.g. an f32 value.
3169 void
3170 NVC0LoweringPass::checkPredicate(Instruction *insn)
3171 {
3172    Value *pred = insn->getPredicate();
3173    Value *pdst;
3174
3175    if (!pred || pred->reg.file == FILE_PREDICATE)
3176       return;
3177    pdst = new_LValue(func, FILE_PREDICATE);
3178
3179    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
3180    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
3181
3182    bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
3183
3184    insn->setPredicate(insn->cc, pdst);
3185 }
3186
3187 //
3188 // - add quadop dance for texturing
3189 // - put FP outputs in GPRs
3190 // - convert instruction sequences
3191 //
3192 bool
3193 NVC0LoweringPass::visit(Instruction *i)
3194 {
3195    bool ret = true;
3196    bld.setPosition(i, false);
3197
3198    if (i->cc != CC_ALWAYS)
3199       checkPredicate(i);
3200
3201    switch (i->op) {
3202    case OP_TEX:
3203    case OP_TXB:
3204    case OP_TXL:
3205    case OP_TXF:
3206    case OP_TXG:
3207       return handleTEX(i->asTex());
3208    case OP_TXD:
3209       return handleTXD(i->asTex());
3210    case OP_TXLQ:
3211       return handleTXLQ(i->asTex());
3212    case OP_TXQ:
3213      return handleTXQ(i->asTex());
3214    case OP_EX2:
3215       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
3216       i->setSrc(0, i->getDef(0));
3217       break;
3218    case OP_POW:
3219       return handlePOW(i);
3220    case OP_DIV:
3221       return handleDIV(i);
3222    case OP_MOD:
3223       return handleMOD(i);
3224    case OP_SQRT:
3225       return handleSQRT(i);
3226    case OP_EXPORT:
3227       ret = handleEXPORT(i);
3228       break;
3229    case OP_EMIT:
3230    case OP_RESTART:
3231       return handleOUT(i);
3232    case OP_RDSV:
3233       return handleRDSV(i);
3234    case OP_WRSV:
3235       return handleWRSV(i);
3236    case OP_STORE:
3237    case OP_LOAD:
3238       handleLDST(i);
3239       break;
3240    case OP_ATOM:
3241    {
3242       const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
3243       handleATOM(i);
3244       handleCasExch(i, cctl);
3245    }
3246       break;
3247    case OP_SULDB:
3248    case OP_SULDP:
3249    case OP_SUSTB:
3250    case OP_SUSTP:
3251    case OP_SUREDB:
3252    case OP_SUREDP:
3253       if (targ->getChipset() >= NVISA_GM107_CHIPSET)
3254          handleSurfaceOpGM107(i->asTex());
3255       else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
3256          handleSurfaceOpNVE4(i->asTex());
3257       else
3258          handleSurfaceOpNVC0(i->asTex());
3259       break;
3260    case OP_SUQ:
3261       handleSUQ(i->asTex());
3262       break;
3263    case OP_BUFQ:
3264       handleBUFQ(i);
3265       break;
3266    case OP_PIXLD:
3267       handlePIXLD(i);
3268       break;
3269    default:
3270       break;
3271    }
3272
3273    /* Kepler+ has a special opcode to compute a new base address to be used
3274     * for indirect loads.
3275     *
3276     * Maxwell+ has an additional similar requirement for indirect
3277     * interpolation ops in frag shaders.
3278     */
3279    bool doAfetch = false;
3280    if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
3281        !i->perPatch &&
3282        (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
3283        i->src(0).isIndirect(0)) {
3284       doAfetch = true;
3285    }
3286    if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
3287        (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
3288        i->src(0).isIndirect(0)) {
3289       doAfetch = true;
3290    }
3291
3292    if (doAfetch) {
3293       Value *addr = cloneShallow(func, i->getSrc(0));
3294       Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
3295                                       i->getSrc(0));
3296       afetch->setIndirect(0, 0, i->getIndirect(0, 0));
3297       addr->reg.data.offset = 0;
3298       i->setSrc(0, addr);
3299       i->setIndirect(0, 0, afetch->getDef(0));
3300    }
3301
3302    return ret;
3303 }
3304
3305 bool
3306 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
3307 {
3308    if (stage == CG_STAGE_PRE_SSA) {
3309       NVC0LoweringPass pass(prog);
3310       return pass.run(prog, false, true);
3311    } else
3312    if (stage == CG_STAGE_POST_RA) {
3313       NVC0LegalizePostRA pass(prog);
3314       return pass.run(prog, false, true);
3315    } else
3316    if (stage == CG_STAGE_SSA) {
3317       NVC0LegalizeSSA pass;
3318       return pass.run(prog, false, true);
3319    }
3320    return false;
3321 }
3322
3323 } // namespace nv50_ir