src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_build_util.h"
  25
  26 #include "codegen/nv50_ir_target_nvc0.h"
  27 #include "codegen/nv50_ir_lowering_nvc0.h"
  28
  29 #include <limits>
  30
  31 namespace nv50_ir {
  32
  33 #define QOP_ADD  0
  34 #define QOP_SUBR 1
  35 #define QOP_SUB  2
  36 #define QOP_MOV2 3
  37
  38 //             UL UR LL LR
  39 #define QUADOP(q, r, s, t)                      \
  40    ((QOP_##q << 6) | (QOP_##r << 4) |           \
  41     (QOP_##s << 2) | (QOP_##t << 0))
  42
  43 void
  44 NVC0LegalizeSSA::handleDIV(Instruction *i)
  45 {
  46    FlowInstruction *call;
  47    int builtin;
  48
  49    bld.setPosition(i, false);
  50
  51    // Generate movs to the input regs for the call we want to generate
  52    for (int s = 0; i->srcExists(s); ++s) {
  53       Instruction *ld = i->getSrc(s)->getInsn();
  54       // check if we are moving an immediate, propagate it in that case
  55       if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV) ||
  56             !(ld->src(0).getFile() == FILE_IMMEDIATE))
  57          bld.mkMovToReg(s, i->getSrc(s));
  58       else {
  59          assert(ld->getSrc(0) != NULL);
  60          bld.mkMovToReg(s, ld->getSrc(0));
  61          // Clear the src, to make code elimination possible here before we
  62          // delete the instruction i later
  63          i->setSrc(s, NULL);
  64          if (ld->isDead())
  65             delete_Instruction(prog, ld);
  66       }
  67    }
  68
  69    switch (i->dType) {
  70    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
  71    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
  72    default:
  73       return;
  74    }
  75    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
  76    bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1);
  77    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
  78    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
  79
  80    call->fixed = 1;
  81    call->absolute = call->builtin = 1;
  82    call->target.builtin = builtin;
  83    delete_Instruction(prog, i);
  84 }
  85
  86 void
  87 NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])
  88 {
  89    FlowInstruction *call;
  90    Value *def[2];
  91    int builtin;
  92
  93    def[0] = bld.mkMovToReg(0, src[0])->getDef(0);
  94    def[1] = bld.mkMovToReg(1, src[1])->getDef(0);
  95
  96    if (i->op == OP_RCP)
  97       builtin = NVC0_BUILTIN_RCP_F64;
  98    else
  99       builtin = NVC0_BUILTIN_RSQ_F64;
 100
 101    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
 102    def[0] = bld.getSSA();
 103    def[1] = bld.getSSA();
 104    bld.mkMovFromReg(def[0], 0);
 105    bld.mkMovFromReg(def[1], 1);
 106    bld.mkClobber(FILE_GPR, 0x3fc, 2);
 107    bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);
 108    bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);
 109
 110    call->fixed = 1;
 111    call->absolute = call->builtin = 1;
 112    call->target.builtin = builtin;
 113    delete_Instruction(prog, i);
 114
 115    prog->fp64 = true;
 116 }
 117
 118 void
 119 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
 120 {
 121    assert(i->dType == TYPE_F64);
 122    // There are instructions that will compute the high 32 bits of the 64-bit
 123    // float. We will just stick 0 in the bottom 32 bits.
 124
 125    bld.setPosition(i, false);
 126
 127    // 1. Take the source and it up.
 128    Value *src[2], *dst[2], *def = i->getDef(0);
 129    bld.mkSplit(src, 4, i->getSrc(0));
 130
 131    int chip = prog->getTarget()->getChipset();
 132    if (chip >= NVISA_GK104_CHIPSET) {
 133       handleRCPRSQLib(i, src);
 134       return;
 135    }
 136
 137    // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
 138    dst[0] = bld.loadImm(NULL, 0);
 139    dst[1] = bld.getSSA();
 140
 141    // 3. The new version of the instruction takes the high 32 bits of the
 142    // source and outputs the high 32 bits of the destination.
 143    i->setSrc(0, src[1]);
 144    i->setDef(0, dst[1]);
 145    i->setType(TYPE_F32);
 146    i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
 147
 148    // 4. Recombine the two dst pieces back into the original destination.
 149    bld.setPosition(i, true);
 150    bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
 151 }
 152
 153 void
 154 NVC0LegalizeSSA::handleFTZ(Instruction *i)
 155 {
 156    // Only want to flush float inputs
 157    assert(i->sType == TYPE_F32);
 158
 159    // If we're already flushing denorms (and NaN's) to zero, no need for this.
 160    if (i->dnz)
 161       return;
 162
 163    // Only certain classes of operations can flush
 164    OpClass cls = prog->getTarget()->getOpClass(i->op);
 165    if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
 166        cls != OPCLASS_CONVERT)
 167       return;
 168
 169    i->ftz = true;
 170 }
 171
 172 void
 173 NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
 174 {
 175    if (i->tex.levelZero)
 176       return;
 177
 178    ImmediateValue lod;
 179
 180    // The LOD argument comes right after the coordinates (before depth bias,
 181    // offsets, etc).
 182    int arg = i->tex.target.getArgCount();
 183
 184    // SM30+ stores the indirect handle as a separate arg, which comes before
 185    // the LOD.
 186    if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
 187        i->tex.rIndirectSrc >= 0)
 188       arg++;
 189    // SM20 stores indirect handle combined with array coordinate
 190    if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
 191        !i->tex.target.isArray() &&
 192        i->tex.rIndirectSrc >= 0)
 193       arg++;
 194
 195    if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
 196       return;
 197
 198    if (i->op == OP_TXL)
 199       i->op = OP_TEX;
 200    i->tex.levelZero = true;
 201    i->moveSources(arg + 1, -1);
 202 }
 203
 204 void
 205 NVC0LegalizeSSA::handleShift(Instruction *lo)
 206 {
 207    Value *shift = lo->getSrc(1);
 208    Value *dst64 = lo->getDef(0);
 209    Value *src[2], *dst[2];
 210    operation op = lo->op;
 211
 212    bld.setPosition(lo, false);
 213
 214    bld.mkSplit(src, 4, lo->getSrc(0));
 215
 216    // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to
 217    // be completely emulated. For SM35+, we can use the more directed SHF
 218    // operations.
 219    if (prog->getTarget()->getChipset() < NVISA_GK20A_CHIPSET) {
 220       // The strategy here is to handle shifts >= 32 and less than 32 as
 221       // separate parts.
 222       //
 223       // For SHL:
 224       // If the shift is <= 32, then
 225       //   (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)
 226       // If the shift is > 32, then
 227       //   (HI,LO) << x = (LO << (x - 32), 0)
 228       //
 229       // For SHR:
 230       // If the shift is <= 32, then
 231       //   (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)
 232       // If the shift is > 32, then
 233       //   (HI,LO) >> x = (0, HI >> (x - 32))
 234       //
 235       // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we
 236       // can use to our advantage. Also note the structural similarities
 237       // between the right/left cases. The main difference is swapping hi/lo
 238       // on input and output.
 239
 240       Value *x32_minus_shift, *pred, *hi1, *hi2;
 241       DataType type = isSignedIntType(lo->dType) ? TYPE_S32 : TYPE_U32;
 242       operation antiop = op == OP_SHR ? OP_SHL : OP_SHR;
 243       if (op == OP_SHR)
 244          std::swap(src[0], src[1]);
 245       bld.mkOp2(OP_ADD, TYPE_U32, (x32_minus_shift = bld.getSSA()), shift, bld.mkImm(0x20))
 246          ->src(0).mod = Modifier(NV50_IR_MOD_NEG);
 247       bld.mkCmp(OP_SET, CC_LE, TYPE_U8, (pred = bld.getSSA(1, FILE_PREDICATE)),
 248                 TYPE_U32, shift, bld.mkImm(32));
 249       // Compute HI (shift <= 32)
 250       bld.mkOp2(OP_OR, TYPE_U32, (hi1 = bld.getSSA()),
 251                 bld.mkOp2v(op, TYPE_U32, bld.getSSA(), src[1], shift),
 252                 bld.mkOp2v(antiop, TYPE_U32, bld.getSSA(), src[0], x32_minus_shift))
 253          ->setPredicate(CC_P, pred);
 254       // Compute LO (all shift values)
 255       bld.mkOp2(op, type, (dst[0] = bld.getSSA()), src[0], shift);
 256       // Compute HI (shift > 32)
 257       bld.mkOp2(op, type, (hi2 = bld.getSSA()), src[0],
 258                 bld.mkOp1v(OP_NEG, TYPE_S32, bld.getSSA(), x32_minus_shift))
 259          ->setPredicate(CC_NOT_P, pred);
 260       bld.mkOp2(OP_UNION, TYPE_U32, (dst[1] = bld.getSSA()), hi1, hi2);
 261       if (op == OP_SHR)
 262          std::swap(dst[0], dst[1]);
 263       bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
 264       delete_Instruction(prog, lo);
 265       return;
 266    }
 267
 268    Instruction *hi = new_Instruction(func, op, TYPE_U32);
 269    lo->bb->insertAfter(lo, hi);
 270
 271    hi->sType = lo->sType;
 272    lo->dType = TYPE_U32;
 273
 274    hi->setDef(0, (dst[1] = bld.getSSA()));
 275    if (lo->op == OP_SHR)
 276       hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;
 277    lo->setDef(0, (dst[0] = bld.getSSA()));
 278
 279    bld.setPosition(hi, true);
 280
 281    if (lo->op == OP_SHL)
 282       std::swap(hi, lo);
 283
 284    hi->setSrc(0, new_ImmediateValue(prog, 0u));
 285    hi->setSrc(1, shift);
 286    hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);
 287
 288    lo->setSrc(0, src[0]);
 289    lo->setSrc(1, shift);
 290    lo->setSrc(2, src[1]);
 291
 292    bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
 293 }
 294
 295 void
 296 NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
 297 {
 298    DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;
 299    Value *carry;
 300    Value *src0[2], *src1[2];
 301    bld.setPosition(cmp, false);
 302
 303    bld.mkSplit(src0, 4, cmp->getSrc(0));
 304    bld.mkSplit(src1, 4, cmp->getSrc(1));
 305    bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])
 306       ->setFlagsDef(0, (carry = bld.getSSA(1, FILE_FLAGS)));
 307    cmp->setFlagsSrc(cmp->srcCount(), carry);
 308    cmp->setSrc(0, src0[1]);
 309    cmp->setSrc(1, src1[1]);
 310    cmp->sType = hTy;
 311 }
 312
 313 void
 314 NVC0LegalizeSSA::handleBREV(Instruction *i)
 315 {
 316    i->op = OP_EXTBF;
 317    i->subOp = NV50_IR_SUBOP_EXTBF_REV;
 318    i->setSrc(1, bld.mkImm(0x2000));
 319 }
 320
 321 bool
 322 NVC0LegalizeSSA::visit(Function *fn)
 323 {
 324    bld.setProgram(fn->getProgram());
 325    return true;
 326 }
 327
 328 bool
 329 NVC0LegalizeSSA::visit(BasicBlock *bb)
 330 {
 331    Instruction *next;
 332    for (Instruction *i = bb->getEntry(); i; i = next) {
 333       next = i->next;
 334
 335       if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
 336          handleFTZ(i);
 337
 338       switch (i->op) {
 339       case OP_DIV:
 340       case OP_MOD:
 341          if (i->sType != TYPE_F32)
 342             handleDIV(i);
 343          break;
 344       case OP_RCP:
 345       case OP_RSQ:
 346          if (i->dType == TYPE_F64)
 347             handleRCPRSQ(i);
 348          break;
 349       case OP_TXL:
 350       case OP_TXF:
 351          handleTEXLOD(i->asTex());
 352          break;
 353       case OP_SHR:
 354       case OP_SHL:
 355          if (typeSizeof(i->sType) == 8)
 356             handleShift(i);
 357          break;
 358       case OP_SET:
 359       case OP_SET_AND:
 360       case OP_SET_OR:
 361       case OP_SET_XOR:
 362          if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
 363             handleSET(i->asCmp());
 364          break;
 365       case OP_BREV:
 366          handleBREV(i);
 367          break;
 368       default:
 369          break;
 370       }
 371    }
 372    return true;
 373 }
 374
 375 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
 376    : rZero(NULL),
 377      carry(NULL),
 378      pOne(NULL),
 379      needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
 380                 prog->getTarget()->getChipset() < 0x110)
 381 {
 382 }
 383
 384 bool
 385 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
 386                                     const Instruction *early) const
 387 {
 388    if (early->bb == later->bb)
 389       return early->serial < later->serial;
 390    return later->bb->dominatedBy(early->bb);
 391 }
 392
 393 void
 394 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
 395                               Instruction *usei, const Instruction *texi)
 396 {
 397    bool add = true;
 398    bool dominated = insnDominatedBy(usei, texi);
 399    // Uses before the tex have to all be included. Just because an earlier
 400    // instruction dominates another instruction doesn't mean that there's no
 401    // way to get from the tex to the later instruction. For example you could
 402    // have nested loops, with the tex in the inner loop, and uses before it in
 403    // both loops - even though the outer loop's instruction would dominate the
 404    // inner's, we still want a texbar before the inner loop's instruction.
 405    //
 406    // However we can still use the eliding logic between uses dominated by the
 407    // tex instruction, as that is unambiguously correct.
 408    if (dominated) {
 409       for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
 410          if (it->after) {
 411             if (insnDominatedBy(usei, it->insn)) {
 412                add = false;
 413                break;
 414             }
 415             if (insnDominatedBy(it->insn, usei)) {
 416                it = uses.erase(it);
 417                continue;
 418             }
 419          }
 420          ++it;
 421       }
 422    }
 423    if (add)
 424       uses.push_back(TexUse(usei, texi, dominated));
 425 }
 426
 427 // While it might be tempting to use the an algorithm that just looks at tex
 428 // uses, not all texture results are guaranteed to be used on all paths. In
 429 // the case where along some control flow path a texture result is never used,
 430 // we might reuse that register for something else, creating a
 431 // write-after-write hazard. So we have to manually look through all
 432 // instructions looking for ones that reference the registers in question.
 433 void
 434 NVC0LegalizePostRA::findFirstUses(
 435    Instruction *texi, std::list<TexUse> &uses)
 436 {
 437    int minGPR = texi->def(0).rep()->reg.data.id;
 438    int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
 439
 440    unordered_set<const BasicBlock *> visited;
 441    findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
 442 }
 443
 444 void
 445 NVC0LegalizePostRA::findFirstUsesBB(
 446    int minGPR, int maxGPR, Instruction *start,
 447    const Instruction *texi, std::list<TexUse> &uses,
 448    unordered_set<const BasicBlock *> &visited)
 449 {
 450    const BasicBlock *bb = start->bb;
 451
 452    // We don't process the whole bb the first time around. This is correct,
 453    // however we might be in a loop and hit this BB again, and need to process
 454    // the full thing. So only mark a bb as visited if we processed it from the
 455    // beginning.
 456    if (start == bb->getEntry()) {
 457       if (visited.find(bb) != visited.end())
 458          return;
 459       visited.insert(bb);
 460    }
 461
 462    for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
 463       if (insn->isNop())
 464          continue;
 465
 466       for (int d = 0; insn->defExists(d); ++d) {
 467          const Value *def = insn->def(d).rep();
 468          if (insn->def(d).getFile() != FILE_GPR ||
 469              def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
 470              def->reg.data.id > maxGPR)
 471             continue;
 472          addTexUse(uses, insn, texi);
 473          return;
 474       }
 475
 476       for (int s = 0; insn->srcExists(s); ++s) {
 477          const Value *src = insn->src(s).rep();
 478          if (insn->src(s).getFile() != FILE_GPR ||
 479              src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
 480              src->reg.data.id > maxGPR)
 481             continue;
 482          addTexUse(uses, insn, texi);
 483          return;
 484       }
 485    }
 486
 487    for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
 488       findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
 489                       texi, uses, visited);
 490    }
 491 }
 492
 493 // Texture barriers:
 494 // This pass is a bit long and ugly and can probably be optimized.
 495 //
 496 // 1. obtain a list of TEXes and their outputs' first use(s)
 497 // 2. calculate the barrier level of each first use (minimal number of TEXes,
 498 //    over all paths, between the TEX and the use in question)
 499 // 3. for each barrier, if all paths from the source TEX to that barrier
 500 //    contain a barrier of lesser level, it can be culled
 501 bool
 502 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
 503 {
 504    std::list<TexUse> *uses;
 505    std::vector<Instruction *> texes;
 506    std::vector<int> bbFirstTex;
 507    std::vector<int> bbFirstUse;
 508    std::vector<int> texCounts;
 509    std::vector<TexUse> useVec;
 510    ArrayList insns;
 511
 512    fn->orderInstructions(insns);
 513
 514    texCounts.resize(fn->allBBlocks.getSize(), 0);
 515    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
 516    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
 517
 518    // tag BB CFG nodes by their id for later
 519    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
 520       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
 521       if (bb)
 522          bb->cfg.tag = bb->getId();
 523    }
 524
 525    // gather the first uses for each TEX
 526    for (int i = 0; i < insns.getSize(); ++i) {
 527       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
 528       if (isTextureOp(tex->op)) {
 529          texes.push_back(tex);
 530          if (!texCounts.at(tex->bb->getId()))
 531             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
 532          texCounts[tex->bb->getId()]++;
 533       }
 534    }
 535    insns.clear();
 536    if (texes.empty())
 537       return false;
 538    uses = new std::list<TexUse>[texes.size()];
 539    if (!uses)
 540       return false;
 541    for (size_t i = 0; i < texes.size(); ++i) {
 542       findFirstUses(texes[i], uses[i]);
 543    }
 544
 545    // determine the barrier level at each use
 546    for (size_t i = 0; i < texes.size(); ++i) {
 547       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
 548            ++u) {
 549          BasicBlock *tb = texes[i]->bb;
 550          BasicBlock *ub = u->insn->bb;
 551          if (tb == ub) {
 552             u->level = 0;
 553             for (size_t j = i + 1; j < texes.size() &&
 554                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
 555                  ++j)
 556                u->level++;
 557          } else {
 558             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
 559                                                       &ub->cfg, texCounts);
 560             if (u->level < 0) {
 561                WARN("Failed to find path TEX -> TEXBAR\n");
 562                u->level = 0;
 563                continue;
 564             }
 565             // this counted all TEXes in the origin block, correct that
 566             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
 567             // and did not count the TEXes in the destination block, add those
 568             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
 569                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
 570                  ++j)
 571                u->level++;
 572          }
 573          assert(u->level >= 0);
 574          useVec.push_back(*u);
 575       }
 576    }
 577    delete[] uses;
 578
 579    // insert the barriers
 580    for (size_t i = 0; i < useVec.size(); ++i) {
 581       Instruction *prev = useVec[i].insn->prev;
 582       if (useVec[i].level < 0)
 583          continue;
 584       if (prev && prev->op == OP_TEXBAR) {
 585          if (prev->subOp > useVec[i].level)
 586             prev->subOp = useVec[i].level;
 587          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
 588       } else {
 589          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
 590          bar->fixed = 1;
 591          bar->subOp = useVec[i].level;
 592          // make use explicit to ease latency calculation
 593          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
 594          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
 595       }
 596    }
 597
 598    if (fn->getProgram()->optLevel < 3)
 599       return true;
 600
 601    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
 602
 603    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
 604    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
 605    limitS.resize(fn->allBBlocks.getSize());
 606
 607    // cull unneeded barriers (should do that earlier, but for simplicity)
 608    IteratorRef bi = fn->cfg.iteratorCFG();
 609    // first calculate min/max outstanding TEXes for each BB
 610    for (bi->reset(); !bi->end(); bi->next()) {
 611       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 612       BasicBlock *bb = BasicBlock::get(n);
 613       int min = 0;
 614       int max = std::numeric_limits<int>::max();
 615       for (Instruction *i = bb->getFirst(); i; i = i->next) {
 616          if (isTextureOp(i->op)) {
 617             min++;
 618             if (max < std::numeric_limits<int>::max())
 619                max++;
 620          } else
 621          if (i->op == OP_TEXBAR) {
 622             min = MIN2(min, i->subOp);
 623             max = MIN2(max, i->subOp);
 624          }
 625       }
 626       // limits when looking at an isolated block
 627       limitS[bb->getId()].min = min;
 628       limitS[bb->getId()].max = max;
 629    }
 630    // propagate the min/max values
 631    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
 632       for (bi->reset(); !bi->end(); bi->next()) {
 633          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 634          BasicBlock *bb = BasicBlock::get(n);
 635          const int bbId = bb->getId();
 636          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
 637             BasicBlock *in = BasicBlock::get(ei.getNode());
 638             const int inId = in->getId();
 639             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
 640             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
 641          }
 642          // I just hope this is correct ...
 643          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
 644             // no barrier
 645             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
 646             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
 647          } else {
 648             // block contained a barrier
 649             limitB[bbId].min = MIN2(limitS[bbId].max,
 650                                     limitT[bbId].min + limitS[bbId].min);
 651             limitB[bbId].max = MIN2(limitS[bbId].max,
 652                                     limitT[bbId].max + limitS[bbId].min);
 653          }
 654       }
 655    }
 656    // finally delete unnecessary barriers
 657    for (bi->reset(); !bi->end(); bi->next()) {
 658       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 659       BasicBlock *bb = BasicBlock::get(n);
 660       Instruction *prev = NULL;
 661       Instruction *next;
 662       int max = limitT[bb->getId()].max;
 663       for (Instruction *i = bb->getFirst(); i; i = next) {
 664          next = i->next;
 665          if (i->op == OP_TEXBAR) {
 666             if (i->subOp >= max) {
 667                delete_Instruction(prog, i);
 668                i = NULL;
 669             } else {
 670                max = i->subOp;
 671                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
 672                   delete_Instruction(prog, prev);
 673                   prev = NULL;
 674                }
 675             }
 676          } else
 677          if (isTextureOp(i->op)) {
 678             max++;
 679          }
 680          if (i && !i->isNop())
 681             prev = i;
 682       }
 683    }
 684    return true;
 685 }
 686
 687 bool
 688 NVC0LegalizePostRA::visit(Function *fn)
 689 {
 690    if (needTexBar)
 691       insertTextureBarriers(fn);
 692
 693    rZero = new_LValue(fn, FILE_GPR);
 694    pOne = new_LValue(fn, FILE_PREDICATE);
 695    carry = new_LValue(fn, FILE_FLAGS);
 696
 697    rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
 698    carry->reg.data.id = 0;
 699    pOne->reg.data.id = 7;
 700
 701    return true;
 702 }
 703
 704 void
 705 NVC0LegalizePostRA::replaceZero(Instruction *i)
 706 {
 707    for (int s = 0; i->srcExists(s); ++s) {
 708       if (s == 2 && i->op == OP_SUCLAMP)
 709          continue;
 710       if (s == 1 && i->op == OP_SHLADD)
 711          continue;
 712       ImmediateValue *imm = i->getSrc(s)->asImm();
 713       if (imm) {
 714          if (i->op == OP_SELP && s == 2) {
 715             i->setSrc(s, pOne);
 716             if (imm->reg.data.u64 == 0)
 717                i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
 718          } else if (imm->reg.data.u64 == 0) {
 719             i->setSrc(s, rZero);
 720          }
 721       }
 722    }
 723 }
 724
 725 // replace CONT with BRA for single unconditional continue
 726 bool
 727 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
 728 {
 729    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
 730       return false;
 731    Graph::EdgeIterator ei = bb->cfg.incident();
 732    if (ei.getType() != Graph::Edge::BACK)
 733       ei.next();
 734    if (ei.getType() != Graph::Edge::BACK)
 735       return false;
 736    BasicBlock *contBB = BasicBlock::get(ei.getNode());
 737
 738    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
 739        contBB->getExit()->getPredicate())
 740       return false;
 741    contBB->getExit()->op = OP_BRA;
 742    bb->remove(bb->getEntry()); // delete PRECONT
 743
 744    ei.next();
 745    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
 746    return true;
 747 }
 748
 749 // replace branches to join blocks with join ops
 750 void
 751 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
 752 {
 753    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
 754       return;
 755    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
 756       BasicBlock *in = BasicBlock::get(ei.getNode());
 757       Instruction *exit = in->getExit();
 758       if (!exit) {
 759          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
 760          // there should always be a terminator instruction
 761          WARN("inserted missing terminator in BB:%i\n", in->getId());
 762       } else
 763       if (exit->op == OP_BRA) {
 764          exit->op = OP_JOIN;
 765          exit->asFlow()->limit = 1; // must-not-propagate marker
 766       }
 767    }
 768    bb->remove(bb->getEntry());
 769 }
 770
 771 // replaces instructions which would end up as f2f or i2i with faster
 772 // alternatives:
 773 //  - fabs(a)     -> fadd(0, abs a)
 774 //  - fneg(a)     -> fadd(neg 0, neg a)
 775 //  - ineg(a)     -> iadd(0, neg a)
 776 //  - fneg(abs a) -> fadd(neg 0, neg abs a)
 777 //  - sat(a)      -> sat add(0, a)
 778 void
 779 NVC0LegalizePostRA::replaceCvt(Instruction *cvt)
 780 {
 781    if (!isFloatType(cvt->sType) && typeSizeof(cvt->sType) != 4)
 782       return;
 783    if (cvt->sType != cvt->dType)
 784       return;
 785    // we could make it work, but in this case we have optimizations disabled
 786    // and we don't really care either way.
 787    if (cvt->src(0).getFile() != FILE_GPR &&
 788        cvt->src(0).getFile() != FILE_MEMORY_CONST)
 789       return;
 790
 791    Modifier mod0, mod1;
 792
 793    switch (cvt->op) {
 794    case OP_ABS:
 795       if (cvt->src(0).mod)
 796          return;
 797       if (!isFloatType(cvt->sType))
 798          return;
 799       mod0 = 0;
 800       mod1 = NV50_IR_MOD_ABS;
 801       break;
 802    case OP_NEG:
 803       if (!isFloatType(cvt->sType) && cvt->src(0).mod)
 804          return;
 805       if (isFloatType(cvt->sType) &&
 806           (cvt->src(0).mod && cvt->src(0).mod != Modifier(NV50_IR_MOD_ABS)))
 807          return;
 808
 809       mod0 = isFloatType(cvt->sType) ? NV50_IR_MOD_NEG : 0;
 810       mod1 = cvt->src(0).mod == Modifier(NV50_IR_MOD_ABS) ?
 811          NV50_IR_MOD_NEG_ABS : NV50_IR_MOD_NEG;
 812       break;
 813    case OP_SAT:
 814       if (!isFloatType(cvt->sType) && cvt->src(0).mod.abs())
 815          return;
 816       mod0 = 0;
 817       mod1 = cvt->src(0).mod;
 818       cvt->saturate = true;
 819       break;
 820    default:
 821       return;
 822    }
 823
 824    cvt->op = OP_ADD;
 825    cvt->moveSources(0, 1);
 826    cvt->setSrc(0, rZero);
 827    cvt->src(0).mod = mod0;
 828    cvt->src(1).mod = mod1;
 829 }
 830
 831 bool
 832 NVC0LegalizePostRA::visit(BasicBlock *bb)
 833 {
 834    Instruction *i, *next;
 835
 836    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
 837    for (i = bb->getFirst(); i; i = next) {
 838       next = i->next;
 839       if (i->op == OP_EMIT || i->op == OP_RESTART) {
 840          if (!i->getDef(0)->refCount())
 841             i->setDef(0, NULL);
 842          if (i->src(0).getFile() == FILE_IMMEDIATE)
 843             i->setSrc(0, rZero); // initial value must be 0
 844          replaceZero(i);
 845       } else
 846       if (i->isNop()) {
 847          bb->remove(i);
 848       } else
 849       if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
 850           prog->getType() != Program::TYPE_COMPUTE) {
 851          // It seems like barriers are never required for tessellation since
 852          // the warp size is 32, and there are always at most 32 tcs threads.
 853          bb->remove(i);
 854       } else
 855       if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
 856          int offset = i->src(0).get()->reg.data.offset;
 857          if (abs(offset) >= 0x10000)
 858             i->src(0).get()->reg.fileIndex += offset >> 16;
 859          i->src(0).get()->reg.data.offset = (int)(short)offset;
 860       } else {
 861          // TODO: Move this to before register allocation for operations that
 862          // need the $c register !
 863          if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {
 864             Instruction *hi;
 865             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
 866             if (hi)
 867                next = hi;
 868          }
 869
 870          if (i->op != OP_MOV && i->op != OP_PFETCH)
 871             replaceZero(i);
 872
 873          if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS)
 874             replaceCvt(i);
 875       }
 876    }
 877    if (!bb->getEntry())
 878       return true;
 879
 880    if (!tryReplaceContWithBra(bb))
 881       propagateJoin(bb);
 882
 883    return true;
 884 }
 885
 886 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
 887 {
 888    bld.setProgram(prog);
 889 }
 890
 891 bool
 892 NVC0LoweringPass::visit(Function *fn)
 893 {
 894    if (prog->getType() == Program::TYPE_GEOMETRY) {
 895       assert(!strncmp(fn->getName(), "MAIN", 4));
 896       // TODO: when we generate actual functions pass this value along somehow
 897       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
 898       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
 899       if (fn->cfgExit) {
 900          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
 901          if (prog->getTarget()->getChipset() >= NVISA_GV100_CHIPSET)
 902             bld.mkOp1(OP_FINAL, TYPE_NONE, NULL, gpEmitAddress)->fixed = 1;
 903          bld.mkMovToReg(0, gpEmitAddress);
 904       }
 905    }
 906    return true;
 907 }
 908
 909 bool
 910 NVC0LoweringPass::visit(BasicBlock *bb)
 911 {
 912    return true;
 913 }
 914
 915 inline Value *
 916 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
 917 {
 918    uint8_t b = prog->driver->io.auxCBSlot;
 919    uint32_t off = prog->driver->io.texBindBase + slot * 4;
 920
 921    if (ptr)
 922       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
 923
 924    return bld.
 925       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 926 }
 927
 928 // move array source to first slot, convert to u16, add indirections
 929 bool
 930 NVC0LoweringPass::handleTEX(TexInstruction *i)
 931 {
 932    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
 933    const int arg = i->tex.target.getArgCount();
 934    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
 935    const int chipset = prog->getTarget()->getChipset();
 936
 937    /* Only normalize in the non-explicit derivatives case. For explicit
 938     * derivatives, this is handled in handleManualTXD.
 939     */
 940    if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
 941       Value *src[3], *val;
 942       int c;
 943       for (c = 0; c < 3; ++c)
 944          src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
 945       val = bld.getScratch();
 946       bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
 947       bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
 948       bld.mkOp1(OP_RCP, TYPE_F32, val, val);
 949       for (c = 0; c < 3; ++c) {
 950          i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
 951                                  i->getSrc(c), val));
 952       }
 953    }
 954
 955    // Arguments to the TEX instruction are a little insane. Even though the
 956    // encoding is identical between SM20 and SM30, the arguments mean
 957    // different things between Fermi and Kepler+. A lot of arguments are
 958    // optional based on flags passed to the instruction. This summarizes the
 959    // order of things.
 960    //
 961    // Fermi:
 962    //  array/indirect
 963    //  coords
 964    //  sample
 965    //  lod bias
 966    //  depth compare
 967    //  offsets:
 968    //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
 969    //    - other: 4 bits each, single reg
 970    //
 971    // Kepler+:
 972    //  indirect handle
 973    //  array (+ offsets for txd in upper 16 bits)
 974    //  coords
 975    //  sample
 976    //  lod bias
 977    //  depth compare
 978    //  offsets (same as fermi, except txd which takes it with array)
 979    //
 980    // Maxwell (tex):
 981    //  array
 982    //  coords
 983    //  indirect handle
 984    //  sample
 985    //  lod bias
 986    //  depth compare
 987    //  offsets
 988    //
 989    // Maxwell (txd):
 990    //  indirect handle
 991    //  coords
 992    //  array + offsets
 993    //  derivatives
 994
 995    if (chipset >= NVISA_GK104_CHIPSET) {
 996       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
 997          // XXX this ignores tsc, and assumes a 1:1 mapping
 998          assert(i->tex.rIndirectSrc >= 0);
 999          if (!i->tex.bindless) {
1000             Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
1001             i->tex.r = 0xff;
1002             i->tex.s = 0x1f;
1003             i->setIndirectR(hnd);
1004          }
1005          i->setIndirectS(NULL);
1006       } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
1007          if (i->tex.r == 0xffff)
1008             i->tex.r = prog->driver->io.fbtexBindBase / 4;
1009          else
1010             i->tex.r += prog->driver->io.texBindBase / 4;
1011          i->tex.s  = 0; // only a single cX[] value possible here
1012       } else {
1013          Value *hnd = bld.getScratch();
1014          Value *rHnd = loadTexHandle(NULL, i->tex.r);
1015          Value *sHnd = loadTexHandle(NULL, i->tex.s);
1016
1017          bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
1018
1019          i->tex.r = 0; // not used for indirect tex
1020          i->tex.s = 0;
1021          i->setIndirectR(hnd);
1022       }
1023       if (i->tex.target.isArray()) {
1024          LValue *layer = new_LValue(func, FILE_GPR);
1025          Value *src = i->getSrc(lyr);
1026          const int sat = (i->op == OP_TXF) ? 1 : 0;
1027          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1028          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
1029          if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
1030             for (int s = dim; s >= 1; --s)
1031                i->setSrc(s, i->getSrc(s - 1));
1032             i->setSrc(0, layer);
1033          } else {
1034             i->setSrc(dim, layer);
1035          }
1036       }
1037       // Move the indirect reference to the first place
1038       if (i->tex.rIndirectSrc >= 0 && (
1039                 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
1040          Value *hnd = i->getIndirectR();
1041
1042          i->setIndirectR(NULL);
1043          i->moveSources(0, 1);
1044          i->setSrc(0, hnd);
1045          i->tex.rIndirectSrc = 0;
1046          i->tex.sIndirectSrc = -1;
1047       }
1048       // Move the indirect reference to right after the coords
1049       else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
1050          Value *hnd = i->getIndirectR();
1051
1052          i->setIndirectR(NULL);
1053          i->moveSources(arg, 1);
1054          i->setSrc(arg, hnd);
1055          i->tex.rIndirectSrc = 0;
1056          i->tex.sIndirectSrc = -1;
1057       }
1058    } else
1059    // (nvc0) generate and move the tsc/tic/array source to the front
1060    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
1061       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1062
1063       Value *ticRel = i->getIndirectR();
1064       Value *tscRel = i->getIndirectS();
1065
1066       if (i->tex.r == 0xffff) {
1067          i->tex.r = 0x20;
1068          i->tex.s = 0x10;
1069       }
1070
1071       if (ticRel) {
1072          i->setSrc(i->tex.rIndirectSrc, NULL);
1073          if (i->tex.r)
1074             ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1075                                 ticRel, bld.mkImm(i->tex.r));
1076       }
1077       if (tscRel) {
1078          i->setSrc(i->tex.sIndirectSrc, NULL);
1079          if (i->tex.s)
1080             tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1081                                 tscRel, bld.mkImm(i->tex.s));
1082       }
1083
1084       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
1085       if (arrayIndex) {
1086          for (int s = dim; s >= 1; --s)
1087             i->setSrc(s, i->getSrc(s - 1));
1088          i->setSrc(0, arrayIndex);
1089       } else {
1090          i->moveSources(0, 1);
1091       }
1092
1093       if (arrayIndex) {
1094          int sat = (i->op == OP_TXF) ? 1 : 0;
1095          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1096          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
1097       } else {
1098          bld.loadImm(src, 0);
1099       }
1100
1101       if (ticRel)
1102          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
1103       if (tscRel)
1104          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
1105
1106       i->setSrc(0, src);
1107    }
1108
1109    // For nvc0, the sample id has to be in the second operand, as the offset
1110    // does. Right now we don't know how to pass both in, and this case can't
1111    // happen with OpenGL. On nve0, the sample id is part of the texture
1112    // coordinate argument.
1113    assert(chipset >= NVISA_GK104_CHIPSET ||
1114           !i->tex.useOffsets || !i->tex.target.isMS());
1115
1116    // offset is between lod and dc
1117    if (i->tex.useOffsets) {
1118       int n, c;
1119       int s = i->srcCount(0xff, true);
1120       if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
1121          if (i->tex.target.isShadow())
1122             s--;
1123          if (i->srcExists(s)) // move potential predicate out of the way
1124             i->moveSources(s, 1);
1125          if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
1126             i->moveSources(s + 1, 1);
1127       }
1128       if (i->op == OP_TXG) {
1129          // Either there is 1 offset, which goes into the 2 low bytes of the
1130          // first source, or there are 4 offsets, which go into 2 sources (8
1131          // values, 1 byte each).
1132          Value *offs[2] = {NULL, NULL};
1133          for (n = 0; n < i->tex.useOffsets; n++) {
1134             for (c = 0; c < 2; ++c) {
1135                if ((n % 2) == 0 && c == 0)
1136                   bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
1137                else
1138                   bld.mkOp3(OP_INSBF, TYPE_U32,
1139                             offs[n / 2],
1140                             i->offset[n][c].get(),
1141                             bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
1142                             offs[n / 2]);
1143             }
1144          }
1145          i->setSrc(s, offs[0]);
1146          if (offs[1])
1147             i->setSrc(s + 1, offs[1]);
1148       } else {
1149          unsigned imm = 0;
1150          assert(i->tex.useOffsets == 1);
1151          for (c = 0; c < 3; ++c) {
1152             ImmediateValue val;
1153             if (!i->offset[0][c].getImmediate(val))
1154                assert(!"non-immediate offset passed to non-TXG");
1155             imm |= (val.reg.data.u32 & 0xf) << (c * 4);
1156          }
1157          if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
1158             // The offset goes into the upper 16 bits of the array index. So
1159             // create it if it's not already there, and INSBF it if it already
1160             // is.
1161             s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
1162             if (chipset >= NVISA_GM107_CHIPSET)
1163                s += dim;
1164             if (i->tex.target.isArray()) {
1165                Value *offset = bld.getScratch();
1166                bld.mkOp3(OP_INSBF, TYPE_U32, offset,
1167                          bld.loadImm(NULL, imm), bld.mkImm(0xc10),
1168                          i->getSrc(s));
1169                i->setSrc(s, offset);
1170             } else {
1171                i->moveSources(s, 1);
1172                i->setSrc(s, bld.loadImm(NULL, imm << 16));
1173             }
1174          } else {
1175             i->setSrc(s, bld.loadImm(NULL, imm));
1176          }
1177       }
1178    }
1179
1180    return true;
1181 }
1182
1183 bool
1184 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
1185 {
1186    // Always done from the l0 perspective. This is the way that NVIDIA's
1187    // driver does it, and doing it from the "current" lane's perpsective
1188    // doesn't seem to always work for reasons that aren't altogether clear,
1189    // even in frag shaders.
1190    //
1191    // Note that we must move not only the coordinates into lane0, but also all
1192    // ancillary arguments, like array indices and depth compare as they may
1193    // differ between lanes. Offsets for TXD are supposed to be uniform, so we
1194    // leave them alone.
1195    static const uint8_t qOps[2] =
1196       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
1197
1198    Value *def[4][4];
1199    Value *crd[3], *arr[2], *shadow;
1200    Instruction *tex;
1201    Value *zero = bld.loadImm(bld.getSSA(), 0);
1202    int l, c;
1203    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1204
1205    // This function is invoked after handleTEX lowering, so we have to expect
1206    // the arguments in the order that the hw wants them. For Fermi, array and
1207    // indirect are both in the leading arg, while for Kepler, array and
1208    // indirect are separate (and both precede the coordinates). Maxwell is
1209    // handled in a separate function.
1210    int array;
1211    if (targ->getChipset() < NVISA_GK104_CHIPSET)
1212       array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
1213    else
1214       array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
1215
1216    i->op = OP_TEX; // no need to clone dPdx/dPdy later
1217
1218    for (c = 0; c < dim; ++c)
1219       crd[c] = bld.getScratch();
1220    for (c = 0; c < array; ++c)
1221       arr[c] = bld.getScratch();
1222    shadow = bld.getScratch();
1223
1224    for (l = 0; l < 4; ++l) {
1225       Value *src[3], *val;
1226
1227       bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1228       // we're using the texture result from lane 0 in all cases, so make sure
1229       // that lane 0 is pointing at the proper array index, indirect value,
1230       // and depth compare.
1231       if (l != 0) {
1232          for (c = 0; c < array; ++c)
1233             bld.mkQuadop(0x00, arr[c], l, i->getSrc(c), zero);
1234          if (i->tex.target.isShadow()) {
1235             // The next argument after coords is the depth compare
1236             bld.mkQuadop(0x00, shadow, l, i->getSrc(array + dim), zero);
1237          }
1238       }
1239       // mov position coordinates from lane l to all lanes
1240       for (c = 0; c < dim; ++c)
1241          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
1242       // add dPdx from lane l to lanes dx
1243       for (c = 0; c < dim; ++c)
1244          bld.mkQuadop(qOps[0], crd[c], l, i->dPdx[c].get(), crd[c]);
1245       // add dPdy from lane l to lanes dy
1246       for (c = 0; c < dim; ++c)
1247          bld.mkQuadop(qOps[1], crd[c], l, i->dPdy[c].get(), crd[c]);
1248       // normalize cube coordinates
1249       if (i->tex.target.isCube()) {
1250          for (c = 0; c < 3; ++c)
1251             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1252          val = bld.getScratch();
1253          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1254          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1255          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1256          for (c = 0; c < 3; ++c)
1257             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1258       } else {
1259          for (c = 0; c < dim; ++c)
1260             src[c] = crd[c];
1261       }
1262       // texture
1263       bld.insert(tex = cloneForward(func, i));
1264       if (l != 0) {
1265          for (c = 0; c < array; ++c)
1266             tex->setSrc(c, arr[c]);
1267          if (i->tex.target.isShadow())
1268             tex->setSrc(array + dim, shadow);
1269       }
1270       for (c = 0; c < dim; ++c)
1271          tex->setSrc(c + array, src[c]);
1272       // broadcast results from lane 0 to all lanes so that the moves *into*
1273       // the target lane pick up the proper value.
1274       if (l != 0)
1275          for (c = 0; i->defExists(c); ++c)
1276             bld.mkQuadop(0x00, tex->getDef(c), 0, tex->getDef(c), zero);
1277       bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1278
1279       // save results
1280       for (c = 0; i->defExists(c); ++c) {
1281          Instruction *mov;
1282          def[c][l] = bld.getSSA();
1283          mov = bld.mkMov(def[c][l], tex->getDef(c));
1284          mov->fixed = 1;
1285          mov->lanes = 1 << l;
1286       }
1287    }
1288
1289    for (c = 0; i->defExists(c); ++c) {
1290       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1291       for (l = 0; l < 4; ++l)
1292          u->setSrc(l, def[c][l]);
1293    }
1294
1295    i->bb->remove(i);
1296    return true;
1297 }
1298
1299 bool
1300 NVC0LoweringPass::handleTXD(TexInstruction *txd)
1301 {
1302    int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
1303    unsigned arg = txd->tex.target.getArgCount();
1304    unsigned expected_args = arg;
1305    const int chipset = prog->getTarget()->getChipset();
1306
1307    if (chipset >= NVISA_GK104_CHIPSET) {
1308       if (!txd->tex.target.isArray() && txd->tex.useOffsets)
1309          expected_args++;
1310       if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
1311          expected_args++;
1312    } else {
1313       if (txd->tex.useOffsets)
1314          expected_args++;
1315       if (!txd->tex.target.isArray() && (
1316                 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
1317          expected_args++;
1318    }
1319
1320    if (expected_args > 4 ||
1321        dim > 2 ||
1322        txd->tex.target.isShadow())
1323       txd->op = OP_TEX;
1324
1325    handleTEX(txd);
1326    while (txd->srcExists(arg))
1327       ++arg;
1328
1329    txd->tex.derivAll = true;
1330    if (txd->op == OP_TEX)
1331       return handleManualTXD(txd);
1332
1333    assert(arg == expected_args);
1334    for (int c = 0; c < dim; ++c) {
1335       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
1336       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
1337       txd->dPdx[c].set(NULL);
1338       txd->dPdy[c].set(NULL);
1339    }
1340
1341    // In this case we have fewer than 4 "real" arguments, which means that
1342    // handleTEX didn't apply any padding. However we have to make sure that
1343    // the second "group" of arguments still gets padded up to 4.
1344    if (chipset >= NVISA_GK104_CHIPSET) {
1345       int s = arg + 2 * dim;
1346       if (s >= 4 && s < 7) {
1347          if (txd->srcExists(s)) // move potential predicate out of the way
1348             txd->moveSources(s, 7 - s);
1349          while (s < 7)
1350             txd->setSrc(s++, bld.loadImm(NULL, 0));
1351       }
1352    }
1353
1354    return true;
1355 }
1356
1357 bool
1358 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
1359 {
1360    const int chipset = prog->getTarget()->getChipset();
1361    if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
1362       txq->tex.r += prog->driver->io.texBindBase / 4;
1363
1364    if (txq->tex.rIndirectSrc < 0)
1365       return true;
1366
1367    Value *ticRel = txq->getIndirectR();
1368
1369    txq->setIndirectS(NULL);
1370    txq->tex.sIndirectSrc = -1;
1371
1372    assert(ticRel);
1373
1374    if (chipset < NVISA_GK104_CHIPSET) {
1375       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1376
1377       txq->setSrc(txq->tex.rIndirectSrc, NULL);
1378       if (txq->tex.r)
1379          ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1380                              ticRel, bld.mkImm(txq->tex.r));
1381
1382       bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
1383
1384       txq->moveSources(0, 1);
1385       txq->setSrc(0, src);
1386    } else {
1387       Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
1388       txq->tex.r = 0xff;
1389       txq->tex.s = 0x1f;
1390
1391       txq->setIndirectR(NULL);
1392       txq->moveSources(0, 1);
1393       txq->setSrc(0, hnd);
1394       txq->tex.rIndirectSrc = 0;
1395    }
1396
1397    return true;
1398 }
1399
1400 bool
1401 NVC0LoweringPass::handleTXLQ(TexInstruction *i)
1402 {
1403    /* The outputs are inverted compared to what the TGSI instruction
1404     * expects. Take that into account in the mask.
1405     */
1406    assert((i->tex.mask & ~3) == 0);
1407    if (i->tex.mask == 1)
1408       i->tex.mask = 2;
1409    else if (i->tex.mask == 2)
1410       i->tex.mask = 1;
1411    handleTEX(i);
1412    bld.setPosition(i, true);
1413
1414    /* The returned values are not quite what we want:
1415     * (a) convert from s16/u16 to f32
1416     * (b) multiply by 1/256
1417     */
1418    for (int def = 0; def < 2; ++def) {
1419       if (!i->defExists(def))
1420          continue;
1421       enum DataType type = TYPE_S16;
1422       if (i->tex.mask == 2 || def > 0)
1423          type = TYPE_U16;
1424       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
1425       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1426                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1427    }
1428    if (i->tex.mask == 3) {
1429       LValue *t = new_LValue(func, FILE_GPR);
1430       bld.mkMov(t, i->getDef(0));
1431       bld.mkMov(i->getDef(0), i->getDef(1));
1432       bld.mkMov(i->getDef(1), t);
1433    }
1434    return true;
1435 }
1436
1437 bool
1438 NVC0LoweringPass::handleBUFQ(Instruction *bufq)
1439 {
1440    bufq->op = OP_MOV;
1441    bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
1442                                    bufq->getSrc(0)->reg.fileIndex * 16));
1443    bufq->setIndirect(0, 0, NULL);
1444    bufq->setIndirect(0, 1, NULL);
1445    return true;
1446 }
1447
1448 void
1449 NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
1450 {
1451    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1452
1453    BasicBlock *currBB = atom->bb;
1454    BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1455    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1456    BasicBlock *setAndUnlockBB = new BasicBlock(func);
1457    BasicBlock *failLockBB = new BasicBlock(func);
1458
1459    bld.setPosition(currBB, true);
1460    assert(!currBB->joinAt);
1461    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1462
1463    CmpInstruction *pred =
1464       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1465                 TYPE_U32, bld.mkImm(0), bld.mkImm(1));
1466
1467    bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1468    currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1469
1470    bld.setPosition(tryLockBB, true);
1471
1472    Instruction *ld =
1473       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1474                  atom->getIndirect(0, 0));
1475    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1476    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1477
1478    bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
1479    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1480    tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1481    tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1482
1483    tryLockBB->cfg.detach(&joinBB->cfg);
1484    bld.remove(atom);
1485
1486    bld.setPosition(setAndUnlockBB, true);
1487    Value *stVal;
1488    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1489       // Read the old value, and write the new one.
1490       stVal = atom->getSrc(1);
1491    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1492       CmpInstruction *set =
1493          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
1494                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1495
1496       bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
1497                 TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
1498    } else {
1499       operation op;
1500
1501       switch (atom->subOp) {
1502       case NV50_IR_SUBOP_ATOM_ADD:
1503          op = OP_ADD;
1504          break;
1505       case NV50_IR_SUBOP_ATOM_AND:
1506          op = OP_AND;
1507          break;
1508       case NV50_IR_SUBOP_ATOM_OR:
1509          op = OP_OR;
1510          break;
1511       case NV50_IR_SUBOP_ATOM_XOR:
1512          op = OP_XOR;
1513          break;
1514       case NV50_IR_SUBOP_ATOM_MIN:
1515          op = OP_MIN;
1516          break;
1517       case NV50_IR_SUBOP_ATOM_MAX:
1518          op = OP_MAX;
1519          break;
1520       default:
1521          assert(0);
1522          return;
1523       }
1524
1525       stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
1526                          atom->getSrc(1));
1527    }
1528
1529    Instruction *st =
1530       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1531                   atom->getIndirect(0, 0), stVal);
1532    st->setDef(0, pred->getDef(0));
1533    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1534
1535    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1536    setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1537
1538    // Lock until the store has not been performed.
1539    bld.setPosition(failLockBB, true);
1540    bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
1541    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1542    failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1543    failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1544
1545    bld.setPosition(joinBB, false);
1546    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1547 }
1548
1549 void
1550 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
1551 {
1552    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1553
1554    BasicBlock *currBB = atom->bb;
1555    BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
1556    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1557
1558    bld.setPosition(currBB, true);
1559    assert(!currBB->joinAt);
1560    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1561
1562    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
1563    currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
1564
1565    bld.setPosition(tryLockAndSetBB, true);
1566
1567    Instruction *ld =
1568       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1569                  atom->getIndirect(0, 0));
1570    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1571    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1572
1573    Value *stVal;
1574    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1575       // Read the old value, and write the new one.
1576       stVal = atom->getSrc(1);
1577    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1578       CmpInstruction *set =
1579          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1580                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1581       set->setPredicate(CC_P, ld->getDef(1));
1582
1583       Instruction *selp =
1584          bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
1585                    atom->getSrc(2), set->getDef(0));
1586       selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
1587       selp->setPredicate(CC_P, ld->getDef(1));
1588
1589       stVal = selp->getDef(0);
1590    } else {
1591       operation op;
1592
1593       switch (atom->subOp) {
1594       case NV50_IR_SUBOP_ATOM_ADD:
1595          op = OP_ADD;
1596          break;
1597       case NV50_IR_SUBOP_ATOM_AND:
1598          op = OP_AND;
1599          break;
1600       case NV50_IR_SUBOP_ATOM_OR:
1601          op = OP_OR;
1602          break;
1603       case NV50_IR_SUBOP_ATOM_XOR:
1604          op = OP_XOR;
1605          break;
1606       case NV50_IR_SUBOP_ATOM_MIN:
1607          op = OP_MIN;
1608          break;
1609       case NV50_IR_SUBOP_ATOM_MAX:
1610          op = OP_MAX;
1611          break;
1612       default:
1613          assert(0);
1614          return;
1615       }
1616
1617       Instruction *i =
1618          bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1619                    atom->getSrc(1));
1620       i->setPredicate(CC_P, ld->getDef(1));
1621
1622       stVal = i->getDef(0);
1623    }
1624
1625    Instruction *st =
1626       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1627                   atom->getIndirect(0, 0), stVal);
1628    st->setPredicate(CC_P, ld->getDef(1));
1629    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1630
1631    // Loop until the lock is acquired.
1632    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
1633    tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
1634    tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
1635    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1636
1637    bld.remove(atom);
1638
1639    bld.setPosition(joinBB, false);
1640    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1641 }
1642
1643 bool
1644 NVC0LoweringPass::handleATOM(Instruction *atom)
1645 {
1646    SVSemantic sv;
1647    Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
1648
1649    switch (atom->src(0).getFile()) {
1650    case FILE_MEMORY_LOCAL:
1651       sv = SV_LBASE;
1652       break;
1653    case FILE_MEMORY_SHARED:
1654       // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1655       // operations on shared memory. For Maxwell, ATOMS is enough.
1656       if (targ->getChipset() < NVISA_GK104_CHIPSET)
1657          handleSharedATOM(atom);
1658       else if (targ->getChipset() < NVISA_GM107_CHIPSET)
1659          handleSharedATOMNVE4(atom);
1660       return true;
1661    case FILE_MEMORY_GLOBAL:
1662       return true;
1663    default:
1664       assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
1665       base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
1666       assert(base->reg.size == 8);
1667       if (ptr)
1668          base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
1669       assert(base->reg.size == 8);
1670       atom->setIndirect(0, 0, base);
1671       atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1672
1673       // Harden against out-of-bounds accesses
1674       Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
1675       Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
1676       Value *pred = new_LValue(func, FILE_PREDICATE);
1677       if (ptr)
1678          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
1679       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
1680       atom->setPredicate(CC_NOT_P, pred);
1681       if (atom->defExists(0)) {
1682          Value *zero, *dst = atom->getDef(0);
1683          atom->setDef(0, bld.getSSA());
1684
1685          bld.setPosition(atom, true);
1686          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
1687             ->setPredicate(CC_P, pred);
1688          bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
1689       }
1690
1691       return true;
1692    }
1693    base =
1694       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
1695
1696    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
1697    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1698    if (ptr)
1699       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
1700    atom->setIndirect(0, 1, NULL);
1701    atom->setIndirect(0, 0, base);
1702
1703    return true;
1704 }
1705
1706 bool
1707 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
1708 {
1709    if (targ->getChipset() < NVISA_GM107_CHIPSET) {
1710       if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
1711          // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1712          return false;
1713       }
1714    }
1715
1716    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
1717        cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
1718       return false;
1719    bld.setPosition(cas, true);
1720
1721    if (needCctl) {
1722       Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
1723       cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
1724       cctl->fixed = 1;
1725       cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
1726       if (cas->isPredicated())
1727          cctl->setPredicate(cas->cc, cas->getPredicate());
1728    }
1729
1730    if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS &&
1731        targ->getChipset() < NVISA_GV100_CHIPSET) {
1732       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1733       // should be set to the high part of the double reg or bad things will
1734       // happen elsewhere in the universe.
1735       // Also, it sometimes returns the new value instead of the old one
1736       // under mysterious circumstances.
1737       Value *dreg = bld.getSSA(8);
1738       bld.setPosition(cas, false);
1739       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
1740       cas->setSrc(1, dreg);
1741       cas->setSrc(2, dreg);
1742    }
1743
1744    return true;
1745 }
1746
1747 inline Value *
1748 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
1749 {
1750    uint8_t b = prog->driver->io.auxCBSlot;
1751    off += base;
1752
1753    return bld.
1754       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1755 }
1756
1757 inline Value *
1758 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
1759 {
1760    uint8_t b = prog->driver->io.auxCBSlot;
1761    off += base;
1762
1763    if (ptr)
1764       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1765
1766    return bld.
1767       mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
1768 }
1769
1770 inline Value *
1771 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
1772 {
1773    uint8_t b = prog->driver->io.auxCBSlot;
1774    off += base;
1775
1776    if (ptr)
1777       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1778
1779    return bld.
1780       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
1781 }
1782
1783 inline Value *
1784 NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
1785 {
1786    return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
1787 }
1788
1789 inline Value *
1790 NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
1791 {
1792    return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
1793 }
1794
1795 inline Value *
1796 NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
1797 {
1798    return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
1799 }
1800
1801 inline Value *
1802 NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
1803 {
1804    return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
1805 }
1806
1807 inline Value *
1808 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1809 {
1810    uint8_t b = prog->driver->io.msInfoCBSlot;
1811    off += prog->driver->io.msInfoBase;
1812    return bld.
1813       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1814 }
1815
1816 inline Value *
1817 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless)
1818 {
1819    uint32_t base = slot * NVC0_SU_INFO__STRIDE;
1820
1821    // We don't upload surface info for bindless for GM107+
1822    assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET);
1823
1824    if (ptr) {
1825       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
1826       if (bindless)
1827          ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(511));
1828       else
1829          ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
1830       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
1831       base = 0;
1832    }
1833    off += base;
1834
1835    return loadResInfo32(ptr, off, bindless ? prog->driver->io.bindlessBase :
1836                         prog->driver->io.suInfoBase);
1837 }
1838
1839 Value *
1840 NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless)
1841 {
1842    if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET)
1843       return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless);
1844
1845    assert(bindless);
1846
1847    Value *samples = bld.getSSA();
1848    // this shouldn't be lowered because it's being inserted before the current instruction
1849    TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
1850    tex->tex.target = target;
1851    tex->tex.query = TXQ_TYPE;
1852    tex->tex.mask = 0x4;
1853    tex->tex.r = 0xff;
1854    tex->tex.s = 0x1f;
1855    tex->tex.rIndirectSrc = 0;
1856    tex->setDef(0, samples);
1857    tex->setSrc(0, ind);
1858    tex->setSrc(1, bld.loadImm(NULL, 0));
1859    bld.insert(tex);
1860
1861    // doesn't work with sample counts other than 1/2/4/8 but they aren't supported
1862    switch (index) {
1863    case 0: {
1864       Value *tmp = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(2));
1865       return bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(2));
1866    }
1867    case 1: {
1868       Value *tmp = bld.mkCmp(OP_SET, CC_GT, TYPE_U32, bld.getSSA(), TYPE_U32, samples, bld.mkImm(2))->getDef(0);
1869       return bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(1));
1870    }
1871    default: {
1872       assert(false);
1873       return NULL;
1874    }
1875    }
1876 }
1877
1878 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1879 {
1880    switch (su->tex.target.getEnum()) {
1881    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1882    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1883    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1884    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
1885                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1886                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1887    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1888    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1889    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1890    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1891    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1892    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1893    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1894    default:
1895       assert(0);
1896       return 0;
1897    }
1898 }
1899
1900 bool
1901 NVC0LoweringPass::handleSUQ(TexInstruction *suq)
1902 {
1903    int mask = suq->tex.mask;
1904    int dim = suq->tex.target.getDim();
1905    int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1906    Value *ind = suq->getIndirectR();
1907    int slot = suq->tex.r;
1908    int c, d;
1909
1910    for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1911       if (c >= arg || !(mask & 1))
1912          continue;
1913
1914       int offset;
1915
1916       if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1917          offset = NVC0_SU_INFO_SIZE(2);
1918       } else {
1919          offset = NVC0_SU_INFO_SIZE(c);
1920       }
1921       bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset, suq->tex.bindless));
1922       if (c == 2 && suq->tex.target.isCube())
1923          bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1924                    bld.loadImm(NULL, 6));
1925    }
1926
1927    if (mask & 1) {
1928       if (suq->tex.target.isMS()) {
1929          Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless);
1930          Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless);
1931          Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1932          bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1933       } else {
1934          bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1935       }
1936    }
1937
1938    bld.remove(suq);
1939    return true;
1940 }
1941
1942 void
1943 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1944 {
1945    const int arg = tex->tex.target.getArgCount();
1946    int slot = tex->tex.r;
1947
1948    if (tex->tex.target == TEX_TARGET_2D_MS)
1949       tex->tex.target = TEX_TARGET_2D;
1950    else
1951    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1952       tex->tex.target = TEX_TARGET_2D_ARRAY;
1953    else
1954       return;
1955
1956    Value *x = tex->getSrc(0);
1957    Value *y = tex->getSrc(1);
1958    Value *s = tex->getSrc(arg - 1);
1959
1960    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1961    Value *ind = tex->getIndirectR();
1962
1963    Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless);
1964    Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless);
1965
1966    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1967    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1968
1969    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1970    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1971
1972    Value *dx = loadMsInfo32(ts, 0x0);
1973    Value *dy = loadMsInfo32(ts, 0x4);
1974
1975    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1976    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1977
1978    tex->setSrc(0, tx);
1979    tex->setSrc(1, ty);
1980    tex->moveSources(arg, -1);
1981 }
1982
1983 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1984 // They're computed from the coordinates using the surface info in c[] space.
1985 void
1986 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1987 {
1988    Instruction *insn;
1989    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
1990    const bool raw =
1991       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
1992    const int slot = su->tex.r;
1993    const int dim = su->tex.target.getDim();
1994    const bool array = su->tex.target.isArray() || su->tex.target.isCube();
1995    const int arg = dim + array;
1996    int c;
1997    Value *zero = bld.mkImm(0);
1998    Value *p1 = NULL;
1999    Value *v;
2000    Value *src[3];
2001    Value *bf, *eau, *off;
2002    Value *addr, *pred;
2003    Value *ind = su->getIndirectR();
2004    Value *y, *z;
2005
2006    off = bld.getScratch(4);
2007    bf = bld.getScratch(4);
2008    addr = bld.getSSA(8);
2009    pred = bld.getScratch(1, FILE_PREDICATE);
2010
2011    bld.setPosition(su, false);
2012
2013    adjustCoordinatesMS(su);
2014
2015    // calculate clamped coordinates
2016    for (c = 0; c < arg; ++c) {
2017       int dimc = c;
2018
2019       if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
2020          // The array index is stored in the Z component for 1D arrays.
2021          dimc = 2;
2022       }
2023
2024       src[c] = bld.getScratch();
2025       if (c == 0 && raw)
2026          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X, su->tex.bindless);
2027       else
2028          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc), su->tex.bindless);
2029       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
2030          ->subOp = getSuClampSubOp(su, dimc);
2031    }
2032    for (; c < 3; ++c)
2033       src[c] = zero;
2034
2035    if (dim == 2 && !array) {
2036       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2037       src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
2038                           v, bld.loadImm(NULL, 16));
2039
2040       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);
2041       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)
2042          ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
2043    }
2044
2045    // set predicate output
2046    if (su->tex.target == TEX_TARGET_BUFFER) {
2047       src[0]->getInsn()->setFlagsDef(1, pred);
2048    } else
2049    if (array) {
2050       p1 = bld.getSSA(1, FILE_PREDICATE);
2051       src[dim]->getInsn()->setFlagsDef(1, p1);
2052    }
2053
2054    // calculate pixel offset
2055    if (dim == 1) {
2056       y = z = zero;
2057       if (su->tex.target != TEX_TARGET_BUFFER)
2058          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
2059    } else {
2060       y = src[1];
2061       z = src[2];
2062
2063       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2064       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
2065          ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l
2066
2067       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
2068       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
2069          ->subOp = array ?
2070          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
2071    }
2072
2073    // calculate effective address part 1
2074    if (su->tex.target == TEX_TARGET_BUFFER) {
2075       if (raw) {
2076          bf = src[0];
2077       } else {
2078          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2079          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
2080             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
2081       }
2082    } else {
2083       uint16_t subOp = 0;
2084
2085       switch (dim) {
2086       case 1:
2087          break;
2088       case 2:
2089          if (array) {
2090             z = off;
2091          } else {
2092             subOp = NV50_IR_SUBOP_SUBFM_3D;
2093          }
2094          break;
2095       default:
2096          subOp = NV50_IR_SUBOP_SUBFM_3D;
2097          assert(dim == 3);
2098          break;
2099       }
2100       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
2101       insn->subOp = subOp;
2102       insn->setFlagsDef(1, pred);
2103    }
2104
2105    // part 2
2106    v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless);
2107
2108    if (su->tex.target == TEX_TARGET_BUFFER) {
2109       eau = v;
2110    } else {
2111       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
2112    }
2113    // add array layer offset
2114    if (array) {
2115       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2116       if (dim == 1)
2117          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
2118             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
2119       else
2120          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
2121             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
2122       // combine predicates
2123       assert(p1);
2124       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
2125    }
2126
2127    if (atom) {
2128       Value *lo = bf;
2129       if (su->tex.target == TEX_TARGET_BUFFER) {
2130          lo = zero;
2131          bld.mkMov(off, bf);
2132       }
2133       //  bf == g[] address & 0xff
2134       // eau == g[] address >> 8
2135       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
2136       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
2137    } else
2138    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
2139       // Convert from u32 to u8 address format, which is what the library code
2140       // doing SULDP currently uses.
2141       // XXX: can SUEAU do this ?
2142       // XXX: does it matter that we don't mask high bytes in bf ?
2143       // Grrr.
2144       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
2145       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
2146    }
2147
2148    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
2149
2150    if (atom && su->tex.target == TEX_TARGET_BUFFER)
2151       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
2152
2153    // let's just set it 0 for raw access and hope it works
2154    v = raw ?
2155       bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2156
2157    // get rid of old coordinate sources, make space for fmt info and predicate
2158    su->moveSources(arg, 3 - arg);
2159    // set 64 bit address and 32-bit format sources
2160    su->setSrc(0, addr);
2161    su->setSrc(1, v);
2162    su->setSrc(2, pred);
2163    su->setIndirectR(NULL);
2164
2165    // prevent read fault when the image is not actually bound
2166    CmpInstruction *pred1 =
2167       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2168                 TYPE_U32, bld.mkImm(0),
2169                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2170
2171    if (su->op != OP_SUSTP && su->tex.format) {
2172       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2173       int blockwidth = format->bits[0] + format->bits[1] +
2174                        format->bits[2] + format->bits[3];
2175
2176       // make sure that the format doesn't mismatch
2177       assert(format->components != 0);
2178       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
2179                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2180                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2181                 pred1->getDef(0));
2182    }
2183    su->setPredicate(CC_NOT_P, pred1->getDef(0));
2184
2185    // TODO: initialize def values to 0 when the surface operation is not
2186    // performed (not needed for stores). Also, fix the "address bounds test"
2187    // subtests from arb_shader_image_load_store-invalid for buffers, because it
2188    // seems like that the predicate is not correctly set by suclamp.
2189 }
2190
2191 static DataType
2192 getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
2193 {
2194    switch (t->type) {
2195    case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
2196    case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
2197    case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
2198    case UINT:
2199       return (t->bits[c] == 8 ? TYPE_U8 :
2200               (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
2201    case SINT:
2202       return (t->bits[c] == 8 ? TYPE_S8 :
2203               (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
2204    }
2205    return TYPE_NONE;
2206 }
2207
2208 static DataType
2209 getDestType(const ImgType type) {
2210    switch (type) {
2211    case FLOAT:
2212    case UNORM:
2213    case SNORM:
2214       return TYPE_F32;
2215    case UINT:
2216       return TYPE_U32;
2217    case SINT:
2218       return TYPE_S32;
2219    default:
2220       assert(!"Impossible type");
2221       return TYPE_NONE;
2222    }
2223 }
2224
2225 void
2226 NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded)
2227 {
2228    const TexInstruction::ImgFormatDesc *format = su->tex.format;
2229    int width = format->bits[0] + format->bits[1] +
2230       format->bits[2] + format->bits[3];
2231    Value *untypedDst[4] = {};
2232    Value *typedDst[4] = {};
2233
2234    // We must convert this to a generic load.
2235    su->op = OP_SULDB;
2236
2237    su->dType = typeOfSize(width / 8);
2238    su->sType = TYPE_U8;
2239
2240    for (int i = 0; i < width / 32; i++)
2241       untypedDst[i] = bld.getSSA();
2242    if (width < 32)
2243       untypedDst[0] = bld.getSSA();
2244
2245    if (loaded && loaded[0]) {
2246       for (int i = 0; i < 4; i++) {
2247          if (loaded[i])
2248             typedDst[i] = loaded[i]->getDef(0);
2249       }
2250    } else {
2251       for (int i = 0; i < 4; i++) {
2252          typedDst[i] = su->getDef(i);
2253       }
2254    }
2255
2256    // Set the untyped dsts as the su's destinations
2257    if (loaded && loaded[0]) {
2258       for (int i = 0; i < 4; i++)
2259          if (loaded[i])
2260             loaded[i]->setDef(0, untypedDst[i]);
2261    } else {
2262       for (int i = 0; i < 4; i++)
2263          su->setDef(i, untypedDst[i]);
2264
2265       bld.setPosition(su, true);
2266    }
2267
2268    // Unpack each component into the typed dsts
2269    int bits = 0;
2270    for (int i = 0; i < 4; bits += format->bits[i], i++) {
2271       if (!typedDst[i])
2272          continue;
2273
2274       if (loaded && loaded[0])
2275          bld.setPosition(loaded[i], true);
2276
2277       if (i >= format->components) {
2278          if (format->type == FLOAT ||
2279              format->type == UNORM ||
2280              format->type == SNORM)
2281             bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
2282          else
2283             bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
2284          continue;
2285       }
2286
2287       // Get just that component's data into the relevant place
2288       if (format->bits[i] == 32)
2289          bld.mkMov(typedDst[i], untypedDst[i]);
2290       else if (format->bits[i] == 16)
2291          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2292                    getSrcType(format, i), untypedDst[i / 2])
2293          ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
2294       else if (format->bits[i] == 8)
2295          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2296                    getSrcType(format, i), untypedDst[0])->subOp = i;
2297       else {
2298          bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
2299                    bld.mkImm((bits % 32) | (format->bits[i] << 8)));
2300          if (format->type == UNORM || format->type == SNORM)
2301             bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
2302       }
2303
2304       // Normalize / convert as necessary
2305       if (format->type == UNORM)
2306          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
2307       else if (format->type == SNORM)
2308          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
2309       else if (format->type == FLOAT && format->bits[i] < 16) {
2310          bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
2311          bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
2312       }
2313    }
2314
2315    if (format->bgra) {
2316       std::swap(typedDst[0], typedDst[2]);
2317    }
2318 }
2319
2320 void
2321 NVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su)
2322 {
2323    if (!su->getPredicate())
2324       return;
2325
2326    bld.setPosition(su, true);
2327
2328    for (unsigned i = 0; su->defExists(i); ++i) {
2329       ValueDef &def = su->def(i);
2330
2331       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2332       assert(su->cc == CC_NOT_P);
2333       mov->setPredicate(CC_P, su->getPredicate());
2334       Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), NULL, mov->getDef(0));
2335
2336       def.replace(uni->getDef(0), false);
2337       uni->setSrc(0, def.get());
2338    }
2339 }
2340
2341 void
2342 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
2343 {
2344    processSurfaceCoordsNVE4(su);
2345
2346    if (su->op == OP_SULDP) {
2347       convertSurfaceFormat(su, NULL);
2348       insertOOBSurfaceOpResult(su);
2349    }
2350
2351    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2352       assert(su->getPredicate());
2353       Value *pred =
2354          bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
2355                     su->getPredicate(), su->getSrc(2));
2356
2357       Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
2358       red->subOp = su->subOp;
2359       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
2360       red->setSrc(1, su->getSrc(3));
2361       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
2362          red->setSrc(2, su->getSrc(4));
2363       red->setIndirect(0, 0, su->getSrc(0));
2364
2365       // make sure to initialize dst value when the atomic operation is not
2366       // performed
2367       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2368
2369       assert(su->cc == CC_NOT_P);
2370       red->setPredicate(su->cc, pred);
2371       mov->setPredicate(CC_P, pred);
2372
2373       bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
2374                 red->getDef(0), mov->getDef(0));
2375
2376       delete_Instruction(bld.getProgram(), su);
2377       handleCasExch(red, true);
2378    }
2379
2380    if (su->op == OP_SUSTB || su->op == OP_SUSTP)
2381       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
2382 }
2383
2384 void
2385 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
2386 {
2387    const int slot = su->tex.r;
2388    const int dim = su->tex.target.getDim();
2389    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2390    int c;
2391    Value *zero = bld.mkImm(0);
2392    Value *src[3];
2393    Value *v;
2394    Value *ind = su->getIndirectR();
2395
2396    bld.setPosition(su, false);
2397
2398    adjustCoordinatesMS(su);
2399
2400    if (ind) {
2401       Value *ptr;
2402       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
2403       ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
2404       su->setIndirectR(ptr);
2405    }
2406
2407    // get surface coordinates
2408    for (c = 0; c < arg; ++c)
2409       src[c] = su->getSrc(c);
2410    for (; c < 3; ++c)
2411       src[c] = zero;
2412
2413    // calculate pixel offset
2414    if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2415       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless);
2416       su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
2417    }
2418
2419    // add array layer offset
2420    if (su->tex.target.isArray() || su->tex.target.isCube()) {
2421       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2422       assert(dim > 1);
2423       su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
2424    }
2425
2426    // prevent read fault when the image is not actually bound
2427    CmpInstruction *pred =
2428       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2429                 TYPE_U32, bld.mkImm(0),
2430                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2431    if (su->op != OP_SUSTP && su->tex.format) {
2432       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2433       int blockwidth = format->bits[0] + format->bits[1] +
2434                        format->bits[2] + format->bits[3];
2435
2436       assert(format->components != 0);
2437       // make sure that the format doesn't mismatch when it's not FMT_NONE
2438       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2439                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2440                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2441                 pred->getDef(0));
2442    }
2443    su->setPredicate(CC_NOT_P, pred->getDef(0));
2444 }
2445
2446 void
2447 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
2448 {
2449    if (su->tex.target == TEX_TARGET_1D_ARRAY) {
2450       /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2451        * will simplify the lowering pass and the texture constraints. */
2452       su->moveSources(1, 1);
2453       su->setSrc(1, bld.loadImm(NULL, 0));
2454       su->tex.target = TEX_TARGET_2D_ARRAY;
2455    }
2456
2457    processSurfaceCoordsNVC0(su);
2458
2459    if (su->op == OP_SULDP) {
2460       convertSurfaceFormat(su, NULL);
2461       insertOOBSurfaceOpResult(su);
2462    }
2463
2464    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2465       const int dim = su->tex.target.getDim();
2466       const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2467       LValue *addr = bld.getSSA(8);
2468       Value *def = su->getDef(0);
2469
2470       su->op = OP_SULEA;
2471
2472       // Set the destination to the address
2473       su->dType = TYPE_U64;
2474       su->setDef(0, addr);
2475       su->setDef(1, su->getPredicate());
2476
2477       bld.setPosition(su, true);
2478
2479       // Perform the atomic op
2480       Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
2481       red->subOp = su->subOp;
2482       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
2483       red->setSrc(1, su->getSrc(arg));
2484       if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
2485          red->setSrc(2, su->getSrc(arg + 1));
2486       red->setIndirect(0, 0, addr);
2487
2488       // make sure to initialize dst value when the atomic operation is not
2489       // performed
2490       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2491
2492       assert(su->cc == CC_NOT_P);
2493       red->setPredicate(su->cc, su->getPredicate());
2494       mov->setPredicate(CC_P, su->getPredicate());
2495
2496       bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
2497
2498       handleCasExch(red, false);
2499    }
2500 }
2501
2502 TexInstruction *
2503 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4])
2504 {
2505    const int slot = su->tex.r;
2506    const int dim = su->tex.target.getDim();
2507    const bool array = su->tex.target.isArray() || su->tex.target.isCube();
2508    const int arg = dim + array;
2509    Value *ind = su->getIndirectR();
2510    Value *handle;
2511    Instruction *pred = NULL, *pred2d = NULL;
2512    int pos = 0;
2513
2514    bld.setPosition(su, false);
2515
2516    adjustCoordinatesMS(su);
2517
2518    // add texture handle
2519    switch (su->op) {
2520    case OP_SUSTP:
2521       pos = 4;
2522       break;
2523    case OP_SUREDP:
2524       pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
2525       break;
2526    default:
2527       assert(pos == 0);
2528       break;
2529    }
2530
2531    if (dim == 2 && !array) {
2532       // This might be a 2d slice of a 3d texture, try to load the z
2533       // coordinate in.
2534       Value *v;
2535       if (!su->tex.bindless)
2536          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2537       else
2538          v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11));
2539       Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1));
2540       pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2541                          TYPE_U32, bld.mkImm(0), is_3d);
2542
2543       bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16));
2544       su->moveSources(dim, 1);
2545       su->setSrc(dim, v);
2546       su->tex.target = nv50_ir::TEX_TARGET_3D;
2547       pos++;
2548    }
2549
2550    if (su->tex.bindless)
2551       handle = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ind, bld.mkImm(2047));
2552    else
2553       handle = loadTexHandle(ind, slot + 32);
2554
2555    su->setSrc(arg + pos, handle);
2556
2557    // The address check doesn't make sense here. The format check could make
2558    // sense but it's a bit of a pain.
2559    if (!su->tex.bindless) {
2560       // prevent read fault when the image is not actually bound
2561       pred =
2562          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2563                    TYPE_U32, bld.mkImm(0),
2564                    loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2565       if (su->op != OP_SUSTP && su->tex.format) {
2566          const TexInstruction::ImgFormatDesc *format = su->tex.format;
2567          int blockwidth = format->bits[0] + format->bits[1] +
2568             format->bits[2] + format->bits[3];
2569
2570          assert(format->components != 0);
2571          // make sure that the format doesn't mismatch when it's not FMT_NONE
2572          bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2573                    TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2574                    loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2575                    pred->getDef(0));
2576       }
2577    }
2578
2579    // Now we have "pred" which (optionally) contains whether to do the surface
2580    // op at all, and a "pred2d" which indicates that, in case of doing the
2581    // surface op, we have to create a 2d and 3d version, conditioned on pred2d.
2582    TexInstruction *su2d = NULL;
2583    if (pred2d) {
2584       su2d = cloneForward(func, su)->asTex();
2585       for (unsigned i = 0; su->defExists(i); ++i)
2586          su2d->setDef(i, bld.getSSA());
2587       su2d->moveSources(dim + 1, -1);
2588       su2d->tex.target = nv50_ir::TEX_TARGET_2D;
2589    }
2590    if (pred2d && pred) {
2591       Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8,
2592                                       bld.getSSA(1, FILE_PREDICATE),
2593                                       pred->getDef(0), pred2d->getDef(0));
2594       pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
2595       pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT);
2596       su->setPredicate(CC_P, pred3d->getDef(0));
2597       pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE),
2598                          pred->getDef(0), pred2d->getDef(0));
2599       pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT);
2600    } else if (pred) {
2601       su->setPredicate(CC_NOT_P, pred->getDef(0));
2602    } else if (pred2d) {
2603       su->setPredicate(CC_NOT_P, pred2d->getDef(0));
2604    }
2605    if (su2d) {
2606       su2d->setPredicate(CC_P, pred2d->getDef(0));
2607       bld.insert(su2d);
2608
2609       // Create a UNION so that RA assigns the same registers
2610       bld.setPosition(su, true);
2611       for (unsigned i = 0; su->defExists(i); ++i) {
2612          assert(i < 4);
2613
2614          ValueDef &def = su->def(i);
2615          ValueDef &def2 = su2d->def(i);
2616          Instruction *mov = NULL;
2617
2618          if (pred) {
2619             mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2620             mov->setPredicate(CC_P, pred->getDef(0));
2621          }
2622
2623          Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
2624                                       bld.getSSA(),
2625                                       NULL, def2.get());
2626          def.replace(uni->getDef(0), false);
2627          uni->setSrc(0, def.get());
2628          if (mov)
2629             uni->setSrc(2, mov->getDef(0));
2630       }
2631    } else if (pred) {
2632       // Create a UNION so that RA assigns the same registers
2633       bld.setPosition(su, true);
2634       for (unsigned i = 0; su->defExists(i); ++i) {
2635          assert(i < 4);
2636
2637          ValueDef &def = su->def(i);
2638
2639          Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2640          mov->setPredicate(CC_P, pred->getDef(0));
2641
2642          Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32,
2643                                       bld.getSSA(),
2644                                       NULL, mov->getDef(0));
2645          def.replace(uni->getDef(0), false);
2646          uni->setSrc(0, def.get());
2647       }
2648    }
2649
2650    return su2d;
2651 }
2652
2653 void
2654 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
2655 {
2656    // processSurfaceCoords also takes care of fixing up the outputs and
2657    // union'ing them with 0 as necessary. Additionally it may create a second
2658    // surface which needs some of the similar fixups.
2659
2660    Instruction *loaded[4] = {};
2661    TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded);
2662
2663    if (su->op == OP_SULDP) {
2664       convertSurfaceFormat(su, loaded);
2665    }
2666
2667    if (su->op == OP_SUREDP) {
2668       su->op = OP_SUREDB;
2669    }
2670
2671    // If we fixed up the type of the regular surface load instruction, we also
2672    // have to fix up the copy.
2673    if (su2) {
2674       su2->op = su->op;
2675       su2->dType = su->dType;
2676       su2->sType = su->sType;
2677    }
2678 }
2679
2680 bool
2681 NVC0LoweringPass::handleWRSV(Instruction *i)
2682 {
2683    Instruction *st;
2684    Symbol *sym;
2685    uint32_t addr;
2686
2687    // must replace, $sreg are not writeable
2688    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
2689    if (addr >= 0x400)
2690       return false;
2691    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
2692
2693    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
2694                     i->getSrc(1));
2695    st->perPatch = i->perPatch;
2696
2697    bld.getBB()->remove(i);
2698    return true;
2699 }
2700
2701 void
2702 NVC0LoweringPass::handleLDST(Instruction *i)
2703 {
2704    if (i->src(0).getFile() == FILE_SHADER_INPUT) {
2705       if (prog->getType() == Program::TYPE_COMPUTE) {
2706          i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
2707          i->getSrc(0)->reg.fileIndex = 0;
2708       } else
2709       if (prog->getType() == Program::TYPE_GEOMETRY &&
2710           i->src(0).isIndirect(0)) {
2711          // XXX: this assumes vec4 units
2712          Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2713                                  i->getIndirect(0, 0), bld.mkImm(4));
2714          i->setIndirect(0, 0, ptr);
2715          i->op = OP_VFETCH;
2716       } else {
2717          i->op = OP_VFETCH;
2718          assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
2719       }
2720    } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
2721       int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
2722       Value *ind = i->getIndirect(0, 1);
2723
2724       if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2725           prog->getType() == Program::TYPE_COMPUTE &&
2726           (fileIndex >= 6 || ind)) {
2727          // The launch descriptor only allows to set up 8 CBs, but OpenGL
2728          // requires at least 12 UBOs. To bypass this limitation, for constant
2729          // buffers 7+, we store the addrs into the driver constbuf and we
2730          // directly load from the global memory.
2731          if (ind) {
2732             // Clamp the UBO index when an indirect access is used to avoid
2733             // loading information from the wrong place in the driver cb.
2734             // TODO - synchronize the max with the driver.
2735             ind = bld.mkOp2v(OP_MIN, TYPE_U32, bld.getSSA(),
2736                              bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2737                                         ind, bld.loadImm(NULL, fileIndex)),
2738                              bld.loadImm(NULL, 13));
2739             fileIndex = 0;
2740          }
2741
2742          Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2743          Value *ptr = loadUboInfo64(ind, fileIndex * 16);
2744          Value *length = loadUboLength32(ind, fileIndex * 16);
2745          Value *pred = new_LValue(func, FILE_PREDICATE);
2746          if (i->src(0).isIndirect(0)) {
2747             bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2748             bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2749          }
2750          i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2751          i->setIndirect(0, 1, NULL);
2752          i->setIndirect(0, 0, ptr);
2753          bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2754          i->setPredicate(CC_NOT_P, pred);
2755          Value *zero, *dst = i->getDef(0);
2756          i->setDef(0, bld.getSSA());
2757
2758          bld.setPosition(i, true);
2759          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2760             ->setPredicate(CC_P, pred);
2761          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2762       } else if (i->src(0).isIndirect(1)) {
2763          Value *ptr;
2764          if (i->src(0).isIndirect(0))
2765             ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
2766                              i->getIndirect(0, 1), bld.mkImm(0x1010),
2767                              i->getIndirect(0, 0));
2768          else
2769             ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2770                              i->getIndirect(0, 1), bld.mkImm(16));
2771          i->setIndirect(0, 1, NULL);
2772          i->setIndirect(0, 0, ptr);
2773          i->subOp = NV50_IR_SUBOP_LDC_IS;
2774       }
2775    } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
2776       assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
2777       i->op = OP_VFETCH;
2778    } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
2779       Value *ind = i->getIndirect(0, 1);
2780       Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
2781       // XXX come up with a way not to do this for EVERY little access but
2782       // rather to batch these up somehow. Unfortunately we've lost the
2783       // information about the field width by the time we get here.
2784       Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2785       Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
2786       Value *pred = new_LValue(func, FILE_PREDICATE);
2787       if (i->src(0).isIndirect(0)) {
2788          bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2789          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2790       }
2791       i->setIndirect(0, 1, NULL);
2792       i->setIndirect(0, 0, ptr);
2793       i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2794       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2795       i->setPredicate(CC_NOT_P, pred);
2796       if (i->defExists(0)) {
2797          Value *zero, *dst = i->getDef(0);
2798          i->setDef(0, bld.getSSA());
2799
2800          bld.setPosition(i, true);
2801          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2802             ->setPredicate(CC_P, pred);
2803          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2804       }
2805    }
2806 }
2807
2808 void
2809 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
2810 {
2811    Value *laneid = bld.getSSA();
2812    Value *x, *y;
2813
2814    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
2815
2816    if (c == 0) {
2817       x = dst;
2818       y = NULL;
2819    } else
2820    if (c == 1) {
2821       x = NULL;
2822       y = dst;
2823    } else {
2824       assert(c == 2);
2825       if (prog->driver_out->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
2826          bld.mkMov(dst, bld.loadImm(NULL, 0));
2827          return;
2828       }
2829       x = bld.getSSA();
2830       y = bld.getSSA();
2831    }
2832    if (x)
2833       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
2834    if (y)
2835       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
2836
2837    if (c == 2) {
2838       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
2839       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
2840    }
2841 }
2842
2843 bool
2844 NVC0LoweringPass::handleRDSV(Instruction *i)
2845 {
2846    Symbol *sym = i->getSrc(0)->asSym();
2847    const SVSemantic sv = sym->reg.data.sv.sv;
2848    Value *vtx = NULL;
2849    Instruction *ld;
2850    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
2851
2852    if (addr >= 0x400) {
2853       // mov $sreg
2854       if (sym->reg.data.sv.index == 3) {
2855          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2856          i->op = OP_MOV;
2857          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
2858       } else
2859       if (sv == SV_TID) {
2860          // Help CSE combine TID fetches
2861          Value *tid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(),
2862                                  bld.mkSysVal(SV_COMBINED_TID, 0));
2863          i->op = OP_EXTBF;
2864          i->setSrc(0, tid);
2865          switch (sym->reg.data.sv.index) {
2866          case 0: i->setSrc(1, bld.mkImm(0x1000)); break;
2867          case 1: i->setSrc(1, bld.mkImm(0x0a10)); break;
2868          case 2: i->setSrc(1, bld.mkImm(0x061a)); break;
2869          }
2870       }
2871       if (sv == SV_VERTEX_COUNT) {
2872          bld.setPosition(i, true);
2873          bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
2874       }
2875       return true;
2876    }
2877
2878    switch (sv) {
2879    case SV_POSITION:
2880       assert(prog->getType() == Program::TYPE_FRAGMENT);
2881       if (i->srcExists(1)) {
2882          // Pass offset through to the interpolation logic
2883          ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
2884                            i->getDef(0), addr, NULL);
2885          ld->setSrc(1, i->getSrc(1));
2886       } else {
2887          bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
2888       }
2889       break;
2890    case SV_FACE:
2891    {
2892       Value *face = i->getDef(0);
2893       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
2894       if (i->dType == TYPE_F32) {
2895          bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
2896          bld.mkOp1(OP_NEG, TYPE_S32, face, face);
2897          bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
2898       }
2899    }
2900       break;
2901    case SV_TESS_COORD:
2902       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
2903       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
2904       break;
2905    case SV_NTID:
2906    case SV_NCTAID:
2907    case SV_GRIDID:
2908       assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
2909       if (sym->reg.data.sv.index == 3) {
2910          i->op = OP_MOV;
2911          i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
2912          return true;
2913       }
2914       // Fallthrough
2915    case SV_WORK_DIM:
2916       addr += prog->driver->prop.cp.gridInfoBase;
2917       bld.mkLoad(TYPE_U32, i->getDef(0),
2918                  bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2919                               TYPE_U32, addr), NULL);
2920       break;
2921    case SV_SAMPLE_INDEX:
2922       // TODO: Properly pass source as an address in the PIX address space
2923       // (which can be of the form [r0+offset]). But this is currently
2924       // unnecessary.
2925       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2926       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2927       break;
2928    case SV_SAMPLE_POS: {
2929       Value *sampleID = bld.getScratch();
2930       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0));
2931       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2932       Value *offset = calculateSampleOffset(sampleID);
2933
2934       assert(prog->driver_out->prop.fp.readsSampleLocations);
2935
2936       if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
2937          bld.mkLoad(TYPE_F32,
2938                     i->getDef(0),
2939                     bld.mkSymbol(
2940                           FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2941                           TYPE_U32, prog->driver->io.sampleInfoBase),
2942                     offset);
2943          bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0),
2944                    bld.mkImm(0x040c + sym->reg.data.sv.index * 16));
2945          bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0));
2946          bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), bld.mkImm(1.0f / 16.0f));
2947       } else {
2948          bld.mkLoad(TYPE_F32,
2949                     i->getDef(0),
2950                     bld.mkSymbol(
2951                           FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2952                           TYPE_U32, prog->driver->io.sampleInfoBase +
2953                           4 * sym->reg.data.sv.index),
2954                     offset);
2955       }
2956       break;
2957    }
2958    case SV_SAMPLE_MASK: {
2959       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2960       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
2961       Instruction *sampleid =
2962          bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2963       sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2964       Value *masked =
2965          bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
2966                     bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2967                                bld.loadImm(NULL, 1), sampleid->getDef(0)));
2968       if (prog->persampleInvocation) {
2969          bld.mkMov(i->getDef(0), masked);
2970       } else {
2971          bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
2972                    bld.mkImm(0))
2973             ->subOp = 1;
2974       }
2975       break;
2976    }
2977    case SV_BASEVERTEX:
2978    case SV_BASEINSTANCE:
2979    case SV_DRAWID:
2980       ld = bld.mkLoad(TYPE_U32, i->getDef(0),
2981                       bld.mkSymbol(FILE_MEMORY_CONST,
2982                                    prog->driver->io.auxCBSlot,
2983                                    TYPE_U32,
2984                                    prog->driver->io.drawInfoBase +
2985                                    4 * (sv - SV_BASEVERTEX)),
2986                       NULL);
2987       break;
2988    default:
2989       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
2990          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2991       if (prog->getType() == Program::TYPE_FRAGMENT) {
2992          bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
2993       } else {
2994          ld = bld.mkFetch(i->getDef(0), i->dType,
2995                           FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
2996          ld->perPatch = i->perPatch;
2997       }
2998       break;
2999    }
3000    bld.getBB()->remove(i);
3001    return true;
3002 }
3003
3004 bool
3005 NVC0LoweringPass::handleDIV(Instruction *i)
3006 {
3007    if (!isFloatType(i->dType))
3008       return true;
3009    bld.setPosition(i, false);
3010    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
3011    i->op = OP_MUL;
3012    i->setSrc(1, rcp->getDef(0));
3013    return true;
3014 }
3015
3016 bool
3017 NVC0LoweringPass::handleMOD(Instruction *i)
3018 {
3019    if (!isFloatType(i->dType))
3020       return true;
3021    LValue *value = bld.getScratch(typeSizeof(i->dType));
3022    bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
3023    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
3024    bld.mkOp1(OP_TRUNC, i->dType, value, value);
3025    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
3026    i->op = OP_SUB;
3027    i->setSrc(1, value);
3028    return true;
3029 }
3030
3031 bool
3032 NVC0LoweringPass::handleSQRT(Instruction *i)
3033 {
3034    if (targ->isOpSupported(OP_SQRT, i->dType))
3035       return true;
3036
3037    if (i->dType == TYPE_F64) {
3038       Value *pred = bld.getSSA(1, FILE_PREDICATE);
3039       Value *zero = bld.loadImm(NULL, 0.0);
3040       Value *dst = bld.getSSA(8);
3041       bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
3042       bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
3043       bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
3044       i->op = OP_MUL;
3045       i->setSrc(1, dst);
3046       // TODO: Handle this properly with a library function
3047    } else {
3048       bld.setPosition(i, true);
3049       i->op = OP_RSQ;
3050       bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
3051    }
3052
3053    return true;
3054 }
3055
3056 bool
3057 NVC0LoweringPass::handlePOW(Instruction *i)
3058 {
3059    LValue *val = bld.getScratch();
3060
3061    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
3062    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
3063    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
3064
3065    i->op = OP_EX2;
3066    i->setSrc(0, val);
3067    i->setSrc(1, NULL);
3068
3069    return true;
3070 }
3071
3072 bool
3073 NVC0LoweringPass::handleEXPORT(Instruction *i)
3074 {
3075    if (prog->getType() == Program::TYPE_FRAGMENT) {
3076       int id = i->getSrc(0)->reg.data.offset / 4;
3077
3078       if (i->src(0).isIndirect(0)) // TODO, ugly
3079          return false;
3080       i->op = OP_MOV;
3081       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
3082       i->src(0).set(i->src(1));
3083       i->setSrc(1, NULL);
3084       i->setDef(0, new_LValue(func, FILE_GPR));
3085       i->getDef(0)->reg.data.id = id;
3086
3087       prog->maxGPR = MAX2(prog->maxGPR, id);
3088    } else
3089    if (prog->getType() == Program::TYPE_GEOMETRY) {
3090       i->setIndirect(0, 1, gpEmitAddress);
3091    }
3092    return true;
3093 }
3094
3095 bool
3096 NVC0LoweringPass::handleOUT(Instruction *i)
3097 {
3098    Instruction *prev = i->prev;
3099    ImmediateValue stream, prevStream;
3100
3101    // Only merge if the stream ids match. Also, note that the previous
3102    // instruction would have already been lowered, so we take arg1 from it.
3103    if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
3104        i->src(0).getImmediate(stream) &&
3105        prev->src(1).getImmediate(prevStream) &&
3106        stream.reg.data.u32 == prevStream.reg.data.u32) {
3107       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
3108       delete_Instruction(prog, i);
3109    } else {
3110       assert(gpEmitAddress);
3111       i->setDef(0, gpEmitAddress);
3112       i->setSrc(1, i->getSrc(0));
3113       i->setSrc(0, gpEmitAddress);
3114    }
3115    return true;
3116 }
3117
3118 Value *
3119 NVC0LoweringPass::calculateSampleOffset(Value *sampleID)
3120 {
3121    Value *offset = bld.getScratch();
3122    if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
3123       // Sample location offsets (in bytes) are calculated like so:
3124       // offset = (SV_POSITION.y % 4 * 2) + (SV_POSITION.x % 2)
3125       // offset = offset * 32 + sampleID % 8 * 4;
3126       // which is equivalent to:
3127       // offset = (SV_POSITION.y & 0x3) << 6 + (SV_POSITION.x & 0x1) << 5;
3128       // offset += sampleID << 2
3129
3130       // The second operand (src1) of the INSBF instructions are like so:
3131       // 0xssll where ss is the size and ll is the offset.
3132       // so: dest = src2 | (src0 & (1 << ss - 1)) << ll
3133
3134       // Add sample ID (offset = (sampleID & 0x7) << 2)
3135       bld.mkOp3(OP_INSBF, TYPE_U32, offset, sampleID, bld.mkImm(0x0302), bld.mkImm(0x0));
3136
3137       Symbol *xSym = bld.mkSysVal(SV_POSITION, 0);
3138       Symbol *ySym = bld.mkSysVal(SV_POSITION, 1);
3139       Value *coord = bld.getScratch();
3140
3141       // Add X coordinate (offset |= (SV_POSITION.x & 0x1) << 5)
3142       bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3143                    targ->getSVAddress(FILE_SHADER_INPUT, xSym), NULL);
3144       bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3145          ->rnd = ROUND_ZI;
3146       bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0105), offset);
3147
3148       // Add Y coordinate (offset |= (SV_POSITION.y & 0x3) << 6)
3149       bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3150                    targ->getSVAddress(FILE_SHADER_INPUT, ySym), NULL);
3151       bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3152          ->rnd = ROUND_ZI;
3153       bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0206), offset);
3154    } else {
3155       bld.mkOp2(OP_SHL, TYPE_U32, offset, sampleID, bld.mkImm(3));
3156    }
3157    return offset;
3158 }
3159
3160 // Handle programmable sample locations for GM20x+
3161 void
3162 NVC0LoweringPass::handlePIXLD(Instruction *i)
3163 {
3164    if (i->subOp != NV50_IR_SUBOP_PIXLD_OFFSET)
3165       return;
3166    if (targ->getChipset() < NVISA_GM200_CHIPSET)
3167       return;
3168
3169    assert(prog->driver_out->prop.fp.readsSampleLocations);
3170
3171    bld.mkLoad(TYPE_F32,
3172               i->getDef(0),
3173               bld.mkSymbol(
3174                     FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3175                     TYPE_U32, prog->driver->io.sampleInfoBase),
3176               calculateSampleOffset(i->getSrc(0)));
3177
3178    bld.getBB()->remove(i);
3179 }
3180
3181 // Generate a binary predicate if an instruction is predicated by
3182 // e.g. an f32 value.
3183 void
3184 NVC0LoweringPass::checkPredicate(Instruction *insn)
3185 {
3186    Value *pred = insn->getPredicate();
3187    Value *pdst;
3188
3189    if (!pred || pred->reg.file == FILE_PREDICATE)
3190       return;
3191    pdst = new_LValue(func, FILE_PREDICATE);
3192
3193    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
3194    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
3195
3196    bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
3197
3198    insn->setPredicate(insn->cc, pdst);
3199 }
3200
3201 //
3202 // - add quadop dance for texturing
3203 // - put FP outputs in GPRs
3204 // - convert instruction sequences
3205 //
3206 bool
3207 NVC0LoweringPass::visit(Instruction *i)
3208 {
3209    bool ret = true;
3210    bld.setPosition(i, false);
3211
3212    if (i->cc != CC_ALWAYS)
3213       checkPredicate(i);
3214
3215    switch (i->op) {
3216    case OP_TEX:
3217    case OP_TXB:
3218    case OP_TXL:
3219    case OP_TXF:
3220    case OP_TXG:
3221       return handleTEX(i->asTex());
3222    case OP_TXD:
3223       return handleTXD(i->asTex());
3224    case OP_TXLQ:
3225       return handleTXLQ(i->asTex());
3226    case OP_TXQ:
3227      return handleTXQ(i->asTex());
3228    case OP_EX2:
3229       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
3230       i->setSrc(0, i->getDef(0));
3231       break;
3232    case OP_POW:
3233       return handlePOW(i);
3234    case OP_DIV:
3235       return handleDIV(i);
3236    case OP_MOD:
3237       return handleMOD(i);
3238    case OP_SQRT:
3239       return handleSQRT(i);
3240    case OP_EXPORT:
3241       ret = handleEXPORT(i);
3242       break;
3243    case OP_EMIT:
3244    case OP_RESTART:
3245       return handleOUT(i);
3246    case OP_RDSV:
3247       return handleRDSV(i);
3248    case OP_WRSV:
3249       return handleWRSV(i);
3250    case OP_STORE:
3251    case OP_LOAD:
3252       handleLDST(i);
3253       break;
3254    case OP_ATOM:
3255    {
3256       const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
3257       handleATOM(i);
3258       handleCasExch(i, cctl);
3259    }
3260       break;
3261    case OP_SULDB:
3262    case OP_SULDP:
3263    case OP_SUSTB:
3264    case OP_SUSTP:
3265    case OP_SUREDB:
3266    case OP_SUREDP:
3267       if (targ->getChipset() >= NVISA_GM107_CHIPSET)
3268          handleSurfaceOpGM107(i->asTex());
3269       else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
3270          handleSurfaceOpNVE4(i->asTex());
3271       else
3272          handleSurfaceOpNVC0(i->asTex());
3273       break;
3274    case OP_SUQ:
3275       handleSUQ(i->asTex());
3276       break;
3277    case OP_BUFQ:
3278       handleBUFQ(i);
3279       break;
3280    case OP_PIXLD:
3281       handlePIXLD(i);
3282       break;
3283    default:
3284       break;
3285    }
3286
3287    /* Kepler+ has a special opcode to compute a new base address to be used
3288     * for indirect loads.
3289     *
3290     * Maxwell+ has an additional similar requirement for indirect
3291     * interpolation ops in frag shaders.
3292     */
3293    bool doAfetch = false;
3294    if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
3295        !i->perPatch &&
3296        (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
3297        i->src(0).isIndirect(0)) {
3298       doAfetch = true;
3299    }
3300    if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
3301        (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
3302        i->src(0).isIndirect(0)) {
3303       doAfetch = true;
3304    }
3305
3306    if (doAfetch) {
3307       Value *addr = cloneShallow(func, i->getSrc(0));
3308       Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
3309                                       i->getSrc(0));
3310       afetch->setIndirect(0, 0, i->getIndirect(0, 0));
3311       addr->reg.data.offset = 0;
3312       i->setSrc(0, addr);
3313       i->setIndirect(0, 0, afetch->getDef(0));
3314    }
3315
3316    return ret;
3317 }
3318
3319 bool
3320 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
3321 {
3322    if (stage == CG_STAGE_PRE_SSA) {
3323       NVC0LoweringPass pass(prog);
3324       return pass.run(prog, false, true);
3325    } else
3326    if (stage == CG_STAGE_POST_RA) {
3327       NVC0LegalizePostRA pass(prog);
3328       return pass.run(prog, false, true);
3329    } else
3330    if (stage == CG_STAGE_SSA) {
3331       NVC0LegalizeSSA pass;
3332       return pass.run(prog, false, true);
3333    }
3334    return false;
3335 }
3336
3337 } // namespace nv50_ir