src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_build_util.h"
  25
  26 #include "codegen/nv50_ir_target_nvc0.h"
  27 #include "codegen/nv50_ir_lowering_nvc0.h"
  28
  29 #include <limits>
  30
  31 namespace nv50_ir {
  32
  33 #define QOP_ADD  0
  34 #define QOP_SUBR 1
  35 #define QOP_SUB  2
  36 #define QOP_MOV2 3
  37
  38 //             UL UR LL LR
  39 #define QUADOP(q, r, s, t)                      \
  40    ((QOP_##q << 6) | (QOP_##r << 4) |           \
  41     (QOP_##s << 2) | (QOP_##t << 0))
  42
  43 void
  44 NVC0LegalizeSSA::handleDIV(Instruction *i)
  45 {
  46    FlowInstruction *call;
  47    int builtin;
  48
  49    bld.setPosition(i, false);
  50
  51    // Generate movs to the input regs for the call we want to generate
  52    for (int s = 0; i->srcExists(s); ++s) {
  53       Instruction *ld = i->getSrc(s)->getInsn();
  54       assert(ld->getSrc(0) != NULL);
  55       // check if we are moving an immediate, propagate it in that case
  56       if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV) ||
  57             !(ld->src(0).getFile() == FILE_IMMEDIATE))
  58          bld.mkMovToReg(s, i->getSrc(s));
  59       else {
  60          bld.mkMovToReg(s, ld->getSrc(0));
  61          // Clear the src, to make code elimination possible here before we
  62          // delete the instruction i later
  63          i->setSrc(s, NULL);
  64          if (ld->isDead())
  65             delete_Instruction(prog, ld);
  66       }
  67    }
  68
  69    switch (i->dType) {
  70    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
  71    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
  72    default:
  73       return;
  74    }
  75    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
  76    bld.mkMovFromReg(i->getDef(0), i->op == OP_DIV ? 0 : 1);
  77    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
  78    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
  79
  80    call->fixed = 1;
  81    call->absolute = call->builtin = 1;
  82    call->target.builtin = builtin;
  83    delete_Instruction(prog, i);
  84 }
  85
  86 void
  87 NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[])
  88 {
  89    FlowInstruction *call;
  90    Value *def[2];
  91    int builtin;
  92
  93    def[0] = bld.mkMovToReg(0, src[0])->getDef(0);
  94    def[1] = bld.mkMovToReg(1, src[1])->getDef(0);
  95
  96    if (i->op == OP_RCP)
  97       builtin = NVC0_BUILTIN_RCP_F64;
  98    else
  99       builtin = NVC0_BUILTIN_RSQ_F64;
 100
 101    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
 102    def[0] = bld.getSSA();
 103    def[1] = bld.getSSA();
 104    bld.mkMovFromReg(def[0], 0);
 105    bld.mkMovFromReg(def[1], 1);
 106    bld.mkClobber(FILE_GPR, 0x3fc, 2);
 107    bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0);
 108    bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]);
 109
 110    call->fixed = 1;
 111    call->absolute = call->builtin = 1;
 112    call->target.builtin = builtin;
 113    delete_Instruction(prog, i);
 114
 115    prog->fp64 = true;
 116 }
 117
 118 void
 119 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
 120 {
 121    assert(i->dType == TYPE_F64);
 122    // There are instructions that will compute the high 32 bits of the 64-bit
 123    // float. We will just stick 0 in the bottom 32 bits.
 124
 125    bld.setPosition(i, false);
 126
 127    // 1. Take the source and it up.
 128    Value *src[2], *dst[2], *def = i->getDef(0);
 129    bld.mkSplit(src, 4, i->getSrc(0));
 130
 131    int chip = prog->getTarget()->getChipset();
 132    if (chip >= NVISA_GK104_CHIPSET && (i->op == OP_RCP || chip < NVISA_GM107_CHIPSET)) {
 133       handleRCPRSQLib(i, src);
 134       return;
 135    }
 136
 137    // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
 138    dst[0] = bld.loadImm(NULL, 0);
 139    dst[1] = bld.getSSA();
 140
 141    // 3. The new version of the instruction takes the high 32 bits of the
 142    // source and outputs the high 32 bits of the destination.
 143    i->setSrc(0, src[1]);
 144    i->setDef(0, dst[1]);
 145    i->setType(TYPE_F32);
 146    i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
 147
 148    // 4. Recombine the two dst pieces back into the original destination.
 149    bld.setPosition(i, true);
 150    bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
 151 }
 152
 153 void
 154 NVC0LegalizeSSA::handleFTZ(Instruction *i)
 155 {
 156    // Only want to flush float inputs
 157    assert(i->sType == TYPE_F32);
 158
 159    // If we're already flushing denorms (and NaN's) to zero, no need for this.
 160    if (i->dnz)
 161       return;
 162
 163    // Only certain classes of operations can flush
 164    OpClass cls = prog->getTarget()->getOpClass(i->op);
 165    if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
 166        cls != OPCLASS_CONVERT)
 167       return;
 168
 169    i->ftz = true;
 170 }
 171
 172 void
 173 NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
 174 {
 175    if (i->tex.levelZero)
 176       return;
 177
 178    ImmediateValue lod;
 179
 180    // The LOD argument comes right after the coordinates (before depth bias,
 181    // offsets, etc).
 182    int arg = i->tex.target.getArgCount();
 183
 184    // SM30+ stores the indirect handle as a separate arg, which comes before
 185    // the LOD.
 186    if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
 187        i->tex.rIndirectSrc >= 0)
 188       arg++;
 189    // SM20 stores indirect handle combined with array coordinate
 190    if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
 191        !i->tex.target.isArray() &&
 192        i->tex.rIndirectSrc >= 0)
 193       arg++;
 194
 195    if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
 196       return;
 197
 198    if (i->op == OP_TXL)
 199       i->op = OP_TEX;
 200    i->tex.levelZero = true;
 201    i->moveSources(arg + 1, -1);
 202 }
 203
 204 void
 205 NVC0LegalizeSSA::handleShift(Instruction *lo)
 206 {
 207    Value *shift = lo->getSrc(1);
 208    Value *dst64 = lo->getDef(0);
 209    Value *src[2], *dst[2];
 210    operation op = lo->op;
 211
 212    bld.setPosition(lo, false);
 213
 214    bld.mkSplit(src, 4, lo->getSrc(0));
 215
 216    // SM30 and prior don't have the fancy new SHF.L/R ops. So the logic has to
 217    // be completely emulated. For SM35+, we can use the more directed SHF
 218    // operations.
 219    if (prog->getTarget()->getChipset() < NVISA_GK20A_CHIPSET) {
 220       // The strategy here is to handle shifts >= 32 and less than 32 as
 221       // separate parts.
 222       //
 223       // For SHL:
 224       // If the shift is <= 32, then
 225       //   (HI,LO) << x = (HI << x | (LO >> (32 - x)), LO << x)
 226       // If the shift is > 32, then
 227       //   (HI,LO) << x = (LO << (x - 32), 0)
 228       //
 229       // For SHR:
 230       // If the shift is <= 32, then
 231       //   (HI,LO) >> x = (HI >> x, (HI << (32 - x)) | LO >> x)
 232       // If the shift is > 32, then
 233       //   (HI,LO) >> x = (0, HI >> (x - 32))
 234       //
 235       // Note that on NVIDIA hardware, a shift > 32 yields a 0 value, which we
 236       // can use to our advantage. Also note the structural similarities
 237       // between the right/left cases. The main difference is swapping hi/lo
 238       // on input and output.
 239
 240       Value *x32_minus_shift, *pred, *hi1, *hi2;
 241       DataType type = isSignedIntType(lo->dType) ? TYPE_S32 : TYPE_U32;
 242       operation antiop = op == OP_SHR ? OP_SHL : OP_SHR;
 243       if (op == OP_SHR)
 244          std::swap(src[0], src[1]);
 245       bld.mkOp2(OP_ADD, TYPE_U32, (x32_minus_shift = bld.getSSA()), shift, bld.mkImm(0x20))
 246          ->src(0).mod = Modifier(NV50_IR_MOD_NEG);
 247       bld.mkCmp(OP_SET, CC_LE, TYPE_U8, (pred = bld.getSSA(1, FILE_PREDICATE)),
 248                 TYPE_U32, shift, bld.mkImm(32));
 249       // Compute HI (shift <= 32)
 250       bld.mkOp2(OP_OR, TYPE_U32, (hi1 = bld.getSSA()),
 251                 bld.mkOp2v(op, TYPE_U32, bld.getSSA(), src[1], shift),
 252                 bld.mkOp2v(antiop, TYPE_U32, bld.getSSA(), src[0], x32_minus_shift))
 253          ->setPredicate(CC_P, pred);
 254       // Compute LO (all shift values)
 255       bld.mkOp2(op, type, (dst[0] = bld.getSSA()), src[0], shift);
 256       // Compute HI (shift > 32)
 257       bld.mkOp2(op, type, (hi2 = bld.getSSA()), src[0],
 258                 bld.mkOp1v(OP_NEG, TYPE_S32, bld.getSSA(), x32_minus_shift))
 259          ->setPredicate(CC_NOT_P, pred);
 260       bld.mkOp2(OP_UNION, TYPE_U32, (dst[1] = bld.getSSA()), hi1, hi2);
 261       if (op == OP_SHR)
 262          std::swap(dst[0], dst[1]);
 263       bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
 264       delete_Instruction(prog, lo);
 265       return;
 266    }
 267
 268    Instruction *hi = new_Instruction(func, op, TYPE_U32);
 269    lo->bb->insertAfter(lo, hi);
 270
 271    hi->sType = lo->sType;
 272    lo->dType = TYPE_U32;
 273
 274    hi->setDef(0, (dst[1] = bld.getSSA()));
 275    if (lo->op == OP_SHR)
 276       hi->subOp |= NV50_IR_SUBOP_SHIFT_HIGH;
 277    lo->setDef(0, (dst[0] = bld.getSSA()));
 278
 279    bld.setPosition(hi, true);
 280
 281    if (lo->op == OP_SHL)
 282       std::swap(hi, lo);
 283
 284    hi->setSrc(0, new_ImmediateValue(prog, 0u));
 285    hi->setSrc(1, shift);
 286    hi->setSrc(2, lo->op == OP_SHL ? src[0] : src[1]);
 287
 288    lo->setSrc(0, src[0]);
 289    lo->setSrc(1, shift);
 290    lo->setSrc(2, src[1]);
 291
 292    bld.mkOp2(OP_MERGE, TYPE_U64, dst64, dst[0], dst[1]);
 293 }
 294
 295 void
 296 NVC0LegalizeSSA::handleSET(CmpInstruction *cmp)
 297 {
 298    DataType hTy = cmp->sType == TYPE_S64 ? TYPE_S32 : TYPE_U32;
 299    Value *carry;
 300    Value *src0[2], *src1[2];
 301    bld.setPosition(cmp, false);
 302
 303    bld.mkSplit(src0, 4, cmp->getSrc(0));
 304    bld.mkSplit(src1, 4, cmp->getSrc(1));
 305    bld.mkOp2(OP_SUB, hTy, NULL, src0[0], src1[0])
 306       ->setFlagsDef(0, (carry = bld.getSSA(1, FILE_FLAGS)));
 307    cmp->setFlagsSrc(cmp->srcCount(), carry);
 308    cmp->setSrc(0, src0[1]);
 309    cmp->setSrc(1, src1[1]);
 310    cmp->sType = hTy;
 311 }
 312
 313 bool
 314 NVC0LegalizeSSA::visit(Function *fn)
 315 {
 316    bld.setProgram(fn->getProgram());
 317    return true;
 318 }
 319
 320 bool
 321 NVC0LegalizeSSA::visit(BasicBlock *bb)
 322 {
 323    Instruction *next;
 324    for (Instruction *i = bb->getEntry(); i; i = next) {
 325       next = i->next;
 326
 327       if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
 328          handleFTZ(i);
 329
 330       switch (i->op) {
 331       case OP_DIV:
 332       case OP_MOD:
 333          if (i->sType != TYPE_F32)
 334             handleDIV(i);
 335          break;
 336       case OP_RCP:
 337       case OP_RSQ:
 338          if (i->dType == TYPE_F64)
 339             handleRCPRSQ(i);
 340          break;
 341       case OP_TXL:
 342       case OP_TXF:
 343          handleTEXLOD(i->asTex());
 344          break;
 345       case OP_SHR:
 346       case OP_SHL:
 347          if (typeSizeof(i->sType) == 8)
 348             handleShift(i);
 349          break;
 350       case OP_SET:
 351       case OP_SET_AND:
 352       case OP_SET_OR:
 353       case OP_SET_XOR:
 354          if (typeSizeof(i->sType) == 8 && i->sType != TYPE_F64)
 355             handleSET(i->asCmp());
 356          break;
 357       default:
 358          break;
 359       }
 360    }
 361    return true;
 362 }
 363
 364 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
 365    : rZero(NULL),
 366      carry(NULL),
 367      pOne(NULL),
 368      needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
 369                 prog->getTarget()->getChipset() < 0x110)
 370 {
 371 }
 372
 373 bool
 374 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
 375                                     const Instruction *early) const
 376 {
 377    if (early->bb == later->bb)
 378       return early->serial < later->serial;
 379    return later->bb->dominatedBy(early->bb);
 380 }
 381
 382 void
 383 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
 384                               Instruction *usei, const Instruction *texi)
 385 {
 386    bool add = true;
 387    bool dominated = insnDominatedBy(usei, texi);
 388    // Uses before the tex have to all be included. Just because an earlier
 389    // instruction dominates another instruction doesn't mean that there's no
 390    // way to get from the tex to the later instruction. For example you could
 391    // have nested loops, with the tex in the inner loop, and uses before it in
 392    // both loops - even though the outer loop's instruction would dominate the
 393    // inner's, we still want a texbar before the inner loop's instruction.
 394    //
 395    // However we can still use the eliding logic between uses dominated by the
 396    // tex instruction, as that is unambiguously correct.
 397    if (dominated) {
 398       for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
 399          if (it->after) {
 400             if (insnDominatedBy(usei, it->insn)) {
 401                add = false;
 402                break;
 403             }
 404             if (insnDominatedBy(it->insn, usei)) {
 405                it = uses.erase(it);
 406                continue;
 407             }
 408          }
 409          ++it;
 410       }
 411    }
 412    if (add)
 413       uses.push_back(TexUse(usei, texi, dominated));
 414 }
 415
 416 // While it might be tempting to use the an algorithm that just looks at tex
 417 // uses, not all texture results are guaranteed to be used on all paths. In
 418 // the case where along some control flow path a texture result is never used,
 419 // we might reuse that register for something else, creating a
 420 // write-after-write hazard. So we have to manually look through all
 421 // instructions looking for ones that reference the registers in question.
 422 void
 423 NVC0LegalizePostRA::findFirstUses(
 424    Instruction *texi, std::list<TexUse> &uses)
 425 {
 426    int minGPR = texi->def(0).rep()->reg.data.id;
 427    int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
 428
 429    unordered_set<const BasicBlock *> visited;
 430    findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
 431 }
 432
 433 void
 434 NVC0LegalizePostRA::findFirstUsesBB(
 435    int minGPR, int maxGPR, Instruction *start,
 436    const Instruction *texi, std::list<TexUse> &uses,
 437    unordered_set<const BasicBlock *> &visited)
 438 {
 439    const BasicBlock *bb = start->bb;
 440
 441    // We don't process the whole bb the first time around. This is correct,
 442    // however we might be in a loop and hit this BB again, and need to process
 443    // the full thing. So only mark a bb as visited if we processed it from the
 444    // beginning.
 445    if (start == bb->getEntry()) {
 446       if (visited.find(bb) != visited.end())
 447          return;
 448       visited.insert(bb);
 449    }
 450
 451    for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
 452       if (insn->isNop())
 453          continue;
 454
 455       for (int d = 0; insn->defExists(d); ++d) {
 456          const Value *def = insn->def(d).rep();
 457          if (insn->def(d).getFile() != FILE_GPR ||
 458              def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
 459              def->reg.data.id > maxGPR)
 460             continue;
 461          addTexUse(uses, insn, texi);
 462          return;
 463       }
 464
 465       for (int s = 0; insn->srcExists(s); ++s) {
 466          const Value *src = insn->src(s).rep();
 467          if (insn->src(s).getFile() != FILE_GPR ||
 468              src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
 469              src->reg.data.id > maxGPR)
 470             continue;
 471          addTexUse(uses, insn, texi);
 472          return;
 473       }
 474    }
 475
 476    for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
 477       findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
 478                       texi, uses, visited);
 479    }
 480 }
 481
 482 // Texture barriers:
 483 // This pass is a bit long and ugly and can probably be optimized.
 484 //
 485 // 1. obtain a list of TEXes and their outputs' first use(s)
 486 // 2. calculate the barrier level of each first use (minimal number of TEXes,
 487 //    over all paths, between the TEX and the use in question)
 488 // 3. for each barrier, if all paths from the source TEX to that barrier
 489 //    contain a barrier of lesser level, it can be culled
 490 bool
 491 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
 492 {
 493    std::list<TexUse> *uses;
 494    std::vector<Instruction *> texes;
 495    std::vector<int> bbFirstTex;
 496    std::vector<int> bbFirstUse;
 497    std::vector<int> texCounts;
 498    std::vector<TexUse> useVec;
 499    ArrayList insns;
 500
 501    fn->orderInstructions(insns);
 502
 503    texCounts.resize(fn->allBBlocks.getSize(), 0);
 504    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
 505    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
 506
 507    // tag BB CFG nodes by their id for later
 508    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
 509       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
 510       if (bb)
 511          bb->cfg.tag = bb->getId();
 512    }
 513
 514    // gather the first uses for each TEX
 515    for (int i = 0; i < insns.getSize(); ++i) {
 516       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
 517       if (isTextureOp(tex->op)) {
 518          texes.push_back(tex);
 519          if (!texCounts.at(tex->bb->getId()))
 520             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
 521          texCounts[tex->bb->getId()]++;
 522       }
 523    }
 524    insns.clear();
 525    if (texes.empty())
 526       return false;
 527    uses = new std::list<TexUse>[texes.size()];
 528    if (!uses)
 529       return false;
 530    for (size_t i = 0; i < texes.size(); ++i) {
 531       findFirstUses(texes[i], uses[i]);
 532    }
 533
 534    // determine the barrier level at each use
 535    for (size_t i = 0; i < texes.size(); ++i) {
 536       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
 537            ++u) {
 538          BasicBlock *tb = texes[i]->bb;
 539          BasicBlock *ub = u->insn->bb;
 540          if (tb == ub) {
 541             u->level = 0;
 542             for (size_t j = i + 1; j < texes.size() &&
 543                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
 544                  ++j)
 545                u->level++;
 546          } else {
 547             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
 548                                                       &ub->cfg, texCounts);
 549             if (u->level < 0) {
 550                WARN("Failed to find path TEX -> TEXBAR\n");
 551                u->level = 0;
 552                continue;
 553             }
 554             // this counted all TEXes in the origin block, correct that
 555             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
 556             // and did not count the TEXes in the destination block, add those
 557             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
 558                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
 559                  ++j)
 560                u->level++;
 561          }
 562          assert(u->level >= 0);
 563          useVec.push_back(*u);
 564       }
 565    }
 566    delete[] uses;
 567
 568    // insert the barriers
 569    for (size_t i = 0; i < useVec.size(); ++i) {
 570       Instruction *prev = useVec[i].insn->prev;
 571       if (useVec[i].level < 0)
 572          continue;
 573       if (prev && prev->op == OP_TEXBAR) {
 574          if (prev->subOp > useVec[i].level)
 575             prev->subOp = useVec[i].level;
 576          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
 577       } else {
 578          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
 579          bar->fixed = 1;
 580          bar->subOp = useVec[i].level;
 581          // make use explicit to ease latency calculation
 582          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
 583          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
 584       }
 585    }
 586
 587    if (fn->getProgram()->optLevel < 3)
 588       return true;
 589
 590    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
 591
 592    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
 593    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
 594    limitS.resize(fn->allBBlocks.getSize());
 595
 596    // cull unneeded barriers (should do that earlier, but for simplicity)
 597    IteratorRef bi = fn->cfg.iteratorCFG();
 598    // first calculate min/max outstanding TEXes for each BB
 599    for (bi->reset(); !bi->end(); bi->next()) {
 600       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 601       BasicBlock *bb = BasicBlock::get(n);
 602       int min = 0;
 603       int max = std::numeric_limits<int>::max();
 604       for (Instruction *i = bb->getFirst(); i; i = i->next) {
 605          if (isTextureOp(i->op)) {
 606             min++;
 607             if (max < std::numeric_limits<int>::max())
 608                max++;
 609          } else
 610          if (i->op == OP_TEXBAR) {
 611             min = MIN2(min, i->subOp);
 612             max = MIN2(max, i->subOp);
 613          }
 614       }
 615       // limits when looking at an isolated block
 616       limitS[bb->getId()].min = min;
 617       limitS[bb->getId()].max = max;
 618    }
 619    // propagate the min/max values
 620    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
 621       for (bi->reset(); !bi->end(); bi->next()) {
 622          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 623          BasicBlock *bb = BasicBlock::get(n);
 624          const int bbId = bb->getId();
 625          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
 626             BasicBlock *in = BasicBlock::get(ei.getNode());
 627             const int inId = in->getId();
 628             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
 629             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
 630          }
 631          // I just hope this is correct ...
 632          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
 633             // no barrier
 634             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
 635             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
 636          } else {
 637             // block contained a barrier
 638             limitB[bbId].min = MIN2(limitS[bbId].max,
 639                                     limitT[bbId].min + limitS[bbId].min);
 640             limitB[bbId].max = MIN2(limitS[bbId].max,
 641                                     limitT[bbId].max + limitS[bbId].min);
 642          }
 643       }
 644    }
 645    // finally delete unnecessary barriers
 646    for (bi->reset(); !bi->end(); bi->next()) {
 647       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 648       BasicBlock *bb = BasicBlock::get(n);
 649       Instruction *prev = NULL;
 650       Instruction *next;
 651       int max = limitT[bb->getId()].max;
 652       for (Instruction *i = bb->getFirst(); i; i = next) {
 653          next = i->next;
 654          if (i->op == OP_TEXBAR) {
 655             if (i->subOp >= max) {
 656                delete_Instruction(prog, i);
 657                i = NULL;
 658             } else {
 659                max = i->subOp;
 660                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
 661                   delete_Instruction(prog, prev);
 662                   prev = NULL;
 663                }
 664             }
 665          } else
 666          if (isTextureOp(i->op)) {
 667             max++;
 668          }
 669          if (i && !i->isNop())
 670             prev = i;
 671       }
 672    }
 673    return true;
 674 }
 675
 676 bool
 677 NVC0LegalizePostRA::visit(Function *fn)
 678 {
 679    if (needTexBar)
 680       insertTextureBarriers(fn);
 681
 682    rZero = new_LValue(fn, FILE_GPR);
 683    pOne = new_LValue(fn, FILE_PREDICATE);
 684    carry = new_LValue(fn, FILE_FLAGS);
 685
 686    rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
 687    carry->reg.data.id = 0;
 688    pOne->reg.data.id = 7;
 689
 690    return true;
 691 }
 692
 693 void
 694 NVC0LegalizePostRA::replaceZero(Instruction *i)
 695 {
 696    for (int s = 0; i->srcExists(s); ++s) {
 697       if (s == 2 && i->op == OP_SUCLAMP)
 698          continue;
 699       if (s == 1 && i->op == OP_SHLADD)
 700          continue;
 701       ImmediateValue *imm = i->getSrc(s)->asImm();
 702       if (imm) {
 703          if (i->op == OP_SELP && s == 2) {
 704             i->setSrc(s, pOne);
 705             if (imm->reg.data.u64 == 0)
 706                i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
 707          } else if (imm->reg.data.u64 == 0) {
 708             i->setSrc(s, rZero);
 709          }
 710       }
 711    }
 712 }
 713
 714 // replace CONT with BRA for single unconditional continue
 715 bool
 716 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
 717 {
 718    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
 719       return false;
 720    Graph::EdgeIterator ei = bb->cfg.incident();
 721    if (ei.getType() != Graph::Edge::BACK)
 722       ei.next();
 723    if (ei.getType() != Graph::Edge::BACK)
 724       return false;
 725    BasicBlock *contBB = BasicBlock::get(ei.getNode());
 726
 727    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
 728        contBB->getExit()->getPredicate())
 729       return false;
 730    contBB->getExit()->op = OP_BRA;
 731    bb->remove(bb->getEntry()); // delete PRECONT
 732
 733    ei.next();
 734    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
 735    return true;
 736 }
 737
 738 // replace branches to join blocks with join ops
 739 void
 740 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
 741 {
 742    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
 743       return;
 744    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
 745       BasicBlock *in = BasicBlock::get(ei.getNode());
 746       Instruction *exit = in->getExit();
 747       if (!exit) {
 748          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
 749          // there should always be a terminator instruction
 750          WARN("inserted missing terminator in BB:%i\n", in->getId());
 751       } else
 752       if (exit->op == OP_BRA) {
 753          exit->op = OP_JOIN;
 754          exit->asFlow()->limit = 1; // must-not-propagate marker
 755       }
 756    }
 757    bb->remove(bb->getEntry());
 758 }
 759
 760 // replaces instructions which would end up as f2f or i2i with faster
 761 // alternatives:
 762 //  - fabs(a)     -> fadd(0, abs a)
 763 //  - fneg(a)     -> fadd(neg 0, neg a)
 764 //  - ineg(a)     -> iadd(0, neg a)
 765 //  - fneg(abs a) -> fadd(neg 0, neg abs a)
 766 //  - sat(a)      -> sat add(0, a)
 767 void
 768 NVC0LegalizePostRA::replaceCvt(Instruction *cvt)
 769 {
 770    if (!isFloatType(cvt->sType) && typeSizeof(cvt->sType) != 4)
 771       return;
 772    if (cvt->sType != cvt->dType)
 773       return;
 774    // we could make it work, but in this case we have optimizations disabled
 775    // and we don't really care either way.
 776    if (cvt->src(0).getFile() != FILE_GPR &&
 777        cvt->src(0).getFile() != FILE_MEMORY_CONST)
 778       return;
 779
 780    Modifier mod0, mod1;
 781
 782    switch (cvt->op) {
 783    case OP_ABS:
 784       if (cvt->src(0).mod)
 785          return;
 786       if (!isFloatType(cvt->sType))
 787          return;
 788       mod0 = 0;
 789       mod1 = NV50_IR_MOD_ABS;
 790       break;
 791    case OP_NEG:
 792       if (!isFloatType(cvt->sType) && cvt->src(0).mod)
 793          return;
 794       if (isFloatType(cvt->sType) &&
 795           (cvt->src(0).mod && cvt->src(0).mod != Modifier(NV50_IR_MOD_ABS)))
 796          return;
 797
 798       mod0 = isFloatType(cvt->sType) ? NV50_IR_MOD_NEG : 0;
 799       mod1 = cvt->src(0).mod == Modifier(NV50_IR_MOD_ABS) ?
 800          NV50_IR_MOD_NEG_ABS : NV50_IR_MOD_NEG;
 801       break;
 802    case OP_SAT:
 803       if (!isFloatType(cvt->sType) && cvt->src(0).mod.abs())
 804          return;
 805       mod0 = 0;
 806       mod1 = cvt->src(0).mod;
 807       cvt->saturate = true;
 808       break;
 809    default:
 810       return;
 811    }
 812
 813    cvt->op = OP_ADD;
 814    cvt->moveSources(0, 1);
 815    cvt->setSrc(0, rZero);
 816    cvt->src(0).mod = mod0;
 817    cvt->src(1).mod = mod1;
 818 }
 819
 820 bool
 821 NVC0LegalizePostRA::visit(BasicBlock *bb)
 822 {
 823    Instruction *i, *next;
 824
 825    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
 826    for (i = bb->getFirst(); i; i = next) {
 827       next = i->next;
 828       if (i->op == OP_EMIT || i->op == OP_RESTART) {
 829          if (!i->getDef(0)->refCount())
 830             i->setDef(0, NULL);
 831          if (i->src(0).getFile() == FILE_IMMEDIATE)
 832             i->setSrc(0, rZero); // initial value must be 0
 833          replaceZero(i);
 834       } else
 835       if (i->isNop()) {
 836          bb->remove(i);
 837       } else
 838       if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
 839           prog->getType() != Program::TYPE_COMPUTE) {
 840          // It seems like barriers are never required for tessellation since
 841          // the warp size is 32, and there are always at most 32 tcs threads.
 842          bb->remove(i);
 843       } else
 844       if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
 845          int offset = i->src(0).get()->reg.data.offset;
 846          if (abs(offset) >= 0x10000)
 847             i->src(0).get()->reg.fileIndex += offset >> 16;
 848          i->src(0).get()->reg.data.offset = (int)(short)offset;
 849       } else {
 850          // TODO: Move this to before register allocation for operations that
 851          // need the $c register !
 852          if (typeSizeof(i->sType) == 8 || typeSizeof(i->dType) == 8) {
 853             Instruction *hi;
 854             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
 855             if (hi)
 856                next = hi;
 857          }
 858
 859          if (i->op == OP_SAT || i->op == OP_NEG || i->op == OP_ABS)
 860             replaceCvt(i);
 861
 862          if (i->op != OP_MOV && i->op != OP_PFETCH)
 863             replaceZero(i);
 864       }
 865    }
 866    if (!bb->getEntry())
 867       return true;
 868
 869    if (!tryReplaceContWithBra(bb))
 870       propagateJoin(bb);
 871
 872    return true;
 873 }
 874
 875 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
 876 {
 877    bld.setProgram(prog);
 878 }
 879
 880 bool
 881 NVC0LoweringPass::visit(Function *fn)
 882 {
 883    if (prog->getType() == Program::TYPE_GEOMETRY) {
 884       assert(!strncmp(fn->getName(), "MAIN", 4));
 885       // TODO: when we generate actual functions pass this value along somehow
 886       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
 887       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
 888       if (fn->cfgExit) {
 889          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
 890          bld.mkMovToReg(0, gpEmitAddress);
 891       }
 892    }
 893    return true;
 894 }
 895
 896 bool
 897 NVC0LoweringPass::visit(BasicBlock *bb)
 898 {
 899    return true;
 900 }
 901
 902 inline Value *
 903 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
 904 {
 905    uint8_t b = prog->driver->io.auxCBSlot;
 906    uint32_t off = prog->driver->io.texBindBase + slot * 4;
 907
 908    if (ptr)
 909       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
 910
 911    return bld.
 912       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 913 }
 914
 915 // move array source to first slot, convert to u16, add indirections
 916 bool
 917 NVC0LoweringPass::handleTEX(TexInstruction *i)
 918 {
 919    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
 920    const int arg = i->tex.target.getArgCount();
 921    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
 922    const int chipset = prog->getTarget()->getChipset();
 923
 924    /* Only normalize in the non-explicit derivatives case. For explicit
 925     * derivatives, this is handled in handleManualTXD.
 926     */
 927    if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
 928       Value *src[3], *val;
 929       int c;
 930       for (c = 0; c < 3; ++c)
 931          src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
 932       val = bld.getScratch();
 933       bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
 934       bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
 935       bld.mkOp1(OP_RCP, TYPE_F32, val, val);
 936       for (c = 0; c < 3; ++c) {
 937          i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
 938                                  i->getSrc(c), val));
 939       }
 940    }
 941
 942    // Arguments to the TEX instruction are a little insane. Even though the
 943    // encoding is identical between SM20 and SM30, the arguments mean
 944    // different things between Fermi and Kepler+. A lot of arguments are
 945    // optional based on flags passed to the instruction. This summarizes the
 946    // order of things.
 947    //
 948    // Fermi:
 949    //  array/indirect
 950    //  coords
 951    //  sample
 952    //  lod bias
 953    //  depth compare
 954    //  offsets:
 955    //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
 956    //    - other: 4 bits each, single reg
 957    //
 958    // Kepler+:
 959    //  indirect handle
 960    //  array (+ offsets for txd in upper 16 bits)
 961    //  coords
 962    //  sample
 963    //  lod bias
 964    //  depth compare
 965    //  offsets (same as fermi, except txd which takes it with array)
 966    //
 967    // Maxwell (tex):
 968    //  array
 969    //  coords
 970    //  indirect handle
 971    //  sample
 972    //  lod bias
 973    //  depth compare
 974    //  offsets
 975    //
 976    // Maxwell (txd):
 977    //  indirect handle
 978    //  coords
 979    //  array + offsets
 980    //  derivatives
 981
 982    if (chipset >= NVISA_GK104_CHIPSET) {
 983       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
 984          // XXX this ignores tsc, and assumes a 1:1 mapping
 985          assert(i->tex.rIndirectSrc >= 0);
 986          if (!i->tex.bindless) {
 987             Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
 988             i->tex.r = 0xff;
 989             i->tex.s = 0x1f;
 990             i->setIndirectR(hnd);
 991          }
 992          i->setIndirectS(NULL);
 993       } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
 994          if (i->tex.r == 0xffff)
 995             i->tex.r = prog->driver->io.fbtexBindBase / 4;
 996          else
 997             i->tex.r += prog->driver->io.texBindBase / 4;
 998          i->tex.s  = 0; // only a single cX[] value possible here
 999       } else {
1000          Value *hnd = bld.getScratch();
1001          Value *rHnd = loadTexHandle(NULL, i->tex.r);
1002          Value *sHnd = loadTexHandle(NULL, i->tex.s);
1003
1004          bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
1005
1006          i->tex.r = 0; // not used for indirect tex
1007          i->tex.s = 0;
1008          i->setIndirectR(hnd);
1009       }
1010       if (i->tex.target.isArray()) {
1011          LValue *layer = new_LValue(func, FILE_GPR);
1012          Value *src = i->getSrc(lyr);
1013          const int sat = (i->op == OP_TXF) ? 1 : 0;
1014          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1015          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
1016          if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
1017             for (int s = dim; s >= 1; --s)
1018                i->setSrc(s, i->getSrc(s - 1));
1019             i->setSrc(0, layer);
1020          } else {
1021             i->setSrc(dim, layer);
1022          }
1023       }
1024       // Move the indirect reference to the first place
1025       if (i->tex.rIndirectSrc >= 0 && (
1026                 i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
1027          Value *hnd = i->getIndirectR();
1028
1029          i->setIndirectR(NULL);
1030          i->moveSources(0, 1);
1031          i->setSrc(0, hnd);
1032          i->tex.rIndirectSrc = 0;
1033          i->tex.sIndirectSrc = -1;
1034       }
1035       // Move the indirect reference to right after the coords
1036       else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
1037          Value *hnd = i->getIndirectR();
1038
1039          i->setIndirectR(NULL);
1040          i->moveSources(arg, 1);
1041          i->setSrc(arg, hnd);
1042          i->tex.rIndirectSrc = 0;
1043          i->tex.sIndirectSrc = -1;
1044       }
1045    } else
1046    // (nvc0) generate and move the tsc/tic/array source to the front
1047    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
1048       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1049
1050       Value *ticRel = i->getIndirectR();
1051       Value *tscRel = i->getIndirectS();
1052
1053       if (i->tex.r == 0xffff) {
1054          i->tex.r = 0x20;
1055          i->tex.s = 0x10;
1056       }
1057
1058       if (ticRel) {
1059          i->setSrc(i->tex.rIndirectSrc, NULL);
1060          if (i->tex.r)
1061             ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1062                                 ticRel, bld.mkImm(i->tex.r));
1063       }
1064       if (tscRel) {
1065          i->setSrc(i->tex.sIndirectSrc, NULL);
1066          if (i->tex.s)
1067             tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1068                                 tscRel, bld.mkImm(i->tex.s));
1069       }
1070
1071       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
1072       if (arrayIndex) {
1073          for (int s = dim; s >= 1; --s)
1074             i->setSrc(s, i->getSrc(s - 1));
1075          i->setSrc(0, arrayIndex);
1076       } else {
1077          i->moveSources(0, 1);
1078       }
1079
1080       if (arrayIndex) {
1081          int sat = (i->op == OP_TXF) ? 1 : 0;
1082          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
1083          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
1084       } else {
1085          bld.loadImm(src, 0);
1086       }
1087
1088       if (ticRel)
1089          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
1090       if (tscRel)
1091          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
1092
1093       i->setSrc(0, src);
1094    }
1095
1096    // For nvc0, the sample id has to be in the second operand, as the offset
1097    // does. Right now we don't know how to pass both in, and this case can't
1098    // happen with OpenGL. On nve0, the sample id is part of the texture
1099    // coordinate argument.
1100    assert(chipset >= NVISA_GK104_CHIPSET ||
1101           !i->tex.useOffsets || !i->tex.target.isMS());
1102
1103    // offset is between lod and dc
1104    if (i->tex.useOffsets) {
1105       int n, c;
1106       int s = i->srcCount(0xff, true);
1107       if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
1108          if (i->tex.target.isShadow())
1109             s--;
1110          if (i->srcExists(s)) // move potential predicate out of the way
1111             i->moveSources(s, 1);
1112          if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
1113             i->moveSources(s + 1, 1);
1114       }
1115       if (i->op == OP_TXG) {
1116          // Either there is 1 offset, which goes into the 2 low bytes of the
1117          // first source, or there are 4 offsets, which go into 2 sources (8
1118          // values, 1 byte each).
1119          Value *offs[2] = {NULL, NULL};
1120          for (n = 0; n < i->tex.useOffsets; n++) {
1121             for (c = 0; c < 2; ++c) {
1122                if ((n % 2) == 0 && c == 0)
1123                   bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
1124                else
1125                   bld.mkOp3(OP_INSBF, TYPE_U32,
1126                             offs[n / 2],
1127                             i->offset[n][c].get(),
1128                             bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
1129                             offs[n / 2]);
1130             }
1131          }
1132          i->setSrc(s, offs[0]);
1133          if (offs[1])
1134             i->setSrc(s + 1, offs[1]);
1135       } else {
1136          unsigned imm = 0;
1137          assert(i->tex.useOffsets == 1);
1138          for (c = 0; c < 3; ++c) {
1139             ImmediateValue val;
1140             if (!i->offset[0][c].getImmediate(val))
1141                assert(!"non-immediate offset passed to non-TXG");
1142             imm |= (val.reg.data.u32 & 0xf) << (c * 4);
1143          }
1144          if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
1145             // The offset goes into the upper 16 bits of the array index. So
1146             // create it if it's not already there, and INSBF it if it already
1147             // is.
1148             s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
1149             if (chipset >= NVISA_GM107_CHIPSET)
1150                s += dim;
1151             if (i->tex.target.isArray()) {
1152                Value *offset = bld.getScratch();
1153                bld.mkOp3(OP_INSBF, TYPE_U32, offset,
1154                          bld.loadImm(NULL, imm), bld.mkImm(0xc10),
1155                          i->getSrc(s));
1156                i->setSrc(s, offset);
1157             } else {
1158                i->moveSources(s, 1);
1159                i->setSrc(s, bld.loadImm(NULL, imm << 16));
1160             }
1161          } else {
1162             i->setSrc(s, bld.loadImm(NULL, imm));
1163          }
1164       }
1165    }
1166
1167    return true;
1168 }
1169
1170 bool
1171 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
1172 {
1173    // Always done from the l0 perspective. This is the way that NVIDIA's
1174    // driver does it, and doing it from the "current" lane's perpsective
1175    // doesn't seem to always work for reasons that aren't altogether clear,
1176    // even in frag shaders.
1177    //
1178    // Note that we must move not only the coordinates into lane0, but also all
1179    // ancillary arguments, like array indices and depth compare as they may
1180    // differ between lanes. Offsets for TXD are supposed to be uniform, so we
1181    // leave them alone.
1182    static const uint8_t qOps[2] =
1183       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) };
1184
1185    Value *def[4][4];
1186    Value *crd[3], *arr[2], *shadow;
1187    Instruction *tex;
1188    Value *zero = bld.loadImm(bld.getSSA(), 0);
1189    int l, c;
1190    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1191
1192    // This function is invoked after handleTEX lowering, so we have to expect
1193    // the arguments in the order that the hw wants them. For Fermi, array and
1194    // indirect are both in the leading arg, while for Kepler, array and
1195    // indirect are separate (and both precede the coordinates). Maxwell is
1196    // handled in a separate function.
1197    int array;
1198    if (targ->getChipset() < NVISA_GK104_CHIPSET)
1199       array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
1200    else
1201       array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
1202
1203    i->op = OP_TEX; // no need to clone dPdx/dPdy later
1204
1205    for (c = 0; c < dim; ++c)
1206       crd[c] = bld.getScratch();
1207    for (c = 0; c < array; ++c)
1208       arr[c] = bld.getScratch();
1209    shadow = bld.getScratch();
1210
1211    for (l = 0; l < 4; ++l) {
1212       Value *src[3], *val;
1213
1214       bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1215       // we're using the texture result from lane 0 in all cases, so make sure
1216       // that lane 0 is pointing at the proper array index, indirect value,
1217       // and depth compare.
1218       if (l != 0) {
1219          for (c = 0; c < array; ++c)
1220             bld.mkQuadop(0x00, arr[c], l, i->getSrc(c), zero);
1221          if (i->tex.target.isShadow()) {
1222             // The next argument after coords is the depth compare
1223             bld.mkQuadop(0x00, shadow, l, i->getSrc(array + dim), zero);
1224          }
1225       }
1226       // mov position coordinates from lane l to all lanes
1227       for (c = 0; c < dim; ++c)
1228          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
1229       // add dPdx from lane l to lanes dx
1230       for (c = 0; c < dim; ++c)
1231          bld.mkQuadop(qOps[0], crd[c], l, i->dPdx[c].get(), crd[c]);
1232       // add dPdy from lane l to lanes dy
1233       for (c = 0; c < dim; ++c)
1234          bld.mkQuadop(qOps[1], crd[c], l, i->dPdy[c].get(), crd[c]);
1235       // normalize cube coordinates
1236       if (i->tex.target.isCube()) {
1237          for (c = 0; c < 3; ++c)
1238             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1239          val = bld.getScratch();
1240          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1241          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1242          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1243          for (c = 0; c < 3; ++c)
1244             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1245       } else {
1246          for (c = 0; c < dim; ++c)
1247             src[c] = crd[c];
1248       }
1249       // texture
1250       bld.insert(tex = cloneForward(func, i));
1251       if (l != 0) {
1252          for (c = 0; c < array; ++c)
1253             tex->setSrc(c, arr[c]);
1254          if (i->tex.target.isShadow())
1255             tex->setSrc(array + dim, shadow);
1256       }
1257       for (c = 0; c < dim; ++c)
1258          tex->setSrc(c + array, src[c]);
1259       // broadcast results from lane 0 to all lanes so that the moves *into*
1260       // the target lane pick up the proper value.
1261       if (l != 0)
1262          for (c = 0; i->defExists(c); ++c)
1263             bld.mkQuadop(0x00, tex->getDef(c), 0, tex->getDef(c), zero);
1264       bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1265
1266       // save results
1267       for (c = 0; i->defExists(c); ++c) {
1268          Instruction *mov;
1269          def[c][l] = bld.getSSA();
1270          mov = bld.mkMov(def[c][l], tex->getDef(c));
1271          mov->fixed = 1;
1272          mov->lanes = 1 << l;
1273       }
1274    }
1275
1276    for (c = 0; i->defExists(c); ++c) {
1277       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1278       for (l = 0; l < 4; ++l)
1279          u->setSrc(l, def[c][l]);
1280    }
1281
1282    i->bb->remove(i);
1283    return true;
1284 }
1285
1286 bool
1287 NVC0LoweringPass::handleTXD(TexInstruction *txd)
1288 {
1289    int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
1290    unsigned arg = txd->tex.target.getArgCount();
1291    unsigned expected_args = arg;
1292    const int chipset = prog->getTarget()->getChipset();
1293
1294    if (chipset >= NVISA_GK104_CHIPSET) {
1295       if (!txd->tex.target.isArray() && txd->tex.useOffsets)
1296          expected_args++;
1297       if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
1298          expected_args++;
1299    } else {
1300       if (txd->tex.useOffsets)
1301          expected_args++;
1302       if (!txd->tex.target.isArray() && (
1303                 txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
1304          expected_args++;
1305    }
1306
1307    if (expected_args > 4 ||
1308        dim > 2 ||
1309        txd->tex.target.isShadow())
1310       txd->op = OP_TEX;
1311
1312    handleTEX(txd);
1313    while (txd->srcExists(arg))
1314       ++arg;
1315
1316    txd->tex.derivAll = true;
1317    if (txd->op == OP_TEX)
1318       return handleManualTXD(txd);
1319
1320    assert(arg == expected_args);
1321    for (int c = 0; c < dim; ++c) {
1322       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
1323       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
1324       txd->dPdx[c].set(NULL);
1325       txd->dPdy[c].set(NULL);
1326    }
1327
1328    // In this case we have fewer than 4 "real" arguments, which means that
1329    // handleTEX didn't apply any padding. However we have to make sure that
1330    // the second "group" of arguments still gets padded up to 4.
1331    if (chipset >= NVISA_GK104_CHIPSET) {
1332       int s = arg + 2 * dim;
1333       if (s >= 4 && s < 7) {
1334          if (txd->srcExists(s)) // move potential predicate out of the way
1335             txd->moveSources(s, 7 - s);
1336          while (s < 7)
1337             txd->setSrc(s++, bld.loadImm(NULL, 0));
1338       }
1339    }
1340
1341    return true;
1342 }
1343
1344 bool
1345 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
1346 {
1347    const int chipset = prog->getTarget()->getChipset();
1348    if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
1349       txq->tex.r += prog->driver->io.texBindBase / 4;
1350
1351    if (txq->tex.rIndirectSrc < 0)
1352       return true;
1353
1354    Value *ticRel = txq->getIndirectR();
1355
1356    txq->setIndirectS(NULL);
1357    txq->tex.sIndirectSrc = -1;
1358
1359    assert(ticRel);
1360
1361    if (chipset < NVISA_GK104_CHIPSET) {
1362       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1363
1364       txq->setSrc(txq->tex.rIndirectSrc, NULL);
1365       if (txq->tex.r)
1366          ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1367                              ticRel, bld.mkImm(txq->tex.r));
1368
1369       bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
1370
1371       txq->moveSources(0, 1);
1372       txq->setSrc(0, src);
1373    } else {
1374       Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
1375       txq->tex.r = 0xff;
1376       txq->tex.s = 0x1f;
1377
1378       txq->setIndirectR(NULL);
1379       txq->moveSources(0, 1);
1380       txq->setSrc(0, hnd);
1381       txq->tex.rIndirectSrc = 0;
1382    }
1383
1384    return true;
1385 }
1386
1387 bool
1388 NVC0LoweringPass::handleTXLQ(TexInstruction *i)
1389 {
1390    /* The outputs are inverted compared to what the TGSI instruction
1391     * expects. Take that into account in the mask.
1392     */
1393    assert((i->tex.mask & ~3) == 0);
1394    if (i->tex.mask == 1)
1395       i->tex.mask = 2;
1396    else if (i->tex.mask == 2)
1397       i->tex.mask = 1;
1398    handleTEX(i);
1399    bld.setPosition(i, true);
1400
1401    /* The returned values are not quite what we want:
1402     * (a) convert from s16/u16 to f32
1403     * (b) multiply by 1/256
1404     */
1405    for (int def = 0; def < 2; ++def) {
1406       if (!i->defExists(def))
1407          continue;
1408       enum DataType type = TYPE_S16;
1409       if (i->tex.mask == 2 || def > 0)
1410          type = TYPE_U16;
1411       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
1412       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1413                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1414    }
1415    if (i->tex.mask == 3) {
1416       LValue *t = new_LValue(func, FILE_GPR);
1417       bld.mkMov(t, i->getDef(0));
1418       bld.mkMov(i->getDef(0), i->getDef(1));
1419       bld.mkMov(i->getDef(1), t);
1420    }
1421    return true;
1422 }
1423
1424 bool
1425 NVC0LoweringPass::handleBUFQ(Instruction *bufq)
1426 {
1427    bufq->op = OP_MOV;
1428    bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
1429                                    bufq->getSrc(0)->reg.fileIndex * 16));
1430    bufq->setIndirect(0, 0, NULL);
1431    bufq->setIndirect(0, 1, NULL);
1432    return true;
1433 }
1434
1435 void
1436 NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
1437 {
1438    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1439
1440    BasicBlock *currBB = atom->bb;
1441    BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1442    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1443    BasicBlock *setAndUnlockBB = new BasicBlock(func);
1444    BasicBlock *failLockBB = new BasicBlock(func);
1445
1446    bld.setPosition(currBB, true);
1447    assert(!currBB->joinAt);
1448    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1449
1450    CmpInstruction *pred =
1451       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1452                 TYPE_U32, bld.mkImm(0), bld.mkImm(1));
1453
1454    bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1455    currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1456
1457    bld.setPosition(tryLockBB, true);
1458
1459    Instruction *ld =
1460       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1461                  atom->getIndirect(0, 0));
1462    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1463    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1464
1465    bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
1466    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1467    tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1468    tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1469
1470    tryLockBB->cfg.detach(&joinBB->cfg);
1471    bld.remove(atom);
1472
1473    bld.setPosition(setAndUnlockBB, true);
1474    Value *stVal;
1475    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1476       // Read the old value, and write the new one.
1477       stVal = atom->getSrc(1);
1478    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1479       CmpInstruction *set =
1480          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
1481                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1482
1483       bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
1484                 TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
1485    } else {
1486       operation op;
1487
1488       switch (atom->subOp) {
1489       case NV50_IR_SUBOP_ATOM_ADD:
1490          op = OP_ADD;
1491          break;
1492       case NV50_IR_SUBOP_ATOM_AND:
1493          op = OP_AND;
1494          break;
1495       case NV50_IR_SUBOP_ATOM_OR:
1496          op = OP_OR;
1497          break;
1498       case NV50_IR_SUBOP_ATOM_XOR:
1499          op = OP_XOR;
1500          break;
1501       case NV50_IR_SUBOP_ATOM_MIN:
1502          op = OP_MIN;
1503          break;
1504       case NV50_IR_SUBOP_ATOM_MAX:
1505          op = OP_MAX;
1506          break;
1507       default:
1508          assert(0);
1509          return;
1510       }
1511
1512       stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
1513                          atom->getSrc(1));
1514    }
1515
1516    Instruction *st =
1517       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1518                   atom->getIndirect(0, 0), stVal);
1519    st->setDef(0, pred->getDef(0));
1520    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1521
1522    bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1523    setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1524
1525    // Lock until the store has not been performed.
1526    bld.setPosition(failLockBB, true);
1527    bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
1528    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1529    failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1530    failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1531
1532    bld.setPosition(joinBB, false);
1533    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1534 }
1535
1536 void
1537 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
1538 {
1539    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1540
1541    BasicBlock *currBB = atom->bb;
1542    BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
1543    BasicBlock *joinBB = atom->bb->splitAfter(atom);
1544
1545    bld.setPosition(currBB, true);
1546    assert(!currBB->joinAt);
1547    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1548
1549    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
1550    currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
1551
1552    bld.setPosition(tryLockAndSetBB, true);
1553
1554    Instruction *ld =
1555       bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1556                  atom->getIndirect(0, 0));
1557    ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1558    ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1559
1560    Value *stVal;
1561    if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1562       // Read the old value, and write the new one.
1563       stVal = atom->getSrc(1);
1564    } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1565       CmpInstruction *set =
1566          bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1567                    TYPE_U32, ld->getDef(0), atom->getSrc(1));
1568       set->setPredicate(CC_P, ld->getDef(1));
1569
1570       Instruction *selp =
1571          bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
1572                    atom->getSrc(2), set->getDef(0));
1573       selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
1574       selp->setPredicate(CC_P, ld->getDef(1));
1575
1576       stVal = selp->getDef(0);
1577    } else {
1578       operation op;
1579
1580       switch (atom->subOp) {
1581       case NV50_IR_SUBOP_ATOM_ADD:
1582          op = OP_ADD;
1583          break;
1584       case NV50_IR_SUBOP_ATOM_AND:
1585          op = OP_AND;
1586          break;
1587       case NV50_IR_SUBOP_ATOM_OR:
1588          op = OP_OR;
1589          break;
1590       case NV50_IR_SUBOP_ATOM_XOR:
1591          op = OP_XOR;
1592          break;
1593       case NV50_IR_SUBOP_ATOM_MIN:
1594          op = OP_MIN;
1595          break;
1596       case NV50_IR_SUBOP_ATOM_MAX:
1597          op = OP_MAX;
1598          break;
1599       default:
1600          assert(0);
1601          return;
1602       }
1603
1604       Instruction *i =
1605          bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1606                    atom->getSrc(1));
1607       i->setPredicate(CC_P, ld->getDef(1));
1608
1609       stVal = i->getDef(0);
1610    }
1611
1612    Instruction *st =
1613       bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1614                   atom->getIndirect(0, 0), stVal);
1615    st->setPredicate(CC_P, ld->getDef(1));
1616    st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1617
1618    // Loop until the lock is acquired.
1619    bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
1620    tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
1621    tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
1622    bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1623
1624    bld.remove(atom);
1625
1626    bld.setPosition(joinBB, false);
1627    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1628 }
1629
1630 bool
1631 NVC0LoweringPass::handleATOM(Instruction *atom)
1632 {
1633    SVSemantic sv;
1634    Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
1635
1636    switch (atom->src(0).getFile()) {
1637    case FILE_MEMORY_LOCAL:
1638       sv = SV_LBASE;
1639       break;
1640    case FILE_MEMORY_SHARED:
1641       // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1642       // operations on shared memory. For Maxwell, ATOMS is enough.
1643       if (targ->getChipset() < NVISA_GK104_CHIPSET)
1644          handleSharedATOM(atom);
1645       else if (targ->getChipset() < NVISA_GM107_CHIPSET)
1646          handleSharedATOMNVE4(atom);
1647       return true;
1648    default:
1649       assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
1650       base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
1651       assert(base->reg.size == 8);
1652       if (ptr)
1653          base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
1654       assert(base->reg.size == 8);
1655       atom->setIndirect(0, 0, base);
1656       atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1657
1658       // Harden against out-of-bounds accesses
1659       Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
1660       Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
1661       Value *pred = new_LValue(func, FILE_PREDICATE);
1662       if (ptr)
1663          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
1664       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
1665       atom->setPredicate(CC_NOT_P, pred);
1666       if (atom->defExists(0)) {
1667          Value *zero, *dst = atom->getDef(0);
1668          atom->setDef(0, bld.getSSA());
1669
1670          bld.setPosition(atom, true);
1671          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
1672             ->setPredicate(CC_P, pred);
1673          bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
1674       }
1675
1676       return true;
1677    }
1678    base =
1679       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
1680
1681    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
1682    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1683    if (ptr)
1684       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
1685    atom->setIndirect(0, 1, NULL);
1686    atom->setIndirect(0, 0, base);
1687
1688    return true;
1689 }
1690
1691 bool
1692 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
1693 {
1694    if (targ->getChipset() < NVISA_GM107_CHIPSET) {
1695       if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
1696          // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1697          return false;
1698       }
1699    }
1700
1701    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
1702        cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
1703       return false;
1704    bld.setPosition(cas, true);
1705
1706    if (needCctl) {
1707       Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
1708       cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
1709       cctl->fixed = 1;
1710       cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
1711       if (cas->isPredicated())
1712          cctl->setPredicate(cas->cc, cas->getPredicate());
1713    }
1714
1715    if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1716       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1717       // should be set to the high part of the double reg or bad things will
1718       // happen elsewhere in the universe.
1719       // Also, it sometimes returns the new value instead of the old one
1720       // under mysterious circumstances.
1721       Value *dreg = bld.getSSA(8);
1722       bld.setPosition(cas, false);
1723       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
1724       cas->setSrc(1, dreg);
1725       cas->setSrc(2, dreg);
1726    }
1727
1728    return true;
1729 }
1730
1731 inline Value *
1732 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
1733 {
1734    uint8_t b = prog->driver->io.auxCBSlot;
1735    off += base;
1736
1737    return bld.
1738       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1739 }
1740
1741 inline Value *
1742 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
1743 {
1744    uint8_t b = prog->driver->io.auxCBSlot;
1745    off += base;
1746
1747    if (ptr)
1748       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1749
1750    return bld.
1751       mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
1752 }
1753
1754 inline Value *
1755 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
1756 {
1757    uint8_t b = prog->driver->io.auxCBSlot;
1758    off += base;
1759
1760    if (ptr)
1761       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1762
1763    return bld.
1764       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
1765 }
1766
1767 inline Value *
1768 NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
1769 {
1770    return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
1771 }
1772
1773 inline Value *
1774 NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
1775 {
1776    return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
1777 }
1778
1779 inline Value *
1780 NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
1781 {
1782    return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
1783 }
1784
1785 inline Value *
1786 NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
1787 {
1788    return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
1789 }
1790
1791 inline Value *
1792 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1793 {
1794    uint8_t b = prog->driver->io.msInfoCBSlot;
1795    off += prog->driver->io.msInfoBase;
1796    return bld.
1797       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1798 }
1799
1800 inline Value *
1801 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless)
1802 {
1803    uint32_t base = slot * NVC0_SU_INFO__STRIDE;
1804
1805    if (ptr) {
1806       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
1807       if (bindless)
1808          ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(511));
1809       else
1810          ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
1811       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
1812       base = 0;
1813    }
1814    off += base;
1815
1816    return loadResInfo32(ptr, off, bindless ? prog->driver->io.bindlessBase :
1817                         prog->driver->io.suInfoBase);
1818 }
1819
1820 Value *
1821 NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless)
1822 {
1823    if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET)
1824       return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless);
1825
1826    assert(bindless);
1827
1828    Value *samples = bld.getSSA();
1829    // this shouldn't be lowered because it's being inserted before the current instruction
1830    TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
1831    tex->tex.target = target;
1832    tex->tex.query = TXQ_TYPE;
1833    tex->tex.mask = 0x4;
1834    tex->tex.r = 0xff;
1835    tex->tex.s = 0x1f;
1836    tex->tex.rIndirectSrc = 0;
1837    tex->setDef(0, samples);
1838    tex->setSrc(0, ind);
1839    tex->setSrc(1, bld.loadImm(NULL, 0));
1840    bld.insert(tex);
1841
1842    // doesn't work with sample counts other than 1/2/4/8 but they aren't supported
1843    switch (index) {
1844    case 0: {
1845       Value *tmp = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(2));
1846       return bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(2));
1847    }
1848    case 1: {
1849       Value *tmp = bld.mkCmp(OP_SET, CC_GT, TYPE_U32, bld.getSSA(), TYPE_U32, samples, bld.mkImm(2))->getDef(0);
1850       return bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(1));
1851    }
1852    default: {
1853       assert(false);
1854       return NULL;
1855    }
1856    }
1857 }
1858
1859 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1860 {
1861    switch (su->tex.target.getEnum()) {
1862    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1863    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1864    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1865    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
1866                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1867                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1868    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1869    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1870    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1871    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1872    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1873    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1874    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1875    default:
1876       assert(0);
1877       return 0;
1878    }
1879 }
1880
1881 bool
1882 NVC0LoweringPass::handleSUQ(TexInstruction *suq)
1883 {
1884    int mask = suq->tex.mask;
1885    int dim = suq->tex.target.getDim();
1886    int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1887    Value *ind = suq->getIndirectR();
1888    int slot = suq->tex.r;
1889    int c, d;
1890
1891    for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1892       if (c >= arg || !(mask & 1))
1893          continue;
1894
1895       int offset;
1896
1897       if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1898          offset = NVC0_SU_INFO_SIZE(2);
1899       } else {
1900          offset = NVC0_SU_INFO_SIZE(c);
1901       }
1902       bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset, suq->tex.bindless));
1903       if (c == 2 && suq->tex.target.isCube())
1904          bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1905                    bld.loadImm(NULL, 6));
1906    }
1907
1908    if (mask & 1) {
1909       if (suq->tex.target.isMS()) {
1910          Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), suq->tex.bindless);
1911          Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), suq->tex.bindless);
1912          Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1913          bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1914       } else {
1915          bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1916       }
1917    }
1918
1919    bld.remove(suq);
1920    return true;
1921 }
1922
1923 void
1924 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1925 {
1926    const int arg = tex->tex.target.getArgCount();
1927    int slot = tex->tex.r;
1928
1929    if (tex->tex.target == TEX_TARGET_2D_MS)
1930       tex->tex.target = TEX_TARGET_2D;
1931    else
1932    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1933       tex->tex.target = TEX_TARGET_2D_ARRAY;
1934    else
1935       return;
1936
1937    Value *x = tex->getSrc(0);
1938    Value *y = tex->getSrc(1);
1939    Value *s = tex->getSrc(arg - 1);
1940
1941    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1942    Value *ind = tex->getIndirectR();
1943
1944    Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless);
1945    Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless);
1946
1947    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1948    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1949
1950    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1951    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1952
1953    Value *dx = loadMsInfo32(ts, 0x0);
1954    Value *dy = loadMsInfo32(ts, 0x4);
1955
1956    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1957    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1958
1959    tex->setSrc(0, tx);
1960    tex->setSrc(1, ty);
1961    tex->moveSources(arg, -1);
1962 }
1963
1964 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1965 // They're computed from the coordinates using the surface info in c[] space.
1966 void
1967 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1968 {
1969    Instruction *insn;
1970    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
1971    const bool raw =
1972       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
1973    const int slot = su->tex.r;
1974    const int dim = su->tex.target.getDim();
1975    const bool array = su->tex.target.isArray() || su->tex.target.isCube();
1976    const int arg = dim + array;
1977    int c;
1978    Value *zero = bld.mkImm(0);
1979    Value *p1 = NULL;
1980    Value *v;
1981    Value *src[3];
1982    Value *bf, *eau, *off;
1983    Value *addr, *pred;
1984    Value *ind = su->getIndirectR();
1985    Value *y, *z;
1986
1987    off = bld.getScratch(4);
1988    bf = bld.getScratch(4);
1989    addr = bld.getSSA(8);
1990    pred = bld.getScratch(1, FILE_PREDICATE);
1991
1992    bld.setPosition(su, false);
1993
1994    adjustCoordinatesMS(su);
1995
1996    // calculate clamped coordinates
1997    for (c = 0; c < arg; ++c) {
1998       int dimc = c;
1999
2000       if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
2001          // The array index is stored in the Z component for 1D arrays.
2002          dimc = 2;
2003       }
2004
2005       src[c] = bld.getScratch();
2006       if (c == 0 && raw)
2007          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X, su->tex.bindless);
2008       else
2009          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc), su->tex.bindless);
2010       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
2011          ->subOp = getSuClampSubOp(su, dimc);
2012    }
2013    for (; c < 3; ++c)
2014       src[c] = zero;
2015
2016    if (dim == 2 && !array) {
2017       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2018       src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(),
2019                           v, bld.loadImm(NULL, 16));
2020
2021       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless);
2022       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero)
2023          ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
2024    }
2025
2026    // set predicate output
2027    if (su->tex.target == TEX_TARGET_BUFFER) {
2028       src[0]->getInsn()->setFlagsDef(1, pred);
2029    } else
2030    if (array) {
2031       p1 = bld.getSSA(1, FILE_PREDICATE);
2032       src[dim]->getInsn()->setFlagsDef(1, p1);
2033    }
2034
2035    // calculate pixel offset
2036    if (dim == 1) {
2037       y = z = zero;
2038       if (su->tex.target != TEX_TARGET_BUFFER)
2039          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
2040    } else {
2041       y = src[1];
2042       z = src[2];
2043
2044       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless);
2045       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
2046          ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l
2047
2048       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless);
2049       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
2050          ->subOp = array ?
2051          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
2052    }
2053
2054    // calculate effective address part 1
2055    if (su->tex.target == TEX_TARGET_BUFFER) {
2056       if (raw) {
2057          bf = src[0];
2058       } else {
2059          v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2060          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
2061             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
2062       }
2063    } else {
2064       uint16_t subOp = 0;
2065
2066       switch (dim) {
2067       case 1:
2068          break;
2069       case 2:
2070          if (array) {
2071             z = off;
2072          } else {
2073             subOp = NV50_IR_SUBOP_SUBFM_3D;
2074          }
2075          break;
2076       default:
2077          subOp = NV50_IR_SUBOP_SUBFM_3D;
2078          assert(dim == 3);
2079          break;
2080       }
2081       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
2082       insn->subOp = subOp;
2083       insn->setFlagsDef(1, pred);
2084    }
2085
2086    // part 2
2087    v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless);
2088
2089    if (su->tex.target == TEX_TARGET_BUFFER) {
2090       eau = v;
2091    } else {
2092       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
2093    }
2094    // add array layer offset
2095    if (array) {
2096       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2097       if (dim == 1)
2098          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
2099             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
2100       else
2101          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
2102             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
2103       // combine predicates
2104       assert(p1);
2105       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
2106    }
2107
2108    if (atom) {
2109       Value *lo = bf;
2110       if (su->tex.target == TEX_TARGET_BUFFER) {
2111          lo = zero;
2112          bld.mkMov(off, bf);
2113       }
2114       //  bf == g[] address & 0xff
2115       // eau == g[] address >> 8
2116       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
2117       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
2118    } else
2119    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
2120       // Convert from u32 to u8 address format, which is what the library code
2121       // doing SULDP currently uses.
2122       // XXX: can SUEAU do this ?
2123       // XXX: does it matter that we don't mask high bytes in bf ?
2124       // Grrr.
2125       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
2126       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
2127    }
2128
2129    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
2130
2131    if (atom && su->tex.target == TEX_TARGET_BUFFER)
2132       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
2133
2134    // let's just set it 0 for raw access and hope it works
2135    v = raw ?
2136       bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT, su->tex.bindless);
2137
2138    // get rid of old coordinate sources, make space for fmt info and predicate
2139    su->moveSources(arg, 3 - arg);
2140    // set 64 bit address and 32-bit format sources
2141    su->setSrc(0, addr);
2142    su->setSrc(1, v);
2143    su->setSrc(2, pred);
2144    su->setIndirectR(NULL);
2145
2146    // prevent read fault when the image is not actually bound
2147    CmpInstruction *pred1 =
2148       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2149                 TYPE_U32, bld.mkImm(0),
2150                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2151
2152    if (su->op != OP_SUSTP && su->tex.format) {
2153       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2154       int blockwidth = format->bits[0] + format->bits[1] +
2155                        format->bits[2] + format->bits[3];
2156
2157       // make sure that the format doesn't mismatch
2158       assert(format->components != 0);
2159       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
2160                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2161                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2162                 pred1->getDef(0));
2163    }
2164    su->setPredicate(CC_NOT_P, pred1->getDef(0));
2165
2166    // TODO: initialize def values to 0 when the surface operation is not
2167    // performed (not needed for stores). Also, fix the "address bounds test"
2168    // subtests from arb_shader_image_load_store-invalid for buffers, because it
2169    // seems like that the predicate is not correctly set by suclamp.
2170 }
2171
2172 static DataType
2173 getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
2174 {
2175    switch (t->type) {
2176    case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
2177    case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
2178    case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
2179    case UINT:
2180       return (t->bits[c] == 8 ? TYPE_U8 :
2181               (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
2182    case SINT:
2183       return (t->bits[c] == 8 ? TYPE_S8 :
2184               (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
2185    }
2186    return TYPE_NONE;
2187 }
2188
2189 static DataType
2190 getDestType(const ImgType type) {
2191    switch (type) {
2192    case FLOAT:
2193    case UNORM:
2194    case SNORM:
2195       return TYPE_F32;
2196    case UINT:
2197       return TYPE_U32;
2198    case SINT:
2199       return TYPE_S32;
2200    default:
2201       assert(!"Impossible type");
2202       return TYPE_NONE;
2203    }
2204 }
2205
2206 void
2207 NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
2208 {
2209    const TexInstruction::ImgFormatDesc *format = su->tex.format;
2210    int width = format->bits[0] + format->bits[1] +
2211       format->bits[2] + format->bits[3];
2212    Value *untypedDst[4] = {};
2213    Value *typedDst[4] = {};
2214
2215    // We must convert this to a generic load.
2216    su->op = OP_SULDB;
2217
2218    su->dType = typeOfSize(width / 8);
2219    su->sType = TYPE_U8;
2220
2221    for (int i = 0; i < width / 32; i++)
2222       untypedDst[i] = bld.getSSA();
2223    if (width < 32)
2224       untypedDst[0] = bld.getSSA();
2225
2226    for (int i = 0; i < 4; i++) {
2227       typedDst[i] = su->getDef(i);
2228    }
2229
2230    // Set the untyped dsts as the su's destinations
2231    for (int i = 0; i < 4; i++)
2232       su->setDef(i, untypedDst[i]);
2233
2234    bld.setPosition(su, true);
2235
2236    // Unpack each component into the typed dsts
2237    int bits = 0;
2238    for (int i = 0; i < 4; bits += format->bits[i], i++) {
2239       if (!typedDst[i])
2240          continue;
2241       if (i >= format->components) {
2242          if (format->type == FLOAT ||
2243              format->type == UNORM ||
2244              format->type == SNORM)
2245             bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
2246          else
2247             bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
2248          continue;
2249       }
2250
2251       // Get just that component's data into the relevant place
2252       if (format->bits[i] == 32)
2253          bld.mkMov(typedDst[i], untypedDst[i]);
2254       else if (format->bits[i] == 16)
2255          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2256                    getSrcType(format, i), untypedDst[i / 2])
2257          ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
2258       else if (format->bits[i] == 8)
2259          bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
2260                    getSrcType(format, i), untypedDst[0])->subOp = i;
2261       else {
2262          bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
2263                    bld.mkImm((bits % 32) | (format->bits[i] << 8)));
2264          if (format->type == UNORM || format->type == SNORM)
2265             bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
2266       }
2267
2268       // Normalize / convert as necessary
2269       if (format->type == UNORM)
2270          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
2271       else if (format->type == SNORM)
2272          bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
2273       else if (format->type == FLOAT && format->bits[i] < 16) {
2274          bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
2275          bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
2276       }
2277    }
2278
2279    if (format->bgra) {
2280       std::swap(typedDst[0], typedDst[2]);
2281    }
2282 }
2283
2284 void
2285 NVC0LoweringPass::insertOOBSurfaceOpResult(TexInstruction *su)
2286 {
2287    if (!su->getPredicate())
2288       return;
2289
2290    bld.setPosition(su, true);
2291
2292    for (unsigned i = 0; su->defExists(i); ++i) {
2293       ValueDef &def = su->def(i);
2294
2295       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2296       assert(su->cc == CC_NOT_P);
2297       mov->setPredicate(CC_P, su->getPredicate());
2298       Instruction *uni = bld.mkOp2(OP_UNION, TYPE_U32, bld.getSSA(), NULL, mov->getDef(0));
2299
2300       def.replace(uni->getDef(0), false);
2301       uni->setSrc(0, def.get());
2302    }
2303 }
2304
2305 void
2306 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
2307 {
2308    processSurfaceCoordsNVE4(su);
2309
2310    if (su->op == OP_SULDP) {
2311       convertSurfaceFormat(su);
2312       insertOOBSurfaceOpResult(su);
2313    }
2314
2315    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2316       assert(su->getPredicate());
2317       Value *pred =
2318          bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
2319                     su->getPredicate(), su->getSrc(2));
2320
2321       Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
2322       red->subOp = su->subOp;
2323       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
2324       red->setSrc(1, su->getSrc(3));
2325       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
2326          red->setSrc(2, su->getSrc(4));
2327       red->setIndirect(0, 0, su->getSrc(0));
2328
2329       // make sure to initialize dst value when the atomic operation is not
2330       // performed
2331       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2332
2333       assert(su->cc == CC_NOT_P);
2334       red->setPredicate(su->cc, pred);
2335       mov->setPredicate(CC_P, pred);
2336
2337       bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
2338                 red->getDef(0), mov->getDef(0));
2339
2340       delete_Instruction(bld.getProgram(), su);
2341       handleCasExch(red, true);
2342    }
2343
2344    if (su->op == OP_SUSTB || su->op == OP_SUSTP)
2345       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
2346 }
2347
2348 void
2349 NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
2350 {
2351    const int slot = su->tex.r;
2352    const int dim = su->tex.target.getDim();
2353    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2354    int c;
2355    Value *zero = bld.mkImm(0);
2356    Value *src[3];
2357    Value *v;
2358    Value *ind = su->getIndirectR();
2359
2360    bld.setPosition(su, false);
2361
2362    adjustCoordinatesMS(su);
2363
2364    if (ind) {
2365       Value *ptr;
2366       ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
2367       ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
2368       su->setIndirectR(ptr);
2369    }
2370
2371    // get surface coordinates
2372    for (c = 0; c < arg; ++c)
2373       src[c] = su->getSrc(c);
2374    for (; c < 3; ++c)
2375       src[c] = zero;
2376
2377    // calculate pixel offset
2378    if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2379       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless);
2380       su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
2381    }
2382
2383    // add array layer offset
2384    if (su->tex.target.isArray() || su->tex.target.isCube()) {
2385       v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless);
2386       assert(dim > 1);
2387       su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
2388    }
2389
2390    // prevent read fault when the image is not actually bound
2391    CmpInstruction *pred =
2392       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2393                 TYPE_U32, bld.mkImm(0),
2394                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2395    if (su->op != OP_SUSTP && su->tex.format) {
2396       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2397       int blockwidth = format->bits[0] + format->bits[1] +
2398                        format->bits[2] + format->bits[3];
2399
2400       assert(format->components != 0);
2401       // make sure that the format doesn't mismatch when it's not FMT_NONE
2402       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2403                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2404                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2405                 pred->getDef(0));
2406    }
2407    su->setPredicate(CC_NOT_P, pred->getDef(0));
2408 }
2409
2410 void
2411 NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
2412 {
2413    if (su->tex.target == TEX_TARGET_1D_ARRAY) {
2414       /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2415        * will simplify the lowering pass and the texture constraints. */
2416       su->moveSources(1, 1);
2417       su->setSrc(1, bld.loadImm(NULL, 0));
2418       su->tex.target = TEX_TARGET_2D_ARRAY;
2419    }
2420
2421    processSurfaceCoordsNVC0(su);
2422
2423    if (su->op == OP_SULDP) {
2424       convertSurfaceFormat(su);
2425       insertOOBSurfaceOpResult(su);
2426    }
2427
2428    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2429       const int dim = su->tex.target.getDim();
2430       const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2431       LValue *addr = bld.getSSA(8);
2432       Value *def = su->getDef(0);
2433
2434       su->op = OP_SULEA;
2435
2436       // Set the destination to the address
2437       su->dType = TYPE_U64;
2438       su->setDef(0, addr);
2439       su->setDef(1, su->getPredicate());
2440
2441       bld.setPosition(su, true);
2442
2443       // Perform the atomic op
2444       Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
2445       red->subOp = su->subOp;
2446       red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
2447       red->setSrc(1, su->getSrc(arg));
2448       if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
2449          red->setSrc(2, su->getSrc(arg + 1));
2450       red->setIndirect(0, 0, addr);
2451
2452       // make sure to initialize dst value when the atomic operation is not
2453       // performed
2454       Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2455
2456       assert(su->cc == CC_NOT_P);
2457       red->setPredicate(su->cc, su->getPredicate());
2458       mov->setPredicate(CC_P, su->getPredicate());
2459
2460       bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
2461
2462       handleCasExch(red, false);
2463    }
2464 }
2465
2466 void
2467 NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
2468 {
2469    const int slot = su->tex.r;
2470    const int dim = su->tex.target.getDim();
2471    const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2472    Value *ind = su->getIndirectR();
2473    Value *handle;
2474    int pos = 0;
2475
2476    bld.setPosition(su, false);
2477
2478    adjustCoordinatesMS(su);
2479
2480    // add texture handle
2481    switch (su->op) {
2482    case OP_SUSTP:
2483       pos = 4;
2484       break;
2485    case OP_SUREDP:
2486       pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
2487       break;
2488    default:
2489       assert(pos == 0);
2490       break;
2491    }
2492    if (su->tex.bindless)
2493       handle = ind;
2494    else
2495       handle = loadTexHandle(ind, slot + 32);
2496    su->setSrc(arg + pos, handle);
2497
2498    // The address check doesn't make sense here. The format check could make
2499    // sense but it's a bit of a pain.
2500    if (su->tex.bindless)
2501       return;
2502
2503    // prevent read fault when the image is not actually bound
2504    CmpInstruction *pred =
2505       bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2506                 TYPE_U32, bld.mkImm(0),
2507                 loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless));
2508    if (su->op != OP_SUSTP && su->tex.format) {
2509       const TexInstruction::ImgFormatDesc *format = su->tex.format;
2510       int blockwidth = format->bits[0] + format->bits[1] +
2511                        format->bits[2] + format->bits[3];
2512
2513       assert(format->components != 0);
2514       // make sure that the format doesn't mismatch when it's not FMT_NONE
2515       bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2516                 TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2517                 loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless),
2518                 pred->getDef(0));
2519    }
2520    su->setPredicate(CC_NOT_P, pred->getDef(0));
2521 }
2522
2523 void
2524 NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
2525 {
2526    processSurfaceCoordsGM107(su);
2527
2528    if (su->op == OP_SULDP) {
2529       convertSurfaceFormat(su);
2530       insertOOBSurfaceOpResult(su);
2531    }
2532
2533    if (su->op == OP_SUREDP) {
2534       Value *def = su->getDef(0);
2535
2536       su->op = OP_SUREDB;
2537
2538       // There may not be a predicate in the bindless case.
2539       if (su->getPredicate()) {
2540          su->setDef(0, bld.getSSA());
2541
2542          bld.setPosition(su, true);
2543
2544          // make sure to initialize dst value when the atomic operation is not
2545          // performed
2546          Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2547
2548          assert(su->cc == CC_NOT_P);
2549          mov->setPredicate(CC_P, su->getPredicate());
2550
2551          bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
2552       }
2553    }
2554 }
2555
2556 bool
2557 NVC0LoweringPass::handleWRSV(Instruction *i)
2558 {
2559    Instruction *st;
2560    Symbol *sym;
2561    uint32_t addr;
2562
2563    // must replace, $sreg are not writeable
2564    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
2565    if (addr >= 0x400)
2566       return false;
2567    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
2568
2569    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
2570                     i->getSrc(1));
2571    st->perPatch = i->perPatch;
2572
2573    bld.getBB()->remove(i);
2574    return true;
2575 }
2576
2577 void
2578 NVC0LoweringPass::handleLDST(Instruction *i)
2579 {
2580    if (i->src(0).getFile() == FILE_SHADER_INPUT) {
2581       if (prog->getType() == Program::TYPE_COMPUTE) {
2582          i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
2583          i->getSrc(0)->reg.fileIndex = 0;
2584       } else
2585       if (prog->getType() == Program::TYPE_GEOMETRY &&
2586           i->src(0).isIndirect(0)) {
2587          // XXX: this assumes vec4 units
2588          Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2589                                  i->getIndirect(0, 0), bld.mkImm(4));
2590          i->setIndirect(0, 0, ptr);
2591          i->op = OP_VFETCH;
2592       } else {
2593          i->op = OP_VFETCH;
2594          assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
2595       }
2596    } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
2597       int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
2598       Value *ind = i->getIndirect(0, 1);
2599
2600       if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2601           prog->getType() == Program::TYPE_COMPUTE &&
2602           (fileIndex >= 6 || ind)) {
2603          // The launch descriptor only allows to set up 8 CBs, but OpenGL
2604          // requires at least 12 UBOs. To bypass this limitation, for constant
2605          // buffers 7+, we store the addrs into the driver constbuf and we
2606          // directly load from the global memory.
2607          if (ind) {
2608             // Clamp the UBO index when an indirect access is used to avoid
2609             // loading information from the wrong place in the driver cb.
2610             // TODO - synchronize the max with the driver.
2611             ind = bld.mkOp2v(OP_MIN, TYPE_U32, bld.getSSA(),
2612                              bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2613                                         ind, bld.loadImm(NULL, fileIndex)),
2614                              bld.loadImm(NULL, 13));
2615             fileIndex = 0;
2616          }
2617
2618          Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2619          Value *ptr = loadUboInfo64(ind, fileIndex * 16);
2620          Value *length = loadUboLength32(ind, fileIndex * 16);
2621          Value *pred = new_LValue(func, FILE_PREDICATE);
2622          if (i->src(0).isIndirect(0)) {
2623             bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2624             bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2625          }
2626          i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2627          i->setIndirect(0, 1, NULL);
2628          i->setIndirect(0, 0, ptr);
2629          bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2630          i->setPredicate(CC_NOT_P, pred);
2631          Value *zero, *dst = i->getDef(0);
2632          i->setDef(0, bld.getSSA());
2633
2634          bld.setPosition(i, true);
2635          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2636             ->setPredicate(CC_P, pred);
2637          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2638       } else if (i->src(0).isIndirect(1)) {
2639          Value *ptr;
2640          if (i->src(0).isIndirect(0))
2641             ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
2642                              i->getIndirect(0, 1), bld.mkImm(0x1010),
2643                              i->getIndirect(0, 0));
2644          else
2645             ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2646                              i->getIndirect(0, 1), bld.mkImm(16));
2647          i->setIndirect(0, 1, NULL);
2648          i->setIndirect(0, 0, ptr);
2649          i->subOp = NV50_IR_SUBOP_LDC_IS;
2650       }
2651    } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
2652       assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
2653       i->op = OP_VFETCH;
2654    } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
2655       Value *ind = i->getIndirect(0, 1);
2656       Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
2657       // XXX come up with a way not to do this for EVERY little access but
2658       // rather to batch these up somehow. Unfortunately we've lost the
2659       // information about the field width by the time we get here.
2660       Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2661       Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
2662       Value *pred = new_LValue(func, FILE_PREDICATE);
2663       if (i->src(0).isIndirect(0)) {
2664          bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2665          bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2666       }
2667       i->setIndirect(0, 1, NULL);
2668       i->setIndirect(0, 0, ptr);
2669       i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2670       bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2671       i->setPredicate(CC_NOT_P, pred);
2672       if (i->defExists(0)) {
2673          Value *zero, *dst = i->getDef(0);
2674          i->setDef(0, bld.getSSA());
2675
2676          bld.setPosition(i, true);
2677          bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2678             ->setPredicate(CC_P, pred);
2679          bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2680       }
2681    }
2682 }
2683
2684 void
2685 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
2686 {
2687    Value *laneid = bld.getSSA();
2688    Value *x, *y;
2689
2690    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
2691
2692    if (c == 0) {
2693       x = dst;
2694       y = NULL;
2695    } else
2696    if (c == 1) {
2697       x = NULL;
2698       y = dst;
2699    } else {
2700       assert(c == 2);
2701       if (prog->driver->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
2702          bld.mkMov(dst, bld.loadImm(NULL, 0));
2703          return;
2704       }
2705       x = bld.getSSA();
2706       y = bld.getSSA();
2707    }
2708    if (x)
2709       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
2710    if (y)
2711       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
2712
2713    if (c == 2) {
2714       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
2715       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
2716    }
2717 }
2718
2719 bool
2720 NVC0LoweringPass::handleRDSV(Instruction *i)
2721 {
2722    Symbol *sym = i->getSrc(0)->asSym();
2723    const SVSemantic sv = sym->reg.data.sv.sv;
2724    Value *vtx = NULL;
2725    Instruction *ld;
2726    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
2727
2728    if (addr >= 0x400) {
2729       // mov $sreg
2730       if (sym->reg.data.sv.index == 3) {
2731          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2732          i->op = OP_MOV;
2733          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
2734       } else
2735       if (sv == SV_TID) {
2736          // Help CSE combine TID fetches
2737          Value *tid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(),
2738                                  bld.mkSysVal(SV_COMBINED_TID, 0));
2739          i->op = OP_EXTBF;
2740          i->setSrc(0, tid);
2741          switch (sym->reg.data.sv.index) {
2742          case 0: i->setSrc(1, bld.mkImm(0x1000)); break;
2743          case 1: i->setSrc(1, bld.mkImm(0x0a10)); break;
2744          case 2: i->setSrc(1, bld.mkImm(0x061a)); break;
2745          }
2746       }
2747       if (sv == SV_VERTEX_COUNT) {
2748          bld.setPosition(i, true);
2749          bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
2750       }
2751       return true;
2752    }
2753
2754    switch (sv) {
2755    case SV_POSITION:
2756       assert(prog->getType() == Program::TYPE_FRAGMENT);
2757       if (i->srcExists(1)) {
2758          // Pass offset through to the interpolation logic
2759          ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
2760                            i->getDef(0), addr, NULL);
2761          ld->setSrc(1, i->getSrc(1));
2762       } else {
2763          bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
2764       }
2765       break;
2766    case SV_FACE:
2767    {
2768       Value *face = i->getDef(0);
2769       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
2770       if (i->dType == TYPE_F32) {
2771          bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
2772          bld.mkOp1(OP_NEG, TYPE_S32, face, face);
2773          bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
2774       }
2775    }
2776       break;
2777    case SV_TESS_COORD:
2778       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
2779       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
2780       break;
2781    case SV_NTID:
2782    case SV_NCTAID:
2783    case SV_GRIDID:
2784       assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
2785       if (sym->reg.data.sv.index == 3) {
2786          i->op = OP_MOV;
2787          i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
2788          return true;
2789       }
2790       // Fallthrough
2791    case SV_WORK_DIM:
2792       addr += prog->driver->prop.cp.gridInfoBase;
2793       bld.mkLoad(TYPE_U32, i->getDef(0),
2794                  bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2795                               TYPE_U32, addr), NULL);
2796       break;
2797    case SV_SAMPLE_INDEX:
2798       // TODO: Properly pass source as an address in the PIX address space
2799       // (which can be of the form [r0+offset]). But this is currently
2800       // unnecessary.
2801       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2802       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2803       break;
2804    case SV_SAMPLE_POS: {
2805       Value *sampleID = bld.getScratch();
2806       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0));
2807       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2808       Value *offset = calculateSampleOffset(sampleID);
2809
2810       assert(prog->driver->prop.fp.readsSampleLocations);
2811
2812       if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
2813          bld.mkLoad(TYPE_F32,
2814                     i->getDef(0),
2815                     bld.mkSymbol(
2816                           FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2817                           TYPE_U32, prog->driver->io.sampleInfoBase),
2818                     offset);
2819          bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0),
2820                    bld.mkImm(0x040c + sym->reg.data.sv.index * 16));
2821          bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0));
2822          bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), bld.mkImm(1.0f / 16.0f));
2823       } else {
2824          bld.mkLoad(TYPE_F32,
2825                     i->getDef(0),
2826                     bld.mkSymbol(
2827                           FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2828                           TYPE_U32, prog->driver->io.sampleInfoBase +
2829                           4 * sym->reg.data.sv.index),
2830                     offset);
2831       }
2832       break;
2833    }
2834    case SV_SAMPLE_MASK: {
2835       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2836       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
2837       Instruction *sampleid =
2838          bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2839       sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2840       Value *masked =
2841          bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
2842                     bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2843                                bld.loadImm(NULL, 1), sampleid->getDef(0)));
2844       if (prog->driver->prop.fp.persampleInvocation) {
2845          bld.mkMov(i->getDef(0), masked);
2846       } else {
2847          bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
2848                    bld.mkImm(0))
2849             ->subOp = 1;
2850       }
2851       break;
2852    }
2853    case SV_BASEVERTEX:
2854    case SV_BASEINSTANCE:
2855    case SV_DRAWID:
2856       ld = bld.mkLoad(TYPE_U32, i->getDef(0),
2857                       bld.mkSymbol(FILE_MEMORY_CONST,
2858                                    prog->driver->io.auxCBSlot,
2859                                    TYPE_U32,
2860                                    prog->driver->io.drawInfoBase +
2861                                    4 * (sv - SV_BASEVERTEX)),
2862                       NULL);
2863       break;
2864    default:
2865       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
2866          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2867       if (prog->getType() == Program::TYPE_FRAGMENT) {
2868          bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
2869       } else {
2870          ld = bld.mkFetch(i->getDef(0), i->dType,
2871                           FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
2872          ld->perPatch = i->perPatch;
2873       }
2874       break;
2875    }
2876    bld.getBB()->remove(i);
2877    return true;
2878 }
2879
2880 bool
2881 NVC0LoweringPass::handleDIV(Instruction *i)
2882 {
2883    if (!isFloatType(i->dType))
2884       return true;
2885    bld.setPosition(i, false);
2886    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
2887    i->op = OP_MUL;
2888    i->setSrc(1, rcp->getDef(0));
2889    return true;
2890 }
2891
2892 bool
2893 NVC0LoweringPass::handleMOD(Instruction *i)
2894 {
2895    if (!isFloatType(i->dType))
2896       return true;
2897    LValue *value = bld.getScratch(typeSizeof(i->dType));
2898    bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
2899    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
2900    bld.mkOp1(OP_TRUNC, i->dType, value, value);
2901    bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
2902    i->op = OP_SUB;
2903    i->setSrc(1, value);
2904    return true;
2905 }
2906
2907 bool
2908 NVC0LoweringPass::handleSQRT(Instruction *i)
2909 {
2910    if (targ->isOpSupported(OP_SQRT, i->dType))
2911       return true;
2912
2913    if (i->dType == TYPE_F64) {
2914       Value *pred = bld.getSSA(1, FILE_PREDICATE);
2915       Value *zero = bld.loadImm(NULL, 0.0);
2916       Value *dst = bld.getSSA(8);
2917       bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
2918       bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
2919       bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
2920       i->op = OP_MUL;
2921       i->setSrc(1, dst);
2922       // TODO: Handle this properly with a library function
2923    } else {
2924       bld.setPosition(i, true);
2925       i->op = OP_RSQ;
2926       bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
2927    }
2928
2929    return true;
2930 }
2931
2932 bool
2933 NVC0LoweringPass::handlePOW(Instruction *i)
2934 {
2935    LValue *val = bld.getScratch();
2936
2937    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
2938    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
2939    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
2940
2941    i->op = OP_EX2;
2942    i->setSrc(0, val);
2943    i->setSrc(1, NULL);
2944
2945    return true;
2946 }
2947
2948 bool
2949 NVC0LoweringPass::handleEXPORT(Instruction *i)
2950 {
2951    if (prog->getType() == Program::TYPE_FRAGMENT) {
2952       int id = i->getSrc(0)->reg.data.offset / 4;
2953
2954       if (i->src(0).isIndirect(0)) // TODO, ugly
2955          return false;
2956       i->op = OP_MOV;
2957       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
2958       i->src(0).set(i->src(1));
2959       i->setSrc(1, NULL);
2960       i->setDef(0, new_LValue(func, FILE_GPR));
2961       i->getDef(0)->reg.data.id = id;
2962
2963       prog->maxGPR = MAX2(prog->maxGPR, id);
2964    } else
2965    if (prog->getType() == Program::TYPE_GEOMETRY) {
2966       i->setIndirect(0, 1, gpEmitAddress);
2967    }
2968    return true;
2969 }
2970
2971 bool
2972 NVC0LoweringPass::handleOUT(Instruction *i)
2973 {
2974    Instruction *prev = i->prev;
2975    ImmediateValue stream, prevStream;
2976
2977    // Only merge if the stream ids match. Also, note that the previous
2978    // instruction would have already been lowered, so we take arg1 from it.
2979    if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
2980        i->src(0).getImmediate(stream) &&
2981        prev->src(1).getImmediate(prevStream) &&
2982        stream.reg.data.u32 == prevStream.reg.data.u32) {
2983       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
2984       delete_Instruction(prog, i);
2985    } else {
2986       assert(gpEmitAddress);
2987       i->setDef(0, gpEmitAddress);
2988       i->setSrc(1, i->getSrc(0));
2989       i->setSrc(0, gpEmitAddress);
2990    }
2991    return true;
2992 }
2993
2994 Value *
2995 NVC0LoweringPass::calculateSampleOffset(Value *sampleID)
2996 {
2997    Value *offset = bld.getScratch();
2998    if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
2999       // Sample location offsets (in bytes) are calculated like so:
3000       // offset = (SV_POSITION.y % 4 * 2) + (SV_POSITION.x % 2)
3001       // offset = offset * 32 + sampleID % 8 * 4;
3002       // which is equivalent to:
3003       // offset = (SV_POSITION.y & 0x3) << 6 + (SV_POSITION.x & 0x1) << 5;
3004       // offset += sampleID << 2
3005
3006       // The second operand (src1) of the INSBF instructions are like so:
3007       // 0xssll where ss is the size and ll is the offset.
3008       // so: dest = src2 | (src0 & (1 << ss - 1)) << ll
3009
3010       // Add sample ID (offset = (sampleID & 0x7) << 2)
3011       bld.mkOp3(OP_INSBF, TYPE_U32, offset, sampleID, bld.mkImm(0x0302), bld.mkImm(0x0));
3012
3013       Symbol *xSym = bld.mkSysVal(SV_POSITION, 0);
3014       Symbol *ySym = bld.mkSysVal(SV_POSITION, 1);
3015       Value *coord = bld.getScratch();
3016
3017       // Add X coordinate (offset |= (SV_POSITION.x & 0x1) << 5)
3018       bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3019                    targ->getSVAddress(FILE_SHADER_INPUT, xSym), NULL);
3020       bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3021          ->rnd = ROUND_ZI;
3022       bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0105), offset);
3023
3024       // Add Y coordinate (offset |= (SV_POSITION.y & 0x3) << 6)
3025       bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
3026                    targ->getSVAddress(FILE_SHADER_INPUT, ySym), NULL);
3027       bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
3028          ->rnd = ROUND_ZI;
3029       bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0206), offset);
3030    } else {
3031       bld.mkOp2(OP_SHL, TYPE_U32, offset, sampleID, bld.mkImm(3));
3032    }
3033    return offset;
3034 }
3035
3036 // Handle programmable sample locations for GM20x+
3037 void
3038 NVC0LoweringPass::handlePIXLD(Instruction *i)
3039 {
3040    if (i->subOp != NV50_IR_SUBOP_PIXLD_OFFSET)
3041       return;
3042    if (targ->getChipset() < NVISA_GM200_CHIPSET)
3043       return;
3044
3045    assert(prog->driver->prop.fp.readsSampleLocations);
3046
3047    bld.mkLoad(TYPE_F32,
3048               i->getDef(0),
3049               bld.mkSymbol(
3050                     FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
3051                     TYPE_U32, prog->driver->io.sampleInfoBase),
3052               calculateSampleOffset(i->getSrc(0)));
3053
3054    bld.getBB()->remove(i);
3055 }
3056
3057 // Generate a binary predicate if an instruction is predicated by
3058 // e.g. an f32 value.
3059 void
3060 NVC0LoweringPass::checkPredicate(Instruction *insn)
3061 {
3062    Value *pred = insn->getPredicate();
3063    Value *pdst;
3064
3065    if (!pred || pred->reg.file == FILE_PREDICATE)
3066       return;
3067    pdst = new_LValue(func, FILE_PREDICATE);
3068
3069    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
3070    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
3071
3072    bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
3073
3074    insn->setPredicate(insn->cc, pdst);
3075 }
3076
3077 //
3078 // - add quadop dance for texturing
3079 // - put FP outputs in GPRs
3080 // - convert instruction sequences
3081 //
3082 bool
3083 NVC0LoweringPass::visit(Instruction *i)
3084 {
3085    bool ret = true;
3086    bld.setPosition(i, false);
3087
3088    if (i->cc != CC_ALWAYS)
3089       checkPredicate(i);
3090
3091    switch (i->op) {
3092    case OP_TEX:
3093    case OP_TXB:
3094    case OP_TXL:
3095    case OP_TXF:
3096    case OP_TXG:
3097       return handleTEX(i->asTex());
3098    case OP_TXD:
3099       return handleTXD(i->asTex());
3100    case OP_TXLQ:
3101       return handleTXLQ(i->asTex());
3102    case OP_TXQ:
3103      return handleTXQ(i->asTex());
3104    case OP_EX2:
3105       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
3106       i->setSrc(0, i->getDef(0));
3107       break;
3108    case OP_POW:
3109       return handlePOW(i);
3110    case OP_DIV:
3111       return handleDIV(i);
3112    case OP_MOD:
3113       return handleMOD(i);
3114    case OP_SQRT:
3115       return handleSQRT(i);
3116    case OP_EXPORT:
3117       ret = handleEXPORT(i);
3118       break;
3119    case OP_EMIT:
3120    case OP_RESTART:
3121       return handleOUT(i);
3122    case OP_RDSV:
3123       return handleRDSV(i);
3124    case OP_WRSV:
3125       return handleWRSV(i);
3126    case OP_STORE:
3127    case OP_LOAD:
3128       handleLDST(i);
3129       break;
3130    case OP_ATOM:
3131    {
3132       const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
3133       handleATOM(i);
3134       handleCasExch(i, cctl);
3135    }
3136       break;
3137    case OP_SULDB:
3138    case OP_SULDP:
3139    case OP_SUSTB:
3140    case OP_SUSTP:
3141    case OP_SUREDB:
3142    case OP_SUREDP:
3143       if (targ->getChipset() >= NVISA_GM107_CHIPSET)
3144          handleSurfaceOpGM107(i->asTex());
3145       else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
3146          handleSurfaceOpNVE4(i->asTex());
3147       else
3148          handleSurfaceOpNVC0(i->asTex());
3149       break;
3150    case OP_SUQ:
3151       handleSUQ(i->asTex());
3152       break;
3153    case OP_BUFQ:
3154       handleBUFQ(i);
3155       break;
3156    case OP_PIXLD:
3157       handlePIXLD(i);
3158       break;
3159    default:
3160       break;
3161    }
3162
3163    /* Kepler+ has a special opcode to compute a new base address to be used
3164     * for indirect loads.
3165     *
3166     * Maxwell+ has an additional similar requirement for indirect
3167     * interpolation ops in frag shaders.
3168     */
3169    bool doAfetch = false;
3170    if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
3171        !i->perPatch &&
3172        (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
3173        i->src(0).isIndirect(0)) {
3174       doAfetch = true;
3175    }
3176    if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
3177        (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
3178        i->src(0).isIndirect(0)) {
3179       doAfetch = true;
3180    }
3181
3182    if (doAfetch) {
3183       Value *addr = cloneShallow(func, i->getSrc(0));
3184       Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
3185                                       i->getSrc(0));
3186       afetch->setIndirect(0, 0, i->getIndirect(0, 0));
3187       addr->reg.data.offset = 0;
3188       i->setSrc(0, addr);
3189       i->setIndirect(0, 0, afetch->getDef(0));
3190    }
3191
3192    return ret;
3193 }
3194
3195 bool
3196 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
3197 {
3198    if (stage == CG_STAGE_PRE_SSA) {
3199       NVC0LoweringPass pass(prog);
3200       return pass.run(prog, false, true);
3201    } else
3202    if (stage == CG_STAGE_POST_RA) {
3203       NVC0LegalizePostRA pass(prog);
3204       return pass.run(prog, false, true);
3205    } else
3206    if (stage == CG_STAGE_SSA) {
3207       NVC0LegalizeSSA pass;
3208       return pass.run(prog, false, true);
3209    }
3210    return false;
3211 }
3212
3213 } // namespace nv50_ir