src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  18  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
  19  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20  * SOFTWARE.
  21  */
  22
  23 #include "nv50/codegen/nv50_ir.h"
  24 #include "nv50/codegen/nv50_ir_build_util.h"
  25
  26 #include "nv50_ir_target_nvc0.h"
  27
  28 #include <limits>
  29
  30 namespace nv50_ir {
  31
  32 #define QOP_ADD  0
  33 #define QOP_SUBR 1
  34 #define QOP_SUB  2
  35 #define QOP_MOV2 3
  36
  37 //             UL UR LL LR
  38 #define QUADOP(q, r, s, t)                      \
  39    ((QOP_##q << 6) | (QOP_##r << 4) |           \
  40     (QOP_##s << 2) | (QOP_##t << 0))
  41
  42 class NVC0LegalizeSSA : public Pass
  43 {
  44 private:
  45    virtual bool visit(BasicBlock *);
  46    virtual bool visit(Function *);
  47
  48    // we want to insert calls to the builtin library only after optimization
  49    void handleDIV(Instruction *); // integer division, modulus
  50    void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
  51
  52 private:
  53    BuildUtil bld;
  54 };
  55
  56 void
  57 NVC0LegalizeSSA::handleDIV(Instruction *i)
  58 {
  59    FlowInstruction *call;
  60    int builtin;
  61    Value *def[2];
  62
  63    bld.setPosition(i, false);
  64    def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
  65    def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
  66    switch (i->dType) {
  67    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
  68    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
  69    default:
  70       return;
  71    }
  72    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
  73    bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
  74    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
  75    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
  76
  77    call->fixed = 1;
  78    call->absolute = call->builtin = 1;
  79    call->target.builtin = builtin;
  80    delete_Instruction(prog, i);
  81 }
  82
  83 void
  84 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
  85 {
  86    // TODO
  87 }
  88
  89 bool
  90 NVC0LegalizeSSA::visit(Function *fn)
  91 {
  92    bld.setProgram(fn->getProgram());
  93    return true;
  94 }
  95
  96 bool
  97 NVC0LegalizeSSA::visit(BasicBlock *bb)
  98 {
  99    Instruction *next;
 100    for (Instruction *i = bb->getEntry(); i; i = next) {
 101       next = i->next;
 102       if (i->dType == TYPE_F32)
 103          continue;
 104       switch (i->op) {
 105       case OP_DIV:
 106       case OP_MOD:
 107          handleDIV(i);
 108          break;
 109       case OP_RCP:
 110       case OP_RSQ:
 111          if (i->dType == TYPE_F64)
 112             handleRCPRSQ(i);
 113          break;
 114       default:
 115          break;
 116       }
 117    }
 118    return true;
 119 }
 120
 121 class NVC0LegalizePostRA : public Pass
 122 {
 123 public:
 124    NVC0LegalizePostRA(const Program *);
 125
 126 private:
 127    virtual bool visit(Function *);
 128    virtual bool visit(BasicBlock *);
 129
 130    void replaceZero(Instruction *);
 131    bool tryReplaceContWithBra(BasicBlock *);
 132    void propagateJoin(BasicBlock *);
 133
 134    struct TexUse
 135    {
 136       TexUse(Instruction *use, const Instruction *tex)
 137          : insn(use), tex(tex), level(-1) { }
 138       Instruction *insn;
 139       const Instruction *tex; // or split / mov
 140       int level;
 141    };
 142    struct Limits
 143    {
 144       Limits() { }
 145       Limits(int min, int max) : min(min), max(max) { }
 146       int min, max;
 147    };
 148    bool insertTextureBarriers(Function *);
 149    inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
 150    void findFirstUses(const Instruction *tex, const Instruction *def,
 151                       std::list<TexUse>&);
 152    void findOverwritingDefs(const Instruction *tex, Instruction *insn,
 153                             const BasicBlock *term,
 154                             std::list<TexUse>&);
 155    void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
 156    const Instruction *recurseDef(const Instruction *);
 157
 158 private:
 159    LValue *rZero;
 160    LValue *carry;
 161    const bool needTexBar;
 162 };
 163
 164 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
 165    : needTexBar(prog->getTarget()->getChipset() >= 0xe0)
 166 {
 167 }
 168
 169 bool
 170 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
 171                                     const Instruction *early) const
 172 {
 173    if (early->bb == later->bb)
 174       return early->serial < later->serial;
 175    return later->bb->dominatedBy(early->bb);
 176 }
 177
 178 void
 179 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
 180                               Instruction *usei, const Instruction *insn)
 181 {
 182    bool add = true;
 183    for (std::list<TexUse>::iterator it = uses.begin();
 184         it != uses.end();) {
 185       if (insnDominatedBy(usei, it->insn)) {
 186          add = false;
 187          break;
 188       }
 189       if (insnDominatedBy(it->insn, usei))
 190          it = uses.erase(it);
 191       else
 192          ++it;
 193    }
 194    if (add)
 195       uses.push_back(TexUse(usei, insn));
 196 }
 197
 198 void
 199 NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
 200                                         Instruction *insn,
 201                                         const BasicBlock *term,
 202                                         std::list<TexUse> &uses)
 203 {
 204    while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
 205       insn = insn->getSrc(0)->getUniqueInsn();
 206
 207    if (!insn || !insn->bb->reachableBy(texi->bb, term))
 208       return;
 209
 210    switch (insn->op) {
 211    /* Values not connected to the tex's definition through any of these should
 212     * not be conflicting.
 213     */
 214    case OP_SPLIT:
 215    case OP_MERGE:
 216    case OP_PHI:
 217    case OP_UNION:
 218       /* recurse again */
 219       for (int s = 0; insn->srcExists(s); ++s)
 220          findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
 221                              uses);
 222       break;
 223    default:
 224       // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
 225       addTexUse(uses, insn, texi);
 226       break;
 227    }
 228 }
 229
 230 void
 231 NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
 232                                   const Instruction *insn,
 233                                   std::list<TexUse> &uses)
 234 {
 235    for (int d = 0; insn->defExists(d); ++d) {
 236       Value *v = insn->getDef(d);
 237       for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
 238          Instruction *usei = (*u)->getInsn();
 239
 240          if (usei->op == OP_PHI || usei->op == OP_UNION) {
 241             // need a barrier before WAW cases
 242             for (int s = 0; usei->srcExists(s); ++s) {
 243                Instruction *defi = usei->getSrc(s)->getUniqueInsn();
 244                if (defi && &usei->src(s) != *u)
 245                   findOverwritingDefs(texi, defi, usei->bb, uses);
 246             }
 247          }
 248
 249          if (usei->op == OP_SPLIT ||
 250              usei->op == OP_MERGE ||
 251              usei->op == OP_PHI ||
 252              usei->op == OP_UNION) {
 253             // these uses don't manifest in the machine code
 254             findFirstUses(texi, usei, uses);
 255          } else
 256          if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
 257              usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
 258             findFirstUses(texi, usei, uses);
 259          } else {
 260             addTexUse(uses, usei, insn);
 261          }
 262       }
 263    }
 264 }
 265
 266 // Texture barriers:
 267 // This pass is a bit long and ugly and can probably be optimized.
 268 //
 269 // 1. obtain a list of TEXes and their outputs' first use(s)
 270 // 2. calculate the barrier level of each first use (minimal number of TEXes,
 271 //    over all paths, between the TEX and the use in question)
 272 // 3. for each barrier, if all paths from the source TEX to that barrier
 273 //    contain a barrier of lesser level, it can be culled
 274 bool
 275 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
 276 {
 277    std::list<TexUse> *uses;
 278    std::vector<Instruction *> texes;
 279    std::vector<int> bbFirstTex;
 280    std::vector<int> bbFirstUse;
 281    std::vector<int> texCounts;
 282    std::vector<TexUse> useVec;
 283    ArrayList insns;
 284
 285    fn->orderInstructions(insns);
 286
 287    texCounts.resize(fn->allBBlocks.getSize(), 0);
 288    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
 289    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
 290
 291    // tag BB CFG nodes by their id for later
 292    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
 293       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
 294       if (bb)
 295          bb->cfg.tag = bb->getId();
 296    }
 297
 298    // gather the first uses for each TEX
 299    for (int i = 0; i < insns.getSize(); ++i) {
 300       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
 301       if (isTextureOp(tex->op)) {
 302          texes.push_back(tex);
 303          if (!texCounts.at(tex->bb->getId()))
 304             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
 305          texCounts[tex->bb->getId()]++;
 306       }
 307    }
 308    insns.clear();
 309    if (texes.empty())
 310       return false;
 311    uses = new std::list<TexUse>[texes.size()];
 312    if (!uses)
 313       return false;
 314    for (size_t i = 0; i < texes.size(); ++i)
 315       findFirstUses(texes[i], texes[i], uses[i]);
 316
 317    // determine the barrier level at each use
 318    for (size_t i = 0; i < texes.size(); ++i) {
 319       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
 320            ++u) {
 321          BasicBlock *tb = texes[i]->bb;
 322          BasicBlock *ub = u->insn->bb;
 323          if (tb == ub) {
 324             u->level = 0;
 325             for (size_t j = i + 1; j < texes.size() &&
 326                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
 327                  ++j)
 328                u->level++;
 329          } else {
 330             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
 331                                                       &ub->cfg, texCounts);
 332             if (u->level < 0) {
 333                WARN("Failed to find path TEX -> TEXBAR\n");
 334                u->level = 0;
 335                continue;
 336             }
 337             // this counted all TEXes in the origin block, correct that
 338             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
 339             // and did not count the TEXes in the destination block, add those
 340             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
 341                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
 342                  ++j)
 343                u->level++;
 344          }
 345          assert(u->level >= 0);
 346          useVec.push_back(*u);
 347       }
 348    }
 349    delete[] uses;
 350    uses = NULL;
 351
 352    // insert the barriers
 353    for (size_t i = 0; i < useVec.size(); ++i) {
 354       Instruction *prev = useVec[i].insn->prev;
 355       if (useVec[i].level < 0)
 356          continue;
 357       if (prev && prev->op == OP_TEXBAR) {
 358          if (prev->subOp > useVec[i].level)
 359             prev->subOp = useVec[i].level;
 360          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
 361       } else {
 362          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
 363          bar->fixed = 1;
 364          bar->subOp = useVec[i].level;
 365          // make use explicit to ease latency calculation
 366          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
 367          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
 368       }
 369    }
 370
 371    if (fn->getProgram()->optLevel < 3) {
 372       if (uses)
 373          delete[] uses;
 374       return true;
 375    }
 376
 377    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
 378
 379    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
 380    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
 381    limitS.resize(fn->allBBlocks.getSize());
 382
 383    // cull unneeded barriers (should do that earlier, but for simplicity)
 384    IteratorRef bi = fn->cfg.iteratorCFG();
 385    // first calculate min/max outstanding TEXes for each BB
 386    for (bi->reset(); !bi->end(); bi->next()) {
 387       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 388       BasicBlock *bb = BasicBlock::get(n);
 389       int min = 0;
 390       int max = std::numeric_limits<int>::max();
 391       for (Instruction *i = bb->getFirst(); i; i = i->next) {
 392          if (isTextureOp(i->op)) {
 393             min++;
 394             if (max < std::numeric_limits<int>::max())
 395                max++;
 396          } else
 397          if (i->op == OP_TEXBAR) {
 398             min = MIN2(min, i->subOp);
 399             max = MIN2(max, i->subOp);
 400          }
 401       }
 402       // limits when looking at an isolated block
 403       limitS[bb->getId()].min = min;
 404       limitS[bb->getId()].max = max;
 405    }
 406    // propagate the min/max values
 407    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
 408       for (bi->reset(); !bi->end(); bi->next()) {
 409          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 410          BasicBlock *bb = BasicBlock::get(n);
 411          const int bbId = bb->getId();
 412          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
 413             BasicBlock *in = BasicBlock::get(ei.getNode());
 414             const int inId = in->getId();
 415             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
 416             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
 417          }
 418          // I just hope this is correct ...
 419          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
 420             // no barrier
 421             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
 422             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
 423          } else {
 424             // block contained a barrier
 425             limitB[bbId].min = MIN2(limitS[bbId].max,
 426                                     limitT[bbId].min + limitS[bbId].min);
 427             limitB[bbId].max = MIN2(limitS[bbId].max,
 428                                     limitT[bbId].max + limitS[bbId].min);
 429          }
 430       }
 431    }
 432    // finally delete unnecessary barriers
 433    for (bi->reset(); !bi->end(); bi->next()) {
 434       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 435       BasicBlock *bb = BasicBlock::get(n);
 436       Instruction *prev = NULL;
 437       Instruction *next;
 438       int max = limitT[bb->getId()].max;
 439       for (Instruction *i = bb->getFirst(); i; i = next) {
 440          next = i->next;
 441          if (i->op == OP_TEXBAR) {
 442             if (i->subOp >= max) {
 443                delete_Instruction(prog, i);
 444             } else {
 445                max = i->subOp;
 446                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
 447                   delete_Instruction(prog, prev);
 448                   prev = NULL;
 449                }
 450             }
 451          } else
 452          if (isTextureOp(i->op)) {
 453             max++;
 454          }
 455          if (!i->isNop())
 456             prev = i;
 457       }
 458    }
 459    if (uses)
 460       delete[] uses;
 461    return true;
 462 }
 463
 464 bool
 465 NVC0LegalizePostRA::visit(Function *fn)
 466 {
 467    if (needTexBar)
 468       insertTextureBarriers(fn);
 469
 470    rZero = new_LValue(fn, FILE_GPR);
 471    carry = new_LValue(fn, FILE_FLAGS);
 472
 473    rZero->reg.data.id = prog->getTarget()->getFileSize(FILE_GPR);
 474    carry->reg.data.id = 0;
 475
 476    return true;
 477 }
 478
 479 void
 480 NVC0LegalizePostRA::replaceZero(Instruction *i)
 481 {
 482    for (int s = 0; i->srcExists(s); ++s) {
 483       if (s == 2 && i->op == OP_SUCLAMP)
 484          continue;
 485       ImmediateValue *imm = i->getSrc(s)->asImm();
 486       if (imm && imm->reg.data.u64 == 0)
 487          i->setSrc(s, rZero);
 488    }
 489 }
 490
 491 // replace CONT with BRA for single unconditional continue
 492 bool
 493 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
 494 {
 495    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
 496       return false;
 497    Graph::EdgeIterator ei = bb->cfg.incident();
 498    if (ei.getType() != Graph::Edge::BACK)
 499       ei.next();
 500    if (ei.getType() != Graph::Edge::BACK)
 501       return false;
 502    BasicBlock *contBB = BasicBlock::get(ei.getNode());
 503
 504    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
 505        contBB->getExit()->getPredicate())
 506       return false;
 507    contBB->getExit()->op = OP_BRA;
 508    bb->remove(bb->getEntry()); // delete PRECONT
 509
 510    ei.next();
 511    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
 512    return true;
 513 }
 514
 515 // replace branches to join blocks with join ops
 516 void
 517 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
 518 {
 519    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
 520       return;
 521    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
 522       BasicBlock *in = BasicBlock::get(ei.getNode());
 523       Instruction *exit = in->getExit();
 524       if (!exit) {
 525          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
 526          // there should always be a terminator instruction
 527          WARN("inserted missing terminator in BB:%i\n", in->getId());
 528       } else
 529       if (exit->op == OP_BRA) {
 530          exit->op = OP_JOIN;
 531          exit->asFlow()->limit = 1; // must-not-propagate marker
 532       }
 533    }
 534    bb->remove(bb->getEntry());
 535 }
 536
 537 bool
 538 NVC0LegalizePostRA::visit(BasicBlock *bb)
 539 {
 540    Instruction *i, *next;
 541
 542    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
 543    for (i = bb->getFirst(); i; i = next) {
 544       next = i->next;
 545       if (i->op == OP_EMIT || i->op == OP_RESTART) {
 546          if (!i->getDef(0)->refCount())
 547             i->setDef(0, NULL);
 548          if (i->src(0).getFile() == FILE_IMMEDIATE)
 549             i->setSrc(0, rZero); // initial value must be 0
 550       } else
 551       if (i->isNop()) {
 552          bb->remove(i);
 553       } else {
 554          // TODO: Move this to before register allocation for operations that
 555          // need the $c register !
 556          if (typeSizeof(i->dType) == 8) {
 557             Instruction *hi;
 558             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
 559             if (hi)
 560                next = hi;
 561          }
 562
 563          if (i->op != OP_MOV && i->op != OP_PFETCH)
 564             replaceZero(i);
 565       }
 566    }
 567    if (!bb->getEntry())
 568       return true;
 569
 570    if (!tryReplaceContWithBra(bb))
 571       propagateJoin(bb);
 572
 573    return true;
 574 }
 575
 576 class NVC0LoweringPass : public Pass
 577 {
 578 public:
 579    NVC0LoweringPass(Program *);
 580
 581 private:
 582    virtual bool visit(Function *);
 583    virtual bool visit(BasicBlock *);
 584    virtual bool visit(Instruction *);
 585
 586    bool handleRDSV(Instruction *);
 587    bool handleWRSV(Instruction *);
 588    bool handleEXPORT(Instruction *);
 589    bool handleOUT(Instruction *);
 590    bool handleDIV(Instruction *);
 591    bool handleMOD(Instruction *);
 592    bool handleSQRT(Instruction *);
 593    bool handlePOW(Instruction *);
 594    bool handleTEX(TexInstruction *);
 595    bool handleTXD(TexInstruction *);
 596    bool handleTXQ(TexInstruction *);
 597    bool handleManualTXD(TexInstruction *);
 598    bool handleATOM(Instruction *);
 599    void handleSurfaceOpNVE4(TexInstruction *);
 600
 601    void checkPredicate(Instruction *);
 602
 603    void readTessCoord(LValue *dst, int c);
 604
 605    Value *loadResInfo32(Value *ptr, uint32_t off);
 606    Value *loadMsInfo32(Value *ptr, uint32_t off);
 607
 608    void adjustCoordinatesMS(TexInstruction *);
 609    void processSurfaceCoordsNVE4(TexInstruction *);
 610
 611 private:
 612    const Target *const targ;
 613
 614    BuildUtil bld;
 615
 616    Symbol *gMemBase;
 617    LValue *gpEmitAddress;
 618 };
 619
 620 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
 621 {
 622    bld.setProgram(prog);
 623    gMemBase = NULL;
 624 }
 625
 626 bool
 627 NVC0LoweringPass::visit(Function *fn)
 628 {
 629    if (prog->getType() == Program::TYPE_GEOMETRY) {
 630       assert(!strncmp(fn->getName(), "MAIN", 4));
 631       // TODO: when we generate actual functions pass this value along somehow
 632       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
 633       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
 634       if (fn->cfgExit) {
 635          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
 636          bld.mkMovToReg(0, gpEmitAddress);
 637       }
 638    }
 639    return true;
 640 }
 641
 642 bool
 643 NVC0LoweringPass::visit(BasicBlock *bb)
 644 {
 645    return true;
 646 }
 647
 648 // move array source to first slot, convert to u16, add indirections
 649 bool
 650 NVC0LoweringPass::handleTEX(TexInstruction *i)
 651 {
 652    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
 653    const int arg = i->tex.target.getArgCount();
 654
 655    if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET) {
 656       if (i->tex.r == i->tex.s) {
 657          i->tex.r += prog->driver->io.texBindBase / 4;
 658          i->tex.s  = 0; // only a single cX[] value possible here
 659       } else {
 660          // TODO: extract handles and use register to select TIC/TSC entries
 661       }
 662       if (i->tex.target.isArray()) {
 663          LValue *layer = new_LValue(func, FILE_GPR);
 664          Value *src = i->getSrc(arg - 1);
 665          const int sat = (i->op == OP_TXF) ? 1 : 0;
 666          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
 667          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
 668          for (int s = dim; s >= 1; --s)
 669             i->setSrc(s, i->getSrc(s - 1));
 670          i->setSrc(0, layer);
 671       }
 672       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
 673          Value *tmp[2];
 674          Symbol *bind;
 675          Value *rRel = i->getIndirectR();
 676          Value *sRel = i->getIndirectS();
 677          Value *shCnt = bld.loadImm(NULL, 2);
 678
 679          if (rRel) {
 680             tmp[0] = bld.getScratch();
 681             bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.r * 4);
 682             bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], rRel, shCnt);
 683             tmp[1] = bld.mkLoadv(TYPE_U32, bind, tmp[0]);
 684             bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1],
 685                       bld.loadImm(tmp[0], 0x00ffffffu));
 686             rRel = tmp[0];
 687             i->setSrc(i->tex.rIndirectSrc, NULL);
 688          }
 689          if (sRel) {
 690             tmp[0] = bld.getScratch();
 691             bind = bld.mkSymbol(FILE_MEMORY_CONST, 15, TYPE_U32, i->tex.s * 4);
 692             bld.mkOp2(OP_SHL, TYPE_U32, tmp[0], sRel, shCnt);
 693             tmp[1] = bld.mkLoadv(TYPE_U32, bind, tmp[0]);
 694             bld.mkOp2(OP_AND, TYPE_U32, tmp[0], tmp[1],
 695                       bld.loadImm(tmp[0], 0xff000000u));
 696             sRel = tmp[0];
 697             i->setSrc(i->tex.sIndirectSrc, NULL);
 698          }
 699          bld.mkOp2(OP_OR, TYPE_U32, rRel, rRel, sRel);
 700
 701          int min = i->tex.rIndirectSrc;
 702          if (min < 0 || min > i->tex.sIndirectSrc)
 703             min = i->tex.sIndirectSrc;
 704          for (int s = min; s >= 1; --s)
 705             i->setSrc(s, i->getSrc(s - 1));
 706          i->setSrc(0, rRel);
 707       }
 708    } else
 709    // (nvc0) generate and move the tsc/tic/array source to the front
 710    if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
 711       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
 712
 713       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(arg - 1) : NULL;
 714       for (int s = dim; s >= 1; --s)
 715          i->setSrc(s, i->getSrc(s - 1));
 716       i->setSrc(0, arrayIndex);
 717
 718       Value *ticRel = i->getIndirectR();
 719       Value *tscRel = i->getIndirectS();
 720
 721       if (arrayIndex) {
 722          int sat = (i->op == OP_TXF) ? 1 : 0;
 723          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
 724          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
 725       } else {
 726          bld.loadImm(src, 0);
 727       }
 728
 729       if (ticRel) {
 730          i->setSrc(i->tex.rIndirectSrc, NULL);
 731          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
 732       }
 733       if (tscRel) {
 734          i->setSrc(i->tex.sIndirectSrc, NULL);
 735          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
 736       }
 737
 738       i->setSrc(0, src);
 739    }
 740
 741    // offset is last source (lod 1st, dc 2nd)
 742    if (i->tex.useOffsets) {
 743       uint32_t value = 0;
 744       int n, c;
 745       int s = i->srcCount(0xff);
 746       for (n = 0; n < i->tex.useOffsets; ++n)
 747          for (c = 0; c < 3; ++c)
 748             value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4);
 749       i->setSrc(s, bld.loadImm(NULL, value));
 750    }
 751
 752    return true;
 753 }
 754
 755 bool
 756 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
 757 {
 758    static const uint8_t qOps[4][2] =
 759    {
 760       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
 761       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
 762       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
 763       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
 764    };
 765    Value *def[4][4];
 766    Value *crd[3];
 767    Instruction *tex;
 768    Value *zero = bld.loadImm(bld.getSSA(), 0);
 769    int l, c;
 770    const int dim = i->tex.target.getDim();
 771
 772    i->op = OP_TEX; // no need to clone dPdx/dPdy later
 773
 774    for (c = 0; c < dim; ++c)
 775       crd[c] = bld.getScratch();
 776
 777    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
 778    for (l = 0; l < 4; ++l) {
 779       // mov coordinates from lane l to all lanes
 780       for (c = 0; c < dim; ++c)
 781          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
 782       // add dPdx from lane l to lanes dx
 783       for (c = 0; c < dim; ++c)
 784          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
 785       // add dPdy from lane l to lanes dy
 786       for (c = 0; c < dim; ++c)
 787          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
 788       // texture
 789       bld.insert(tex = cloneForward(func, i));
 790       for (c = 0; c < dim; ++c)
 791          tex->setSrc(c, crd[c]);
 792       // save results
 793       for (c = 0; i->defExists(c); ++c) {
 794          Instruction *mov;
 795          def[c][l] = bld.getSSA();
 796          mov = bld.mkMov(def[c][l], tex->getDef(c));
 797          mov->fixed = 1;
 798          mov->lanes = 1 << l;
 799       }
 800    }
 801    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
 802
 803    for (c = 0; i->defExists(c); ++c) {
 804       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
 805       for (l = 0; l < 4; ++l)
 806          u->setSrc(l, def[c][l]);
 807    }
 808
 809    i->bb->remove(i);
 810    return true;
 811 }
 812
 813 bool
 814 NVC0LoweringPass::handleTXD(TexInstruction *txd)
 815 {
 816    int dim = txd->tex.target.getDim();
 817    int arg = txd->tex.target.getArgCount();
 818
 819    handleTEX(txd);
 820    while (txd->srcExists(arg))
 821       ++arg;
 822
 823    txd->tex.derivAll = true;
 824    if (dim > 2 ||
 825        txd->tex.target.isCube() ||
 826        arg > 4 ||
 827        txd->tex.target.isShadow())
 828       return handleManualTXD(txd);
 829
 830    for (int c = 0; c < dim; ++c) {
 831       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
 832       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
 833       txd->dPdx[c].set(NULL);
 834       txd->dPdy[c].set(NULL);
 835    }
 836    return true;
 837 }
 838
 839 bool
 840 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
 841 {
 842    // TODO: indirect resource/sampler index
 843    return true;
 844 }
 845
 846 bool
 847 NVC0LoweringPass::handleATOM(Instruction *atom)
 848 {
 849    SVSemantic sv;
 850
 851    switch (atom->src(0).getFile()) {
 852    case FILE_MEMORY_LOCAL:
 853       sv = SV_LBASE;
 854       break;
 855    case FILE_MEMORY_SHARED:
 856       sv = SV_SBASE;
 857       break;
 858    default:
 859       assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
 860       return true;
 861    }
 862    Value *base =
 863       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
 864    Value *ptr = atom->getIndirect(0, 0);
 865
 866    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
 867    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
 868    if (ptr)
 869       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
 870    atom->setIndirect(0, 0, base);
 871
 872    return true;
 873 }
 874
 875 inline Value *
 876 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
 877 {
 878    uint8_t b = prog->driver->io.resInfoCBSlot;
 879    off += prog->driver->io.suInfoBase;
 880    return bld.
 881       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 882 }
 883
 884 inline Value *
 885 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
 886 {
 887    uint8_t b = prog->driver->io.msInfoCBSlot;
 888    off += prog->driver->io.msInfoBase;
 889    return bld.
 890       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 891 }
 892
 893 /* On nvc0, surface info is obtained via the surface binding points passed
 894  * to the SULD/SUST instructions.
 895  * On nve4, surface info is stored in c[] and is used by various special
 896  * instructions, e.g. for clamping coordiantes or generating an address.
 897  * They couldn't just have added an equivalent to TIC now, couldn't they ?
 898  */
 899 #define NVE4_SU_INFO_ADDR   0x00
 900 #define NVE4_SU_INFO_FMT    0x04
 901 #define NVE4_SU_INFO_DIM_X  0x08
 902 #define NVE4_SU_INFO_PITCH  0x0c
 903 #define NVE4_SU_INFO_DIM_Y  0x10
 904 #define NVE4_SU_INFO_ARRAY  0x14
 905 #define NVE4_SU_INFO_DIM_Z  0x18
 906 #define NVE4_SU_INFO_UNK1C  0x1c
 907 #define NVE4_SU_INFO_WIDTH  0x20
 908 #define NVE4_SU_INFO_HEIGHT 0x24
 909 #define NVE4_SU_INFO_DEPTH  0x28
 910 #define NVE4_SU_INFO_TARGET 0x2c
 911 #define NVE4_SU_INFO_CALL   0x30
 912 #define NVE4_SU_INFO_RAW_X  0x34
 913 #define NVE4_SU_INFO_MS_X   0x38
 914 #define NVE4_SU_INFO_MS_Y   0x3c
 915
 916 #define NVE4_SU_INFO__STRIDE 0x40
 917
 918 #define NVE4_SU_INFO_DIM(i)  (0x08 + (i) * 8)
 919 #define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
 920 #define NVE4_SU_INFO_MS(i)   (0x38 + (i) * 4)
 921
 922 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
 923 {
 924    switch (su->tex.target.getEnum()) {
 925    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
 926    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
 927    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
 928    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
 929                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
 930                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
 931    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
 932    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
 933    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
 934    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
 935    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
 936    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
 937    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
 938    default:
 939       assert(0);
 940       return 0;
 941    }
 942 }
 943
 944 void
 945 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
 946 {
 947    const uint16_t base = tex->tex.r * NVE4_SU_INFO__STRIDE;
 948    const int arg = tex->tex.target.getArgCount();
 949
 950    if (tex->tex.target == TEX_TARGET_2D_MS)
 951       tex->tex.target = TEX_TARGET_2D;
 952    else
 953    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
 954       tex->tex.target = TEX_TARGET_2D_ARRAY;
 955    else
 956       return;
 957
 958    Value *x = tex->getSrc(0);
 959    Value *y = tex->getSrc(1);
 960    Value *s = tex->getSrc(arg - 1);
 961
 962    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
 963
 964    Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
 965    Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
 966
 967    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
 968    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
 969
 970    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
 971    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
 972
 973    Value *dx = loadMsInfo32(ts, 0x0);
 974    Value *dy = loadMsInfo32(ts, 0x4);
 975
 976    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
 977    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
 978
 979    tex->setSrc(0, tx);
 980    tex->setSrc(1, ty);
 981    tex->moveSources(arg, -1);
 982 }
 983
 984 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
 985 // They're computed from the coordinates using the surface info in c[] space.
 986 void
 987 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
 988 {
 989    Instruction *insn;
 990    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
 991    const bool raw =
 992       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
 993    const int idx = su->tex.r;
 994    const int dim = su->tex.target.getDim();
 995    const int arg = dim + (su->tex.target.isArray() ? 1 : 0);
 996    const uint16_t base = idx * NVE4_SU_INFO__STRIDE;
 997    int c;
 998    Value *zero = bld.mkImm(0);
 999    Value *p1 = NULL;
1000    Value *v;
1001    Value *src[3];
1002    Value *bf, *eau, *off;
1003    Value *addr, *pred;
1004
1005    off = bld.getScratch(4);
1006    bf = bld.getScratch(4);
1007    addr = bld.getSSA(8);
1008    pred = bld.getScratch(1, FILE_PREDICATE);
1009
1010    bld.setPosition(su, false);
1011
1012    adjustCoordinatesMS(su);
1013
1014    // calculate clamped coordinates
1015    for (c = 0; c < arg; ++c) {
1016       src[c] = bld.getScratch();
1017       if (c == 0 && raw)
1018          v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
1019       else
1020          v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
1021       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
1022          ->subOp = getSuClampSubOp(su, c);
1023    }
1024    for (; c < 3; ++c)
1025       src[c] = zero;
1026
1027    // set predicate output
1028    if (su->tex.target == TEX_TARGET_BUFFER) {
1029       src[0]->getInsn()->setFlagsDef(1, pred);
1030    } else
1031    if (su->tex.target.isArray()) {
1032       p1 = bld.getSSA(1, FILE_PREDICATE);
1033       src[dim]->getInsn()->setFlagsDef(1, p1);
1034    }
1035
1036    // calculate pixel offset
1037    if (dim == 1) {
1038       if (su->tex.target != TEX_TARGET_BUFFER)
1039          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
1040    } else
1041    if (dim == 3) {
1042       v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
1043       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
1044          ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1045
1046       v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
1047       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
1048          ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1049    } else {
1050       assert(dim == 2);
1051       v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
1052       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
1053          ->subOp = su->tex.target.isArray() ?
1054          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1055    }
1056
1057    // calculate effective address part 1
1058    if (su->tex.target == TEX_TARGET_BUFFER) {
1059       if (raw) {
1060          bf = src[0];
1061       } else {
1062          v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
1063          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
1064             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
1065       }
1066    } else {
1067       Value *y = src[1];
1068       Value *z = src[2];
1069       uint16_t subOp = 0;
1070
1071       switch (dim) {
1072       case 1:
1073          y = zero;
1074          z = zero;
1075          break;
1076       case 2:
1077          z = off;
1078          if (!su->tex.target.isArray()) {
1079             z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
1080             subOp = NV50_IR_SUBOP_SUBFM_3D;
1081          }
1082          break;
1083       default:
1084          subOp = NV50_IR_SUBOP_SUBFM_3D;
1085          assert(dim == 3);
1086          break;
1087       }
1088       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
1089       insn->subOp = subOp;
1090       insn->setFlagsDef(1, pred);
1091    }
1092
1093    // part 2
1094    v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
1095
1096    if (su->tex.target == TEX_TARGET_BUFFER) {
1097       eau = v;
1098    } else {
1099       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
1100    }
1101    // add array layer offset
1102    if (su->tex.target.isArray()) {
1103       v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
1104       if (dim == 1)
1105          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
1106             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1107       else
1108          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
1109             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
1110       // combine predicates
1111       assert(p1);
1112       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
1113    }
1114
1115    if (atom) {
1116       Value *lo = bf;
1117       if (su->tex.target == TEX_TARGET_BUFFER) {
1118          lo = zero;
1119          bld.mkMov(off, bf);
1120       }
1121       //  bf == g[] address & 0xff
1122       // eau == g[] address >> 8
1123       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
1124       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
1125    } else
1126    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
1127       // Convert from u32 to u8 address format, which is what the library code
1128       // doing SULDP currently uses.
1129       // XXX: can SUEAU do this ?
1130       // XXX: does it matter that we don't mask high bytes in bf ?
1131       // Grrr.
1132       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
1133       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
1134    }
1135
1136    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
1137
1138    if (atom && su->tex.target == TEX_TARGET_BUFFER)
1139       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
1140
1141    // let's just set it 0 for raw access and hope it works
1142    v = raw ?
1143       bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
1144
1145    // get rid of old coordinate sources, make space for fmt info and predicate
1146    su->moveSources(arg, 3 - arg);
1147    // set 64 bit address and 32-bit format sources
1148    su->setSrc(0, addr);
1149    su->setSrc(1, v);
1150    su->setSrc(2, pred);
1151 }
1152
1153 void
1154 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
1155 {
1156    processSurfaceCoordsNVE4(su);
1157
1158    // Who do we hate more ? The person who decided that nvc0's SULD doesn't
1159    // have to support conversion or the person who decided that, in OpenCL,
1160    // you don't have to specify the format here like you do in OpenGL ?
1161
1162    if (su->op == OP_SULDP) {
1163       // We don't patch shaders. Ever.
1164       // You get an indirect call to our library blob here.
1165       // But at least it's uniform.
1166       FlowInstruction *call;
1167       LValue *p[3];
1168       LValue *r[5];
1169       uint16_t base = su->tex.r * NVE4_SU_INFO__STRIDE + NVE4_SU_INFO_CALL;
1170
1171       for (int i = 0; i < 4; ++i)
1172          (r[i] = bld.getScratch(4, FILE_GPR))->reg.data.id = i;
1173       for (int i = 0; i < 3; ++i)
1174          (p[i] = bld.getScratch(1, FILE_PREDICATE))->reg.data.id = i;
1175       (r[4] = bld.getScratch(8, FILE_GPR))->reg.data.id = 4;
1176
1177       bld.mkMov(p[1], bld.mkImm((su->cache == CACHE_CA) ? 1 : 0), TYPE_U8);
1178       bld.mkMov(p[2], bld.mkImm((su->cache == CACHE_CG) ? 1 : 0), TYPE_U8);
1179       bld.mkMov(p[0], su->getSrc(2), TYPE_U8);
1180       bld.mkMov(r[4], su->getSrc(0), TYPE_U64);
1181       bld.mkMov(r[2], su->getSrc(1), TYPE_U32);
1182
1183       call = bld.mkFlow(OP_CALL, NULL, su->cc, su->getPredicate());
1184
1185       call->indirect = 1;
1186       call->absolute = 1;
1187       call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
1188                                    prog->driver->io.resInfoCBSlot, TYPE_U32,
1189                                    prog->driver->io.suInfoBase + base));
1190       call->setSrc(1, r[2]);
1191       call->setSrc(2, r[4]);
1192       for (int i = 0; i < 3; ++i)
1193          call->setSrc(3 + i, p[i]);
1194       for (int i = 0; i < 4; ++i) {
1195          call->setDef(i, r[i]);
1196          bld.mkMov(su->getDef(i), r[i]);
1197       }
1198       call->setDef(4, p[1]);
1199       delete_Instruction(bld.getProgram(), su);
1200    }
1201
1202    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
1203       Value *pred = su->getSrc(2);
1204       CondCode cc = CC_NOT_P;
1205       if (su->getPredicate()) {
1206          pred = bld.getScratch(1, FILE_PREDICATE);
1207          cc = su->cc;
1208          if (cc == CC_NOT_P) {
1209             bld.mkOp2(OP_OR, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
1210          } else {
1211             bld.mkOp2(OP_AND, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
1212             pred->getInsn()->src(1).mod = Modifier(NV50_IR_MOD_NOT);
1213          }
1214       }
1215       Instruction *red = bld.mkOp(OP_ATOM, su->dType, su->getDef(0));
1216       red->subOp = su->subOp;
1217       if (!gMemBase)
1218          gMemBase = bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0);
1219       red->setSrc(0, gMemBase);
1220       red->setSrc(1, su->getSrc(3));
1221       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
1222          red->setSrc(2, su->getSrc(4));
1223       red->setIndirect(0, 0, su->getSrc(0));
1224       red->setPredicate(cc, pred);
1225       delete_Instruction(bld.getProgram(), su);
1226    } else {
1227       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
1228    }
1229 }
1230
1231 bool
1232 NVC0LoweringPass::handleWRSV(Instruction *i)
1233 {
1234    Instruction *st;
1235    Symbol *sym;
1236    uint32_t addr;
1237
1238    // must replace, $sreg are not writeable
1239    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
1240    if (addr >= 0x400)
1241       return false;
1242    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1243
1244    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
1245                     i->getSrc(1));
1246    st->perPatch = i->perPatch;
1247
1248    bld.getBB()->remove(i);
1249    return true;
1250 }
1251
1252 void
1253 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
1254 {
1255    Value *laneid = bld.getSSA();
1256    Value *x, *y;
1257
1258    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
1259
1260    if (c == 0) {
1261       x = dst;
1262       y = NULL;
1263    } else
1264    if (c == 1) {
1265       x = NULL;
1266       y = dst;
1267    } else {
1268       assert(c == 2);
1269       x = bld.getSSA();
1270       y = bld.getSSA();
1271    }
1272    if (x)
1273       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
1274    if (y)
1275       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
1276
1277    if (c == 2) {
1278       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
1279       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
1280    }
1281 }
1282
1283 bool
1284 NVC0LoweringPass::handleRDSV(Instruction *i)
1285 {
1286    Symbol *sym = i->getSrc(0)->asSym();
1287    Value *vtx = NULL;
1288    Instruction *ld;
1289    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1290
1291    if (addr >= 0x400) // mov $sreg
1292       return true;
1293
1294    switch (i->getSrc(0)->reg.data.sv.sv) {
1295    case SV_POSITION:
1296       assert(prog->getType() == Program::TYPE_FRAGMENT);
1297       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1298       break;
1299    case SV_FACE:
1300    {
1301       Value *face = i->getDef(0);
1302       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
1303       if (i->dType == TYPE_F32) {
1304          bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
1305          bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
1306       }
1307    }
1308       break;
1309    case SV_TESS_COORD:
1310       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
1311       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
1312       break;
1313    default:
1314       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
1315          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
1316       ld = bld.mkFetch(i->getDef(0), i->dType,
1317                        FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
1318       ld->perPatch = i->perPatch;
1319       break;
1320    }
1321    bld.getBB()->remove(i);
1322    return true;
1323 }
1324
1325 bool
1326 NVC0LoweringPass::handleDIV(Instruction *i)
1327 {
1328    if (!isFloatType(i->dType))
1329       return true;
1330    bld.setPosition(i, false);
1331    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1332    i->op = OP_MUL;
1333    i->setSrc(1, rcp->getDef(0));
1334    return true;
1335 }
1336
1337 bool
1338 NVC0LoweringPass::handleMOD(Instruction *i)
1339 {
1340    if (i->dType != TYPE_F32)
1341       return true;
1342    LValue *value = bld.getScratch();
1343    bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
1344    bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
1345    bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
1346    bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
1347    i->op = OP_SUB;
1348    i->setSrc(1, value);
1349    return true;
1350 }
1351
1352 bool
1353 NVC0LoweringPass::handleSQRT(Instruction *i)
1354 {
1355    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
1356                                 bld.getSSA(), i->getSrc(0));
1357    i->op = OP_MUL;
1358    i->setSrc(1, rsq->getDef(0));
1359
1360    return true;
1361 }
1362
1363 bool
1364 NVC0LoweringPass::handlePOW(Instruction *i)
1365 {
1366    LValue *val = bld.getScratch();
1367
1368    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1369    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1370    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1371
1372    i->op = OP_EX2;
1373    i->setSrc(0, val);
1374    i->setSrc(1, NULL);
1375
1376    return true;
1377 }
1378
1379 bool
1380 NVC0LoweringPass::handleEXPORT(Instruction *i)
1381 {
1382    if (prog->getType() == Program::TYPE_FRAGMENT) {
1383       int id = i->getSrc(0)->reg.data.offset / 4;
1384
1385       if (i->src(0).isIndirect(0)) // TODO, ugly
1386          return false;
1387       i->op = OP_MOV;
1388       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1389       i->src(0).set(i->src(1));
1390       i->setSrc(1, NULL);
1391       i->setDef(0, new_LValue(func, FILE_GPR));
1392       i->getDef(0)->reg.data.id = id;
1393
1394       prog->maxGPR = MAX2(prog->maxGPR, id);
1395    } else
1396    if (prog->getType() == Program::TYPE_GEOMETRY) {
1397       i->setIndirect(0, 1, gpEmitAddress);
1398    }
1399    return true;
1400 }
1401
1402 bool
1403 NVC0LoweringPass::handleOUT(Instruction *i)
1404 {
1405    if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
1406       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
1407       delete_Instruction(prog, i);
1408    } else {
1409       assert(gpEmitAddress);
1410       i->setDef(0, gpEmitAddress);
1411       if (i->srcExists(0))
1412          i->setSrc(1, i->getSrc(0));
1413       i->setSrc(0, gpEmitAddress);
1414    }
1415    return true;
1416 }
1417
1418 // Generate a binary predicate if an instruction is predicated by
1419 // e.g. an f32 value.
1420 void
1421 NVC0LoweringPass::checkPredicate(Instruction *insn)
1422 {
1423    Value *pred = insn->getPredicate();
1424    Value *pdst;
1425
1426    if (!pred || pred->reg.file == FILE_PREDICATE)
1427       return;
1428    pdst = new_LValue(func, FILE_PREDICATE);
1429
1430    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
1431    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
1432
1433    bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, pdst, bld.mkImm(0), pred);
1434
1435    insn->setPredicate(insn->cc, pdst);
1436 }
1437
1438 //
1439 // - add quadop dance for texturing
1440 // - put FP outputs in GPRs
1441 // - convert instruction sequences
1442 //
1443 bool
1444 NVC0LoweringPass::visit(Instruction *i)
1445 {
1446    bld.setPosition(i, false);
1447
1448    if (i->cc != CC_ALWAYS)
1449       checkPredicate(i);
1450
1451    switch (i->op) {
1452    case OP_TEX:
1453    case OP_TXB:
1454    case OP_TXL:
1455    case OP_TXF:
1456    case OP_TXG:
1457       return handleTEX(i->asTex());
1458    case OP_TXD:
1459       return handleTXD(i->asTex());
1460    case OP_TXQ:
1461      return handleTXQ(i->asTex());
1462    case OP_EX2:
1463       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1464       i->setSrc(0, i->getDef(0));
1465       break;
1466    case OP_POW:
1467       return handlePOW(i);
1468    case OP_DIV:
1469       return handleDIV(i);
1470    case OP_MOD:
1471       return handleMOD(i);
1472    case OP_SQRT:
1473       return handleSQRT(i);
1474    case OP_EXPORT:
1475       return handleEXPORT(i);
1476    case OP_EMIT:
1477    case OP_RESTART:
1478       return handleOUT(i);
1479    case OP_RDSV:
1480       return handleRDSV(i);
1481    case OP_WRSV:
1482       return handleWRSV(i);
1483    case OP_LOAD:
1484       if (i->src(0).getFile() == FILE_SHADER_INPUT) {
1485          if (prog->getType() == Program::TYPE_COMPUTE) {
1486             i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
1487             i->getSrc(0)->reg.fileIndex = 0;
1488          } else {
1489             i->op = OP_VFETCH;
1490             assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
1491          }
1492       }
1493       break;
1494    case OP_ATOM:
1495       handleATOM(i);
1496       break;
1497    case OP_SULDB:
1498    case OP_SULDP:
1499    case OP_SUSTB:
1500    case OP_SUSTP:
1501    case OP_SUREDB:
1502    case OP_SUREDP:
1503       if (targ->getChipset() >= NVISA_GK104_CHIPSET)
1504          handleSurfaceOpNVE4(i->asTex());
1505       break;
1506    default:
1507       break;
1508    }
1509    return true;
1510 }
1511
1512 bool
1513 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
1514 {
1515    if (stage == CG_STAGE_PRE_SSA) {
1516       NVC0LoweringPass pass(prog);
1517       return pass.run(prog, false, true);
1518    } else
1519    if (stage == CG_STAGE_POST_RA) {
1520       NVC0LegalizePostRA pass(prog);
1521       return pass.run(prog, false, true);
1522    } else
1523    if (stage == CG_STAGE_SSA) {
1524       NVC0LegalizeSSA pass;
1525       return pass.run(prog, false, true);
1526    }
1527    return false;
1528 }
1529
1530 } // namespace nv50_ir