src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_build_util.h"
  25
  26 #include "codegen/nv50_ir_target_nv50.h"
  27
  28 namespace nv50_ir {
  29
  30 // nv50 doesn't support 32 bit integer multiplication
  31 //
  32 //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
  33 // -------------------
  34 //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
  35 // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
  36 //       al*bl
  37 //    ah*bl 00
  38 //
  39 // fffe0001 + fffe0001
  40 static bool
  41 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
  42 {
  43    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
  44
  45    DataType fTy = mul->sType; // full type
  46    DataType hTy;
  47    switch (fTy) {
  48    case TYPE_S32: hTy = TYPE_S16; break;
  49    case TYPE_U32: hTy = TYPE_U16; break;
  50    case TYPE_U64: hTy = TYPE_U32; break;
  51    case TYPE_S64: hTy = TYPE_S32; break;
  52    default:
  53       return false;
  54    }
  55    unsigned int fullSize = typeSizeof(fTy);
  56    unsigned int halfSize = typeSizeof(hTy);
  57
  58    Instruction *i[9];
  59
  60    bld->setPosition(mul, true);
  61
  62    Value *a[2], *b[2];
  63    Value *c[2];
  64    Value *t[4];
  65    for (int j = 0; j < 4; ++j)
  66       t[j] = bld->getSSA(fullSize);
  67
  68    // split sources into halves
  69    i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
  70    i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
  71
  72    i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
  73    i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
  74    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
  75    i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
  76
  77    if (highResult) {
  78       Value *r[3];
  79       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
  80       c[0] = bld->getSSA(1, FILE_FLAGS);
  81       c[1] = bld->getSSA(1, FILE_FLAGS);
  82       for (int j = 0; j < 3; ++j)
  83          r[j] = bld->getSSA(fullSize);
  84
  85       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
  86       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
  87       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
  88       i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
  89
  90       // set carry defs / sources
  91       i[3]->setFlagsDef(1, c[0]);
  92       i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
  93       i[6]->setPredicate(CC_C, c[0]);
  94       i[5]->setFlagsSrc(3, c[1]);
  95    } else {
  96       bld->mkMov(mul->getDef(0), t[3]);
  97    }
  98    delete_Instruction(bld->getProgram(), mul);
  99
 100    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
 101       if (i[j])
 102          i[j]->sType = hTy;
 103
 104    return true;
 105 }
 106
 107 #define QOP_ADD  0
 108 #define QOP_SUBR 1
 109 #define QOP_SUB  2
 110 #define QOP_MOV2 3
 111
 112 //             UL UR LL LR
 113 #define QUADOP(q, r, s, t)            \
 114    ((QOP_##q << 6) | (QOP_##r << 4) | \
 115     (QOP_##s << 2) | (QOP_##t << 0))
 116
 117 class NV50LegalizePostRA : public Pass
 118 {
 119 private:
 120    virtual bool visit(Function *);
 121    virtual bool visit(BasicBlock *);
 122
 123    void handlePRERET(FlowInstruction *);
 124    void replaceZero(Instruction *);
 125
 126    LValue *r63;
 127 };
 128
 129 bool
 130 NV50LegalizePostRA::visit(Function *fn)
 131 {
 132    Program *prog = fn->getProgram();
 133
 134    r63 = new_LValue(fn, FILE_GPR);
 135    r63->reg.data.id = 63;
 136
 137    // this is actually per-program, but we can do it all on visiting main()
 138    std::list<Instruction *> *outWrites =
 139       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
 140
 141    if (outWrites) {
 142       for (std::list<Instruction *>::iterator it = outWrites->begin();
 143            it != outWrites->end(); ++it)
 144          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
 145       // instructions will be deleted on exit
 146       outWrites->clear();
 147    }
 148
 149    return true;
 150 }
 151
 152 void
 153 NV50LegalizePostRA::replaceZero(Instruction *i)
 154 {
 155    for (int s = 0; i->srcExists(s); ++s) {
 156       ImmediateValue *imm = i->getSrc(s)->asImm();
 157       if (imm && imm->reg.data.u64 == 0)
 158          i->setSrc(s, r63);
 159    }
 160 }
 161
 162 // Emulate PRERET: jump to the target and call to the origin from there
 163 //
 164 // WARNING: atm only works if BBs are affected by at most a single PRERET
 165 //
 166 // BB:0
 167 // preret BB:3
 168 // (...)
 169 // BB:3
 170 // (...)
 171 //             --->
 172 // BB:0
 173 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
 174 // (...)
 175 // BB:3
 176 // bra BB:3 + n1 (skip the call)
 177 // call BB:0 + n2 (skip bra at beginning of BB:0)
 178 // (...)
 179 void
 180 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
 181 {
 182    BasicBlock *bbE = pre->bb;
 183    BasicBlock *bbT = pre->target.bb;
 184
 185    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
 186    bbE->remove(pre);
 187    bbE->insertHead(pre);
 188
 189    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
 190    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
 191
 192    bbT->insertHead(call);
 193    bbT->insertHead(skip);
 194
 195    // NOTE: maybe split blocks to prevent the instructions from moving ?
 196
 197    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
 198    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
 199 }
 200
 201 bool
 202 NV50LegalizePostRA::visit(BasicBlock *bb)
 203 {
 204    Instruction *i, *next;
 205
 206    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
 207    for (i = bb->getFirst(); i; i = next) {
 208       next = i->next;
 209       if (i->isNop()) {
 210          bb->remove(i);
 211       } else
 212       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
 213          handlePRERET(i->asFlow());
 214       } else {
 215          // TODO: We will want to do this before register allocation,
 216          // since have to use a $c register for the carry flag.
 217          if (typeSizeof(i->dType) == 8) {
 218             Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
 219             if (hi)
 220                next = hi;
 221          }
 222
 223          if (i->op != OP_MOV && i->op != OP_PFETCH &&
 224              i->op != OP_BAR &&
 225              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
 226             replaceZero(i);
 227       }
 228    }
 229    if (!bb->getEntry())
 230       return true;
 231
 232    return true;
 233 }
 234
 235 class NV50LegalizeSSA : public Pass
 236 {
 237 public:
 238    NV50LegalizeSSA(Program *);
 239
 240    virtual bool visit(BasicBlock *bb);
 241
 242 private:
 243    void propagateWriteToOutput(Instruction *);
 244    void handleDIV(Instruction *);
 245    void handleMOD(Instruction *);
 246    void handleMUL(Instruction *);
 247    void handleAddrDef(Instruction *);
 248
 249    inline bool isARL(const Instruction *) const;
 250
 251    BuildUtil bld;
 252
 253    std::list<Instruction *> *outWrites;
 254 };
 255
 256 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
 257 {
 258    bld.setProgram(prog);
 259
 260    if (prog->optLevel >= 2 &&
 261        (prog->getType() == Program::TYPE_GEOMETRY ||
 262         prog->getType() == Program::TYPE_VERTEX))
 263       outWrites =
 264          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
 265    else
 266       outWrites = NULL;
 267 }
 268
 269 void
 270 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
 271 {
 272    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
 273       return;
 274
 275    // check def instruction can store
 276    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
 277
 278    // TODO: move exports (if beneficial) in common opt pass
 279    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
 280       return;
 281    for (int s = 0; di->srcExists(s); ++s)
 282       if (di->src(s).getFile() == FILE_IMMEDIATE)
 283          return;
 284
 285    // We cannot set defs to non-lvalues before register allocation, so
 286    // save & remove (to save registers) the exports and replace later.
 287    outWrites->push_back(st);
 288    st->bb->remove(st);
 289 }
 290
 291 bool
 292 NV50LegalizeSSA::isARL(const Instruction *i) const
 293 {
 294    ImmediateValue imm;
 295
 296    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
 297       return false;
 298    if (!i->src(1).getImmediate(imm))
 299       return false;
 300    return imm.isInteger(0);
 301 }
 302
 303 void
 304 NV50LegalizeSSA::handleAddrDef(Instruction *i)
 305 {
 306    Instruction *arl;
 307
 308    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
 309
 310    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
 311    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
 312       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
 313          return;
 314       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
 315          return;
 316    }
 317
 318    // turn $a sources into $r sources (can't operate on $a)
 319    for (int s = 0; i->srcExists(s); ++s) {
 320       Value *a = i->getSrc(s);
 321       Value *r;
 322       if (a->reg.file == FILE_ADDRESS) {
 323          if (a->getInsn() && isARL(a->getInsn())) {
 324             i->setSrc(s, a->getInsn()->getSrc(0));
 325          } else {
 326             bld.setPosition(i, false);
 327             r = bld.getSSA();
 328             bld.mkMov(r, a);
 329             i->setSrc(s, r);
 330          }
 331       }
 332    }
 333    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
 334       return;
 335
 336    // turn result back into $a
 337    bld.setPosition(i, true);
 338    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
 339    i->setDef(0, arl->getSrc(0));
 340 }
 341
 342 void
 343 NV50LegalizeSSA::handleMUL(Instruction *mul)
 344 {
 345    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
 346       return;
 347    Value *def = mul->getDef(0);
 348    Value *pred = mul->getPredicate();
 349    CondCode cc = mul->cc;
 350    if (pred)
 351       mul->setPredicate(CC_ALWAYS, NULL);
 352
 353    if (mul->op == OP_MAD) {
 354       Instruction *add = mul;
 355       bld.setPosition(add, false);
 356       Value *res = cloneShallow(func, mul->getDef(0));
 357       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
 358       add->op = OP_ADD;
 359       add->setSrc(0, mul->getDef(0));
 360       add->setSrc(1, add->getSrc(2));
 361       for (int s = 2; add->srcExists(s); ++s)
 362          add->setSrc(s, NULL);
 363       mul->subOp = add->subOp;
 364       add->subOp = 0;
 365    }
 366    expandIntegerMUL(&bld, mul);
 367    if (pred)
 368       def->getInsn()->setPredicate(cc, pred);
 369 }
 370
 371 // Use f32 division: first compute an approximate result, use it to reduce
 372 // the dividend, which should then be representable as f32, divide the reduced
 373 // dividend, and add the quotients.
 374 void
 375 NV50LegalizeSSA::handleDIV(Instruction *div)
 376 {
 377    const DataType ty = div->sType;
 378
 379    if (ty != TYPE_U32 && ty != TYPE_S32)
 380       return;
 381
 382    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
 383
 384    bld.setPosition(div, false);
 385
 386    Value *a, *af = bld.getSSA();
 387    Value *b, *bf = bld.getSSA();
 388
 389    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
 390    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
 391
 392    if (isSignedType(ty)) {
 393       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
 394       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
 395       a = bld.getSSA();
 396       b = bld.getSSA();
 397       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
 398       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
 399    } else {
 400       a = div->getSrc(0);
 401       b = div->getSrc(1);
 402    }
 403
 404    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
 405    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
 406
 407    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
 408    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
 409
 410    // get error of 1st result
 411    expandIntegerMUL(&bld,
 412       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
 413    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
 414
 415    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
 416
 417    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
 418    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
 419       ->rnd = ROUND_Z;
 420    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
 421
 422    // correction: if modulus >= divisor, add 1
 423    expandIntegerMUL(&bld,
 424       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
 425    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
 426    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
 427    if (!isSignedType(ty)) {
 428       div->op = OP_SUB;
 429       div->setSrc(0, q);
 430       div->setSrc(1, s);
 431    } else {
 432       t = q;
 433       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
 434       s = bld.getSSA();
 435       t = bld.getSSA();
 436       // fix the sign
 437       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
 438          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
 439       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
 440       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
 441
 442       div->op = OP_UNION;
 443       div->setSrc(0, s);
 444       div->setSrc(1, t);
 445    }
 446 }
 447
 448 void
 449 NV50LegalizeSSA::handleMOD(Instruction *mod)
 450 {
 451    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
 452       return;
 453    bld.setPosition(mod, false);
 454
 455    Value *q = bld.getSSA();
 456    Value *m = bld.getSSA();
 457
 458    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
 459    handleDIV(q->getInsn());
 460
 461    bld.setPosition(mod, false);
 462    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
 463
 464    mod->op = OP_SUB;
 465    mod->setSrc(1, m);
 466 }
 467
 468 bool
 469 NV50LegalizeSSA::visit(BasicBlock *bb)
 470 {
 471    Instruction *insn, *next;
 472    // skipping PHIs (don't pass them to handleAddrDef) !
 473    for (insn = bb->getEntry(); insn; insn = next) {
 474       next = insn->next;
 475
 476       switch (insn->op) {
 477       case OP_EXPORT:
 478          if (outWrites)
 479             propagateWriteToOutput(insn);
 480          break;
 481       case OP_DIV:
 482          handleDIV(insn);
 483          break;
 484       case OP_MOD:
 485          handleMOD(insn);
 486          break;
 487       case OP_MAD:
 488       case OP_MUL:
 489          handleMUL(insn);
 490          break;
 491       default:
 492          break;
 493       }
 494
 495       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
 496          handleAddrDef(insn);
 497    }
 498    return true;
 499 }
 500
 501 class NV50LoweringPreSSA : public Pass
 502 {
 503 public:
 504    NV50LoweringPreSSA(Program *);
 505
 506 private:
 507    virtual bool visit(Instruction *);
 508    virtual bool visit(Function *);
 509
 510    bool handleRDSV(Instruction *);
 511    bool handleWRSV(Instruction *);
 512
 513    bool handleEXPORT(Instruction *);
 514
 515    bool handleDIV(Instruction *);
 516    bool handleSQRT(Instruction *);
 517    bool handlePOW(Instruction *);
 518
 519    bool handleSET(Instruction *);
 520    bool handleSLCT(CmpInstruction *);
 521    bool handleSELP(Instruction *);
 522
 523    bool handleTEX(TexInstruction *);
 524    bool handleTXB(TexInstruction *); // I really
 525    bool handleTXL(TexInstruction *); // hate
 526    bool handleTXD(TexInstruction *); // these 3
 527
 528    bool handleCALL(Instruction *);
 529    bool handlePRECONT(Instruction *);
 530    bool handleCONT(Instruction *);
 531
 532    void checkPredicate(Instruction *);
 533
 534 private:
 535    const Target *const targ;
 536
 537    BuildUtil bld;
 538
 539    Value *tid;
 540 };
 541
 542 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
 543    targ(prog->getTarget()), tid(NULL)
 544 {
 545    bld.setProgram(prog);
 546 }
 547
 548 bool
 549 NV50LoweringPreSSA::visit(Function *f)
 550 {
 551    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
 552
 553    if (prog->getType() == Program::TYPE_COMPUTE) {
 554       // Add implicit "thread id" argument in $r0 to the function
 555       Value *arg = new_LValue(func, FILE_GPR);
 556       arg->reg.data.id = 0;
 557       f->ins.push_back(arg);
 558
 559       bld.setPosition(root, false);
 560       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
 561    }
 562
 563    return true;
 564 }
 565
 566 bool
 567 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
 568 {
 569    const int arg = i->tex.target.getArgCount();
 570    const int dref = arg;
 571    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
 572
 573    // dref comes before bias/lod
 574    if (i->tex.target.isShadow())
 575       if (i->op == OP_TXB || i->op == OP_TXL)
 576          i->swapSources(dref, lod);
 577
 578    if (i->tex.target.isArray()) {
 579       if (i->op != OP_TXF) {
 580          // array index must be converted to u32, but it's already an integer
 581          // for TXF
 582          Value *layer = i->getSrc(arg - 1);
 583          LValue *src = new_LValue(func, FILE_GPR);
 584          bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
 585          bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
 586          i->setSrc(arg - 1, src);
 587       }
 588       if (i->tex.target.isCube()) {
 589          std::vector<Value *> acube, a2d;
 590          int c;
 591
 592          acube.resize(4);
 593          for (c = 0; c < 4; ++c)
 594             acube[c] = i->getSrc(c);
 595          a2d.resize(4);
 596          for (c = 0; c < 3; ++c)
 597             a2d[c] = new_LValue(func, FILE_GPR);
 598          a2d[3] = NULL;
 599
 600          bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
 601                    a2d, acube)->asTex()->tex.mask = 0x7;
 602
 603          for (c = 0; c < 3; ++c)
 604             i->setSrc(c, a2d[c]);
 605          i->setSrc(c, NULL);
 606          for (; i->srcExists(c + 1); ++c)
 607             i->setSrc(c, i->getSrc(c + 1));
 608
 609          i->tex.target = i->tex.target.isShadow() ?
 610             TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
 611       }
 612    }
 613
 614    // texel offsets are 3 immediate fields in the instruction,
 615    // nv50 cannot do textureGatherOffsets
 616    assert(i->tex.useOffsets <= 1);
 617
 618    return true;
 619 }
 620
 621 // Bias must be equal for all threads of a quad or lod calculation will fail.
 622 //
 623 // The lanes of a quad are grouped by the bit in the condition register they
 624 // have set, which is selected by differing bias values.
 625 // Move the input values for TEX into a new register set for each group and
 626 // execute TEX only for a specific group.
 627 // We always need to use 4 new registers for the inputs/outputs because the
 628 // implicitly calculated derivatives must be correct.
 629 //
 630 // TODO: move to SSA phase so we can easily determine whether bias is constant
 631 bool
 632 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
 633 {
 634    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
 635    int l, d;
 636
 637    handleTEX(i);
 638    Value *bias = i->getSrc(i->tex.target.getArgCount());
 639    if (bias->isUniform())
 640       return true;
 641
 642    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
 643                                  bld.loadImm(NULL, 1));
 644    bld.setPosition(cond, false);
 645
 646    for (l = 1; l < 4; ++l) {
 647       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
 648       Value *bit = bld.getSSA();
 649       Value *pred = bld.getScratch(1, FILE_FLAGS);
 650       Value *imm = bld.loadImm(NULL, (1 << l));
 651       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
 652       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
 653       cond->setSrc(l, bit);
 654    }
 655    Value *flags = bld.getScratch(1, FILE_FLAGS);
 656    bld.setPosition(cond, true);
 657    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
 658
 659    Instruction *tex[4];
 660    for (l = 0; l < 4; ++l) {
 661       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
 662       bld.insert(tex[l]);
 663    }
 664
 665    Value *res[4][4];
 666    for (d = 0; i->defExists(d); ++d)
 667       res[0][d] = tex[0]->getDef(d);
 668    for (l = 1; l < 4; ++l) {
 669       for (d = 0; tex[l]->defExists(d); ++d) {
 670          res[l][d] = cloneShallow(func, res[0][d]);
 671          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
 672       }
 673    }
 674
 675    for (d = 0; i->defExists(d); ++d) {
 676       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
 677       for (l = 0; l < 4; ++l)
 678          dst->setSrc(l, res[l][d]);
 679    }
 680    delete_Instruction(prog, i);
 681    return true;
 682 }
 683
 684 // LOD must be equal for all threads of a quad.
 685 // Unlike with TXB, here we can just diverge since there's no LOD calculation
 686 // that would require all 4 threads' sources to be set up properly.
 687 bool
 688 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
 689 {
 690    handleTEX(i);
 691    Value *lod = i->getSrc(i->tex.target.getArgCount());
 692    if (lod->isUniform())
 693       return true;
 694
 695    BasicBlock *currBB = i->bb;
 696    BasicBlock *texiBB = i->bb->splitBefore(i, false);
 697    BasicBlock *joinBB = i->bb->splitAfter(i);
 698
 699    bld.setPosition(currBB, true);
 700    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
 701
 702    for (int l = 0; l <= 3; ++l) {
 703       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
 704       Value *pred = bld.getScratch(1, FILE_FLAGS);
 705       bld.setPosition(currBB, true);
 706       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
 707       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
 708       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
 709       if (l <= 2) {
 710          BasicBlock *laneBB = new BasicBlock(func);
 711          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
 712          currBB = laneBB;
 713       }
 714    }
 715    bld.setPosition(joinBB, false);
 716    bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
 717    return true;
 718 }
 719
 720 bool
 721 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
 722 {
 723    static const uint8_t qOps[4][2] =
 724    {
 725       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
 726       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
 727       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
 728       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
 729    };
 730    Value *def[4][4];
 731    Value *crd[3];
 732    Instruction *tex;
 733    Value *zero = bld.loadImm(bld.getSSA(), 0);
 734    int l, c;
 735    const int dim = i->tex.target.getDim();
 736
 737    handleTEX(i);
 738    i->op = OP_TEX; // no need to clone dPdx/dPdy later
 739
 740    for (c = 0; c < dim; ++c)
 741       crd[c] = bld.getScratch();
 742
 743    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
 744    for (l = 0; l < 4; ++l) {
 745       // mov coordinates from lane l to all lanes
 746       for (c = 0; c < dim; ++c)
 747          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
 748       // add dPdx from lane l to lanes dx
 749       for (c = 0; c < dim; ++c)
 750          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
 751       // add dPdy from lane l to lanes dy
 752       for (c = 0; c < dim; ++c)
 753          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
 754       // texture
 755       bld.insert(tex = cloneForward(func, i));
 756       for (c = 0; c < dim; ++c)
 757          tex->setSrc(c, crd[c]);
 758       // save results
 759       for (c = 0; i->defExists(c); ++c) {
 760          Instruction *mov;
 761          def[c][l] = bld.getSSA();
 762          mov = bld.mkMov(def[c][l], tex->getDef(c));
 763          mov->fixed = 1;
 764          mov->lanes = 1 << l;
 765       }
 766    }
 767    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
 768
 769    for (c = 0; i->defExists(c); ++c) {
 770       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
 771       for (l = 0; l < 4; ++l)
 772          u->setSrc(l, def[c][l]);
 773    }
 774
 775    i->bb->remove(i);
 776    return true;
 777 }
 778
 779 bool
 780 NV50LoweringPreSSA::handleSET(Instruction *i)
 781 {
 782    if (i->dType == TYPE_F32) {
 783       bld.setPosition(i, true);
 784       i->dType = TYPE_U32;
 785       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
 786       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
 787    }
 788    return true;
 789 }
 790
 791 bool
 792 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
 793 {
 794    Value *src0 = bld.getSSA();
 795    Value *src1 = bld.getSSA();
 796    Value *pred = bld.getScratch(1, FILE_FLAGS);
 797
 798    Value *v0 = i->getSrc(0);
 799    Value *v1 = i->getSrc(1);
 800    // XXX: these probably shouldn't be immediates in the first place ...
 801    if (v0->asImm())
 802       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
 803    if (v1->asImm())
 804       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
 805
 806    bld.setPosition(i, true);
 807    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
 808    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
 809    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
 810
 811    bld.setPosition(i, false);
 812    i->op = OP_SET;
 813    i->setFlagsDef(0, pred);
 814    i->dType = TYPE_U8;
 815    i->setSrc(0, i->getSrc(2));
 816    i->setSrc(2, NULL);
 817    i->setSrc(1, bld.loadImm(NULL, 0));
 818
 819    return true;
 820 }
 821
 822 bool
 823 NV50LoweringPreSSA::handleSELP(Instruction *i)
 824 {
 825    Value *src0 = bld.getSSA();
 826    Value *src1 = bld.getSSA();
 827
 828    Value *v0 = i->getSrc(0);
 829    Value *v1 = i->getSrc(1);
 830    if (v0->asImm())
 831       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
 832    if (v1->asImm())
 833       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
 834
 835    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
 836    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
 837    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
 838    delete_Instruction(prog, i);
 839    return true;
 840 }
 841
 842 bool
 843 NV50LoweringPreSSA::handleWRSV(Instruction *i)
 844 {
 845    Symbol *sym = i->getSrc(0)->asSym();
 846
 847    // these are all shader outputs, $sreg are not writeable
 848    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
 849    if (addr >= 0x400)
 850       return false;
 851    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
 852
 853    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
 854
 855    bld.getBB()->remove(i);
 856    return true;
 857 }
 858
 859 bool
 860 NV50LoweringPreSSA::handleCALL(Instruction *i)
 861 {
 862    if (prog->getType() == Program::TYPE_COMPUTE) {
 863       // Add implicit "thread id" argument in $r0 to the function
 864       i->setSrc(i->srcCount(), tid);
 865    }
 866    return true;
 867 }
 868
 869 bool
 870 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
 871 {
 872    delete_Instruction(prog, i);
 873    return true;
 874 }
 875
 876 bool
 877 NV50LoweringPreSSA::handleCONT(Instruction *i)
 878 {
 879    i->op = OP_BRA;
 880    return true;
 881 }
 882
 883 bool
 884 NV50LoweringPreSSA::handleRDSV(Instruction *i)
 885 {
 886    Symbol *sym = i->getSrc(0)->asSym();
 887    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
 888    Value *def = i->getDef(0);
 889    SVSemantic sv = sym->reg.data.sv.sv;
 890    int idx = sym->reg.data.sv.index;
 891
 892    if (addr >= 0x400) // mov $sreg
 893       return true;
 894
 895    switch (sv) {
 896    case SV_POSITION:
 897       assert(prog->getType() == Program::TYPE_FRAGMENT);
 898       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
 899       break;
 900    case SV_FACE:
 901       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
 902       if (i->dType == TYPE_F32) {
 903          bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
 904          bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
 905       }
 906       break;
 907    case SV_NCTAID:
 908    case SV_CTAID:
 909    case SV_NTID:
 910       if ((sv == SV_NCTAID && idx >= 2) ||
 911           (sv == SV_NTID && idx >= 3)) {
 912          bld.mkMov(def, bld.mkImm(1));
 913       } else if (sv == SV_CTAID && idx >= 2) {
 914          bld.mkMov(def, bld.mkImm(0));
 915       } else {
 916          Value *x = bld.getSSA(2);
 917          bld.mkOp1(OP_LOAD, TYPE_U16, x,
 918                    bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
 919          bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
 920       }
 921       break;
 922    case SV_TID:
 923       if (idx == 0) {
 924          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
 925       } else if (idx == 1) {
 926          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
 927          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
 928       } else if (idx == 2) {
 929          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
 930       } else {
 931          bld.mkMov(def, bld.mkImm(0));
 932       }
 933       break;
 934    default:
 935       bld.mkFetch(i->getDef(0), i->dType,
 936                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
 937       break;
 938    }
 939    bld.getBB()->remove(i);
 940    return true;
 941 }
 942
 943 bool
 944 NV50LoweringPreSSA::handleDIV(Instruction *i)
 945 {
 946    if (!isFloatType(i->dType))
 947       return true;
 948    bld.setPosition(i, false);
 949    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
 950    i->op = OP_MUL;
 951    i->setSrc(1, rcp->getDef(0));
 952    return true;
 953 }
 954
 955 bool
 956 NV50LoweringPreSSA::handleSQRT(Instruction *i)
 957 {
 958    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
 959                                 bld.getSSA(), i->getSrc(0));
 960    i->op = OP_MUL;
 961    i->setSrc(1, rsq->getDef(0));
 962
 963    return true;
 964 }
 965
 966 bool
 967 NV50LoweringPreSSA::handlePOW(Instruction *i)
 968 {
 969    LValue *val = bld.getScratch();
 970
 971    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
 972    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
 973    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
 974
 975    i->op = OP_EX2;
 976    i->setSrc(0, val);
 977    i->setSrc(1, NULL);
 978
 979    return true;
 980 }
 981
 982 bool
 983 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
 984 {
 985    if (prog->getType() == Program::TYPE_FRAGMENT) {
 986       if (i->getIndirect(0, 0)) {
 987          // TODO: redirect to l[] here, load to GPRs at exit
 988          return false;
 989       } else {
 990          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
 991
 992          i->op = OP_MOV;
 993          i->subOp = NV50_IR_SUBOP_MOV_FINAL;
 994          i->src(0).set(i->src(1));
 995          i->setSrc(1, NULL);
 996          i->setDef(0, new_LValue(func, FILE_GPR));
 997          i->getDef(0)->reg.data.id = id;
 998
 999          prog->maxGPR = MAX2(prog->maxGPR, id);
1000       }
1001    }
1002    return true;
1003 }
1004
1005 // Set flags according to predicate and make the instruction read $cX.
1006 void
1007 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1008 {
1009    Value *pred = insn->getPredicate();
1010    Value *cdst;
1011
1012    if (!pred || pred->reg.file == FILE_FLAGS)
1013       return;
1014    cdst = bld.getSSA(1, FILE_FLAGS);
1015
1016    bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
1017
1018    insn->setPredicate(insn->cc, cdst);
1019 }
1020
1021 //
1022 // - add quadop dance for texturing
1023 // - put FP outputs in GPRs
1024 // - convert instruction sequences
1025 //
1026 bool
1027 NV50LoweringPreSSA::visit(Instruction *i)
1028 {
1029    bld.setPosition(i, false);
1030
1031    if (i->cc != CC_ALWAYS)
1032       checkPredicate(i);
1033
1034    switch (i->op) {
1035    case OP_TEX:
1036    case OP_TXF:
1037    case OP_TXG:
1038       return handleTEX(i->asTex());
1039    case OP_TXB:
1040       return handleTXB(i->asTex());
1041    case OP_TXL:
1042       return handleTXL(i->asTex());
1043    case OP_TXD:
1044       return handleTXD(i->asTex());
1045    case OP_EX2:
1046       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1047       i->setSrc(0, i->getDef(0));
1048       break;
1049    case OP_SET:
1050       return handleSET(i);
1051    case OP_SLCT:
1052       return handleSLCT(i->asCmp());
1053    case OP_SELP:
1054       return handleSELP(i);
1055    case OP_POW:
1056       return handlePOW(i);
1057    case OP_DIV:
1058       return handleDIV(i);
1059    case OP_SQRT:
1060       return handleSQRT(i);
1061    case OP_EXPORT:
1062       return handleEXPORT(i);
1063    case OP_RDSV:
1064       return handleRDSV(i);
1065    case OP_WRSV:
1066       return handleWRSV(i);
1067    case OP_CALL:
1068       return handleCALL(i);
1069    case OP_PRECONT:
1070       return handlePRECONT(i);
1071    case OP_CONT:
1072       return handleCONT(i);
1073    default:
1074       break;
1075    }
1076    return true;
1077 }
1078
1079 bool
1080 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1081 {
1082    bool ret = false;
1083
1084    if (stage == CG_STAGE_PRE_SSA) {
1085       NV50LoweringPreSSA pass(prog);
1086       ret = pass.run(prog, false, true);
1087    } else
1088    if (stage == CG_STAGE_SSA) {
1089       if (!prog->targetPriv)
1090          prog->targetPriv = new std::list<Instruction *>();
1091       NV50LegalizeSSA pass(prog);
1092       ret = pass.run(prog, false, true);
1093    } else
1094    if (stage == CG_STAGE_POST_RA) {
1095       NV50LegalizePostRA pass;
1096       ret = pass.run(prog, false, true);
1097       if (prog->targetPriv)
1098          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1099    }
1100    return ret;
1101 }
1102
1103 } // namespace nv50_ir