src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  18  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
  19  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20  * SOFTWARE.
  21  */
  22
  23 #include "nv50/codegen/nv50_ir.h"
  24 #include "nv50/codegen/nv50_ir_build_util.h"
  25
  26 #include "nv50_ir_target_nv50.h"
  27
  28 namespace nv50_ir {
  29
  30 // nv50 doesn't support 32 bit integer multiplication
  31 //
  32 //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
  33 // -------------------
  34 //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
  35 // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
  36 //       al*bl
  37 //    ah*bl 00
  38 //
  39 // fffe0001 + fffe0001
  40 static bool
  41 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
  42 {
  43    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
  44
  45    DataType fTy = mul->sType; // full type
  46    DataType hTy;
  47    switch (fTy) {
  48    case TYPE_S32: hTy = TYPE_S16; break;
  49    case TYPE_U32: hTy = TYPE_U16; break;
  50    case TYPE_U64: hTy = TYPE_U32; break;
  51    case TYPE_S64: hTy = TYPE_S32; break;
  52    default:
  53       return false;
  54    }
  55    unsigned int fullSize = typeSizeof(fTy);
  56    unsigned int halfSize = typeSizeof(hTy);
  57
  58    Instruction *i[9];
  59
  60    Value *a[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) };
  61    Value *b[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) };
  62    Value *c[2];
  63    Value *t[4];
  64    for (int j = 0; j < 4; ++j)
  65       t[j] = bld->getSSA(fullSize);
  66
  67    (i[0] = bld->mkOp1(OP_SPLIT, fTy, a[0], mul->getSrc(0)))->setDef(1, a[1]);
  68    (i[1] = bld->mkOp1(OP_SPLIT, fTy, b[0], mul->getSrc(1)))->setDef(1, b[1]);
  69
  70    i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
  71    i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
  72    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
  73    i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
  74
  75    if (highResult) {
  76       Value *r[3];
  77       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
  78       c[0] = bld->getSSA(1, FILE_FLAGS);
  79       c[1] = bld->getSSA(1, FILE_FLAGS);
  80       for (int j = 0; j < 3; ++j)
  81          r[j] = bld->getSSA(fullSize);
  82
  83       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
  84       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
  85       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
  86       i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
  87
  88       // set carry defs / sources
  89       i[3]->setFlagsDef(1, c[0]);
  90       i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
  91       i[6]->setPredicate(CC_C, c[0]);
  92       i[5]->setFlagsSrc(3, c[1]);
  93    } else {
  94       bld->mkMov(mul->getDef(0), t[3]);
  95    }
  96    delete_Instruction(bld->getProgram(), mul);
  97
  98    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
  99       i[j]->sType = hTy;
 100
 101    return true;
 102 }
 103
 104 #define QOP_ADD  0
 105 #define QOP_SUBR 1
 106 #define QOP_SUB  2
 107 #define QOP_MOV2 3
 108
 109 #define QUADOP(q, r, s, t)            \
 110    ((QOP_##q << 0) | (QOP_##r << 2) | \
 111     (QOP_##s << 4) | (QOP_##t << 6))
 112
 113 class NV50LegalizePostRA : public Pass
 114 {
 115 private:
 116    virtual bool visit(Function *);
 117    virtual bool visit(BasicBlock *);
 118
 119    void handlePRERET(FlowInstruction *);
 120    void replaceZero(Instruction *);
 121    void split64BitOp(Instruction *);
 122
 123    LValue *r63;
 124 };
 125
 126 bool
 127 NV50LegalizePostRA::visit(Function *fn)
 128 {
 129    Program *prog = fn->getProgram();
 130
 131    r63 = new_LValue(fn, FILE_GPR);
 132    r63->reg.data.id = 63;
 133
 134    // this is actually per-program, but we can do it all on visiting main()
 135    std::list<Instruction *> *outWrites =
 136       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
 137
 138    if (outWrites) {
 139       for (std::list<Instruction *>::iterator it = outWrites->begin();
 140            it != outWrites->end(); ++it)
 141          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
 142       // instructions will be deleted on exit
 143       outWrites->clear();
 144    }
 145
 146    return true;
 147 }
 148
 149 void
 150 NV50LegalizePostRA::replaceZero(Instruction *i)
 151 {
 152    for (int s = 0; i->srcExists(s); ++s) {
 153       ImmediateValue *imm = i->getSrc(s)->asImm();
 154       if (imm && imm->reg.data.u64 == 0)
 155          i->setSrc(s, r63);
 156    }
 157 }
 158
 159 void
 160 NV50LegalizePostRA::split64BitOp(Instruction *i)
 161 {
 162    if (i->dType == TYPE_F64) {
 163       if (i->op == OP_MAD)
 164          i->op = OP_FMA;
 165       if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA ||
 166           i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX ||
 167           i->op == OP_SET)
 168          return;
 169       i->dType = i->sType = TYPE_U32;
 170
 171       i->bb->insertAfter(i, cloneForward(func, i));
 172    }
 173 }
 174
 175 // Emulate PRERET: jump to the target and call to the origin from there
 176 //
 177 // WARNING: atm only works if BBs are affected by at most a single PRERET
 178 //
 179 // BB:0
 180 // preret BB:3
 181 // (...)
 182 // BB:3
 183 // (...)
 184 //             --->
 185 // BB:0
 186 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
 187 // (...)
 188 // BB:3
 189 // bra BB:3 + n1 (skip the call)
 190 // call BB:0 + n2 (skip bra at beginning of BB:0)
 191 // (...)
 192 void
 193 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
 194 {
 195    BasicBlock *bbE = pre->bb;
 196    BasicBlock *bbT = pre->target.bb;
 197
 198    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
 199    bbE->remove(pre);
 200    bbE->insertHead(pre);
 201
 202    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
 203    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
 204
 205    bbT->insertHead(call);
 206    bbT->insertHead(skip);
 207
 208    // NOTE: maybe split blocks to prevent the instructions from moving ?
 209
 210    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
 211    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
 212 }
 213
 214 bool
 215 NV50LegalizePostRA::visit(BasicBlock *bb)
 216 {
 217    Instruction *i, *next;
 218
 219    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
 220    for (i = bb->getFirst(); i; i = next) {
 221       next = i->next;
 222       if (i->isNop()) {
 223          bb->remove(i);
 224       } else
 225       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
 226          handlePRERET(i->asFlow());
 227       } else {
 228          if (i->op != OP_MOV && i->op != OP_PFETCH &&
 229              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
 230             replaceZero(i);
 231          if (typeSizeof(i->dType) == 8)
 232             split64BitOp(i);
 233       }
 234    }
 235    if (!bb->getEntry())
 236       return true;
 237
 238    return true;
 239 }
 240
 241 class NV50LegalizeSSA : public Pass
 242 {
 243 public:
 244    NV50LegalizeSSA(Program *);
 245
 246    virtual bool visit(BasicBlock *bb);
 247
 248 private:
 249    void propagateWriteToOutput(Instruction *);
 250    void handleDIV(Instruction *);
 251    void handleMOD(Instruction *);
 252    void handleMUL(Instruction *);
 253    void handleAddrDef(Instruction *);
 254
 255    inline bool isARL(const Instruction *) const;
 256
 257    BuildUtil bld;
 258
 259    std::list<Instruction *> *outWrites;
 260 };
 261
 262 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
 263 {
 264    bld.setProgram(prog);
 265
 266    if (prog->optLevel >= 2 &&
 267        (prog->getType() == Program::TYPE_GEOMETRY ||
 268         prog->getType() == Program::TYPE_VERTEX))
 269       outWrites =
 270          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
 271    else
 272       outWrites = NULL;
 273 }
 274
 275 void
 276 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
 277 {
 278    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
 279       return;
 280
 281    // check def instruction can store
 282    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
 283
 284    // TODO: move exports (if beneficial) in common opt pass
 285    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
 286       return;
 287    for (int s = 0; di->srcExists(s); ++s)
 288       if (di->src(s).getFile() == FILE_IMMEDIATE)
 289          return;
 290
 291    // We cannot set defs to non-lvalues before register allocation, so
 292    // save & remove (to save registers) the exports and replace later.
 293    outWrites->push_back(st);
 294    st->bb->remove(st);
 295 }
 296
 297 bool
 298 NV50LegalizeSSA::isARL(const Instruction *i) const
 299 {
 300    ImmediateValue imm;
 301
 302    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
 303       return false;
 304    if (!i->src(1).getImmediate(imm))
 305       return false;
 306    return imm.isInteger(0);
 307 }
 308
 309 void
 310 NV50LegalizeSSA::handleAddrDef(Instruction *i)
 311 {
 312    Instruction *arl;
 313
 314    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
 315
 316    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
 317    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
 318       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
 319          return;
 320       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
 321          return;
 322    }
 323
 324    // turn $a sources into $r sources (can't operate on $a)
 325    for (int s = 0; i->srcExists(s); ++s) {
 326       Value *a = i->getSrc(s);
 327       Value *r;
 328       if (a->reg.file == FILE_ADDRESS) {
 329          if (a->getInsn() && isARL(a->getInsn())) {
 330             i->setSrc(s, a->getInsn()->getSrc(0));
 331          } else {
 332             bld.setPosition(i, false);
 333             r = bld.getSSA();
 334             bld.mkMov(r, a);
 335             i->setSrc(s, r);
 336          }
 337       }
 338    }
 339    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
 340       return;
 341
 342    // turn result back into $a
 343    bld.setPosition(i, true);
 344    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
 345    i->setDef(0, arl->getSrc(0));
 346 }
 347
 348 void
 349 NV50LegalizeSSA::handleMUL(Instruction *mul)
 350 {
 351    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
 352       return;
 353    Value *def = mul->getDef(0);
 354    Value *pred = mul->getPredicate();
 355    CondCode cc = mul->cc;
 356    if (pred)
 357       mul->setPredicate(CC_ALWAYS, NULL);
 358
 359    if (mul->op == OP_MAD) {
 360       Instruction *add = mul;
 361       bld.setPosition(add, false);
 362       Value *res = cloneShallow(func, mul->getDef(0));
 363       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
 364       add->op = OP_ADD;
 365       add->setSrc(0, mul->getDef(0));
 366       add->setSrc(1, add->getSrc(2));
 367       for (int s = 2; add->srcExists(s); ++s)
 368          add->setSrc(s, NULL);
 369       mul->subOp = add->subOp;
 370       add->subOp = 0;
 371    }
 372    expandIntegerMUL(&bld, mul);
 373    if (pred)
 374       def->getInsn()->setPredicate(cc, pred);
 375 }
 376
 377 // Use f32 division: first compute an approximate result, use it to reduce
 378 // the dividend, which should then be representable as f32, divide the reduced
 379 // dividend, and add the quotients.
 380 void
 381 NV50LegalizeSSA::handleDIV(Instruction *div)
 382 {
 383    const DataType ty = div->sType;
 384
 385    if (ty != TYPE_U32 && ty != TYPE_S32)
 386       return;
 387
 388    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
 389
 390    bld.setPosition(div, false);
 391
 392    Value *a, *af = bld.getSSA();
 393    Value *b, *bf = bld.getSSA();
 394
 395    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
 396    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
 397
 398    if (isSignedType(ty)) {
 399       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
 400       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
 401       a = bld.getSSA();
 402       b = bld.getSSA();
 403       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
 404       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
 405    } else {
 406       a = div->getSrc(0);
 407       b = div->getSrc(1);
 408    }
 409
 410    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
 411    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
 412
 413    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
 414    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
 415
 416    // get error of 1st result
 417    expandIntegerMUL(&bld,
 418       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
 419    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
 420
 421    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
 422
 423    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
 424    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
 425       ->rnd = ROUND_Z;
 426    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
 427
 428    // correction: if modulus >= divisor, add 1
 429    expandIntegerMUL(&bld,
 430       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
 431    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
 432    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
 433    if (!isSignedType(ty)) {
 434       div->op = OP_SUB;
 435       div->setSrc(0, q);
 436       div->setSrc(1, s);
 437    } else {
 438       t = q;
 439       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
 440       s = bld.getSSA();
 441       t = bld.getSSA();
 442       // fix the sign
 443       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
 444          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
 445       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
 446       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
 447
 448       div->op = OP_UNION;
 449       div->setSrc(0, s);
 450       div->setSrc(1, t);
 451    }
 452 }
 453
 454 void
 455 NV50LegalizeSSA::handleMOD(Instruction *mod)
 456 {
 457    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
 458       return;
 459    bld.setPosition(mod, false);
 460
 461    Value *q = bld.getSSA();
 462    Value *m = bld.getSSA();
 463
 464    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
 465    handleDIV(q->getInsn());
 466
 467    bld.setPosition(mod, false);
 468    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
 469
 470    mod->op = OP_SUB;
 471    mod->setSrc(1, m);
 472 }
 473
 474 bool
 475 NV50LegalizeSSA::visit(BasicBlock *bb)
 476 {
 477    Instruction *insn, *next;
 478    // skipping PHIs (don't pass them to handleAddrDef) !
 479    for (insn = bb->getEntry(); insn; insn = next) {
 480       next = insn->next;
 481
 482       switch (insn->op) {
 483       case OP_EXPORT:
 484          if (outWrites)
 485             propagateWriteToOutput(insn);
 486          break;
 487       case OP_DIV:
 488          handleDIV(insn);
 489          break;
 490       case OP_MOD:
 491          handleMOD(insn);
 492          break;
 493       case OP_MAD:
 494       case OP_MUL:
 495          handleMUL(insn);
 496          break;
 497       default:
 498          break;
 499       }
 500
 501       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
 502          handleAddrDef(insn);
 503    }
 504    return true;
 505 }
 506
 507 class NV50LoweringPreSSA : public Pass
 508 {
 509 public:
 510    NV50LoweringPreSSA(Program *);
 511
 512 private:
 513    virtual bool visit(Instruction *);
 514    virtual bool visit(Function *);
 515
 516    bool handleRDSV(Instruction *);
 517    bool handleWRSV(Instruction *);
 518
 519    bool handleEXPORT(Instruction *);
 520
 521    bool handleMUL(Instruction *);
 522    bool handleDIV(Instruction *);
 523    bool handleSQRT(Instruction *);
 524    bool handlePOW(Instruction *);
 525
 526    bool handleSET(Instruction *);
 527    bool handleSLCT(CmpInstruction *);
 528    bool handleSELP(Instruction *);
 529
 530    bool handleTEX(TexInstruction *);
 531    bool handleTXB(TexInstruction *); // I really
 532    bool handleTXL(TexInstruction *); // hate
 533    bool handleTXD(TexInstruction *); // these 3
 534
 535    bool handleCALL(Instruction *);
 536    bool handlePRECONT(Instruction *);
 537    bool handleCONT(Instruction *);
 538
 539    void checkPredicate(Instruction *);
 540
 541 private:
 542    const Target *const targ;
 543
 544    BuildUtil bld;
 545
 546    Value *tid;
 547 };
 548
 549 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
 550    targ(prog->getTarget()), tid(NULL)
 551 {
 552    bld.setProgram(prog);
 553 }
 554
 555 bool
 556 NV50LoweringPreSSA::visit(Function *f)
 557 {
 558    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
 559
 560    if (prog->getType() == Program::TYPE_COMPUTE) {
 561       // Add implicit "thread id" argument in $r0 to the function
 562       Value *arg = new_LValue(func, FILE_GPR);
 563       arg->reg.data.id = 0;
 564       f->ins.push_back(arg);
 565
 566       bld.setPosition(root, false);
 567       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
 568    }
 569
 570    return true;
 571 }
 572
 573 // move array source to first slot, convert to u16, add indirections
 574 bool
 575 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
 576 {
 577    const int arg = i->tex.target.getArgCount();
 578    const int dref = arg;
 579    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
 580
 581    // dref comes before bias/lod
 582    if (i->tex.target.isShadow())
 583       if (i->op == OP_TXB || i->op == OP_TXL)
 584          i->swapSources(dref, lod);
 585
 586    // array index must be converted to u32
 587    if (i->tex.target.isArray()) {
 588       Value *layer = i->getSrc(arg - 1);
 589       LValue *src = new_LValue(func, FILE_GPR);
 590       bld.mkCvt(OP_CVT, TYPE_U16, src, TYPE_F32, layer);
 591       i->setSrc(arg - 1, src);
 592
 593       if (i->tex.target.isCube()) {
 594          // Value *face = layer;
 595          Value *x, *y;
 596          x = new_LValue(func, FILE_GPR);
 597          y = new_LValue(func, FILE_GPR);
 598          layer = new_LValue(func, FILE_GPR);
 599
 600          i->tex.target = TEX_TARGET_2D_ARRAY;
 601
 602          // TODO: use TEXPREP to convert x,y,z,face -> x,y,layer
 603          bld.mkMov(x, i->getSrc(0));
 604          bld.mkMov(y, i->getSrc(1));
 605          bld.mkMov(layer, i->getSrc(3));
 606
 607          i->setSrc(0, x);
 608          i->setSrc(1, y);
 609          i->setSrc(2, layer);
 610          i->setSrc(3, i->getSrc(4));
 611          i->setSrc(4, NULL);
 612       }
 613    }
 614
 615    // texel offsets are 3 immediate fields in the instruction,
 616    // nv50 cannot do textureGatherOffsets
 617    assert(i->tex.useOffsets <= 1);
 618
 619    return true;
 620 }
 621
 622 // Bias must be equal for all threads of a quad or lod calculation will fail.
 623 //
 624 // The lanes of a quad are grouped by the bit in the condition register they
 625 // have set, which is selected by differing bias values.
 626 // Move the input values for TEX into a new register set for each group and
 627 // execute TEX only for a specific group.
 628 // We always need to use 4 new registers for the inputs/outputs because the
 629 // implicitly calculated derivatives must be correct.
 630 //
 631 // TODO: move to SSA phase so we can easily determine whether bias is constant
 632 bool
 633 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
 634 {
 635    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
 636    int l, d;
 637
 638    handleTEX(i);
 639    Value *bias = i->getSrc(i->tex.target.getArgCount());
 640    if (bias->isUniform())
 641       return true;
 642
 643    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
 644                                  bld.loadImm(NULL, 1));
 645    bld.setPosition(cond, false);
 646
 647    for (l = 1; l < 4; ++l) {
 648       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
 649       Value *bit = bld.getSSA();
 650       Value *pred = bld.getScratch(1, FILE_FLAGS);
 651       Value *imm = bld.loadImm(NULL, (1 << l));
 652       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
 653       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
 654       cond->setSrc(l, bit);
 655    }
 656    Value *flags = bld.getScratch(1, FILE_FLAGS);
 657    bld.setPosition(cond, true);
 658    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
 659
 660    Instruction *tex[4];
 661    for (l = 0; l < 4; ++l) {
 662       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
 663       bld.insert(tex[l]);
 664    }
 665
 666    Value *res[4][4];
 667    for (d = 0; i->defExists(d); ++d)
 668       res[0][d] = tex[0]->getDef(d);
 669    for (l = 1; l < 4; ++l) {
 670       for (d = 0; tex[l]->defExists(d); ++d) {
 671          res[l][d] = cloneShallow(func, res[0][d]);
 672          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
 673       }
 674    }
 675
 676    for (d = 0; i->defExists(d); ++d) {
 677       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
 678       for (l = 0; l < 4; ++l)
 679          dst->setSrc(l, res[l][d]);
 680    }
 681    delete_Instruction(prog, i);
 682    return true;
 683 }
 684
 685 // LOD must be equal for all threads of a quad.
 686 // Unlike with TXB, here we can just diverge since there's no LOD calculation
 687 // that would require all 4 threads' sources to be set up properly.
 688 bool
 689 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
 690 {
 691    handleTEX(i);
 692    Value *lod = i->getSrc(i->tex.target.getArgCount());
 693    if (lod->isUniform())
 694       return true;
 695
 696    BasicBlock *currBB = i->bb;
 697    BasicBlock *texiBB = i->bb->splitBefore(i, false);
 698    BasicBlock *joinBB = i->bb->splitAfter(i);
 699
 700    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
 701
 702    for (int l = 0; l <= 3; ++l) {
 703       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
 704       Value *pred = bld.getScratch(1, FILE_FLAGS);
 705       bld.setPosition(currBB, true);
 706       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
 707       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
 708       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
 709       if (l <= 2) {
 710          BasicBlock *laneBB = new BasicBlock(func);
 711          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
 712          currBB = laneBB;
 713       }
 714    }
 715    bld.setPosition(joinBB, false);
 716    bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
 717    return true;
 718 }
 719
 720 bool
 721 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
 722 {
 723    static const uint8_t qOps[4][2] =
 724    {
 725       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
 726       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
 727       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
 728       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
 729    };
 730    Value *def[4][4];
 731    Value *crd[3];
 732    Instruction *tex;
 733    Value *zero = bld.loadImm(bld.getSSA(), 0);
 734    int l, c;
 735    const int dim = i->tex.target.getDim();
 736
 737    handleTEX(i);
 738    i->op = OP_TEX; // no need to clone dPdx/dPdy later
 739
 740    for (c = 0; c < dim; ++c)
 741       crd[c] = bld.getScratch();
 742
 743    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
 744    for (l = 0; l < 4; ++l) {
 745       // mov coordinates from lane l to all lanes
 746       for (c = 0; c < dim; ++c)
 747          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
 748       // add dPdx from lane l to lanes dx
 749       for (c = 0; c < dim; ++c)
 750          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
 751       // add dPdy from lane l to lanes dy
 752       for (c = 0; c < dim; ++c)
 753          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
 754       // texture
 755       bld.insert(tex = cloneForward(func, i));
 756       for (c = 0; c < dim; ++c)
 757          tex->setSrc(c, crd[c]);
 758       // save results
 759       for (c = 0; i->defExists(c); ++c) {
 760          Instruction *mov;
 761          def[c][l] = bld.getSSA();
 762          mov = bld.mkMov(def[c][l], tex->getDef(c));
 763          mov->fixed = 1;
 764          mov->lanes = 1 << l;
 765       }
 766    }
 767    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
 768
 769    for (c = 0; i->defExists(c); ++c) {
 770       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
 771       for (l = 0; l < 4; ++l)
 772          u->setSrc(l, def[c][l]);
 773    }
 774
 775    i->bb->remove(i);
 776    return true;
 777 }
 778
 779 bool
 780 NV50LoweringPreSSA::handleSET(Instruction *i)
 781 {
 782    if (i->dType == TYPE_F32) {
 783       bld.setPosition(i, true);
 784       i->dType = TYPE_U32;
 785       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
 786       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
 787    }
 788    return true;
 789 }
 790
 791 bool
 792 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
 793 {
 794    Value *src0 = bld.getSSA();
 795    Value *src1 = bld.getSSA();
 796    Value *pred = bld.getScratch(1, FILE_FLAGS);
 797
 798    Value *v0 = i->getSrc(0);
 799    Value *v1 = i->getSrc(1);
 800    // XXX: these probably shouldn't be immediates in the first place ...
 801    if (v0->asImm())
 802       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
 803    if (v1->asImm())
 804       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
 805
 806    bld.setPosition(i, true);
 807    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
 808    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
 809    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
 810
 811    bld.setPosition(i, false);
 812    i->op = OP_SET;
 813    i->setFlagsDef(0, pred);
 814    i->dType = TYPE_U8;
 815    i->setSrc(0, i->getSrc(2));
 816    i->setSrc(2, NULL);
 817    i->setSrc(1, bld.loadImm(NULL, 0));
 818
 819    return true;
 820 }
 821
 822 bool
 823 NV50LoweringPreSSA::handleSELP(Instruction *i)
 824 {
 825    Value *src0 = bld.getSSA();
 826    Value *src1 = bld.getSSA();
 827
 828    Value *v0 = i->getSrc(0);
 829    Value *v1 = i->getSrc(1);
 830    if (v0->asImm())
 831       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
 832    if (v1->asImm())
 833       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
 834
 835    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
 836    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
 837    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
 838    delete_Instruction(prog, i);
 839    return true;
 840 }
 841
 842 bool
 843 NV50LoweringPreSSA::handleWRSV(Instruction *i)
 844 {
 845    Symbol *sym = i->getSrc(0)->asSym();
 846
 847    // these are all shader outputs, $sreg are not writeable
 848    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
 849    if (addr >= 0x400)
 850       return false;
 851    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
 852
 853    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
 854
 855    bld.getBB()->remove(i);
 856    return true;
 857 }
 858
 859 bool
 860 NV50LoweringPreSSA::handleCALL(Instruction *i)
 861 {
 862    if (prog->getType() == Program::TYPE_COMPUTE) {
 863       // Add implicit "thread id" argument in $r0 to the function
 864       i->setSrc(i->srcCount(), tid);
 865    }
 866    return true;
 867 }
 868
 869 bool
 870 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
 871 {
 872    delete_Instruction(prog, i);
 873    return true;
 874 }
 875
 876 bool
 877 NV50LoweringPreSSA::handleCONT(Instruction *i)
 878 {
 879    i->op = OP_BRA;
 880    return true;
 881 }
 882
 883 bool
 884 NV50LoweringPreSSA::handleRDSV(Instruction *i)
 885 {
 886    Symbol *sym = i->getSrc(0)->asSym();
 887    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
 888    Value *def = i->getDef(0);
 889    SVSemantic sv = sym->reg.data.sv.sv;
 890    int idx = sym->reg.data.sv.index;
 891
 892    if (addr >= 0x400) // mov $sreg
 893       return true;
 894
 895    switch (sv) {
 896    case SV_POSITION:
 897       assert(prog->getType() == Program::TYPE_FRAGMENT);
 898       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
 899       break;
 900    case SV_FACE:
 901       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
 902       if (i->dType == TYPE_F32) {
 903          bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
 904          bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
 905       }
 906       break;
 907    case SV_NCTAID:
 908    case SV_CTAID:
 909    case SV_NTID:
 910       if ((sv == SV_NCTAID && idx >= 2) ||
 911           (sv == SV_NTID && idx >= 3)) {
 912          bld.mkMov(def, bld.mkImm(1));
 913       } else if (sv == SV_CTAID && idx >= 2) {
 914          bld.mkMov(def, bld.mkImm(0));
 915       } else {
 916          Value *x = bld.getSSA(2);
 917          bld.mkOp1(OP_LOAD, TYPE_U16, x,
 918                    bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
 919          bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
 920       }
 921       break;
 922    case SV_TID:
 923       if (idx == 0) {
 924          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
 925       } else if (idx == 1) {
 926          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
 927          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
 928       } else if (idx == 2) {
 929          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
 930       } else {
 931          bld.mkMov(def, bld.mkImm(0));
 932       }
 933       break;
 934    default:
 935       bld.mkFetch(i->getDef(0), i->dType,
 936                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
 937       break;
 938    }
 939    bld.getBB()->remove(i);
 940    return true;
 941 }
 942
 943 bool
 944 NV50LoweringPreSSA::handleMUL(Instruction *i)
 945 {
 946    if (!isFloatType(i->dType) && typeSizeof(i->sType) > 2)
 947       return expandIntegerMUL(&bld, i);
 948    return true;
 949 }
 950
 951 bool
 952 NV50LoweringPreSSA::handleDIV(Instruction *i)
 953 {
 954    if (!isFloatType(i->dType))
 955       return true;
 956    bld.setPosition(i, false);
 957    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
 958    i->op = OP_MUL;
 959    i->setSrc(1, rcp->getDef(0));
 960    return true;
 961 }
 962
 963 bool
 964 NV50LoweringPreSSA::handleSQRT(Instruction *i)
 965 {
 966    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
 967                                 bld.getSSA(), i->getSrc(0));
 968    i->op = OP_MUL;
 969    i->setSrc(1, rsq->getDef(0));
 970
 971    return true;
 972 }
 973
 974 bool
 975 NV50LoweringPreSSA::handlePOW(Instruction *i)
 976 {
 977    LValue *val = bld.getScratch();
 978
 979    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
 980    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
 981    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
 982
 983    i->op = OP_EX2;
 984    i->setSrc(0, val);
 985    i->setSrc(1, NULL);
 986
 987    return true;
 988 }
 989
 990 bool
 991 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
 992 {
 993    if (prog->getType() == Program::TYPE_FRAGMENT) {
 994       if (i->getIndirect(0, 0)) {
 995          // TODO: redirect to l[] here, load to GPRs at exit
 996          return false;
 997       } else {
 998          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
 999
1000          i->op = OP_MOV;
1001          i->src(0).set(i->src(1));
1002          i->setSrc(1, NULL);
1003          i->setDef(0, new_LValue(func, FILE_GPR));
1004          i->getDef(0)->reg.data.id = id;
1005
1006          prog->maxGPR = MAX2(prog->maxGPR, id);
1007       }
1008    }
1009    return true;
1010 }
1011
1012 // Set flags according to predicate and make the instruction read $cX.
1013 void
1014 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1015 {
1016    Value *pred = insn->getPredicate();
1017    Value *cdst;
1018
1019    if (!pred || pred->reg.file == FILE_FLAGS)
1020       return;
1021    cdst = bld.getSSA(1, FILE_FLAGS);
1022
1023    bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, cdst, bld.loadImm(NULL, 0), pred);
1024
1025    insn->setPredicate(insn->cc, cdst);
1026 }
1027
1028 //
1029 // - add quadop dance for texturing
1030 // - put FP outputs in GPRs
1031 // - convert instruction sequences
1032 //
1033 bool
1034 NV50LoweringPreSSA::visit(Instruction *i)
1035 {
1036    if (i->prev)
1037       bld.setPosition(i->prev, true);
1038    else
1039    if (i->next)
1040       bld.setPosition(i->next, false);
1041    else
1042       bld.setPosition(i->bb, true);
1043
1044    if (i->cc != CC_ALWAYS)
1045       checkPredicate(i);
1046
1047    switch (i->op) {
1048    case OP_TEX:
1049    case OP_TXF:
1050    case OP_TXG:
1051       return handleTEX(i->asTex());
1052    case OP_TXB:
1053       return handleTXB(i->asTex());
1054    case OP_TXL:
1055       return handleTXL(i->asTex());
1056    case OP_TXD:
1057       return handleTXD(i->asTex());
1058    case OP_EX2:
1059       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1060       i->setSrc(0, i->getDef(0));
1061       break;
1062    case OP_SET:
1063       return handleSET(i);
1064    case OP_SLCT:
1065       return handleSLCT(i->asCmp());
1066    case OP_SELP:
1067       return handleSELP(i);
1068    case OP_POW:
1069       return handlePOW(i);
1070    case OP_MUL:
1071       return handleMUL(i);
1072    case OP_DIV:
1073       return handleDIV(i);
1074    case OP_SQRT:
1075       return handleSQRT(i);
1076    case OP_EXPORT:
1077       return handleEXPORT(i);
1078    case OP_RDSV:
1079       return handleRDSV(i);
1080    case OP_WRSV:
1081       return handleWRSV(i);
1082    case OP_CALL:
1083       return handleCALL(i);
1084    case OP_PRECONT:
1085       return handlePRECONT(i);
1086    case OP_CONT:
1087       return handleCONT(i);
1088    default:
1089       break;
1090    }
1091    return true;
1092 }
1093
1094 bool
1095 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1096 {
1097    bool ret = false;
1098
1099    if (stage == CG_STAGE_PRE_SSA) {
1100       NV50LoweringPreSSA pass(prog);
1101       ret = pass.run(prog, false, true);
1102    } else
1103    if (stage == CG_STAGE_SSA) {
1104       if (!prog->targetPriv)
1105          prog->targetPriv = new std::list<Instruction *>();
1106       NV50LegalizeSSA pass(prog);
1107       ret = pass.run(prog, false, true);
1108    } else
1109    if (stage == CG_STAGE_POST_RA) {
1110       NV50LegalizePostRA pass;
1111       ret = pass.run(prog, false, true);
1112       if (prog->targetPriv)
1113          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1114    }
1115    return ret;
1116 }
1117
1118 } // namespace nv50_ir