src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_build_util.h"
  25
  26 #include "codegen/nv50_ir_target_nv50.h"
  27
  28 namespace nv50_ir {
  29
  30 // nv50 doesn't support 32 bit integer multiplication
  31 //
  32 //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
  33 // -------------------
  34 //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
  35 // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
  36 //       al*bl
  37 //    ah*bl 00
  38 //
  39 // fffe0001 + fffe0001
  40 //
  41 // Note that this sort of splitting doesn't work for signed values, so we
  42 // compute the sign on those manually and then perform an unsigned multiply.
  43 static bool
  44 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
  45 {
  46    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
  47
  48    DataType fTy; // full type
  49    switch (mul->sType) {
  50    case TYPE_S32: fTy = TYPE_U32; break;
  51    case TYPE_S64: fTy = TYPE_U64; break;
  52    default: fTy = mul->sType; break;
  53    }
  54
  55    DataType hTy; // half type
  56    switch (fTy) {
  57    case TYPE_U32: hTy = TYPE_U16; break;
  58    case TYPE_U64: hTy = TYPE_U32; break;
  59    default:
  60       return false;
  61    }
  62    unsigned int fullSize = typeSizeof(fTy);
  63    unsigned int halfSize = typeSizeof(hTy);
  64
  65    Instruction *i[9];
  66
  67    bld->setPosition(mul, true);
  68
  69    Value *s[2];
  70    Value *a[2], *b[2];
  71    Value *t[4];
  72    for (int j = 0; j < 4; ++j)
  73       t[j] = bld->getSSA(fullSize);
  74
  75    s[0] = mul->getSrc(0);
  76    s[1] = mul->getSrc(1);
  77
  78    if (isSignedType(mul->sType) && highResult) {
  79       s[0] = bld->getSSA(fullSize);
  80       s[1] = bld->getSSA(fullSize);
  81       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
  82       bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
  83    }
  84
  85    // split sources into halves
  86    i[0] = bld->mkSplit(a, halfSize, s[0]);
  87    i[1] = bld->mkSplit(b, halfSize, s[1]);
  88
  89    i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
  90    i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
  91    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
  92    i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
  93
  94    if (highResult) {
  95       Value *c[2];
  96       Value *r[5];
  97       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
  98       c[0] = bld->getSSA(1, FILE_FLAGS);
  99       c[1] = bld->getSSA(1, FILE_FLAGS);
 100       for (int j = 0; j < 5; ++j)
 101          r[j] = bld->getSSA(fullSize);
 102
 103       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
 104       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
 105       bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
 106       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
 107       i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
 108
 109       // set carry defs / sources
 110       i[3]->setFlagsDef(1, c[0]);
 111       // actual result required in negative case, but ignored for
 112       // unsigned. for some reason the compiler ends up dropping the whole
 113       // instruction if the destination is unused but the flags are.
 114       if (isSignedType(mul->sType))
 115          i[4]->setFlagsDef(1, c[1]);
 116       else
 117          i[4]->setFlagsDef(0, c[1]);
 118       i[6]->setPredicate(CC_C, c[0]);
 119       i[5]->setFlagsSrc(3, c[1]);
 120
 121       if (isSignedType(mul->sType)) {
 122          Value *cc[2];
 123          Value *rr[7];
 124          Value *one = bld->getSSA(fullSize);
 125          bld->loadImm(one, 1);
 126          for (int j = 0; j < 7; j++)
 127             rr[j] = bld->getSSA(fullSize);
 128
 129          // NOTE: this logic uses predicates because splitting basic blocks is
 130          // ~impossible during the SSA phase. The RA relies on a correlation
 131          // between edge order and phi node sources.
 132
 133          // Set the sign of the result based on the inputs
 134          bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
 135             ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
 136
 137          // 1s complement of 64-bit value
 138          bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
 139             ->setPredicate(CC_S, cc[0]);
 140          bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
 141             ->setPredicate(CC_S, cc[0]);
 142
 143          // add to low 32-bits, keep track of the carry
 144          Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
 145          n->setPredicate(CC_S, cc[0]);
 146          n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
 147
 148          // If there was a carry, add 1 to the upper 32 bits
 149          // XXX: These get executed even if they shouldn't be
 150          bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
 151             ->setPredicate(CC_C, cc[1]);
 152          bld->mkMov(rr[3], rr[0])
 153             ->setPredicate(CC_NC, cc[1]);
 154          bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
 155
 156          // Merge the results from the negative and non-negative paths
 157          bld->mkMov(rr[5], rr[4])
 158             ->setPredicate(CC_S, cc[0]);
 159          bld->mkMov(rr[6], r[4])
 160             ->setPredicate(CC_NS, cc[0]);
 161          bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
 162       } else {
 163          bld->mkMov(mul->getDef(0), r[4]);
 164       }
 165    } else {
 166       bld->mkMov(mul->getDef(0), t[3]);
 167    }
 168    delete_Instruction(bld->getProgram(), mul);
 169
 170    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
 171       if (i[j])
 172          i[j]->sType = hTy;
 173
 174    return true;
 175 }
 176
 177 #define QOP_ADD  0
 178 #define QOP_SUBR 1
 179 #define QOP_SUB  2
 180 #define QOP_MOV2 3
 181
 182 //             UL UR LL LR
 183 #define QUADOP(q, r, s, t)            \
 184    ((QOP_##q << 6) | (QOP_##r << 4) | \
 185     (QOP_##s << 2) | (QOP_##t << 0))
 186
 187 class NV50LegalizePostRA : public Pass
 188 {
 189 private:
 190    virtual bool visit(Function *);
 191    virtual bool visit(BasicBlock *);
 192
 193    void handlePRERET(FlowInstruction *);
 194    void replaceZero(Instruction *);
 195
 196    LValue *r63;
 197 };
 198
 199 bool
 200 NV50LegalizePostRA::visit(Function *fn)
 201 {
 202    Program *prog = fn->getProgram();
 203
 204    r63 = new_LValue(fn, FILE_GPR);
 205    // GPR units on nv50 are in half-regs
 206    if (prog->maxGPR < 126)
 207       r63->reg.data.id = 63;
 208    else
 209       r63->reg.data.id = 127;
 210
 211    // this is actually per-program, but we can do it all on visiting main()
 212    std::list<Instruction *> *outWrites =
 213       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
 214
 215    if (outWrites) {
 216       for (std::list<Instruction *>::iterator it = outWrites->begin();
 217            it != outWrites->end(); ++it)
 218          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
 219       // instructions will be deleted on exit
 220       outWrites->clear();
 221    }
 222
 223    return true;
 224 }
 225
 226 void
 227 NV50LegalizePostRA::replaceZero(Instruction *i)
 228 {
 229    for (int s = 0; i->srcExists(s); ++s) {
 230       ImmediateValue *imm = i->getSrc(s)->asImm();
 231       if (imm && imm->reg.data.u64 == 0)
 232          i->setSrc(s, r63);
 233    }
 234 }
 235
 236 // Emulate PRERET: jump to the target and call to the origin from there
 237 //
 238 // WARNING: atm only works if BBs are affected by at most a single PRERET
 239 //
 240 // BB:0
 241 // preret BB:3
 242 // (...)
 243 // BB:3
 244 // (...)
 245 //             --->
 246 // BB:0
 247 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
 248 // (...)
 249 // BB:3
 250 // bra BB:3 + n1 (skip the call)
 251 // call BB:0 + n2 (skip bra at beginning of BB:0)
 252 // (...)
 253 void
 254 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
 255 {
 256    BasicBlock *bbE = pre->bb;
 257    BasicBlock *bbT = pre->target.bb;
 258
 259    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
 260    bbE->remove(pre);
 261    bbE->insertHead(pre);
 262
 263    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
 264    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
 265
 266    bbT->insertHead(call);
 267    bbT->insertHead(skip);
 268
 269    // NOTE: maybe split blocks to prevent the instructions from moving ?
 270
 271    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
 272    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
 273 }
 274
 275 bool
 276 NV50LegalizePostRA::visit(BasicBlock *bb)
 277 {
 278    Instruction *i, *next;
 279
 280    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
 281    for (i = bb->getFirst(); i; i = next) {
 282       next = i->next;
 283       if (i->isNop()) {
 284          bb->remove(i);
 285       } else
 286       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
 287          handlePRERET(i->asFlow());
 288       } else {
 289          // TODO: We will want to do this before register allocation,
 290          // since have to use a $c register for the carry flag.
 291          if (typeSizeof(i->dType) == 8) {
 292             Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
 293             if (hi)
 294                next = hi;
 295          }
 296
 297          if (i->op != OP_PFETCH && i->op != OP_BAR &&
 298              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
 299             replaceZero(i);
 300       }
 301    }
 302    if (!bb->getEntry())
 303       return true;
 304
 305    return true;
 306 }
 307
 308 class NV50LegalizeSSA : public Pass
 309 {
 310 public:
 311    NV50LegalizeSSA(Program *);
 312
 313    virtual bool visit(BasicBlock *bb);
 314
 315 private:
 316    void propagateWriteToOutput(Instruction *);
 317    void handleDIV(Instruction *);
 318    void handleMOD(Instruction *);
 319    void handleMUL(Instruction *);
 320    void handleAddrDef(Instruction *);
 321
 322    inline bool isARL(const Instruction *) const;
 323
 324    BuildUtil bld;
 325
 326    std::list<Instruction *> *outWrites;
 327 };
 328
 329 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
 330 {
 331    bld.setProgram(prog);
 332
 333    if (prog->optLevel >= 2 &&
 334        (prog->getType() == Program::TYPE_GEOMETRY ||
 335         prog->getType() == Program::TYPE_VERTEX))
 336       outWrites =
 337          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
 338    else
 339       outWrites = NULL;
 340 }
 341
 342 void
 343 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
 344 {
 345    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
 346       return;
 347
 348    // check def instruction can store
 349    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
 350
 351    // TODO: move exports (if beneficial) in common opt pass
 352    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
 353       return;
 354
 355    for (int s = 0; di->srcExists(s); ++s)
 356       if (di->src(s).getFile() == FILE_IMMEDIATE)
 357          return;
 358
 359    if (prog->getType() == Program::TYPE_GEOMETRY) {
 360       // Only propagate output writes in geometry shaders when we can be sure
 361       // that we are propagating to the same output vertex.
 362       if (di->bb != st->bb)
 363          return;
 364       Instruction *i;
 365       for (i = di; i != st; i = i->next) {
 366          if (i->op == OP_EMIT || i->op == OP_RESTART)
 367             return;
 368       }
 369       assert(i); // st after di
 370    }
 371
 372    // We cannot set defs to non-lvalues before register allocation, so
 373    // save & remove (to save registers) the exports and replace later.
 374    outWrites->push_back(st);
 375    st->bb->remove(st);
 376 }
 377
 378 bool
 379 NV50LegalizeSSA::isARL(const Instruction *i) const
 380 {
 381    ImmediateValue imm;
 382
 383    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
 384       return false;
 385    if (!i->src(1).getImmediate(imm))
 386       return false;
 387    return imm.isInteger(0);
 388 }
 389
 390 void
 391 NV50LegalizeSSA::handleAddrDef(Instruction *i)
 392 {
 393    Instruction *arl;
 394
 395    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
 396
 397    // PFETCH can always write to $a
 398    if (i->op == OP_PFETCH)
 399       return;
 400    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
 401    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
 402       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
 403          return;
 404       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
 405          return;
 406    }
 407
 408    // turn $a sources into $r sources (can't operate on $a)
 409    for (int s = 0; i->srcExists(s); ++s) {
 410       Value *a = i->getSrc(s);
 411       Value *r;
 412       if (a->reg.file == FILE_ADDRESS) {
 413          if (a->getInsn() && isARL(a->getInsn())) {
 414             i->setSrc(s, a->getInsn()->getSrc(0));
 415          } else {
 416             bld.setPosition(i, false);
 417             r = bld.getSSA();
 418             bld.mkMov(r, a);
 419             i->setSrc(s, r);
 420          }
 421       }
 422    }
 423    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
 424       return;
 425
 426    // turn result back into $a
 427    bld.setPosition(i, true);
 428    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
 429    i->setDef(0, arl->getSrc(0));
 430 }
 431
 432 void
 433 NV50LegalizeSSA::handleMUL(Instruction *mul)
 434 {
 435    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
 436       return;
 437    Value *def = mul->getDef(0);
 438    Value *pred = mul->getPredicate();
 439    CondCode cc = mul->cc;
 440    if (pred)
 441       mul->setPredicate(CC_ALWAYS, NULL);
 442
 443    if (mul->op == OP_MAD) {
 444       Instruction *add = mul;
 445       bld.setPosition(add, false);
 446       Value *res = cloneShallow(func, mul->getDef(0));
 447       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
 448       add->op = OP_ADD;
 449       add->setSrc(0, mul->getDef(0));
 450       add->setSrc(1, add->getSrc(2));
 451       for (int s = 2; add->srcExists(s); ++s)
 452          add->setSrc(s, NULL);
 453       mul->subOp = add->subOp;
 454       add->subOp = 0;
 455    }
 456    expandIntegerMUL(&bld, mul);
 457    if (pred)
 458       def->getInsn()->setPredicate(cc, pred);
 459 }
 460
 461 // Use f32 division: first compute an approximate result, use it to reduce
 462 // the dividend, which should then be representable as f32, divide the reduced
 463 // dividend, and add the quotients.
 464 void
 465 NV50LegalizeSSA::handleDIV(Instruction *div)
 466 {
 467    const DataType ty = div->sType;
 468
 469    if (ty != TYPE_U32 && ty != TYPE_S32)
 470       return;
 471
 472    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
 473
 474    bld.setPosition(div, false);
 475
 476    Value *a, *af = bld.getSSA();
 477    Value *b, *bf = bld.getSSA();
 478
 479    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
 480    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
 481
 482    if (isSignedType(ty)) {
 483       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
 484       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
 485       a = bld.getSSA();
 486       b = bld.getSSA();
 487       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
 488       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
 489    } else {
 490       a = div->getSrc(0);
 491       b = div->getSrc(1);
 492    }
 493
 494    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
 495    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
 496
 497    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
 498    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
 499
 500    // get error of 1st result
 501    expandIntegerMUL(&bld,
 502       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
 503    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
 504
 505    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
 506
 507    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
 508    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
 509       ->rnd = ROUND_Z;
 510    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
 511
 512    // correction: if modulus >= divisor, add 1
 513    expandIntegerMUL(&bld,
 514       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
 515    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
 516    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
 517    if (!isSignedType(ty)) {
 518       div->op = OP_SUB;
 519       div->setSrc(0, q);
 520       div->setSrc(1, s);
 521    } else {
 522       t = q;
 523       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
 524       s = bld.getSSA();
 525       t = bld.getSSA();
 526       // fix the sign
 527       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
 528          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
 529       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
 530       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
 531
 532       div->op = OP_UNION;
 533       div->setSrc(0, s);
 534       div->setSrc(1, t);
 535    }
 536 }
 537
 538 void
 539 NV50LegalizeSSA::handleMOD(Instruction *mod)
 540 {
 541    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
 542       return;
 543    bld.setPosition(mod, false);
 544
 545    Value *q = bld.getSSA();
 546    Value *m = bld.getSSA();
 547
 548    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
 549    handleDIV(q->getInsn());
 550
 551    bld.setPosition(mod, false);
 552    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
 553
 554    mod->op = OP_SUB;
 555    mod->setSrc(1, m);
 556 }
 557
 558 bool
 559 NV50LegalizeSSA::visit(BasicBlock *bb)
 560 {
 561    Instruction *insn, *next;
 562    // skipping PHIs (don't pass them to handleAddrDef) !
 563    for (insn = bb->getEntry(); insn; insn = next) {
 564       next = insn->next;
 565
 566       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
 567          handleAddrDef(insn);
 568
 569       switch (insn->op) {
 570       case OP_EXPORT:
 571          if (outWrites)
 572             propagateWriteToOutput(insn);
 573          break;
 574       case OP_DIV:
 575          handleDIV(insn);
 576          break;
 577       case OP_MOD:
 578          handleMOD(insn);
 579          break;
 580       case OP_MAD:
 581       case OP_MUL:
 582          handleMUL(insn);
 583          break;
 584       default:
 585          break;
 586       }
 587    }
 588    return true;
 589 }
 590
 591 class NV50LoweringPreSSA : public Pass
 592 {
 593 public:
 594    NV50LoweringPreSSA(Program *);
 595
 596 private:
 597    virtual bool visit(Instruction *);
 598    virtual bool visit(Function *);
 599
 600    bool handleRDSV(Instruction *);
 601    bool handleWRSV(Instruction *);
 602
 603    bool handlePFETCH(Instruction *);
 604    bool handleEXPORT(Instruction *);
 605    bool handleLOAD(Instruction *);
 606
 607    bool handleDIV(Instruction *);
 608    bool handleSQRT(Instruction *);
 609    bool handlePOW(Instruction *);
 610
 611    bool handleSET(Instruction *);
 612    bool handleSLCT(CmpInstruction *);
 613    bool handleSELP(Instruction *);
 614
 615    bool handleTEX(TexInstruction *);
 616    bool handleTXB(TexInstruction *); // I really
 617    bool handleTXL(TexInstruction *); // hate
 618    bool handleTXD(TexInstruction *); // these 3
 619    bool handleTXLQ(TexInstruction *);
 620    bool handleTXQ(TexInstruction *);
 621
 622    bool handleCALL(Instruction *);
 623    bool handlePRECONT(Instruction *);
 624    bool handleCONT(Instruction *);
 625
 626    void checkPredicate(Instruction *);
 627    void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
 628    void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
 629
 630 private:
 631    const Target *const targ;
 632
 633    BuildUtil bld;
 634
 635    Value *tid;
 636 };
 637
 638 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
 639    targ(prog->getTarget()), tid(NULL)
 640 {
 641    bld.setProgram(prog);
 642 }
 643
 644 bool
 645 NV50LoweringPreSSA::visit(Function *f)
 646 {
 647    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
 648
 649    if (prog->getType() == Program::TYPE_COMPUTE) {
 650       // Add implicit "thread id" argument in $r0 to the function
 651       Value *arg = new_LValue(func, FILE_GPR);
 652       arg->reg.data.id = 0;
 653       f->ins.push_back(arg);
 654
 655       bld.setPosition(root, false);
 656       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
 657    }
 658
 659    return true;
 660 }
 661
 662 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
 663                                        Value **ms_x, Value **ms_y) {
 664    // This loads the texture-indexed ms setting from the constant buffer
 665    Value *tmp = new_LValue(func, FILE_GPR);
 666    uint8_t b = prog->driver->io.resInfoCBSlot;
 667    off += prog->driver->io.suInfoBase;
 668    if (prog->getType() > Program::TYPE_VERTEX)
 669       off += 16 * 2 * 4;
 670    if (prog->getType() > Program::TYPE_GEOMETRY)
 671       off += 16 * 2 * 4;
 672    *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 673                              FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
 674    *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 675                              FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
 676    *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
 677 }
 678
 679 void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
 680    // Given a MS level, and a sample id, compute the delta x/y
 681    uint8_t b = prog->driver->io.msInfoCBSlot;
 682    Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
 683
 684    // The required information is at mslevel * 16 * 4 + sample * 8
 685    // = (mslevel * 8 + sample) * 8
 686    bld.mkOp2(OP_SHL,
 687              TYPE_U32,
 688              off,
 689              bld.mkOp2v(OP_ADD, TYPE_U32, t,
 690                         bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
 691                         s),
 692              bld.mkImm(3));
 693    *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 694                            FILE_MEMORY_CONST, b, TYPE_U32,
 695                            prog->driver->io.msInfoBase), off);
 696    *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 697                            FILE_MEMORY_CONST, b, TYPE_U32,
 698                            prog->driver->io.msInfoBase + 4), off);
 699 }
 700
 701 bool
 702 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
 703 {
 704    const int arg = i->tex.target.getArgCount();
 705    const int dref = arg;
 706    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
 707
 708    // handle MS, which means looking up the MS params for this texture, and
 709    // adjusting the input coordinates to point at the right sample.
 710    if (i->tex.target.isMS()) {
 711       Value *x = i->getSrc(0);
 712       Value *y = i->getSrc(1);
 713       Value *s = i->getSrc(arg - 1);
 714       Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
 715          *ms, *ms_x, *ms_y, *dx, *dy;
 716
 717       i->tex.target.clearMS();
 718
 719       loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
 720       loadMsInfo(ms, s, &dx, &dy);
 721
 722       bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
 723       bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
 724       bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
 725       bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
 726       i->setSrc(0, tx);
 727       i->setSrc(1, ty);
 728       i->setSrc(arg - 1, bld.loadImm(NULL, 0));
 729    }
 730
 731    // dref comes before bias/lod
 732    if (i->tex.target.isShadow())
 733       if (i->op == OP_TXB || i->op == OP_TXL)
 734          i->swapSources(dref, lod);
 735
 736    if (i->tex.target.isArray()) {
 737       if (i->op != OP_TXF) {
 738          // array index must be converted to u32, but it's already an integer
 739          // for TXF
 740          Value *layer = i->getSrc(arg - 1);
 741          LValue *src = new_LValue(func, FILE_GPR);
 742          bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
 743          bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
 744          i->setSrc(arg - 1, src);
 745       }
 746       if (i->tex.target.isCube() && i->srcCount() > 4) {
 747          std::vector<Value *> acube, a2d;
 748          int c;
 749
 750          acube.resize(4);
 751          for (c = 0; c < 4; ++c)
 752             acube[c] = i->getSrc(c);
 753          a2d.resize(4);
 754          for (c = 0; c < 3; ++c)
 755             a2d[c] = new_LValue(func, FILE_GPR);
 756          a2d[3] = NULL;
 757
 758          bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
 759                    a2d, acube)->asTex()->tex.mask = 0x7;
 760
 761          for (c = 0; c < 3; ++c)
 762             i->setSrc(c, a2d[c]);
 763          for (; i->srcExists(c + 1); ++c)
 764             i->setSrc(c, i->getSrc(c + 1));
 765          i->setSrc(c, NULL);
 766          assert(c <= 4);
 767
 768          i->tex.target = i->tex.target.isShadow() ?
 769             TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
 770       }
 771    }
 772
 773    // texel offsets are 3 immediate fields in the instruction,
 774    // nv50 cannot do textureGatherOffsets
 775    assert(i->tex.useOffsets <= 1);
 776    if (i->tex.useOffsets) {
 777       for (int c = 0; c < 3; ++c) {
 778          ImmediateValue val;
 779          if (!i->offset[0][c].getImmediate(val))
 780             assert(!"non-immediate offset");
 781          i->tex.offset[c] = val.reg.data.u32;
 782          i->offset[0][c].set(NULL);
 783       }
 784    }
 785
 786    return true;
 787 }
 788
 789 // Bias must be equal for all threads of a quad or lod calculation will fail.
 790 //
 791 // The lanes of a quad are grouped by the bit in the condition register they
 792 // have set, which is selected by differing bias values.
 793 // Move the input values for TEX into a new register set for each group and
 794 // execute TEX only for a specific group.
 795 // We always need to use 4 new registers for the inputs/outputs because the
 796 // implicitly calculated derivatives must be correct.
 797 //
 798 // TODO: move to SSA phase so we can easily determine whether bias is constant
 799 bool
 800 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
 801 {
 802    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
 803    int l, d;
 804
 805    // We can't actually apply bias *and* do a compare for a cube
 806    // texture. Since the compare has to be done before the filtering, just
 807    // drop the bias on the floor.
 808    if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
 809       i->op = OP_TEX;
 810       i->setSrc(3, i->getSrc(4));
 811       i->setSrc(4, NULL);
 812       return handleTEX(i);
 813    }
 814
 815    handleTEX(i);
 816    Value *bias = i->getSrc(i->tex.target.getArgCount());
 817    if (bias->isUniform())
 818       return true;
 819
 820    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
 821                                  bld.loadImm(NULL, 1));
 822    bld.setPosition(cond, false);
 823
 824    for (l = 1; l < 4; ++l) {
 825       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
 826       Value *bit = bld.getSSA();
 827       Value *pred = bld.getScratch(1, FILE_FLAGS);
 828       Value *imm = bld.loadImm(NULL, (1 << l));
 829       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
 830       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
 831       cond->setSrc(l, bit);
 832    }
 833    Value *flags = bld.getScratch(1, FILE_FLAGS);
 834    bld.setPosition(cond, true);
 835    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
 836
 837    Instruction *tex[4];
 838    for (l = 0; l < 4; ++l) {
 839       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
 840       bld.insert(tex[l]);
 841    }
 842
 843    Value *res[4][4];
 844    for (d = 0; i->defExists(d); ++d)
 845       res[0][d] = tex[0]->getDef(d);
 846    for (l = 1; l < 4; ++l) {
 847       for (d = 0; tex[l]->defExists(d); ++d) {
 848          res[l][d] = cloneShallow(func, res[0][d]);
 849          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
 850       }
 851    }
 852
 853    for (d = 0; i->defExists(d); ++d) {
 854       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
 855       for (l = 0; l < 4; ++l)
 856          dst->setSrc(l, res[l][d]);
 857    }
 858    delete_Instruction(prog, i);
 859    return true;
 860 }
 861
 862 // LOD must be equal for all threads of a quad.
 863 // Unlike with TXB, here we can just diverge since there's no LOD calculation
 864 // that would require all 4 threads' sources to be set up properly.
 865 bool
 866 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
 867 {
 868    handleTEX(i);
 869    Value *lod = i->getSrc(i->tex.target.getArgCount());
 870    if (lod->isUniform())
 871       return true;
 872
 873    BasicBlock *currBB = i->bb;
 874    BasicBlock *texiBB = i->bb->splitBefore(i, false);
 875    BasicBlock *joinBB = i->bb->splitAfter(i);
 876
 877    bld.setPosition(currBB, true);
 878    assert(!currBB->joinAt);
 879    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
 880
 881    for (int l = 0; l <= 3; ++l) {
 882       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
 883       Value *pred = bld.getScratch(1, FILE_FLAGS);
 884       bld.setPosition(currBB, true);
 885       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
 886       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
 887       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
 888       if (l <= 2) {
 889          BasicBlock *laneBB = new BasicBlock(func);
 890          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
 891          currBB = laneBB;
 892       }
 893    }
 894    bld.setPosition(joinBB, false);
 895    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
 896    return true;
 897 }
 898
 899 bool
 900 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
 901 {
 902    static const uint8_t qOps[4][2] =
 903    {
 904       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
 905       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
 906       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
 907       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
 908    };
 909    Value *def[4][4];
 910    Value *crd[3];
 911    Instruction *tex;
 912    Value *zero = bld.loadImm(bld.getSSA(), 0);
 913    int l, c;
 914    const int dim = i->tex.target.getDim();
 915
 916    handleTEX(i);
 917    i->op = OP_TEX; // no need to clone dPdx/dPdy later
 918
 919    for (c = 0; c < dim; ++c)
 920       crd[c] = bld.getScratch();
 921
 922    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
 923    for (l = 0; l < 4; ++l) {
 924       // mov coordinates from lane l to all lanes
 925       for (c = 0; c < dim; ++c)
 926          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
 927       // add dPdx from lane l to lanes dx
 928       for (c = 0; c < dim; ++c)
 929          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
 930       // add dPdy from lane l to lanes dy
 931       for (c = 0; c < dim; ++c)
 932          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
 933       // texture
 934       bld.insert(tex = cloneForward(func, i));
 935       for (c = 0; c < dim; ++c)
 936          tex->setSrc(c, crd[c]);
 937       // save results
 938       for (c = 0; i->defExists(c); ++c) {
 939          Instruction *mov;
 940          def[c][l] = bld.getSSA();
 941          mov = bld.mkMov(def[c][l], tex->getDef(c));
 942          mov->fixed = 1;
 943          mov->lanes = 1 << l;
 944       }
 945    }
 946    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
 947
 948    for (c = 0; i->defExists(c); ++c) {
 949       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
 950       for (l = 0; l < 4; ++l)
 951          u->setSrc(l, def[c][l]);
 952    }
 953
 954    i->bb->remove(i);
 955    return true;
 956 }
 957
 958 bool
 959 NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
 960 {
 961    handleTEX(i);
 962    bld.setPosition(i, true);
 963
 964    /* The returned values are not quite what we want:
 965     * (a) convert from s32 to f32
 966     * (b) multiply by 1/256
 967     */
 968    for (int def = 0; def < 2; ++def) {
 969       if (!i->defExists(def))
 970          continue;
 971       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
 972       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
 973                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
 974    }
 975    return true;
 976 }
 977
 978 bool
 979 NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
 980 {
 981    Value *ms, *ms_x, *ms_y;
 982    if (i->tex.query == TXQ_DIMS)
 983       return true;
 984    assert(i->tex.query == TXQ_TYPE);
 985    assert(i->tex.mask == 4);
 986
 987    loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
 988    bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
 989    i->bb->remove(i);
 990
 991    return true;
 992 }
 993
 994
 995 bool
 996 NV50LoweringPreSSA::handleSET(Instruction *i)
 997 {
 998    if (i->dType == TYPE_F32) {
 999       bld.setPosition(i, true);
1000       i->dType = TYPE_U32;
1001       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
1002       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
1003    }
1004    return true;
1005 }
1006
1007 bool
1008 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
1009 {
1010    Value *src0 = bld.getSSA();
1011    Value *src1 = bld.getSSA();
1012    Value *pred = bld.getScratch(1, FILE_FLAGS);
1013
1014    Value *v0 = i->getSrc(0);
1015    Value *v1 = i->getSrc(1);
1016    // XXX: these probably shouldn't be immediates in the first place ...
1017    if (v0->asImm())
1018       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1019    if (v1->asImm())
1020       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1021
1022    bld.setPosition(i, true);
1023    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
1024    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
1025    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1026
1027    bld.setPosition(i, false);
1028    i->op = OP_SET;
1029    i->setFlagsDef(0, pred);
1030    i->dType = TYPE_U8;
1031    i->setSrc(0, i->getSrc(2));
1032    i->setSrc(2, NULL);
1033    i->setSrc(1, bld.loadImm(NULL, 0));
1034
1035    return true;
1036 }
1037
1038 bool
1039 NV50LoweringPreSSA::handleSELP(Instruction *i)
1040 {
1041    Value *src0 = bld.getSSA();
1042    Value *src1 = bld.getSSA();
1043
1044    Value *v0 = i->getSrc(0);
1045    Value *v1 = i->getSrc(1);
1046    if (v0->asImm())
1047       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1048    if (v1->asImm())
1049       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1050
1051    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1052    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1053    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1054    delete_Instruction(prog, i);
1055    return true;
1056 }
1057
1058 bool
1059 NV50LoweringPreSSA::handleWRSV(Instruction *i)
1060 {
1061    Symbol *sym = i->getSrc(0)->asSym();
1062
1063    // these are all shader outputs, $sreg are not writeable
1064    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
1065    if (addr >= 0x400)
1066       return false;
1067    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1068
1069    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
1070
1071    bld.getBB()->remove(i);
1072    return true;
1073 }
1074
1075 bool
1076 NV50LoweringPreSSA::handleCALL(Instruction *i)
1077 {
1078    if (prog->getType() == Program::TYPE_COMPUTE) {
1079       // Add implicit "thread id" argument in $r0 to the function
1080       i->setSrc(i->srcCount(), tid);
1081    }
1082    return true;
1083 }
1084
1085 bool
1086 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
1087 {
1088    delete_Instruction(prog, i);
1089    return true;
1090 }
1091
1092 bool
1093 NV50LoweringPreSSA::handleCONT(Instruction *i)
1094 {
1095    i->op = OP_BRA;
1096    return true;
1097 }
1098
1099 bool
1100 NV50LoweringPreSSA::handleRDSV(Instruction *i)
1101 {
1102    Symbol *sym = i->getSrc(0)->asSym();
1103    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1104    Value *def = i->getDef(0);
1105    SVSemantic sv = sym->reg.data.sv.sv;
1106    int idx = sym->reg.data.sv.index;
1107
1108    if (addr >= 0x400) // mov $sreg
1109       return true;
1110
1111    switch (sv) {
1112    case SV_POSITION:
1113       assert(prog->getType() == Program::TYPE_FRAGMENT);
1114       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1115       break;
1116    case SV_FACE:
1117       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1118       if (i->dType == TYPE_F32) {
1119          bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
1120          bld.mkOp1(OP_NEG, TYPE_S32, def, def);
1121          bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
1122       }
1123       break;
1124    case SV_NCTAID:
1125    case SV_CTAID:
1126    case SV_NTID:
1127       if ((sv == SV_NCTAID && idx >= 2) ||
1128           (sv == SV_NTID && idx >= 3)) {
1129          bld.mkMov(def, bld.mkImm(1));
1130       } else if (sv == SV_CTAID && idx >= 2) {
1131          bld.mkMov(def, bld.mkImm(0));
1132       } else {
1133          Value *x = bld.getSSA(2);
1134          bld.mkOp1(OP_LOAD, TYPE_U16, x,
1135                    bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1136          bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1137       }
1138       break;
1139    case SV_TID:
1140       if (idx == 0) {
1141          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1142       } else if (idx == 1) {
1143          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1144          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1145       } else if (idx == 2) {
1146          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1147       } else {
1148          bld.mkMov(def, bld.mkImm(0));
1149       }
1150       break;
1151    case SV_SAMPLE_POS: {
1152       Value *off = new_LValue(func, FILE_ADDRESS);
1153       bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1154       bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1155       bld.mkLoad(TYPE_F32,
1156                  def,
1157                  bld.mkSymbol(
1158                        FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
1159                        TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1160                  off);
1161       break;
1162    }
1163    default:
1164       bld.mkFetch(i->getDef(0), i->dType,
1165                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1166       break;
1167    }
1168    bld.getBB()->remove(i);
1169    return true;
1170 }
1171
1172 bool
1173 NV50LoweringPreSSA::handleDIV(Instruction *i)
1174 {
1175    if (!isFloatType(i->dType))
1176       return true;
1177    bld.setPosition(i, false);
1178    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1179    i->op = OP_MUL;
1180    i->setSrc(1, rcp->getDef(0));
1181    return true;
1182 }
1183
1184 bool
1185 NV50LoweringPreSSA::handleSQRT(Instruction *i)
1186 {
1187    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
1188                                 bld.getSSA(), i->getSrc(0));
1189    i->op = OP_MUL;
1190    i->setSrc(1, rsq->getDef(0));
1191
1192    return true;
1193 }
1194
1195 bool
1196 NV50LoweringPreSSA::handlePOW(Instruction *i)
1197 {
1198    LValue *val = bld.getScratch();
1199
1200    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1201    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1202    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1203
1204    i->op = OP_EX2;
1205    i->setSrc(0, val);
1206    i->setSrc(1, NULL);
1207
1208    return true;
1209 }
1210
1211 bool
1212 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
1213 {
1214    if (prog->getType() == Program::TYPE_FRAGMENT) {
1215       if (i->getIndirect(0, 0)) {
1216          // TODO: redirect to l[] here, load to GPRs at exit
1217          return false;
1218       } else {
1219          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1220
1221          i->op = OP_MOV;
1222          i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1223          i->src(0).set(i->src(1));
1224          i->setSrc(1, NULL);
1225          i->setDef(0, new_LValue(func, FILE_GPR));
1226          i->getDef(0)->reg.data.id = id;
1227
1228          prog->maxGPR = MAX2(prog->maxGPR, id);
1229       }
1230    }
1231    return true;
1232 }
1233
1234 // Handle indirect addressing in geometry shaders:
1235 //
1236 // ld $r0 a[$a1][$a2+k] ->
1237 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1238 //
1239 bool
1240 NV50LoweringPreSSA::handleLOAD(Instruction *i)
1241 {
1242    ValueRef src = i->src(0);
1243
1244    if (src.isIndirect(1)) {
1245       assert(prog->getType() == Program::TYPE_GEOMETRY);
1246       Value *addr = i->getIndirect(0, 1);
1247
1248       if (src.isIndirect(0)) {
1249          // base address is in an address register, so move to a GPR
1250          Value *base = bld.getScratch();
1251          bld.mkMov(base, addr);
1252
1253          Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1254          Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1255          Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1256                                     i->getIndirect(0, 0), bld.mkImm(2));
1257
1258          // Calculate final address: addr = base + attr*vstride; use 16-bit
1259          // multiplication since 32-bit would be lowered to multiple
1260          // instructions, and we only need the low 16 bits of the result
1261          Value *a[2], *b[2];
1262          bld.mkSplit(a, 2, attrib);
1263          bld.mkSplit(b, 2, vstride);
1264          Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1265                                  base);
1266
1267          // move address from GPR into an address register
1268          addr = bld.getSSA(2, FILE_ADDRESS);
1269          bld.mkMov(addr, sum);
1270       }
1271
1272       i->setIndirect(0, 1, NULL);
1273       i->setIndirect(0, 0, addr);
1274    }
1275
1276    return true;
1277 }
1278
1279 bool
1280 NV50LoweringPreSSA::handlePFETCH(Instruction *i)
1281 {
1282    assert(prog->getType() == Program::TYPE_GEOMETRY);
1283
1284    // NOTE: cannot use getImmediate here, not in SSA form yet, move to
1285    // later phase if that assertion ever triggers:
1286
1287    ImmediateValue *imm = i->getSrc(0)->asImm();
1288    assert(imm);
1289
1290    assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
1291
1292    if (i->srcExists(1)) {
1293       // indirect addressing of vertex in primitive space
1294
1295       LValue *val = bld.getScratch();
1296       Value *ptr = bld.getSSA(2, FILE_ADDRESS);
1297       bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
1298       bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
1299
1300       // NOTE: PFETCH directly to an $aX only works with direct addressing
1301       i->op = OP_SHL;
1302       i->setSrc(0, val);
1303       i->setSrc(1, bld.mkImm(0));
1304    }
1305
1306    return true;
1307 }
1308
1309 // Set flags according to predicate and make the instruction read $cX.
1310 void
1311 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1312 {
1313    Value *pred = insn->getPredicate();
1314    Value *cdst;
1315
1316    // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
1317    if (!pred ||
1318        pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
1319       return;
1320
1321    cdst = bld.getSSA(1, FILE_FLAGS);
1322
1323    bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
1324
1325    insn->setPredicate(insn->cc, cdst);
1326 }
1327
1328 //
1329 // - add quadop dance for texturing
1330 // - put FP outputs in GPRs
1331 // - convert instruction sequences
1332 //
1333 bool
1334 NV50LoweringPreSSA::visit(Instruction *i)
1335 {
1336    bld.setPosition(i, false);
1337
1338    if (i->cc != CC_ALWAYS)
1339       checkPredicate(i);
1340
1341    switch (i->op) {
1342    case OP_TEX:
1343    case OP_TXF:
1344    case OP_TXG:
1345       return handleTEX(i->asTex());
1346    case OP_TXB:
1347       return handleTXB(i->asTex());
1348    case OP_TXL:
1349       return handleTXL(i->asTex());
1350    case OP_TXD:
1351       return handleTXD(i->asTex());
1352    case OP_TXLQ:
1353       return handleTXLQ(i->asTex());
1354    case OP_TXQ:
1355       return handleTXQ(i->asTex());
1356    case OP_EX2:
1357       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1358       i->setSrc(0, i->getDef(0));
1359       break;
1360    case OP_SET:
1361       return handleSET(i);
1362    case OP_SLCT:
1363       return handleSLCT(i->asCmp());
1364    case OP_SELP:
1365       return handleSELP(i);
1366    case OP_POW:
1367       return handlePOW(i);
1368    case OP_DIV:
1369       return handleDIV(i);
1370    case OP_SQRT:
1371       return handleSQRT(i);
1372    case OP_EXPORT:
1373       return handleEXPORT(i);
1374    case OP_LOAD:
1375       return handleLOAD(i);
1376    case OP_RDSV:
1377       return handleRDSV(i);
1378    case OP_WRSV:
1379       return handleWRSV(i);
1380    case OP_CALL:
1381       return handleCALL(i);
1382    case OP_PRECONT:
1383       return handlePRECONT(i);
1384    case OP_CONT:
1385       return handleCONT(i);
1386    case OP_PFETCH:
1387       return handlePFETCH(i);
1388    default:
1389       break;
1390    }
1391    return true;
1392 }
1393
1394 bool
1395 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1396 {
1397    bool ret = false;
1398
1399    if (stage == CG_STAGE_PRE_SSA) {
1400       NV50LoweringPreSSA pass(prog);
1401       ret = pass.run(prog, false, true);
1402    } else
1403    if (stage == CG_STAGE_SSA) {
1404       if (!prog->targetPriv)
1405          prog->targetPriv = new std::list<Instruction *>();
1406       NV50LegalizeSSA pass(prog);
1407       ret = pass.run(prog, false, true);
1408    } else
1409    if (stage == CG_STAGE_POST_RA) {
1410       NV50LegalizePostRA pass;
1411       ret = pass.run(prog, false, true);
1412       if (prog->targetPriv)
1413          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1414    }
1415    return ret;
1416 }
1417
1418 } // namespace nv50_ir