src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_build_util.h"
  25
  26 #include "codegen/nv50_ir_target_nv50.h"
  27
  28 namespace nv50_ir {
  29
  30 // nv50 doesn't support 32 bit integer multiplication
  31 //
  32 //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
  33 // -------------------
  34 //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
  35 // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
  36 //       al*bl
  37 //    ah*bl 00
  38 //
  39 // fffe0001 + fffe0001
  40 //
  41 // Note that this sort of splitting doesn't work for signed values, so we
  42 // compute the sign on those manually and then perform an unsigned multiply.
  43 static bool
  44 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
  45 {
  46    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
  47
  48    DataType fTy; // full type
  49    switch (mul->sType) {
  50    case TYPE_S32: fTy = TYPE_U32; break;
  51    case TYPE_S64: fTy = TYPE_U64; break;
  52    default: fTy = mul->sType; break;
  53    }
  54
  55    DataType hTy; // half type
  56    switch (fTy) {
  57    case TYPE_U32: hTy = TYPE_U16; break;
  58    case TYPE_U64: hTy = TYPE_U32; break;
  59    default:
  60       return false;
  61    }
  62    unsigned int fullSize = typeSizeof(fTy);
  63    unsigned int halfSize = typeSizeof(hTy);
  64
  65    Instruction *i[9];
  66
  67    bld->setPosition(mul, true);
  68
  69    Value *s[2];
  70    Value *a[2], *b[2];
  71    Value *t[4];
  72    for (int j = 0; j < 4; ++j)
  73       t[j] = bld->getSSA(fullSize);
  74
  75    s[0] = mul->getSrc(0);
  76    s[1] = mul->getSrc(1);
  77
  78    if (isSignedType(mul->sType)) {
  79       s[0] = bld->getSSA(fullSize);
  80       s[1] = bld->getSSA(fullSize);
  81       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
  82       bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
  83    }
  84
  85    // split sources into halves
  86    i[0] = bld->mkSplit(a, halfSize, s[0]);
  87    i[1] = bld->mkSplit(b, halfSize, s[1]);
  88
  89    i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
  90    i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
  91    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
  92    i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
  93
  94    if (highResult) {
  95       Value *c[2];
  96       Value *r[5];
  97       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
  98       c[0] = bld->getSSA(1, FILE_FLAGS);
  99       c[1] = bld->getSSA(1, FILE_FLAGS);
 100       for (int j = 0; j < 5; ++j)
 101          r[j] = bld->getSSA(fullSize);
 102
 103       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
 104       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
 105       bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
 106       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
 107       i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
 108
 109       // set carry defs / sources
 110       i[3]->setFlagsDef(1, c[0]);
 111       // actual result required in negative case, but ignored for
 112       // unsigned. for some reason the compiler ends up dropping the whole
 113       // instruction if the destination is unused but the flags are.
 114       if (isSignedType(mul->sType))
 115          i[4]->setFlagsDef(1, c[1]);
 116       else
 117          i[4]->setFlagsDef(0, c[1]);
 118       i[6]->setPredicate(CC_C, c[0]);
 119       i[5]->setFlagsSrc(3, c[1]);
 120
 121       if (isSignedType(mul->sType)) {
 122          Value *cc[2];
 123          Value *rr[7];
 124          Value *one = bld->getSSA(fullSize);
 125          bld->loadImm(one, 1);
 126          for (int j = 0; j < 7; j++)
 127             rr[j] = bld->getSSA(fullSize);
 128
 129          // NOTE: this logic uses predicates because splitting basic blocks is
 130          // ~impossible during the SSA phase. The RA relies on a correlation
 131          // between edge order and phi node sources.
 132
 133          // Set the sign of the result based on the inputs
 134          bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
 135             ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
 136
 137          // 1s complement of 64-bit value
 138          bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
 139             ->setPredicate(CC_S, cc[0]);
 140          bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
 141             ->setPredicate(CC_S, cc[0]);
 142
 143          // add to low 32-bits, keep track of the carry
 144          Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
 145          n->setPredicate(CC_S, cc[0]);
 146          n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
 147
 148          // If there was a carry, add 1 to the upper 32 bits
 149          // XXX: These get executed even if they shouldn't be
 150          bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
 151             ->setPredicate(CC_C, cc[1]);
 152          bld->mkMov(rr[3], rr[0])
 153             ->setPredicate(CC_NC, cc[1]);
 154          bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
 155
 156          // Merge the results from the negative and non-negative paths
 157          bld->mkMov(rr[5], rr[4])
 158             ->setPredicate(CC_S, cc[0]);
 159          bld->mkMov(rr[6], r[4])
 160             ->setPredicate(CC_NS, cc[0]);
 161          bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
 162       } else {
 163          bld->mkMov(mul->getDef(0), r[4]);
 164       }
 165    } else {
 166       bld->mkMov(mul->getDef(0), t[3]);
 167    }
 168    delete_Instruction(bld->getProgram(), mul);
 169
 170    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
 171       if (i[j])
 172          i[j]->sType = hTy;
 173
 174    return true;
 175 }
 176
 177 #define QOP_ADD  0
 178 #define QOP_SUBR 1
 179 #define QOP_SUB  2
 180 #define QOP_MOV2 3
 181
 182 //             UL UR LL LR
 183 #define QUADOP(q, r, s, t)            \
 184    ((QOP_##q << 6) | (QOP_##r << 4) | \
 185     (QOP_##s << 2) | (QOP_##t << 0))
 186
 187 class NV50LegalizePostRA : public Pass
 188 {
 189 private:
 190    virtual bool visit(Function *);
 191    virtual bool visit(BasicBlock *);
 192
 193    void handlePRERET(FlowInstruction *);
 194    void replaceZero(Instruction *);
 195
 196    LValue *r63;
 197 };
 198
 199 bool
 200 NV50LegalizePostRA::visit(Function *fn)
 201 {
 202    Program *prog = fn->getProgram();
 203
 204    r63 = new_LValue(fn, FILE_GPR);
 205    r63->reg.data.id = 63;
 206
 207    // this is actually per-program, but we can do it all on visiting main()
 208    std::list<Instruction *> *outWrites =
 209       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
 210
 211    if (outWrites) {
 212       for (std::list<Instruction *>::iterator it = outWrites->begin();
 213            it != outWrites->end(); ++it)
 214          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
 215       // instructions will be deleted on exit
 216       outWrites->clear();
 217    }
 218
 219    return true;
 220 }
 221
 222 void
 223 NV50LegalizePostRA::replaceZero(Instruction *i)
 224 {
 225    for (int s = 0; i->srcExists(s); ++s) {
 226       ImmediateValue *imm = i->getSrc(s)->asImm();
 227       if (imm && imm->reg.data.u64 == 0)
 228          i->setSrc(s, r63);
 229    }
 230 }
 231
 232 // Emulate PRERET: jump to the target and call to the origin from there
 233 //
 234 // WARNING: atm only works if BBs are affected by at most a single PRERET
 235 //
 236 // BB:0
 237 // preret BB:3
 238 // (...)
 239 // BB:3
 240 // (...)
 241 //             --->
 242 // BB:0
 243 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
 244 // (...)
 245 // BB:3
 246 // bra BB:3 + n1 (skip the call)
 247 // call BB:0 + n2 (skip bra at beginning of BB:0)
 248 // (...)
 249 void
 250 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
 251 {
 252    BasicBlock *bbE = pre->bb;
 253    BasicBlock *bbT = pre->target.bb;
 254
 255    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
 256    bbE->remove(pre);
 257    bbE->insertHead(pre);
 258
 259    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
 260    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
 261
 262    bbT->insertHead(call);
 263    bbT->insertHead(skip);
 264
 265    // NOTE: maybe split blocks to prevent the instructions from moving ?
 266
 267    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
 268    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
 269 }
 270
 271 bool
 272 NV50LegalizePostRA::visit(BasicBlock *bb)
 273 {
 274    Instruction *i, *next;
 275
 276    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
 277    for (i = bb->getFirst(); i; i = next) {
 278       next = i->next;
 279       if (i->isNop()) {
 280          bb->remove(i);
 281       } else
 282       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
 283          handlePRERET(i->asFlow());
 284       } else {
 285          // TODO: We will want to do this before register allocation,
 286          // since have to use a $c register for the carry flag.
 287          if (typeSizeof(i->dType) == 8) {
 288             Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
 289             if (hi)
 290                next = hi;
 291          }
 292
 293          if (i->op != OP_MOV && i->op != OP_PFETCH &&
 294              i->op != OP_BAR &&
 295              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
 296             replaceZero(i);
 297       }
 298    }
 299    if (!bb->getEntry())
 300       return true;
 301
 302    return true;
 303 }
 304
 305 class NV50LegalizeSSA : public Pass
 306 {
 307 public:
 308    NV50LegalizeSSA(Program *);
 309
 310    virtual bool visit(BasicBlock *bb);
 311
 312 private:
 313    void propagateWriteToOutput(Instruction *);
 314    void handleDIV(Instruction *);
 315    void handleMOD(Instruction *);
 316    void handleMUL(Instruction *);
 317    void handleAddrDef(Instruction *);
 318
 319    inline bool isARL(const Instruction *) const;
 320
 321    BuildUtil bld;
 322
 323    std::list<Instruction *> *outWrites;
 324 };
 325
 326 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
 327 {
 328    bld.setProgram(prog);
 329
 330    if (prog->optLevel >= 2 &&
 331        (prog->getType() == Program::TYPE_GEOMETRY ||
 332         prog->getType() == Program::TYPE_VERTEX))
 333       outWrites =
 334          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
 335    else
 336       outWrites = NULL;
 337 }
 338
 339 void
 340 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
 341 {
 342    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
 343       return;
 344
 345    // check def instruction can store
 346    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
 347
 348    // TODO: move exports (if beneficial) in common opt pass
 349    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
 350       return;
 351
 352    for (int s = 0; di->srcExists(s); ++s)
 353       if (di->src(s).getFile() == FILE_IMMEDIATE)
 354          return;
 355
 356    if (prog->getType() == Program::TYPE_GEOMETRY) {
 357       // Only propagate output writes in geometry shaders when we can be sure
 358       // that we are propagating to the same output vertex.
 359       if (di->bb != st->bb)
 360          return;
 361       Instruction *i;
 362       for (i = di; i != st; i = i->next) {
 363          if (i->op == OP_EMIT || i->op == OP_RESTART)
 364             return;
 365       }
 366       assert(i); // st after di
 367    }
 368
 369    // We cannot set defs to non-lvalues before register allocation, so
 370    // save & remove (to save registers) the exports and replace later.
 371    outWrites->push_back(st);
 372    st->bb->remove(st);
 373 }
 374
 375 bool
 376 NV50LegalizeSSA::isARL(const Instruction *i) const
 377 {
 378    ImmediateValue imm;
 379
 380    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
 381       return false;
 382    if (!i->src(1).getImmediate(imm))
 383       return false;
 384    return imm.isInteger(0);
 385 }
 386
 387 void
 388 NV50LegalizeSSA::handleAddrDef(Instruction *i)
 389 {
 390    Instruction *arl;
 391
 392    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
 393
 394    // PFETCH can always write to $a
 395    if (i->op == OP_PFETCH)
 396       return;
 397    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
 398    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
 399       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
 400          return;
 401       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
 402          return;
 403    }
 404
 405    // turn $a sources into $r sources (can't operate on $a)
 406    for (int s = 0; i->srcExists(s); ++s) {
 407       Value *a = i->getSrc(s);
 408       Value *r;
 409       if (a->reg.file == FILE_ADDRESS) {
 410          if (a->getInsn() && isARL(a->getInsn())) {
 411             i->setSrc(s, a->getInsn()->getSrc(0));
 412          } else {
 413             bld.setPosition(i, false);
 414             r = bld.getSSA();
 415             bld.mkMov(r, a);
 416             i->setSrc(s, r);
 417          }
 418       }
 419    }
 420    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
 421       return;
 422
 423    // turn result back into $a
 424    bld.setPosition(i, true);
 425    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
 426    i->setDef(0, arl->getSrc(0));
 427 }
 428
 429 void
 430 NV50LegalizeSSA::handleMUL(Instruction *mul)
 431 {
 432    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
 433       return;
 434    Value *def = mul->getDef(0);
 435    Value *pred = mul->getPredicate();
 436    CondCode cc = mul->cc;
 437    if (pred)
 438       mul->setPredicate(CC_ALWAYS, NULL);
 439
 440    if (mul->op == OP_MAD) {
 441       Instruction *add = mul;
 442       bld.setPosition(add, false);
 443       Value *res = cloneShallow(func, mul->getDef(0));
 444       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
 445       add->op = OP_ADD;
 446       add->setSrc(0, mul->getDef(0));
 447       add->setSrc(1, add->getSrc(2));
 448       for (int s = 2; add->srcExists(s); ++s)
 449          add->setSrc(s, NULL);
 450       mul->subOp = add->subOp;
 451       add->subOp = 0;
 452    }
 453    expandIntegerMUL(&bld, mul);
 454    if (pred)
 455       def->getInsn()->setPredicate(cc, pred);
 456 }
 457
 458 // Use f32 division: first compute an approximate result, use it to reduce
 459 // the dividend, which should then be representable as f32, divide the reduced
 460 // dividend, and add the quotients.
 461 void
 462 NV50LegalizeSSA::handleDIV(Instruction *div)
 463 {
 464    const DataType ty = div->sType;
 465
 466    if (ty != TYPE_U32 && ty != TYPE_S32)
 467       return;
 468
 469    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
 470
 471    bld.setPosition(div, false);
 472
 473    Value *a, *af = bld.getSSA();
 474    Value *b, *bf = bld.getSSA();
 475
 476    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
 477    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
 478
 479    if (isSignedType(ty)) {
 480       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
 481       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
 482       a = bld.getSSA();
 483       b = bld.getSSA();
 484       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
 485       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
 486    } else {
 487       a = div->getSrc(0);
 488       b = div->getSrc(1);
 489    }
 490
 491    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
 492    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
 493
 494    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
 495    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
 496
 497    // get error of 1st result
 498    expandIntegerMUL(&bld,
 499       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
 500    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
 501
 502    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
 503
 504    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
 505    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
 506       ->rnd = ROUND_Z;
 507    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
 508
 509    // correction: if modulus >= divisor, add 1
 510    expandIntegerMUL(&bld,
 511       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
 512    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
 513    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
 514    if (!isSignedType(ty)) {
 515       div->op = OP_SUB;
 516       div->setSrc(0, q);
 517       div->setSrc(1, s);
 518    } else {
 519       t = q;
 520       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
 521       s = bld.getSSA();
 522       t = bld.getSSA();
 523       // fix the sign
 524       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
 525          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
 526       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
 527       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
 528
 529       div->op = OP_UNION;
 530       div->setSrc(0, s);
 531       div->setSrc(1, t);
 532    }
 533 }
 534
 535 void
 536 NV50LegalizeSSA::handleMOD(Instruction *mod)
 537 {
 538    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
 539       return;
 540    bld.setPosition(mod, false);
 541
 542    Value *q = bld.getSSA();
 543    Value *m = bld.getSSA();
 544
 545    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
 546    handleDIV(q->getInsn());
 547
 548    bld.setPosition(mod, false);
 549    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
 550
 551    mod->op = OP_SUB;
 552    mod->setSrc(1, m);
 553 }
 554
 555 bool
 556 NV50LegalizeSSA::visit(BasicBlock *bb)
 557 {
 558    Instruction *insn, *next;
 559    // skipping PHIs (don't pass them to handleAddrDef) !
 560    for (insn = bb->getEntry(); insn; insn = next) {
 561       next = insn->next;
 562
 563       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
 564          handleAddrDef(insn);
 565
 566       switch (insn->op) {
 567       case OP_EXPORT:
 568          if (outWrites)
 569             propagateWriteToOutput(insn);
 570          break;
 571       case OP_DIV:
 572          handleDIV(insn);
 573          break;
 574       case OP_MOD:
 575          handleMOD(insn);
 576          break;
 577       case OP_MAD:
 578       case OP_MUL:
 579          handleMUL(insn);
 580          break;
 581       default:
 582          break;
 583       }
 584    }
 585    return true;
 586 }
 587
 588 class NV50LoweringPreSSA : public Pass
 589 {
 590 public:
 591    NV50LoweringPreSSA(Program *);
 592
 593 private:
 594    virtual bool visit(Instruction *);
 595    virtual bool visit(Function *);
 596
 597    bool handleRDSV(Instruction *);
 598    bool handleWRSV(Instruction *);
 599
 600    bool handlePFETCH(Instruction *);
 601    bool handleEXPORT(Instruction *);
 602    bool handleLOAD(Instruction *);
 603
 604    bool handleDIV(Instruction *);
 605    bool handleSQRT(Instruction *);
 606    bool handlePOW(Instruction *);
 607
 608    bool handleSET(Instruction *);
 609    bool handleSLCT(CmpInstruction *);
 610    bool handleSELP(Instruction *);
 611
 612    bool handleTEX(TexInstruction *);
 613    bool handleTXB(TexInstruction *); // I really
 614    bool handleTXL(TexInstruction *); // hate
 615    bool handleTXD(TexInstruction *); // these 3
 616    bool handleTXLQ(TexInstruction *);
 617
 618    bool handleCALL(Instruction *);
 619    bool handlePRECONT(Instruction *);
 620    bool handleCONT(Instruction *);
 621
 622    void checkPredicate(Instruction *);
 623    void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
 624    void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
 625
 626 private:
 627    const Target *const targ;
 628
 629    BuildUtil bld;
 630
 631    Value *tid;
 632 };
 633
 634 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
 635    targ(prog->getTarget()), tid(NULL)
 636 {
 637    bld.setProgram(prog);
 638 }
 639
 640 bool
 641 NV50LoweringPreSSA::visit(Function *f)
 642 {
 643    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
 644
 645    if (prog->getType() == Program::TYPE_COMPUTE) {
 646       // Add implicit "thread id" argument in $r0 to the function
 647       Value *arg = new_LValue(func, FILE_GPR);
 648       arg->reg.data.id = 0;
 649       f->ins.push_back(arg);
 650
 651       bld.setPosition(root, false);
 652       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
 653    }
 654
 655    return true;
 656 }
 657
 658 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
 659                                        Value **ms_x, Value **ms_y) {
 660    // This loads the texture-indexed ms setting from the constant buffer
 661    Value *tmp = new_LValue(func, FILE_GPR);
 662    uint8_t b = prog->driver->io.resInfoCBSlot;
 663    off += prog->driver->io.suInfoBase;
 664    if (prog->getType() > Program::TYPE_VERTEX)
 665       off += 16 * 2 * 4;
 666    if (prog->getType() > Program::TYPE_GEOMETRY)
 667       off += 16 * 2 * 4;
 668    *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 669                              FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
 670    *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 671                              FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
 672    *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
 673 }
 674
 675 void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
 676    // Given a MS level, and a sample id, compute the delta x/y
 677    uint8_t b = prog->driver->io.msInfoCBSlot;
 678    Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
 679
 680    // The required information is at mslevel * 16 * 4 + sample * 8
 681    // = (mslevel * 8 + sample) * 8
 682    bld.mkOp2(OP_SHL,
 683              TYPE_U32,
 684              off,
 685              bld.mkOp2v(OP_ADD, TYPE_U32, t,
 686                         bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
 687                         s),
 688              bld.mkImm(3));
 689    *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 690                            FILE_MEMORY_CONST, b, TYPE_U32,
 691                            prog->driver->io.msInfoBase), off);
 692    *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 693                            FILE_MEMORY_CONST, b, TYPE_U32,
 694                            prog->driver->io.msInfoBase + 4), off);
 695 }
 696
 697 bool
 698 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
 699 {
 700    const int arg = i->tex.target.getArgCount();
 701    const int dref = arg;
 702    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
 703
 704    // handle MS, which means looking up the MS params for this texture, and
 705    // adjusting the input coordinates to point at the right sample.
 706    if (i->tex.target.isMS()) {
 707       Value *x = i->getSrc(0);
 708       Value *y = i->getSrc(1);
 709       Value *s = i->getSrc(arg - 1);
 710       Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
 711          *ms, *ms_x, *ms_y, *dx, *dy;
 712
 713       i->tex.target.clearMS();
 714
 715       loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
 716       loadMsInfo(ms, s, &dx, &dy);
 717
 718       bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
 719       bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
 720       bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
 721       bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
 722       i->setSrc(0, tx);
 723       i->setSrc(1, ty);
 724       i->setSrc(arg - 1, bld.loadImm(NULL, 0));
 725    }
 726
 727    // dref comes before bias/lod
 728    if (i->tex.target.isShadow())
 729       if (i->op == OP_TXB || i->op == OP_TXL)
 730          i->swapSources(dref, lod);
 731
 732    if (i->tex.target.isArray()) {
 733       if (i->op != OP_TXF) {
 734          // array index must be converted to u32, but it's already an integer
 735          // for TXF
 736          Value *layer = i->getSrc(arg - 1);
 737          LValue *src = new_LValue(func, FILE_GPR);
 738          bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
 739          bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
 740          i->setSrc(arg - 1, src);
 741       }
 742       if (i->tex.target.isCube() && i->srcCount() > 4) {
 743          std::vector<Value *> acube, a2d;
 744          int c;
 745
 746          acube.resize(4);
 747          for (c = 0; c < 4; ++c)
 748             acube[c] = i->getSrc(c);
 749          a2d.resize(4);
 750          for (c = 0; c < 3; ++c)
 751             a2d[c] = new_LValue(func, FILE_GPR);
 752          a2d[3] = NULL;
 753
 754          bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
 755                    a2d, acube)->asTex()->tex.mask = 0x7;
 756
 757          for (c = 0; c < 3; ++c)
 758             i->setSrc(c, a2d[c]);
 759          for (; i->srcExists(c + 1); ++c)
 760             i->setSrc(c, i->getSrc(c + 1));
 761          i->setSrc(c, NULL);
 762          assert(c <= 4);
 763
 764          i->tex.target = i->tex.target.isShadow() ?
 765             TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
 766       }
 767    }
 768
 769    // texel offsets are 3 immediate fields in the instruction,
 770    // nv50 cannot do textureGatherOffsets
 771    assert(i->tex.useOffsets <= 1);
 772    if (i->tex.useOffsets) {
 773       for (int c = 0; c < 3; ++c) {
 774          ImmediateValue val;
 775          assert(i->offset[0][c].getImmediate(val));
 776          i->tex.offset[c] = val.reg.data.u32;
 777          i->offset[0][c].set(NULL);
 778       }
 779    }
 780
 781    return true;
 782 }
 783
 784 // Bias must be equal for all threads of a quad or lod calculation will fail.
 785 //
 786 // The lanes of a quad are grouped by the bit in the condition register they
 787 // have set, which is selected by differing bias values.
 788 // Move the input values for TEX into a new register set for each group and
 789 // execute TEX only for a specific group.
 790 // We always need to use 4 new registers for the inputs/outputs because the
 791 // implicitly calculated derivatives must be correct.
 792 //
 793 // TODO: move to SSA phase so we can easily determine whether bias is constant
 794 bool
 795 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
 796 {
 797    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
 798    int l, d;
 799
 800    handleTEX(i);
 801    Value *bias = i->getSrc(i->tex.target.getArgCount());
 802    if (bias->isUniform())
 803       return true;
 804
 805    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
 806                                  bld.loadImm(NULL, 1));
 807    bld.setPosition(cond, false);
 808
 809    for (l = 1; l < 4; ++l) {
 810       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
 811       Value *bit = bld.getSSA();
 812       Value *pred = bld.getScratch(1, FILE_FLAGS);
 813       Value *imm = bld.loadImm(NULL, (1 << l));
 814       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
 815       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
 816       cond->setSrc(l, bit);
 817    }
 818    Value *flags = bld.getScratch(1, FILE_FLAGS);
 819    bld.setPosition(cond, true);
 820    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
 821
 822    Instruction *tex[4];
 823    for (l = 0; l < 4; ++l) {
 824       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
 825       bld.insert(tex[l]);
 826    }
 827
 828    Value *res[4][4];
 829    for (d = 0; i->defExists(d); ++d)
 830       res[0][d] = tex[0]->getDef(d);
 831    for (l = 1; l < 4; ++l) {
 832       for (d = 0; tex[l]->defExists(d); ++d) {
 833          res[l][d] = cloneShallow(func, res[0][d]);
 834          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
 835       }
 836    }
 837
 838    for (d = 0; i->defExists(d); ++d) {
 839       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
 840       for (l = 0; l < 4; ++l)
 841          dst->setSrc(l, res[l][d]);
 842    }
 843    delete_Instruction(prog, i);
 844    return true;
 845 }
 846
 847 // LOD must be equal for all threads of a quad.
 848 // Unlike with TXB, here we can just diverge since there's no LOD calculation
 849 // that would require all 4 threads' sources to be set up properly.
 850 bool
 851 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
 852 {
 853    handleTEX(i);
 854    Value *lod = i->getSrc(i->tex.target.getArgCount());
 855    if (lod->isUniform())
 856       return true;
 857
 858    BasicBlock *currBB = i->bb;
 859    BasicBlock *texiBB = i->bb->splitBefore(i, false);
 860    BasicBlock *joinBB = i->bb->splitAfter(i);
 861
 862    bld.setPosition(currBB, true);
 863    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
 864
 865    for (int l = 0; l <= 3; ++l) {
 866       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
 867       Value *pred = bld.getScratch(1, FILE_FLAGS);
 868       bld.setPosition(currBB, true);
 869       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
 870       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
 871       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
 872       if (l <= 2) {
 873          BasicBlock *laneBB = new BasicBlock(func);
 874          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
 875          currBB = laneBB;
 876       }
 877    }
 878    bld.setPosition(joinBB, false);
 879    bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
 880    return true;
 881 }
 882
 883 bool
 884 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
 885 {
 886    static const uint8_t qOps[4][2] =
 887    {
 888       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
 889       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
 890       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
 891       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
 892    };
 893    Value *def[4][4];
 894    Value *crd[3];
 895    Instruction *tex;
 896    Value *zero = bld.loadImm(bld.getSSA(), 0);
 897    int l, c;
 898    const int dim = i->tex.target.getDim();
 899
 900    handleTEX(i);
 901    i->op = OP_TEX; // no need to clone dPdx/dPdy later
 902
 903    for (c = 0; c < dim; ++c)
 904       crd[c] = bld.getScratch();
 905
 906    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
 907    for (l = 0; l < 4; ++l) {
 908       // mov coordinates from lane l to all lanes
 909       for (c = 0; c < dim; ++c)
 910          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
 911       // add dPdx from lane l to lanes dx
 912       for (c = 0; c < dim; ++c)
 913          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
 914       // add dPdy from lane l to lanes dy
 915       for (c = 0; c < dim; ++c)
 916          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
 917       // texture
 918       bld.insert(tex = cloneForward(func, i));
 919       for (c = 0; c < dim; ++c)
 920          tex->setSrc(c, crd[c]);
 921       // save results
 922       for (c = 0; i->defExists(c); ++c) {
 923          Instruction *mov;
 924          def[c][l] = bld.getSSA();
 925          mov = bld.mkMov(def[c][l], tex->getDef(c));
 926          mov->fixed = 1;
 927          mov->lanes = 1 << l;
 928       }
 929    }
 930    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
 931
 932    for (c = 0; i->defExists(c); ++c) {
 933       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
 934       for (l = 0; l < 4; ++l)
 935          u->setSrc(l, def[c][l]);
 936    }
 937
 938    i->bb->remove(i);
 939    return true;
 940 }
 941
 942 bool
 943 NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
 944 {
 945    handleTEX(i);
 946    bld.setPosition(i, true);
 947
 948    /* The returned values are not quite what we want:
 949     * (a) convert from s32 to f32
 950     * (b) multiply by 1/256
 951     */
 952    for (int def = 0; def < 2; ++def) {
 953       if (!i->defExists(def))
 954          continue;
 955       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
 956       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
 957                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
 958    }
 959    return true;
 960 }
 961
 962 bool
 963 NV50LoweringPreSSA::handleSET(Instruction *i)
 964 {
 965    if (i->dType == TYPE_F32) {
 966       bld.setPosition(i, true);
 967       i->dType = TYPE_U32;
 968       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
 969       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
 970    }
 971    return true;
 972 }
 973
 974 bool
 975 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
 976 {
 977    Value *src0 = bld.getSSA();
 978    Value *src1 = bld.getSSA();
 979    Value *pred = bld.getScratch(1, FILE_FLAGS);
 980
 981    Value *v0 = i->getSrc(0);
 982    Value *v1 = i->getSrc(1);
 983    // XXX: these probably shouldn't be immediates in the first place ...
 984    if (v0->asImm())
 985       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
 986    if (v1->asImm())
 987       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
 988
 989    bld.setPosition(i, true);
 990    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
 991    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
 992    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
 993
 994    bld.setPosition(i, false);
 995    i->op = OP_SET;
 996    i->setFlagsDef(0, pred);
 997    i->dType = TYPE_U8;
 998    i->setSrc(0, i->getSrc(2));
 999    i->setSrc(2, NULL);
1000    i->setSrc(1, bld.loadImm(NULL, 0));
1001
1002    return true;
1003 }
1004
1005 bool
1006 NV50LoweringPreSSA::handleSELP(Instruction *i)
1007 {
1008    Value *src0 = bld.getSSA();
1009    Value *src1 = bld.getSSA();
1010
1011    Value *v0 = i->getSrc(0);
1012    Value *v1 = i->getSrc(1);
1013    if (v0->asImm())
1014       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1015    if (v1->asImm())
1016       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1017
1018    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1019    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1020    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1021    delete_Instruction(prog, i);
1022    return true;
1023 }
1024
1025 bool
1026 NV50LoweringPreSSA::handleWRSV(Instruction *i)
1027 {
1028    Symbol *sym = i->getSrc(0)->asSym();
1029
1030    // these are all shader outputs, $sreg are not writeable
1031    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
1032    if (addr >= 0x400)
1033       return false;
1034    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1035
1036    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
1037
1038    bld.getBB()->remove(i);
1039    return true;
1040 }
1041
1042 bool
1043 NV50LoweringPreSSA::handleCALL(Instruction *i)
1044 {
1045    if (prog->getType() == Program::TYPE_COMPUTE) {
1046       // Add implicit "thread id" argument in $r0 to the function
1047       i->setSrc(i->srcCount(), tid);
1048    }
1049    return true;
1050 }
1051
1052 bool
1053 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
1054 {
1055    delete_Instruction(prog, i);
1056    return true;
1057 }
1058
1059 bool
1060 NV50LoweringPreSSA::handleCONT(Instruction *i)
1061 {
1062    i->op = OP_BRA;
1063    return true;
1064 }
1065
1066 bool
1067 NV50LoweringPreSSA::handleRDSV(Instruction *i)
1068 {
1069    Symbol *sym = i->getSrc(0)->asSym();
1070    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1071    Value *def = i->getDef(0);
1072    SVSemantic sv = sym->reg.data.sv.sv;
1073    int idx = sym->reg.data.sv.index;
1074
1075    if (addr >= 0x400) // mov $sreg
1076       return true;
1077
1078    switch (sv) {
1079    case SV_POSITION:
1080       assert(prog->getType() == Program::TYPE_FRAGMENT);
1081       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1082       break;
1083    case SV_FACE:
1084       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1085       if (i->dType == TYPE_F32) {
1086          bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
1087          bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
1088       }
1089       break;
1090    case SV_NCTAID:
1091    case SV_CTAID:
1092    case SV_NTID:
1093       if ((sv == SV_NCTAID && idx >= 2) ||
1094           (sv == SV_NTID && idx >= 3)) {
1095          bld.mkMov(def, bld.mkImm(1));
1096       } else if (sv == SV_CTAID && idx >= 2) {
1097          bld.mkMov(def, bld.mkImm(0));
1098       } else {
1099          Value *x = bld.getSSA(2);
1100          bld.mkOp1(OP_LOAD, TYPE_U16, x,
1101                    bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1102          bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1103       }
1104       break;
1105    case SV_TID:
1106       if (idx == 0) {
1107          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1108       } else if (idx == 1) {
1109          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1110          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1111       } else if (idx == 2) {
1112          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1113       } else {
1114          bld.mkMov(def, bld.mkImm(0));
1115       }
1116       break;
1117    case SV_SAMPLE_POS: {
1118       Value *off = new_LValue(func, FILE_ADDRESS);
1119       bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1120       bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1121       bld.mkLoad(TYPE_F32,
1122                  def,
1123                  bld.mkSymbol(
1124                        FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
1125                        TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1126                  off);
1127       break;
1128    }
1129    default:
1130       bld.mkFetch(i->getDef(0), i->dType,
1131                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1132       break;
1133    }
1134    bld.getBB()->remove(i);
1135    return true;
1136 }
1137
1138 bool
1139 NV50LoweringPreSSA::handleDIV(Instruction *i)
1140 {
1141    if (!isFloatType(i->dType))
1142       return true;
1143    bld.setPosition(i, false);
1144    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1145    i->op = OP_MUL;
1146    i->setSrc(1, rcp->getDef(0));
1147    return true;
1148 }
1149
1150 bool
1151 NV50LoweringPreSSA::handleSQRT(Instruction *i)
1152 {
1153    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
1154                                 bld.getSSA(), i->getSrc(0));
1155    i->op = OP_MUL;
1156    i->setSrc(1, rsq->getDef(0));
1157
1158    return true;
1159 }
1160
1161 bool
1162 NV50LoweringPreSSA::handlePOW(Instruction *i)
1163 {
1164    LValue *val = bld.getScratch();
1165
1166    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1167    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1168    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1169
1170    i->op = OP_EX2;
1171    i->setSrc(0, val);
1172    i->setSrc(1, NULL);
1173
1174    return true;
1175 }
1176
1177 bool
1178 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
1179 {
1180    if (prog->getType() == Program::TYPE_FRAGMENT) {
1181       if (i->getIndirect(0, 0)) {
1182          // TODO: redirect to l[] here, load to GPRs at exit
1183          return false;
1184       } else {
1185          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1186
1187          i->op = OP_MOV;
1188          i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1189          i->src(0).set(i->src(1));
1190          i->setSrc(1, NULL);
1191          i->setDef(0, new_LValue(func, FILE_GPR));
1192          i->getDef(0)->reg.data.id = id;
1193
1194          prog->maxGPR = MAX2(prog->maxGPR, id);
1195       }
1196    }
1197    return true;
1198 }
1199
1200 // Handle indirect addressing in geometry shaders:
1201 //
1202 // ld $r0 a[$a1][$a2+k] ->
1203 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1204 //
1205 bool
1206 NV50LoweringPreSSA::handleLOAD(Instruction *i)
1207 {
1208    ValueRef src = i->src(0);
1209
1210    if (src.isIndirect(1)) {
1211       assert(prog->getType() == Program::TYPE_GEOMETRY);
1212       Value *addr = i->getIndirect(0, 1);
1213
1214       if (src.isIndirect(0)) {
1215          // base address is in an address register, so move to a GPR
1216          Value *base = bld.getScratch();
1217          bld.mkMov(base, addr);
1218
1219          Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1220          Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1221          Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1222                                     i->getIndirect(0, 0), bld.mkImm(2));
1223
1224          // Calculate final address: addr = base + attr*vstride; use 16-bit
1225          // multiplication since 32-bit would be lowered to multiple
1226          // instructions, and we only need the low 16 bits of the result
1227          Value *a[2], *b[2];
1228          bld.mkSplit(a, 2, attrib);
1229          bld.mkSplit(b, 2, vstride);
1230          Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1231                                  base);
1232
1233          // move address from GPR into an address register
1234          addr = bld.getSSA(2, FILE_ADDRESS);
1235          bld.mkMov(addr, sum);
1236       }
1237
1238       i->setIndirect(0, 1, NULL);
1239       i->setIndirect(0, 0, addr);
1240    }
1241
1242    return true;
1243 }
1244
1245 bool
1246 NV50LoweringPreSSA::handlePFETCH(Instruction *i)
1247 {
1248    assert(prog->getType() == Program::TYPE_GEOMETRY);
1249
1250    // NOTE: cannot use getImmediate here, not in SSA form yet, move to
1251    // later phase if that assertion ever triggers:
1252
1253    ImmediateValue *imm = i->getSrc(0)->asImm();
1254    assert(imm);
1255
1256    assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
1257
1258    if (i->srcExists(1)) {
1259       // indirect addressing of vertex in primitive space
1260
1261       LValue *val = bld.getScratch();
1262       Value *ptr = bld.getSSA(2, FILE_ADDRESS);
1263       bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
1264       bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
1265
1266       // NOTE: PFETCH directly to an $aX only works with direct addressing
1267       i->op = OP_SHL;
1268       i->setSrc(0, val);
1269       i->setSrc(1, bld.mkImm(0));
1270    }
1271
1272    return true;
1273 }
1274
1275 // Set flags according to predicate and make the instruction read $cX.
1276 void
1277 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1278 {
1279    Value *pred = insn->getPredicate();
1280    Value *cdst;
1281
1282    // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
1283    if (!pred ||
1284        pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
1285       return;
1286
1287    cdst = bld.getSSA(1, FILE_FLAGS);
1288
1289    bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
1290
1291    insn->setPredicate(insn->cc, cdst);
1292 }
1293
1294 //
1295 // - add quadop dance for texturing
1296 // - put FP outputs in GPRs
1297 // - convert instruction sequences
1298 //
1299 bool
1300 NV50LoweringPreSSA::visit(Instruction *i)
1301 {
1302    bld.setPosition(i, false);
1303
1304    if (i->cc != CC_ALWAYS)
1305       checkPredicate(i);
1306
1307    switch (i->op) {
1308    case OP_TEX:
1309    case OP_TXF:
1310    case OP_TXG:
1311       return handleTEX(i->asTex());
1312    case OP_TXB:
1313       return handleTXB(i->asTex());
1314    case OP_TXL:
1315       return handleTXL(i->asTex());
1316    case OP_TXD:
1317       return handleTXD(i->asTex());
1318    case OP_TXLQ:
1319       return handleTXLQ(i->asTex());
1320    case OP_EX2:
1321       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1322       i->setSrc(0, i->getDef(0));
1323       break;
1324    case OP_SET:
1325       return handleSET(i);
1326    case OP_SLCT:
1327       return handleSLCT(i->asCmp());
1328    case OP_SELP:
1329       return handleSELP(i);
1330    case OP_POW:
1331       return handlePOW(i);
1332    case OP_DIV:
1333       return handleDIV(i);
1334    case OP_SQRT:
1335       return handleSQRT(i);
1336    case OP_EXPORT:
1337       return handleEXPORT(i);
1338    case OP_LOAD:
1339       return handleLOAD(i);
1340    case OP_RDSV:
1341       return handleRDSV(i);
1342    case OP_WRSV:
1343       return handleWRSV(i);
1344    case OP_CALL:
1345       return handleCALL(i);
1346    case OP_PRECONT:
1347       return handlePRECONT(i);
1348    case OP_CONT:
1349       return handleCONT(i);
1350    case OP_PFETCH:
1351       return handlePFETCH(i);
1352    default:
1353       break;
1354    }
1355    return true;
1356 }
1357
1358 bool
1359 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1360 {
1361    bool ret = false;
1362
1363    if (stage == CG_STAGE_PRE_SSA) {
1364       NV50LoweringPreSSA pass(prog);
1365       ret = pass.run(prog, false, true);
1366    } else
1367    if (stage == CG_STAGE_SSA) {
1368       if (!prog->targetPriv)
1369          prog->targetPriv = new std::list<Instruction *>();
1370       NV50LegalizeSSA pass(prog);
1371       ret = pass.run(prog, false, true);
1372    } else
1373    if (stage == CG_STAGE_POST_RA) {
1374       NV50LegalizePostRA pass;
1375       ret = pass.run(prog, false, true);
1376       if (prog->targetPriv)
1377          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1378    }
1379    return ret;
1380 }
1381
1382 } // namespace nv50_ir