src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_build_util.h"
  25
  26 #include "codegen/nv50_ir_target_nv50.h"
  27
  28 namespace nv50_ir {
  29
  30 // nv50 doesn't support 32 bit integer multiplication
  31 //
  32 //       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
  33 // -------------------
  34 //    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
  35 // ah*bh 00 00                 (           carry1) << 16 + ( carry2)
  36 //       al*bl
  37 //    ah*bl 00
  38 //
  39 // fffe0001 + fffe0001
  40 //
  41 // Note that this sort of splitting doesn't work for signed values, so we
  42 // compute the sign on those manually and then perform an unsigned multiply.
  43 static bool
  44 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
  45 {
  46    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
  47    ImmediateValue src1;
  48    bool src1imm = mul->src(1).getImmediate(src1);
  49
  50    DataType fTy; // full type
  51    switch (mul->sType) {
  52    case TYPE_S32: fTy = TYPE_U32; break;
  53    case TYPE_S64: fTy = TYPE_U64; break;
  54    default: fTy = mul->sType; break;
  55    }
  56
  57    DataType hTy; // half type
  58    switch (fTy) {
  59    case TYPE_U32: hTy = TYPE_U16; break;
  60    case TYPE_U64: hTy = TYPE_U32; break;
  61    default:
  62       return false;
  63    }
  64    unsigned int fullSize = typeSizeof(fTy);
  65    unsigned int halfSize = typeSizeof(hTy);
  66
  67    Instruction *i[9];
  68
  69    bld->setPosition(mul, true);
  70
  71    Value *s[2];
  72    Value *a[2], *b[2];
  73    Value *t[4];
  74    for (int j = 0; j < 4; ++j)
  75       t[j] = bld->getSSA(fullSize);
  76
  77    if (isSignedType(mul->sType) && highResult) {
  78       s[0] = bld->getSSA(fullSize);
  79       s[1] = bld->getSSA(fullSize);
  80       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
  81       bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
  82       src1.reg.data.s32 = abs(src1.reg.data.s32);
  83    } else {
  84       s[0] = mul->getSrc(0);
  85       s[1] = mul->getSrc(1);
  86    }
  87
  88    // split sources into halves
  89    i[0] = bld->mkSplit(a, halfSize, s[0]);
  90    i[1] = bld->mkSplit(b, halfSize, s[1]);
  91
  92    if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
  93       i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
  94                                bld->mkImm(src1.reg.data.u32 & 0xffff));
  95    } else {
  96       i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
  97                         src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
  98       if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
  99          i[3] = i[2];
 100          t[1] = t[0];
 101       } else {
 102          i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
 103       }
 104    }
 105    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
 106    if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
 107       i[4] = i[3];
 108       t[3] = t[2];
 109    } else {
 110       i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
 111    }
 112
 113    if (highResult) {
 114       Value *c[2];
 115       Value *r[5];
 116       Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
 117       c[0] = bld->getSSA(1, FILE_FLAGS);
 118       c[1] = bld->getSSA(1, FILE_FLAGS);
 119       for (int j = 0; j < 5; ++j)
 120          r[j] = bld->getSSA(fullSize);
 121
 122       i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
 123       i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
 124       bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
 125       bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
 126       i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
 127
 128       // set carry defs / sources
 129       i[3]->setFlagsDef(1, c[0]);
 130       // actual result required in negative case, but ignored for
 131       // unsigned. for some reason the compiler ends up dropping the whole
 132       // instruction if the destination is unused but the flags are.
 133       if (isSignedType(mul->sType))
 134          i[4]->setFlagsDef(1, c[1]);
 135       else
 136          i[4]->setFlagsDef(0, c[1]);
 137       i[6]->setPredicate(CC_C, c[0]);
 138       i[5]->setFlagsSrc(3, c[1]);
 139
 140       if (isSignedType(mul->sType)) {
 141          Value *cc[2];
 142          Value *rr[7];
 143          Value *one = bld->getSSA(fullSize);
 144          bld->loadImm(one, 1);
 145          for (int j = 0; j < 7; j++)
 146             rr[j] = bld->getSSA(fullSize);
 147
 148          // NOTE: this logic uses predicates because splitting basic blocks is
 149          // ~impossible during the SSA phase. The RA relies on a correlation
 150          // between edge order and phi node sources.
 151
 152          // Set the sign of the result based on the inputs
 153          bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
 154             ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
 155
 156          // 1s complement of 64-bit value
 157          bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
 158             ->setPredicate(CC_S, cc[0]);
 159          bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
 160             ->setPredicate(CC_S, cc[0]);
 161
 162          // add to low 32-bits, keep track of the carry
 163          Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
 164          n->setPredicate(CC_S, cc[0]);
 165          n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
 166
 167          // If there was a carry, add 1 to the upper 32 bits
 168          // XXX: These get executed even if they shouldn't be
 169          bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
 170             ->setPredicate(CC_C, cc[1]);
 171          bld->mkMov(rr[3], rr[0])
 172             ->setPredicate(CC_NC, cc[1]);
 173          bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
 174
 175          // Merge the results from the negative and non-negative paths
 176          bld->mkMov(rr[5], rr[4])
 177             ->setPredicate(CC_S, cc[0]);
 178          bld->mkMov(rr[6], r[4])
 179             ->setPredicate(CC_NS, cc[0]);
 180          bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
 181       } else {
 182          bld->mkMov(mul->getDef(0), r[4]);
 183       }
 184    } else {
 185       bld->mkMov(mul->getDef(0), t[3]);
 186    }
 187    delete_Instruction(bld->getProgram(), mul);
 188
 189    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
 190       if (i[j])
 191          i[j]->sType = hTy;
 192
 193    return true;
 194 }
 195
 196 #define QOP_ADD  0
 197 #define QOP_SUBR 1
 198 #define QOP_SUB  2
 199 #define QOP_MOV2 3
 200
 201 //             UL UR LL LR
 202 #define QUADOP(q, r, s, t)            \
 203    ((QOP_##q << 6) | (QOP_##r << 4) | \
 204     (QOP_##s << 2) | (QOP_##t << 0))
 205
 206 class NV50LegalizePostRA : public Pass
 207 {
 208 private:
 209    virtual bool visit(Function *);
 210    virtual bool visit(BasicBlock *);
 211
 212    void handlePRERET(FlowInstruction *);
 213    void replaceZero(Instruction *);
 214
 215    LValue *r63;
 216 };
 217
 218 bool
 219 NV50LegalizePostRA::visit(Function *fn)
 220 {
 221    Program *prog = fn->getProgram();
 222
 223    r63 = new_LValue(fn, FILE_GPR);
 224    // GPR units on nv50 are in half-regs
 225    if (prog->maxGPR < 126)
 226       r63->reg.data.id = 63;
 227    else
 228       r63->reg.data.id = 127;
 229
 230    // this is actually per-program, but we can do it all on visiting main()
 231    std::list<Instruction *> *outWrites =
 232       reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
 233
 234    if (outWrites) {
 235       for (std::list<Instruction *>::iterator it = outWrites->begin();
 236            it != outWrites->end(); ++it)
 237          (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
 238       // instructions will be deleted on exit
 239       outWrites->clear();
 240    }
 241
 242    return true;
 243 }
 244
 245 void
 246 NV50LegalizePostRA::replaceZero(Instruction *i)
 247 {
 248    for (int s = 0; i->srcExists(s); ++s) {
 249       ImmediateValue *imm = i->getSrc(s)->asImm();
 250       if (imm && imm->reg.data.u64 == 0)
 251          i->setSrc(s, r63);
 252    }
 253 }
 254
 255 // Emulate PRERET: jump to the target and call to the origin from there
 256 //
 257 // WARNING: atm only works if BBs are affected by at most a single PRERET
 258 //
 259 // BB:0
 260 // preret BB:3
 261 // (...)
 262 // BB:3
 263 // (...)
 264 //             --->
 265 // BB:0
 266 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
 267 // (...)
 268 // BB:3
 269 // bra BB:3 + n1 (skip the call)
 270 // call BB:0 + n2 (skip bra at beginning of BB:0)
 271 // (...)
 272 void
 273 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
 274 {
 275    BasicBlock *bbE = pre->bb;
 276    BasicBlock *bbT = pre->target.bb;
 277
 278    pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
 279    bbE->remove(pre);
 280    bbE->insertHead(pre);
 281
 282    Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
 283    Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
 284
 285    bbT->insertHead(call);
 286    bbT->insertHead(skip);
 287
 288    // NOTE: maybe split blocks to prevent the instructions from moving ?
 289
 290    skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
 291    call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
 292 }
 293
 294 bool
 295 NV50LegalizePostRA::visit(BasicBlock *bb)
 296 {
 297    Instruction *i, *next;
 298
 299    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
 300    for (i = bb->getFirst(); i; i = next) {
 301       next = i->next;
 302       if (i->isNop()) {
 303          bb->remove(i);
 304       } else
 305       if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
 306          handlePRERET(i->asFlow());
 307       } else {
 308          // TODO: We will want to do this before register allocation,
 309          // since have to use a $c register for the carry flag.
 310          if (typeSizeof(i->dType) == 8) {
 311             Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
 312             if (hi)
 313                next = hi;
 314          }
 315
 316          if (i->op != OP_PFETCH && i->op != OP_BAR &&
 317              (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
 318             replaceZero(i);
 319       }
 320    }
 321    if (!bb->getEntry())
 322       return true;
 323
 324    return true;
 325 }
 326
 327 class NV50LegalizeSSA : public Pass
 328 {
 329 public:
 330    NV50LegalizeSSA(Program *);
 331
 332    virtual bool visit(BasicBlock *bb);
 333
 334 private:
 335    void propagateWriteToOutput(Instruction *);
 336    void handleDIV(Instruction *);
 337    void handleMOD(Instruction *);
 338    void handleMUL(Instruction *);
 339    void handleAddrDef(Instruction *);
 340
 341    inline bool isARL(const Instruction *) const;
 342
 343    BuildUtil bld;
 344
 345    std::list<Instruction *> *outWrites;
 346 };
 347
 348 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
 349 {
 350    bld.setProgram(prog);
 351
 352    if (prog->optLevel >= 2 &&
 353        (prog->getType() == Program::TYPE_GEOMETRY ||
 354         prog->getType() == Program::TYPE_VERTEX))
 355       outWrites =
 356          reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
 357    else
 358       outWrites = NULL;
 359 }
 360
 361 void
 362 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
 363 {
 364    if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
 365       return;
 366
 367    // check def instruction can store
 368    Instruction *di = st->getSrc(1)->defs.front()->getInsn();
 369
 370    // TODO: move exports (if beneficial) in common opt pass
 371    if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
 372       return;
 373
 374    for (int s = 0; di->srcExists(s); ++s)
 375       if (di->src(s).getFile() == FILE_IMMEDIATE ||
 376           di->src(s).getFile() == FILE_MEMORY_LOCAL)
 377          return;
 378
 379    if (prog->getType() == Program::TYPE_GEOMETRY) {
 380       // Only propagate output writes in geometry shaders when we can be sure
 381       // that we are propagating to the same output vertex.
 382       if (di->bb != st->bb)
 383          return;
 384       Instruction *i;
 385       for (i = di; i != st; i = i->next) {
 386          if (i->op == OP_EMIT || i->op == OP_RESTART)
 387             return;
 388       }
 389       assert(i); // st after di
 390    }
 391
 392    // We cannot set defs to non-lvalues before register allocation, so
 393    // save & remove (to save registers) the exports and replace later.
 394    outWrites->push_back(st);
 395    st->bb->remove(st);
 396 }
 397
 398 bool
 399 NV50LegalizeSSA::isARL(const Instruction *i) const
 400 {
 401    ImmediateValue imm;
 402
 403    if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
 404       return false;
 405    if (!i->src(1).getImmediate(imm))
 406       return false;
 407    return imm.isInteger(0);
 408 }
 409
 410 void
 411 NV50LegalizeSSA::handleAddrDef(Instruction *i)
 412 {
 413    Instruction *arl;
 414
 415    i->getDef(0)->reg.size = 2; // $aX are only 16 bit
 416
 417    // PFETCH can always write to $a
 418    if (i->op == OP_PFETCH)
 419       return;
 420    // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
 421    if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
 422       if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
 423          return;
 424       if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
 425          return;
 426    }
 427
 428    // turn $a sources into $r sources (can't operate on $a)
 429    for (int s = 0; i->srcExists(s); ++s) {
 430       Value *a = i->getSrc(s);
 431       Value *r;
 432       if (a->reg.file == FILE_ADDRESS) {
 433          if (a->getInsn() && isARL(a->getInsn())) {
 434             i->setSrc(s, a->getInsn()->getSrc(0));
 435          } else {
 436             bld.setPosition(i, false);
 437             r = bld.getSSA();
 438             bld.mkMov(r, a);
 439             i->setSrc(s, r);
 440          }
 441       }
 442    }
 443    if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
 444       return;
 445
 446    // turn result back into $a
 447    bld.setPosition(i, true);
 448    arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
 449    i->setDef(0, arl->getSrc(0));
 450 }
 451
 452 void
 453 NV50LegalizeSSA::handleMUL(Instruction *mul)
 454 {
 455    if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
 456       return;
 457    Value *def = mul->getDef(0);
 458    Value *pred = mul->getPredicate();
 459    CondCode cc = mul->cc;
 460    if (pred)
 461       mul->setPredicate(CC_ALWAYS, NULL);
 462
 463    if (mul->op == OP_MAD) {
 464       Instruction *add = mul;
 465       bld.setPosition(add, false);
 466       Value *res = cloneShallow(func, mul->getDef(0));
 467       mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
 468       add->op = OP_ADD;
 469       add->setSrc(0, mul->getDef(0));
 470       add->setSrc(1, add->getSrc(2));
 471       for (int s = 2; add->srcExists(s); ++s)
 472          add->setSrc(s, NULL);
 473       mul->subOp = add->subOp;
 474       add->subOp = 0;
 475    }
 476    expandIntegerMUL(&bld, mul);
 477    if (pred)
 478       def->getInsn()->setPredicate(cc, pred);
 479 }
 480
 481 // Use f32 division: first compute an approximate result, use it to reduce
 482 // the dividend, which should then be representable as f32, divide the reduced
 483 // dividend, and add the quotients.
 484 void
 485 NV50LegalizeSSA::handleDIV(Instruction *div)
 486 {
 487    const DataType ty = div->sType;
 488
 489    if (ty != TYPE_U32 && ty != TYPE_S32)
 490       return;
 491
 492    Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
 493
 494    bld.setPosition(div, false);
 495
 496    Value *a, *af = bld.getSSA();
 497    Value *b, *bf = bld.getSSA();
 498
 499    bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
 500    bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
 501
 502    if (isSignedType(ty)) {
 503       af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
 504       bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
 505       a = bld.getSSA();
 506       b = bld.getSSA();
 507       bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
 508       bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
 509    } else {
 510       a = div->getSrc(0);
 511       b = div->getSrc(1);
 512    }
 513
 514    bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
 515    bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
 516
 517    bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
 518    bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
 519
 520    // get error of 1st result
 521    expandIntegerMUL(&bld,
 522       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
 523    bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
 524
 525    bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
 526
 527    bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
 528    bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
 529       ->rnd = ROUND_Z;
 530    bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
 531
 532    // correction: if modulus >= divisor, add 1
 533    expandIntegerMUL(&bld,
 534       bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
 535    bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
 536    bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
 537    if (!isSignedType(ty)) {
 538       div->op = OP_SUB;
 539       div->setSrc(0, q);
 540       div->setSrc(1, s);
 541    } else {
 542       t = q;
 543       bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
 544       s = bld.getSSA();
 545       t = bld.getSSA();
 546       // fix the sign
 547       bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
 548          ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
 549       bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
 550       bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
 551
 552       div->op = OP_UNION;
 553       div->setSrc(0, s);
 554       div->setSrc(1, t);
 555    }
 556 }
 557
 558 void
 559 NV50LegalizeSSA::handleMOD(Instruction *mod)
 560 {
 561    if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
 562       return;
 563    bld.setPosition(mod, false);
 564
 565    Value *q = bld.getSSA();
 566    Value *m = bld.getSSA();
 567
 568    bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
 569    handleDIV(q->getInsn());
 570
 571    bld.setPosition(mod, false);
 572    expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
 573
 574    mod->op = OP_SUB;
 575    mod->setSrc(1, m);
 576 }
 577
 578 bool
 579 NV50LegalizeSSA::visit(BasicBlock *bb)
 580 {
 581    Instruction *insn, *next;
 582    // skipping PHIs (don't pass them to handleAddrDef) !
 583    for (insn = bb->getEntry(); insn; insn = next) {
 584       next = insn->next;
 585
 586       if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
 587          handleAddrDef(insn);
 588
 589       switch (insn->op) {
 590       case OP_EXPORT:
 591          if (outWrites)
 592             propagateWriteToOutput(insn);
 593          break;
 594       case OP_DIV:
 595          handleDIV(insn);
 596          break;
 597       case OP_MOD:
 598          handleMOD(insn);
 599          break;
 600       case OP_MAD:
 601       case OP_MUL:
 602          handleMUL(insn);
 603          break;
 604       default:
 605          break;
 606       }
 607    }
 608    return true;
 609 }
 610
 611 class NV50LoweringPreSSA : public Pass
 612 {
 613 public:
 614    NV50LoweringPreSSA(Program *);
 615
 616 private:
 617    virtual bool visit(Instruction *);
 618    virtual bool visit(Function *);
 619
 620    bool handleRDSV(Instruction *);
 621    bool handleWRSV(Instruction *);
 622
 623    bool handlePFETCH(Instruction *);
 624    bool handleEXPORT(Instruction *);
 625    bool handleLOAD(Instruction *);
 626
 627    bool handleDIV(Instruction *);
 628    bool handleSQRT(Instruction *);
 629    bool handlePOW(Instruction *);
 630
 631    bool handleSET(Instruction *);
 632    bool handleSLCT(CmpInstruction *);
 633    bool handleSELP(Instruction *);
 634
 635    bool handleTEX(TexInstruction *);
 636    bool handleTXB(TexInstruction *); // I really
 637    bool handleTXL(TexInstruction *); // hate
 638    bool handleTXD(TexInstruction *); // these 3
 639    bool handleTXLQ(TexInstruction *);
 640    bool handleTXQ(TexInstruction *);
 641
 642    bool handleCALL(Instruction *);
 643    bool handlePRECONT(Instruction *);
 644    bool handleCONT(Instruction *);
 645
 646    void checkPredicate(Instruction *);
 647    void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
 648    void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
 649
 650 private:
 651    const Target *const targ;
 652
 653    BuildUtil bld;
 654
 655    Value *tid;
 656 };
 657
 658 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
 659    targ(prog->getTarget()), tid(NULL)
 660 {
 661    bld.setProgram(prog);
 662 }
 663
 664 bool
 665 NV50LoweringPreSSA::visit(Function *f)
 666 {
 667    BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
 668
 669    if (prog->getType() == Program::TYPE_COMPUTE) {
 670       // Add implicit "thread id" argument in $r0 to the function
 671       Value *arg = new_LValue(func, FILE_GPR);
 672       arg->reg.data.id = 0;
 673       f->ins.push_back(arg);
 674
 675       bld.setPosition(root, false);
 676       tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
 677    }
 678
 679    return true;
 680 }
 681
 682 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
 683                                        Value **ms_x, Value **ms_y) {
 684    // This loads the texture-indexed ms setting from the constant buffer
 685    Value *tmp = new_LValue(func, FILE_GPR);
 686    uint8_t b = prog->driver->io.auxCBSlot;
 687    off += prog->driver->io.suInfoBase;
 688    if (prog->getType() > Program::TYPE_VERTEX)
 689       off += 16 * 2 * 4;
 690    if (prog->getType() > Program::TYPE_GEOMETRY)
 691       off += 16 * 2 * 4;
 692    *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 693                              FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
 694    *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 695                              FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
 696    *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
 697 }
 698
 699 void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
 700    // Given a MS level, and a sample id, compute the delta x/y
 701    uint8_t b = prog->driver->io.msInfoCBSlot;
 702    Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
 703
 704    // The required information is at mslevel * 16 * 4 + sample * 8
 705    // = (mslevel * 8 + sample) * 8
 706    bld.mkOp2(OP_SHL,
 707              TYPE_U32,
 708              off,
 709              bld.mkOp2v(OP_ADD, TYPE_U32, t,
 710                         bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
 711                         s),
 712              bld.mkImm(3));
 713    *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 714                            FILE_MEMORY_CONST, b, TYPE_U32,
 715                            prog->driver->io.msInfoBase), off);
 716    *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
 717                            FILE_MEMORY_CONST, b, TYPE_U32,
 718                            prog->driver->io.msInfoBase + 4), off);
 719 }
 720
 721 bool
 722 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
 723 {
 724    const int arg = i->tex.target.getArgCount();
 725    const int dref = arg;
 726    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
 727
 728    /* Only normalize in the non-explicit derivatives case.
 729     */
 730    if (i->tex.target.isCube() && i->op != OP_TXD) {
 731       Value *src[3], *val;
 732       int c;
 733       for (c = 0; c < 3; ++c)
 734          src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
 735       val = bld.getScratch();
 736       bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
 737       bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
 738       bld.mkOp1(OP_RCP, TYPE_F32, val, val);
 739       for (c = 0; c < 3; ++c) {
 740          i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
 741                                  i->getSrc(c), val));
 742       }
 743    }
 744
 745    // handle MS, which means looking up the MS params for this texture, and
 746    // adjusting the input coordinates to point at the right sample.
 747    if (i->tex.target.isMS()) {
 748       Value *x = i->getSrc(0);
 749       Value *y = i->getSrc(1);
 750       Value *s = i->getSrc(arg - 1);
 751       Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
 752          *ms, *ms_x, *ms_y, *dx, *dy;
 753
 754       i->tex.target.clearMS();
 755
 756       loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
 757       loadMsInfo(ms, s, &dx, &dy);
 758
 759       bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
 760       bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
 761       bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
 762       bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
 763       i->setSrc(0, tx);
 764       i->setSrc(1, ty);
 765       i->setSrc(arg - 1, bld.loadImm(NULL, 0));
 766    }
 767
 768    // dref comes before bias/lod
 769    if (i->tex.target.isShadow())
 770       if (i->op == OP_TXB || i->op == OP_TXL)
 771          i->swapSources(dref, lod);
 772
 773    if (i->tex.target.isArray()) {
 774       if (i->op != OP_TXF) {
 775          // array index must be converted to u32, but it's already an integer
 776          // for TXF
 777          Value *layer = i->getSrc(arg - 1);
 778          LValue *src = new_LValue(func, FILE_GPR);
 779          bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
 780          bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
 781          i->setSrc(arg - 1, src);
 782       }
 783       if (i->tex.target.isCube() && i->srcCount() > 4) {
 784          std::vector<Value *> acube, a2d;
 785          int c;
 786
 787          acube.resize(4);
 788          for (c = 0; c < 4; ++c)
 789             acube[c] = i->getSrc(c);
 790          a2d.resize(4);
 791          for (c = 0; c < 3; ++c)
 792             a2d[c] = new_LValue(func, FILE_GPR);
 793          a2d[3] = NULL;
 794
 795          bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
 796                    a2d, acube)->asTex()->tex.mask = 0x7;
 797
 798          for (c = 0; c < 3; ++c)
 799             i->setSrc(c, a2d[c]);
 800          for (; i->srcExists(c + 1); ++c)
 801             i->setSrc(c, i->getSrc(c + 1));
 802          i->setSrc(c, NULL);
 803          assert(c <= 4);
 804
 805          i->tex.target = i->tex.target.isShadow() ?
 806             TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
 807       }
 808    }
 809
 810    // texel offsets are 3 immediate fields in the instruction,
 811    // nv50 cannot do textureGatherOffsets
 812    assert(i->tex.useOffsets <= 1);
 813    if (i->tex.useOffsets) {
 814       for (int c = 0; c < 3; ++c) {
 815          ImmediateValue val;
 816          if (!i->offset[0][c].getImmediate(val))
 817             assert(!"non-immediate offset");
 818          i->tex.offset[c] = val.reg.data.u32;
 819          i->offset[0][c].set(NULL);
 820       }
 821    }
 822
 823    return true;
 824 }
 825
 826 // Bias must be equal for all threads of a quad or lod calculation will fail.
 827 //
 828 // The lanes of a quad are grouped by the bit in the condition register they
 829 // have set, which is selected by differing bias values.
 830 // Move the input values for TEX into a new register set for each group and
 831 // execute TEX only for a specific group.
 832 // We always need to use 4 new registers for the inputs/outputs because the
 833 // implicitly calculated derivatives must be correct.
 834 //
 835 // TODO: move to SSA phase so we can easily determine whether bias is constant
 836 bool
 837 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
 838 {
 839    const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
 840    int l, d;
 841
 842    // We can't actually apply bias *and* do a compare for a cube
 843    // texture. Since the compare has to be done before the filtering, just
 844    // drop the bias on the floor.
 845    if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
 846       i->op = OP_TEX;
 847       i->setSrc(3, i->getSrc(4));
 848       i->setSrc(4, NULL);
 849       return handleTEX(i);
 850    }
 851
 852    handleTEX(i);
 853    Value *bias = i->getSrc(i->tex.target.getArgCount());
 854    if (bias->isUniform())
 855       return true;
 856
 857    Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
 858                                  bld.loadImm(NULL, 1));
 859    bld.setPosition(cond, false);
 860
 861    for (l = 1; l < 4; ++l) {
 862       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
 863       Value *bit = bld.getSSA();
 864       Value *pred = bld.getScratch(1, FILE_FLAGS);
 865       Value *imm = bld.loadImm(NULL, (1 << l));
 866       bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
 867       bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
 868       cond->setSrc(l, bit);
 869    }
 870    Value *flags = bld.getScratch(1, FILE_FLAGS);
 871    bld.setPosition(cond, true);
 872    bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
 873
 874    Instruction *tex[4];
 875    for (l = 0; l < 4; ++l) {
 876       (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
 877       bld.insert(tex[l]);
 878    }
 879
 880    Value *res[4][4];
 881    for (d = 0; i->defExists(d); ++d)
 882       res[0][d] = tex[0]->getDef(d);
 883    for (l = 1; l < 4; ++l) {
 884       for (d = 0; tex[l]->defExists(d); ++d) {
 885          res[l][d] = cloneShallow(func, res[0][d]);
 886          bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
 887       }
 888    }
 889
 890    for (d = 0; i->defExists(d); ++d) {
 891       Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
 892       for (l = 0; l < 4; ++l)
 893          dst->setSrc(l, res[l][d]);
 894    }
 895    delete_Instruction(prog, i);
 896    return true;
 897 }
 898
 899 // LOD must be equal for all threads of a quad.
 900 // Unlike with TXB, here we can just diverge since there's no LOD calculation
 901 // that would require all 4 threads' sources to be set up properly.
 902 bool
 903 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
 904 {
 905    handleTEX(i);
 906    Value *lod = i->getSrc(i->tex.target.getArgCount());
 907    if (lod->isUniform())
 908       return true;
 909
 910    BasicBlock *currBB = i->bb;
 911    BasicBlock *texiBB = i->bb->splitBefore(i, false);
 912    BasicBlock *joinBB = i->bb->splitAfter(i);
 913
 914    bld.setPosition(currBB, true);
 915    assert(!currBB->joinAt);
 916    currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
 917
 918    for (int l = 0; l <= 3; ++l) {
 919       const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
 920       Value *pred = bld.getScratch(1, FILE_FLAGS);
 921       bld.setPosition(currBB, true);
 922       bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
 923       bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
 924       currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
 925       if (l <= 2) {
 926          BasicBlock *laneBB = new BasicBlock(func);
 927          currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
 928          currBB = laneBB;
 929       }
 930    }
 931    bld.setPosition(joinBB, false);
 932    bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
 933    return true;
 934 }
 935
 936 bool
 937 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
 938 {
 939    static const uint8_t qOps[4][2] =
 940    {
 941       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
 942       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
 943       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
 944       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
 945    };
 946    Value *def[4][4];
 947    Value *crd[3];
 948    Instruction *tex;
 949    Value *zero = bld.loadImm(bld.getSSA(), 0);
 950    int l, c;
 951    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
 952
 953    handleTEX(i);
 954    i->op = OP_TEX; // no need to clone dPdx/dPdy later
 955    i->tex.derivAll = true;
 956
 957    for (c = 0; c < dim; ++c)
 958       crd[c] = bld.getScratch();
 959
 960    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
 961    for (l = 0; l < 4; ++l) {
 962       Value *src[3], *val;
 963       // mov coordinates from lane l to all lanes
 964       for (c = 0; c < dim; ++c)
 965          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
 966       // add dPdx from lane l to lanes dx
 967       for (c = 0; c < dim; ++c)
 968          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
 969       // add dPdy from lane l to lanes dy
 970       for (c = 0; c < dim; ++c)
 971          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
 972       // normalize cube coordinates if necessary
 973       if (i->tex.target.isCube()) {
 974          for (c = 0; c < 3; ++c)
 975             src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
 976          val = bld.getScratch();
 977          bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
 978          bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
 979          bld.mkOp1(OP_RCP, TYPE_F32, val, val);
 980          for (c = 0; c < 3; ++c)
 981             src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
 982       } else {
 983          for (c = 0; c < dim; ++c)
 984             src[c] = crd[c];
 985       }
 986       // texture
 987       bld.insert(tex = cloneForward(func, i));
 988       for (c = 0; c < dim; ++c)
 989          tex->setSrc(c, src[c]);
 990       // save results
 991       for (c = 0; i->defExists(c); ++c) {
 992          Instruction *mov;
 993          def[c][l] = bld.getSSA();
 994          mov = bld.mkMov(def[c][l], tex->getDef(c));
 995          mov->fixed = 1;
 996          mov->lanes = 1 << l;
 997       }
 998    }
 999    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1000
1001    for (c = 0; i->defExists(c); ++c) {
1002       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1003       for (l = 0; l < 4; ++l)
1004          u->setSrc(l, def[c][l]);
1005    }
1006
1007    i->bb->remove(i);
1008    return true;
1009 }
1010
1011 bool
1012 NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
1013 {
1014    handleTEX(i);
1015    bld.setPosition(i, true);
1016
1017    /* The returned values are not quite what we want:
1018     * (a) convert from s32 to f32
1019     * (b) multiply by 1/256
1020     */
1021    for (int def = 0; def < 2; ++def) {
1022       if (!i->defExists(def))
1023          continue;
1024       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
1025       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1026                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1027    }
1028    return true;
1029 }
1030
1031 bool
1032 NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
1033 {
1034    Value *ms, *ms_x, *ms_y;
1035    if (i->tex.query == TXQ_DIMS)
1036       return true;
1037    assert(i->tex.query == TXQ_TYPE);
1038    assert(i->tex.mask == 4);
1039
1040    loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1041    bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
1042    i->bb->remove(i);
1043
1044    return true;
1045 }
1046
1047
1048 bool
1049 NV50LoweringPreSSA::handleSET(Instruction *i)
1050 {
1051    if (i->dType == TYPE_F32) {
1052       bld.setPosition(i, true);
1053       i->dType = TYPE_U32;
1054       bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
1055       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
1056    }
1057    return true;
1058 }
1059
1060 bool
1061 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
1062 {
1063    Value *src0 = bld.getSSA();
1064    Value *src1 = bld.getSSA();
1065    Value *pred = bld.getScratch(1, FILE_FLAGS);
1066
1067    Value *v0 = i->getSrc(0);
1068    Value *v1 = i->getSrc(1);
1069    // XXX: these probably shouldn't be immediates in the first place ...
1070    if (v0->asImm())
1071       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1072    if (v1->asImm())
1073       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1074
1075    bld.setPosition(i, true);
1076    bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
1077    bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
1078    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1079
1080    bld.setPosition(i, false);
1081    i->op = OP_SET;
1082    i->setFlagsDef(0, pred);
1083    i->dType = TYPE_U8;
1084    i->setSrc(0, i->getSrc(2));
1085    i->setSrc(2, NULL);
1086    i->setSrc(1, bld.loadImm(NULL, 0));
1087
1088    return true;
1089 }
1090
1091 bool
1092 NV50LoweringPreSSA::handleSELP(Instruction *i)
1093 {
1094    Value *src0 = bld.getSSA();
1095    Value *src1 = bld.getSSA();
1096
1097    Value *v0 = i->getSrc(0);
1098    Value *v1 = i->getSrc(1);
1099    if (v0->asImm())
1100       v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1101    if (v1->asImm())
1102       v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1103
1104    bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1105    bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1106    bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1107    delete_Instruction(prog, i);
1108    return true;
1109 }
1110
1111 bool
1112 NV50LoweringPreSSA::handleWRSV(Instruction *i)
1113 {
1114    Symbol *sym = i->getSrc(0)->asSym();
1115
1116    // these are all shader outputs, $sreg are not writeable
1117    uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
1118    if (addr >= 0x400)
1119       return false;
1120    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1121
1122    bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
1123
1124    bld.getBB()->remove(i);
1125    return true;
1126 }
1127
1128 bool
1129 NV50LoweringPreSSA::handleCALL(Instruction *i)
1130 {
1131    if (prog->getType() == Program::TYPE_COMPUTE) {
1132       // Add implicit "thread id" argument in $r0 to the function
1133       i->setSrc(i->srcCount(), tid);
1134    }
1135    return true;
1136 }
1137
1138 bool
1139 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
1140 {
1141    delete_Instruction(prog, i);
1142    return true;
1143 }
1144
1145 bool
1146 NV50LoweringPreSSA::handleCONT(Instruction *i)
1147 {
1148    i->op = OP_BRA;
1149    return true;
1150 }
1151
1152 bool
1153 NV50LoweringPreSSA::handleRDSV(Instruction *i)
1154 {
1155    Symbol *sym = i->getSrc(0)->asSym();
1156    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1157    Value *def = i->getDef(0);
1158    SVSemantic sv = sym->reg.data.sv.sv;
1159    int idx = sym->reg.data.sv.index;
1160
1161    if (addr >= 0x400) // mov $sreg
1162       return true;
1163
1164    switch (sv) {
1165    case SV_POSITION:
1166       assert(prog->getType() == Program::TYPE_FRAGMENT);
1167       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1168       break;
1169    case SV_FACE:
1170       bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1171       if (i->dType == TYPE_F32) {
1172          bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
1173          bld.mkOp1(OP_NEG, TYPE_S32, def, def);
1174          bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
1175       }
1176       break;
1177    case SV_NCTAID:
1178    case SV_CTAID:
1179    case SV_NTID:
1180       if ((sv == SV_NCTAID && idx >= 2) ||
1181           (sv == SV_NTID && idx >= 3)) {
1182          bld.mkMov(def, bld.mkImm(1));
1183       } else if (sv == SV_CTAID && idx >= 2) {
1184          bld.mkMov(def, bld.mkImm(0));
1185       } else {
1186          Value *x = bld.getSSA(2);
1187          bld.mkOp1(OP_LOAD, TYPE_U16, x,
1188                    bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1189          bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1190       }
1191       break;
1192    case SV_TID:
1193       if (idx == 0) {
1194          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1195       } else if (idx == 1) {
1196          bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1197          bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1198       } else if (idx == 2) {
1199          bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1200       } else {
1201          bld.mkMov(def, bld.mkImm(0));
1202       }
1203       break;
1204    case SV_SAMPLE_POS: {
1205       Value *off = new_LValue(func, FILE_ADDRESS);
1206       bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1207       bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1208       bld.mkLoad(TYPE_F32,
1209                  def,
1210                  bld.mkSymbol(
1211                        FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
1212                        TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1213                  off);
1214       break;
1215    }
1216    default:
1217       bld.mkFetch(i->getDef(0), i->dType,
1218                   FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1219       break;
1220    }
1221    bld.getBB()->remove(i);
1222    return true;
1223 }
1224
1225 bool
1226 NV50LoweringPreSSA::handleDIV(Instruction *i)
1227 {
1228    if (!isFloatType(i->dType))
1229       return true;
1230    bld.setPosition(i, false);
1231    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1232    i->op = OP_MUL;
1233    i->setSrc(1, rcp->getDef(0));
1234    return true;
1235 }
1236
1237 bool
1238 NV50LoweringPreSSA::handleSQRT(Instruction *i)
1239 {
1240    bld.setPosition(i, true);
1241    i->op = OP_RSQ;
1242    bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
1243
1244    return true;
1245 }
1246
1247 bool
1248 NV50LoweringPreSSA::handlePOW(Instruction *i)
1249 {
1250    LValue *val = bld.getScratch();
1251
1252    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1253    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1254    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1255
1256    i->op = OP_EX2;
1257    i->setSrc(0, val);
1258    i->setSrc(1, NULL);
1259
1260    return true;
1261 }
1262
1263 bool
1264 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
1265 {
1266    if (prog->getType() == Program::TYPE_FRAGMENT) {
1267       if (i->getIndirect(0, 0)) {
1268          // TODO: redirect to l[] here, load to GPRs at exit
1269          return false;
1270       } else {
1271          int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1272
1273          i->op = OP_MOV;
1274          i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1275          i->src(0).set(i->src(1));
1276          i->setSrc(1, NULL);
1277          i->setDef(0, new_LValue(func, FILE_GPR));
1278          i->getDef(0)->reg.data.id = id;
1279
1280          prog->maxGPR = MAX2(prog->maxGPR, id * 2);
1281       }
1282    }
1283    return true;
1284 }
1285
1286 // Handle indirect addressing in geometry shaders:
1287 //
1288 // ld $r0 a[$a1][$a2+k] ->
1289 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1290 //
1291 bool
1292 NV50LoweringPreSSA::handleLOAD(Instruction *i)
1293 {
1294    ValueRef src = i->src(0);
1295
1296    if (src.isIndirect(1)) {
1297       assert(prog->getType() == Program::TYPE_GEOMETRY);
1298       Value *addr = i->getIndirect(0, 1);
1299
1300       if (src.isIndirect(0)) {
1301          // base address is in an address register, so move to a GPR
1302          Value *base = bld.getScratch();
1303          bld.mkMov(base, addr);
1304
1305          Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1306          Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1307          Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1308                                     i->getIndirect(0, 0), bld.mkImm(2));
1309
1310          // Calculate final address: addr = base + attr*vstride; use 16-bit
1311          // multiplication since 32-bit would be lowered to multiple
1312          // instructions, and we only need the low 16 bits of the result
1313          Value *a[2], *b[2];
1314          bld.mkSplit(a, 2, attrib);
1315          bld.mkSplit(b, 2, vstride);
1316          Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1317                                  base);
1318
1319          // move address from GPR into an address register
1320          addr = bld.getSSA(2, FILE_ADDRESS);
1321          bld.mkMov(addr, sum);
1322       }
1323
1324       i->setIndirect(0, 1, NULL);
1325       i->setIndirect(0, 0, addr);
1326    }
1327
1328    return true;
1329 }
1330
1331 bool
1332 NV50LoweringPreSSA::handlePFETCH(Instruction *i)
1333 {
1334    assert(prog->getType() == Program::TYPE_GEOMETRY);
1335
1336    // NOTE: cannot use getImmediate here, not in SSA form yet, move to
1337    // later phase if that assertion ever triggers:
1338
1339    ImmediateValue *imm = i->getSrc(0)->asImm();
1340    assert(imm);
1341
1342    assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
1343
1344    if (i->srcExists(1)) {
1345       // indirect addressing of vertex in primitive space
1346
1347       LValue *val = bld.getScratch();
1348       Value *ptr = bld.getSSA(2, FILE_ADDRESS);
1349       bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
1350       bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
1351
1352       // NOTE: PFETCH directly to an $aX only works with direct addressing
1353       i->op = OP_SHL;
1354       i->setSrc(0, val);
1355       i->setSrc(1, bld.mkImm(0));
1356    }
1357
1358    return true;
1359 }
1360
1361 // Set flags according to predicate and make the instruction read $cX.
1362 void
1363 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1364 {
1365    Value *pred = insn->getPredicate();
1366    Value *cdst;
1367
1368    // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
1369    if (!pred ||
1370        pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
1371       return;
1372
1373    cdst = bld.getSSA(1, FILE_FLAGS);
1374
1375    bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
1376
1377    insn->setPredicate(insn->cc, cdst);
1378 }
1379
1380 //
1381 // - add quadop dance for texturing
1382 // - put FP outputs in GPRs
1383 // - convert instruction sequences
1384 //
1385 bool
1386 NV50LoweringPreSSA::visit(Instruction *i)
1387 {
1388    bld.setPosition(i, false);
1389
1390    if (i->cc != CC_ALWAYS)
1391       checkPredicate(i);
1392
1393    switch (i->op) {
1394    case OP_TEX:
1395    case OP_TXF:
1396    case OP_TXG:
1397       return handleTEX(i->asTex());
1398    case OP_TXB:
1399       return handleTXB(i->asTex());
1400    case OP_TXL:
1401       return handleTXL(i->asTex());
1402    case OP_TXD:
1403       return handleTXD(i->asTex());
1404    case OP_TXLQ:
1405       return handleTXLQ(i->asTex());
1406    case OP_TXQ:
1407       return handleTXQ(i->asTex());
1408    case OP_EX2:
1409       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1410       i->setSrc(0, i->getDef(0));
1411       break;
1412    case OP_SET:
1413       return handleSET(i);
1414    case OP_SLCT:
1415       return handleSLCT(i->asCmp());
1416    case OP_SELP:
1417       return handleSELP(i);
1418    case OP_POW:
1419       return handlePOW(i);
1420    case OP_DIV:
1421       return handleDIV(i);
1422    case OP_SQRT:
1423       return handleSQRT(i);
1424    case OP_EXPORT:
1425       return handleEXPORT(i);
1426    case OP_LOAD:
1427       return handleLOAD(i);
1428    case OP_RDSV:
1429       return handleRDSV(i);
1430    case OP_WRSV:
1431       return handleWRSV(i);
1432    case OP_CALL:
1433       return handleCALL(i);
1434    case OP_PRECONT:
1435       return handlePRECONT(i);
1436    case OP_CONT:
1437       return handleCONT(i);
1438    case OP_PFETCH:
1439       return handlePFETCH(i);
1440    default:
1441       break;
1442    }
1443    return true;
1444 }
1445
1446 bool
1447 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1448 {
1449    bool ret = false;
1450
1451    if (stage == CG_STAGE_PRE_SSA) {
1452       NV50LoweringPreSSA pass(prog);
1453       ret = pass.run(prog, false, true);
1454    } else
1455    if (stage == CG_STAGE_SSA) {
1456       if (!prog->targetPriv)
1457          prog->targetPriv = new std::list<Instruction *>();
1458       NV50LegalizeSSA pass(prog);
1459       ret = pass.run(prog, false, true);
1460    } else
1461    if (stage == CG_STAGE_POST_RA) {
1462       NV50LegalizePostRA pass;
1463       ret = pass.run(prog, false, true);
1464       if (prog->targetPriv)
1465          delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1466    }
1467    return ret;
1468 }
1469
1470 } // namespace nv50_ir