src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_target.h"
  25 #include "codegen/nv50_ir_build_util.h"
  26
  27 extern "C" {
  28 #include "util/u_math.h"
  29 }
  30
  31 namespace nv50_ir {
  32
  33 bool
  34 Instruction::isNop() const
  35 {
  36    if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
  37       return true;
  38    if (terminator || join) // XXX: should terminator imply flow ?
  39       return false;
  40    if (op == OP_ATOM)
  41       return false;
  42    if (!fixed && op == OP_NOP)
  43       return true;
  44
  45    if (defExists(0) && def(0).rep()->reg.data.id < 0) {
  46       for (int d = 1; defExists(d); ++d)
  47          if (def(d).rep()->reg.data.id >= 0)
  48             WARN("part of vector result is unused !\n");
  49       return true;
  50    }
  51
  52    if (op == OP_MOV || op == OP_UNION) {
  53       if (!getDef(0)->equals(getSrc(0)))
  54          return false;
  55       if (op == OP_UNION)
  56          if (!def(0).rep()->equals(getSrc(1)))
  57             return false;
  58       return true;
  59    }
  60
  61    return false;
  62 }
  63
  64 bool Instruction::isDead() const
  65 {
  66    if (op == OP_STORE ||
  67        op == OP_EXPORT ||
  68        op == OP_ATOM ||
  69        op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
  70        op == OP_WRSV)
  71       return false;
  72
  73    for (int d = 0; defExists(d); ++d)
  74       if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
  75          return false;
  76
  77    if (terminator || asFlow())
  78       return false;
  79    if (fixed)
  80       return false;
  81
  82    return true;
  83 };
  84
  85 // =============================================================================
  86
  87 class CopyPropagation : public Pass
  88 {
  89 private:
  90    virtual bool visit(BasicBlock *);
  91 };
  92
  93 // Propagate all MOVs forward to make subsequent optimization easier, except if
  94 // the sources stem from a phi, in which case we don't want to mess up potential
  95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
  96 bool
  97 CopyPropagation::visit(BasicBlock *bb)
  98 {
  99    Instruction *mov, *si, *next;
 100
 101    for (mov = bb->getEntry(); mov; mov = next) {
 102       next = mov->next;
 103       if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
 104          continue;
 105       if (mov->getPredicate())
 106          continue;
 107       if (mov->def(0).getFile() != mov->src(0).getFile())
 108          continue;
 109       si = mov->getSrc(0)->getInsn();
 110       if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
 111          // propagate
 112          mov->def(0).replace(mov->getSrc(0), false);
 113          delete_Instruction(prog, mov);
 114       }
 115    }
 116    return true;
 117 }
 118
 119 // =============================================================================
 120
 121 class MergeSplits : public Pass
 122 {
 123 private:
 124    virtual bool visit(BasicBlock *);
 125 };
 126
 127 // For SPLIT / MERGE pairs that operate on the same registers, replace the
 128 // post-merge def with the SPLIT's source.
 129 bool
 130 MergeSplits::visit(BasicBlock *bb)
 131 {
 132    Instruction *i, *next, *si;
 133
 134    for (i = bb->getEntry(); i; i = next) {
 135       next = i->next;
 136       if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
 137          continue;
 138       si = i->getSrc(0)->getInsn();
 139       if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
 140          continue;
 141       i->def(0).replace(si->getSrc(0), false);
 142       delete_Instruction(prog, i);
 143    }
 144
 145    return true;
 146 }
 147
 148 // =============================================================================
 149
 150 class LoadPropagation : public Pass
 151 {
 152 private:
 153    virtual bool visit(BasicBlock *);
 154
 155    void checkSwapSrc01(Instruction *);
 156
 157    bool isCSpaceLoad(Instruction *);
 158    bool isImmdLoad(Instruction *);
 159    bool isAttribOrSharedLoad(Instruction *);
 160 };
 161
 162 bool
 163 LoadPropagation::isCSpaceLoad(Instruction *ld)
 164 {
 165    return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
 166 }
 167
 168 bool
 169 LoadPropagation::isImmdLoad(Instruction *ld)
 170 {
 171    if (!ld || (ld->op != OP_MOV) ||
 172        ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
 173       return false;
 174
 175    // A 0 can be replaced with a register, so it doesn't count as an immediate.
 176    ImmediateValue val;
 177    return ld->src(0).getImmediate(val) && !val.isInteger(0);
 178 }
 179
 180 bool
 181 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
 182 {
 183    return ld &&
 184       (ld->op == OP_VFETCH ||
 185        (ld->op == OP_LOAD &&
 186         (ld->src(0).getFile() == FILE_SHADER_INPUT ||
 187          ld->src(0).getFile() == FILE_MEMORY_SHARED)));
 188 }
 189
 190 void
 191 LoadPropagation::checkSwapSrc01(Instruction *insn)
 192 {
 193    const Target *targ = prog->getTarget();
 194    if (!targ->getOpInfo(insn).commutative) {
 195       if (insn->op != OP_SET && insn->op != OP_SLCT &&
 196           insn->op != OP_SUB && insn->op != OP_XMAD)
 197          return;
 198       // XMAD is only commutative if both the CBCC and MRG flags are not set.
 199       if (insn->op == OP_XMAD &&
 200           (insn->subOp & NV50_IR_SUBOP_XMAD_CMODE_MASK) == NV50_IR_SUBOP_XMAD_CBCC)
 201          return;
 202       if (insn->op == OP_XMAD && (insn->subOp & NV50_IR_SUBOP_XMAD_MRG))
 203          return;
 204    }
 205    if (insn->src(1).getFile() != FILE_GPR)
 206       return;
 207    // This is the special OP_SET used for alphatesting, we can't reverse its
 208    // arguments as that will confuse the fixup code.
 209    if (insn->op == OP_SET && insn->subOp)
 210       return;
 211
 212    Instruction *i0 = insn->getSrc(0)->getInsn();
 213    Instruction *i1 = insn->getSrc(1)->getInsn();
 214
 215    // Swap sources to inline the less frequently used source. That way,
 216    // optimistically, it will eventually be able to remove the instruction.
 217    int i0refs = insn->getSrc(0)->refCount();
 218    int i1refs = insn->getSrc(1)->refCount();
 219
 220    if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
 221       if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
 222           !targ->insnCanLoad(insn, 1, i1) ||
 223           i0refs < i1refs)
 224          insn->swapSources(0, 1);
 225       else
 226          return;
 227    } else
 228    if (isAttribOrSharedLoad(i1)) {
 229       if (!isAttribOrSharedLoad(i0))
 230          insn->swapSources(0, 1);
 231       else
 232          return;
 233    } else {
 234       return;
 235    }
 236
 237    if (insn->op == OP_SET || insn->op == OP_SET_AND ||
 238        insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
 239       insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
 240    else
 241    if (insn->op == OP_SLCT)
 242       insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
 243    else
 244    if (insn->op == OP_SUB) {
 245       insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
 246       insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
 247    } else
 248    if (insn->op == OP_XMAD) {
 249       // swap h1 flags
 250       uint16_t h1 = (insn->subOp >> 1 & NV50_IR_SUBOP_XMAD_H1(0)) |
 251                     (insn->subOp << 1 & NV50_IR_SUBOP_XMAD_H1(1));
 252       insn->subOp = (insn->subOp & ~NV50_IR_SUBOP_XMAD_H1_MASK) | h1;
 253    }
 254 }
 255
 256 bool
 257 LoadPropagation::visit(BasicBlock *bb)
 258 {
 259    const Target *targ = prog->getTarget();
 260    Instruction *next;
 261
 262    for (Instruction *i = bb->getEntry(); i; i = next) {
 263       next = i->next;
 264
 265       if (i->op == OP_CALL) // calls have args as sources, they must be in regs
 266          continue;
 267
 268       if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
 269          continue;
 270
 271       if (i->srcExists(1))
 272          checkSwapSrc01(i);
 273
 274       for (int s = 0; i->srcExists(s); ++s) {
 275          Instruction *ld = i->getSrc(s)->getInsn();
 276
 277          if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
 278             continue;
 279          if (!targ->insnCanLoad(i, s, ld))
 280             continue;
 281
 282          // propagate !
 283          i->setSrc(s, ld->getSrc(0));
 284          if (ld->src(0).isIndirect(0))
 285             i->setIndirect(s, 0, ld->getIndirect(0, 0));
 286
 287          if (ld->getDef(0)->refCount() == 0)
 288             delete_Instruction(prog, ld);
 289       }
 290    }
 291    return true;
 292 }
 293
 294 // =============================================================================
 295
 296 class IndirectPropagation : public Pass
 297 {
 298 private:
 299    virtual bool visit(BasicBlock *);
 300
 301    BuildUtil bld;
 302 };
 303
 304 bool
 305 IndirectPropagation::visit(BasicBlock *bb)
 306 {
 307    const Target *targ = prog->getTarget();
 308    Instruction *next;
 309
 310    for (Instruction *i = bb->getEntry(); i; i = next) {
 311       next = i->next;
 312
 313       bld.setPosition(i, false);
 314
 315       for (int s = 0; i->srcExists(s); ++s) {
 316          Instruction *insn;
 317          ImmediateValue imm;
 318          if (!i->src(s).isIndirect(0))
 319             continue;
 320          insn = i->getIndirect(s, 0)->getInsn();
 321          if (!insn)
 322             continue;
 323          if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
 324             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
 325                 !insn->src(1).getImmediate(imm) ||
 326                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
 327                continue;
 328             i->setIndirect(s, 0, insn->getSrc(0));
 329             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 330             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
 331          } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
 332             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
 333                 !insn->src(1).getImmediate(imm) ||
 334                 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
 335                continue;
 336             i->setIndirect(s, 0, insn->getSrc(0));
 337             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 338             i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
 339          } else if (insn->op == OP_MOV) {
 340             if (!insn->src(0).getImmediate(imm) ||
 341                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
 342                continue;
 343             i->setIndirect(s, 0, NULL);
 344             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 345             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
 346          } else if (insn->op == OP_SHLADD) {
 347             if (!insn->src(2).getImmediate(imm) ||
 348                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
 349                continue;
 350             i->setIndirect(s, 0, bld.mkOp2v(
 351                OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), insn->getSrc(1)));
 352             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 353             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
 354          }
 355       }
 356    }
 357    return true;
 358 }
 359
 360 // =============================================================================
 361
 362 // Evaluate constant expressions.
 363 class ConstantFolding : public Pass
 364 {
 365 public:
 366    bool foldAll(Program *);
 367
 368 private:
 369    virtual bool visit(BasicBlock *);
 370
 371    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
 372    void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
 373    void opnd(Instruction *, ImmediateValue&, int s);
 374    void opnd3(Instruction *, ImmediateValue&);
 375
 376    void unary(Instruction *, const ImmediateValue&);
 377
 378    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
 379
 380    CmpInstruction *findOriginForTestWithZero(Value *);
 381
 382    bool createMul(DataType ty, Value *def, Value *a, int64_t b, Value *c);
 383
 384    unsigned int foldCount;
 385
 386    BuildUtil bld;
 387 };
 388
 389 // TODO: remember generated immediates and only revisit these
 390 bool
 391 ConstantFolding::foldAll(Program *prog)
 392 {
 393    unsigned int iterCount = 0;
 394    do {
 395       foldCount = 0;
 396       if (!run(prog))
 397          return false;
 398    } while (foldCount && ++iterCount < 2);
 399    return true;
 400 }
 401
 402 bool
 403 ConstantFolding::visit(BasicBlock *bb)
 404 {
 405    Instruction *i, *next;
 406
 407    for (i = bb->getEntry(); i; i = next) {
 408       next = i->next;
 409       if (i->op == OP_MOV || i->op == OP_CALL)
 410          continue;
 411
 412       ImmediateValue src0, src1, src2;
 413
 414       if (i->srcExists(2) &&
 415           i->src(0).getImmediate(src0) &&
 416           i->src(1).getImmediate(src1) &&
 417           i->src(2).getImmediate(src2))
 418          expr(i, src0, src1, src2);
 419       else
 420       if (i->srcExists(1) &&
 421           i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
 422          expr(i, src0, src1);
 423       else
 424       if (i->srcExists(0) && i->src(0).getImmediate(src0))
 425          opnd(i, src0, 0);
 426       else
 427       if (i->srcExists(1) && i->src(1).getImmediate(src1))
 428          opnd(i, src1, 1);
 429       if (i->srcExists(2) && i->src(2).getImmediate(src2))
 430          opnd3(i, src2);
 431    }
 432    return true;
 433 }
 434
 435 CmpInstruction *
 436 ConstantFolding::findOriginForTestWithZero(Value *value)
 437 {
 438    if (!value)
 439       return NULL;
 440    Instruction *insn = value->getInsn();
 441    if (!insn)
 442       return NULL;
 443
 444    if (insn->asCmp() && insn->op != OP_SLCT)
 445       return insn->asCmp();
 446
 447    /* Sometimes mov's will sneak in as a result of other folding. This gets
 448     * cleaned up later.
 449     */
 450    if (insn->op == OP_MOV)
 451       return findOriginForTestWithZero(insn->getSrc(0));
 452
 453    /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
 454    if (insn->op == OP_AND) {
 455       int s = 0;
 456       ImmediateValue imm;
 457       if (!insn->src(s).getImmediate(imm)) {
 458          s = 1;
 459          if (!insn->src(s).getImmediate(imm))
 460             return NULL;
 461       }
 462       if (imm.reg.data.f32 != 1.0f)
 463          return NULL;
 464       /* TODO: Come up with a way to handle the condition being inverted */
 465       if (insn->src(!s).mod != Modifier(0))
 466          return NULL;
 467       return findOriginForTestWithZero(insn->getSrc(!s));
 468    }
 469
 470    return NULL;
 471 }
 472
 473 void
 474 Modifier::applyTo(ImmediateValue& imm) const
 475 {
 476    if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
 477       return;
 478    switch (imm.reg.type) {
 479    case TYPE_F32:
 480       if (bits & NV50_IR_MOD_ABS)
 481          imm.reg.data.f32 = fabsf(imm.reg.data.f32);
 482       if (bits & NV50_IR_MOD_NEG)
 483          imm.reg.data.f32 = -imm.reg.data.f32;
 484       if (bits & NV50_IR_MOD_SAT) {
 485          if (imm.reg.data.f32 < 0.0f)
 486             imm.reg.data.f32 = 0.0f;
 487          else
 488          if (imm.reg.data.f32 > 1.0f)
 489             imm.reg.data.f32 = 1.0f;
 490       }
 491       assert(!(bits & NV50_IR_MOD_NOT));
 492       break;
 493
 494    case TYPE_S8: // NOTE: will be extended
 495    case TYPE_S16:
 496    case TYPE_S32:
 497    case TYPE_U8: // NOTE: treated as signed
 498    case TYPE_U16:
 499    case TYPE_U32:
 500       if (bits & NV50_IR_MOD_ABS)
 501          imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
 502             imm.reg.data.s32 : -imm.reg.data.s32;
 503       if (bits & NV50_IR_MOD_NEG)
 504          imm.reg.data.s32 = -imm.reg.data.s32;
 505       if (bits & NV50_IR_MOD_NOT)
 506          imm.reg.data.s32 = ~imm.reg.data.s32;
 507       break;
 508
 509    case TYPE_F64:
 510       if (bits & NV50_IR_MOD_ABS)
 511          imm.reg.data.f64 = fabs(imm.reg.data.f64);
 512       if (bits & NV50_IR_MOD_NEG)
 513          imm.reg.data.f64 = -imm.reg.data.f64;
 514       if (bits & NV50_IR_MOD_SAT) {
 515          if (imm.reg.data.f64 < 0.0)
 516             imm.reg.data.f64 = 0.0;
 517          else
 518          if (imm.reg.data.f64 > 1.0)
 519             imm.reg.data.f64 = 1.0;
 520       }
 521       assert(!(bits & NV50_IR_MOD_NOT));
 522       break;
 523
 524    default:
 525       assert(!"invalid/unhandled type");
 526       imm.reg.data.u64 = 0;
 527       break;
 528    }
 529 }
 530
 531 operation
 532 Modifier::getOp() const
 533 {
 534    switch (bits) {
 535    case NV50_IR_MOD_ABS: return OP_ABS;
 536    case NV50_IR_MOD_NEG: return OP_NEG;
 537    case NV50_IR_MOD_SAT: return OP_SAT;
 538    case NV50_IR_MOD_NOT: return OP_NOT;
 539    case 0:
 540       return OP_MOV;
 541    default:
 542       return OP_CVT;
 543    }
 544 }
 545
 546 void
 547 ConstantFolding::expr(Instruction *i,
 548                       ImmediateValue &imm0, ImmediateValue &imm1)
 549 {
 550    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
 551    struct Storage res;
 552    DataType type = i->dType;
 553
 554    memset(&res.data, 0, sizeof(res.data));
 555
 556    switch (i->op) {
 557    case OP_MAD:
 558    case OP_FMA:
 559    case OP_MUL:
 560       if (i->dnz && i->dType == TYPE_F32) {
 561          if (!isfinite(a->data.f32))
 562             a->data.f32 = 0.0f;
 563          if (!isfinite(b->data.f32))
 564             b->data.f32 = 0.0f;
 565       }
 566       switch (i->dType) {
 567       case TYPE_F32:
 568          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
 569          break;
 570       case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
 571       case TYPE_S32:
 572          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 573             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
 574             break;
 575          }
 576          /* fallthrough */
 577       case TYPE_U32:
 578          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 579             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
 580             break;
 581          }
 582          res.data.u32 = a->data.u32 * b->data.u32; break;
 583       default:
 584          return;
 585       }
 586       break;
 587    case OP_DIV:
 588       if (b->data.u32 == 0)
 589          break;
 590       switch (i->dType) {
 591       case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
 592       case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
 593       case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
 594       case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
 595       default:
 596          return;
 597       }
 598       break;
 599    case OP_ADD:
 600       switch (i->dType) {
 601       case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
 602       case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
 603       case TYPE_S32:
 604       case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
 605       default:
 606          return;
 607       }
 608       break;
 609    case OP_SUB:
 610       switch (i->dType) {
 611       case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
 612       case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
 613       case TYPE_S32:
 614       case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
 615       default:
 616          return;
 617       }
 618       break;
 619    case OP_POW:
 620       switch (i->dType) {
 621       case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
 622       case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
 623       default:
 624          return;
 625       }
 626       break;
 627    case OP_MAX:
 628       switch (i->dType) {
 629       case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
 630       case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
 631       case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
 632       case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
 633       default:
 634          return;
 635       }
 636       break;
 637    case OP_MIN:
 638       switch (i->dType) {
 639       case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
 640       case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
 641       case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
 642       case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
 643       default:
 644          return;
 645       }
 646       break;
 647    case OP_AND:
 648       res.data.u64 = a->data.u64 & b->data.u64;
 649       break;
 650    case OP_OR:
 651       res.data.u64 = a->data.u64 | b->data.u64;
 652       break;
 653    case OP_XOR:
 654       res.data.u64 = a->data.u64 ^ b->data.u64;
 655       break;
 656    case OP_SHL:
 657       res.data.u32 = a->data.u32 << b->data.u32;
 658       break;
 659    case OP_SHR:
 660       switch (i->dType) {
 661       case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
 662       case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
 663       default:
 664          return;
 665       }
 666       break;
 667    case OP_SLCT:
 668       if (a->data.u32 != b->data.u32)
 669          return;
 670       res.data.u32 = a->data.u32;
 671       break;
 672    case OP_EXTBF: {
 673       int offset = b->data.u32 & 0xff;
 674       int width = (b->data.u32 >> 8) & 0xff;
 675       int rshift = offset;
 676       int lshift = 0;
 677       if (width == 0) {
 678          res.data.u32 = 0;
 679          break;
 680       }
 681       if (width + offset < 32) {
 682          rshift = 32 - width;
 683          lshift = 32 - width - offset;
 684       }
 685       if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
 686          res.data.u32 = util_bitreverse(a->data.u32);
 687       else
 688          res.data.u32 = a->data.u32;
 689       switch (i->dType) {
 690       case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
 691       case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
 692       default:
 693          return;
 694       }
 695       break;
 696    }
 697    case OP_POPCNT:
 698       res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
 699       break;
 700    case OP_PFETCH:
 701       // The two arguments to pfetch are logically added together. Normally
 702       // the second argument will not be constant, but that can happen.
 703       res.data.u32 = a->data.u32 + b->data.u32;
 704       type = TYPE_U32;
 705       break;
 706    case OP_MERGE:
 707       switch (i->dType) {
 708       case TYPE_U64:
 709       case TYPE_S64:
 710       case TYPE_F64:
 711          res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
 712          break;
 713       default:
 714          return;
 715       }
 716       break;
 717    default:
 718       return;
 719    }
 720    ++foldCount;
 721
 722    i->src(0).mod = Modifier(0);
 723    i->src(1).mod = Modifier(0);
 724    i->postFactor = 0;
 725
 726    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
 727    i->setSrc(1, NULL);
 728
 729    i->getSrc(0)->reg.data = res.data;
 730    i->getSrc(0)->reg.type = type;
 731    i->getSrc(0)->reg.size = typeSizeof(type);
 732
 733    switch (i->op) {
 734    case OP_MAD:
 735    case OP_FMA: {
 736       ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
 737
 738       // Move the immediate into position 1, where we know it might be
 739       // emittable. However it might not be anyways, as there may be other
 740       // restrictions, so move it into a separate LValue.
 741       bld.setPosition(i, false);
 742       i->op = OP_ADD;
 743       i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
 744       i->setSrc(0, i->getSrc(2));
 745       i->src(0).mod = i->src(2).mod;
 746       i->setSrc(2, NULL);
 747
 748       if (i->src(0).getImmediate(src0))
 749          expr(i, src0, src1);
 750       else
 751          opnd(i, src1, 1);
 752       break;
 753    }
 754    case OP_PFETCH:
 755       // Leave PFETCH alone... we just folded its 2 args into 1.
 756       break;
 757    default:
 758       i->op = i->saturate ? OP_SAT : OP_MOV;
 759       if (i->saturate)
 760          unary(i, *i->getSrc(0)->asImm());
 761       break;
 762    }
 763    i->subOp = 0;
 764 }
 765
 766 void
 767 ConstantFolding::expr(Instruction *i,
 768                       ImmediateValue &imm0,
 769                       ImmediateValue &imm1,
 770                       ImmediateValue &imm2)
 771 {
 772    struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
 773    struct Storage res;
 774
 775    memset(&res.data, 0, sizeof(res.data));
 776
 777    switch (i->op) {
 778    case OP_INSBF: {
 779       int offset = b->data.u32 & 0xff;
 780       int width = (b->data.u32 >> 8) & 0xff;
 781       unsigned bitmask = ((1 << width) - 1) << offset;
 782       res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
 783       break;
 784    }
 785    case OP_MAD:
 786    case OP_FMA: {
 787       switch (i->dType) {
 788       case TYPE_F32:
 789          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
 790             c->data.f32;
 791          break;
 792       case TYPE_F64:
 793          res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
 794          break;
 795       case TYPE_S32:
 796          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 797             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
 798             break;
 799          }
 800          /* fallthrough */
 801       case TYPE_U32:
 802          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 803             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
 804             break;
 805          }
 806          res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
 807          break;
 808       default:
 809          return;
 810       }
 811       break;
 812    }
 813    case OP_SHLADD:
 814       res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
 815       break;
 816    default:
 817       return;
 818    }
 819
 820    ++foldCount;
 821    i->src(0).mod = Modifier(0);
 822    i->src(1).mod = Modifier(0);
 823    i->src(2).mod = Modifier(0);
 824
 825    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
 826    i->setSrc(1, NULL);
 827    i->setSrc(2, NULL);
 828
 829    i->getSrc(0)->reg.data = res.data;
 830    i->getSrc(0)->reg.type = i->dType;
 831    i->getSrc(0)->reg.size = typeSizeof(i->dType);
 832
 833    i->op = OP_MOV;
 834 }
 835
 836 void
 837 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
 838 {
 839    Storage res;
 840
 841    if (i->dType != TYPE_F32)
 842       return;
 843    switch (i->op) {
 844    case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
 845    case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
 846    case OP_SAT: res.data.f32 = CLAMP(imm.reg.data.f32, 0.0f, 1.0f); break;
 847    case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
 848    case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
 849    case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
 850    case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
 851    case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
 852    case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
 853    case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
 854    case OP_PRESIN:
 855    case OP_PREEX2:
 856       // these should be handled in subsequent OP_SIN/COS/EX2
 857       res.data.f32 = imm.reg.data.f32;
 858       break;
 859    default:
 860       return;
 861    }
 862    i->op = OP_MOV;
 863    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
 864    i->src(0).mod = Modifier(0);
 865 }
 866
 867 void
 868 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
 869                                         const int s, ImmediateValue& imm2)
 870 {
 871    const int t = s ? 0 : 1;
 872    Instruction *insn;
 873    Instruction *mul1 = NULL; // mul1 before mul2
 874    int e = 0;
 875    float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
 876    ImmediateValue imm1;
 877
 878    assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
 879
 880    if (mul2->getSrc(t)->refCount() == 1) {
 881       insn = mul2->getSrc(t)->getInsn();
 882       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
 883          mul1 = insn;
 884       if (mul1 && !mul1->saturate) {
 885          int s1;
 886
 887          if (mul1->src(s1 = 0).getImmediate(imm1) ||
 888              mul1->src(s1 = 1).getImmediate(imm1)) {
 889             bld.setPosition(mul1, false);
 890             // a = mul r, imm1
 891             // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
 892             mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
 893             mul1->src(s1).mod = Modifier(0);
 894             mul2->def(0).replace(mul1->getDef(0), false);
 895             mul1->saturate = mul2->saturate;
 896          } else
 897          if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
 898             // c = mul a, b
 899             // d = mul c, imm   -> d = mul_x_imm a, b
 900             mul1->postFactor = e;
 901             mul2->def(0).replace(mul1->getDef(0), false);
 902             if (f < 0)
 903                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
 904             mul1->saturate = mul2->saturate;
 905          }
 906          return;
 907       }
 908    }
 909    if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
 910       // b = mul a, imm
 911       // d = mul b, c   -> d = mul_x_imm a, c
 912       int s2, t2;
 913       insn = (*mul2->getDef(0)->uses.begin())->getInsn();
 914       if (!insn)
 915          return;
 916       mul1 = mul2;
 917       mul2 = NULL;
 918       s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
 919       t2 = s2 ? 0 : 1;
 920       if (insn->op == OP_MUL && insn->dType == TYPE_F32)
 921          if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
 922             mul2 = insn;
 923       if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
 924          mul2->postFactor = e;
 925          mul2->setSrc(s2, mul1->src(t));
 926          if (f < 0)
 927             mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
 928       }
 929    }
 930 }
 931
 932 void
 933 ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
 934 {
 935    switch (i->op) {
 936    case OP_MAD:
 937    case OP_FMA:
 938       if (imm2.isInteger(0)) {
 939          i->op = OP_MUL;
 940          i->setSrc(2, NULL);
 941          foldCount++;
 942          return;
 943       }
 944       break;
 945    case OP_SHLADD:
 946       if (imm2.isInteger(0)) {
 947          i->op = OP_SHL;
 948          i->setSrc(2, NULL);
 949          foldCount++;
 950          return;
 951       }
 952       break;
 953    default:
 954       return;
 955    }
 956 }
 957
 958 bool
 959 ConstantFolding::createMul(DataType ty, Value *def, Value *a, int64_t b, Value *c)
 960 {
 961    const Target *target = prog->getTarget();
 962    int64_t absB = llabs(b);
 963
 964    //a * (2^shl) -> a << shl
 965    if (b >= 0 && util_is_power_of_two_or_zero64(b)) {
 966       int shl = util_logbase2_64(b);
 967
 968       Value *res = c ? bld.getSSA(typeSizeof(ty)) : def;
 969       bld.mkOp2(OP_SHL, ty, res, a, bld.mkImm(shl));
 970       if (c)
 971          bld.mkOp2(OP_ADD, ty, def, res, c);
 972
 973       return true;
 974    }
 975
 976    //a * (2^shl + 1) -> a << shl + a
 977    //a * -(2^shl + 1) -> -a << shl + a
 978    //a * (2^shl - 1) -> a << shl - a
 979    //a * -(2^shl - 1) -> -a << shl - a
 980    if (typeSizeof(ty) == 4 &&
 981        (util_is_power_of_two_or_zero64(absB - 1) ||
 982         util_is_power_of_two_or_zero64(absB + 1)) &&
 983        target->isOpSupported(OP_SHLADD, TYPE_U32)) {
 984       bool subA = util_is_power_of_two_or_zero64(absB + 1);
 985       int shl = subA ? util_logbase2_64(absB + 1) : util_logbase2_64(absB - 1);
 986
 987       Value *res = c ? bld.getSSA() : def;
 988       Instruction *insn = bld.mkOp3(OP_SHLADD, TYPE_U32, res, a, bld.mkImm(shl), a);
 989       if (b < 0)
 990          insn->src(0).mod = Modifier(NV50_IR_MOD_NEG);
 991       if (subA)
 992          insn->src(2).mod = Modifier(NV50_IR_MOD_NEG);
 993
 994       if (c)
 995          bld.mkOp2(OP_ADD, TYPE_U32, def, res, c);
 996
 997       return true;
 998    }
 999
1000    if (typeSizeof(ty) == 4 && b >= 0 && b <= 0xffff &&
1001        target->isOpSupported(OP_XMAD, TYPE_U32)) {
1002       Value *tmp = bld.mkOp3v(OP_XMAD, TYPE_U32, bld.getSSA(),
1003                               a, bld.mkImm((uint32_t)b), c ? c : bld.mkImm(0));
1004       bld.mkOp3(OP_XMAD, TYPE_U32, def, a, bld.mkImm((uint32_t)b), tmp)->subOp =
1005          NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_H1(0);
1006
1007       return true;
1008    }
1009
1010    return false;
1011 }
1012
1013 void
1014 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
1015 {
1016    const int t = !s;
1017    const operation op = i->op;
1018    Instruction *newi = i;
1019
1020    switch (i->op) {
1021    case OP_SPLIT: {
1022       bld.setPosition(i, false);
1023
1024       uint8_t size = i->getDef(0)->reg.size;
1025       uint8_t bitsize = size * 8;
1026       uint32_t mask = (1ULL << bitsize) - 1;
1027       assert(bitsize <= 32);
1028
1029       uint64_t val = imm0.reg.data.u64;
1030       for (int8_t d = 0; i->defExists(d); ++d) {
1031          Value *def = i->getDef(d);
1032          assert(def->reg.size == size);
1033
1034          newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)), TYPE_U32);
1035          val >>= bitsize;
1036       }
1037       delete_Instruction(prog, i);
1038       break;
1039    }
1040    case OP_MUL:
1041       if (i->dType == TYPE_F32)
1042          tryCollapseChainedMULs(i, s, imm0);
1043
1044       if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
1045          assert(!isFloatType(i->sType));
1046          if (imm0.isInteger(1) && i->dType == TYPE_S32) {
1047             bld.setPosition(i, false);
1048             // Need to set to the sign value, which is a compare.
1049             newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
1050                              TYPE_S32, i->getSrc(t), bld.mkImm(0));
1051             delete_Instruction(prog, i);
1052          } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
1053             // The high bits can't be set in this case (either mul by 0 or
1054             // unsigned by 1)
1055             i->op = OP_MOV;
1056             i->subOp = 0;
1057             i->setSrc(0, new_ImmediateValue(prog, 0u));
1058             i->src(0).mod = Modifier(0);
1059             i->setSrc(1, NULL);
1060          } else if (!imm0.isNegative() && imm0.isPow2()) {
1061             // Translate into a shift
1062             imm0.applyLog2();
1063             i->op = OP_SHR;
1064             i->subOp = 0;
1065             imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
1066             i->setSrc(0, i->getSrc(t));
1067             i->src(0).mod = i->src(t).mod;
1068             i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
1069             i->src(1).mod = 0;
1070          }
1071       } else
1072       if (imm0.isInteger(0)) {
1073          i->op = OP_MOV;
1074          i->setSrc(0, new_ImmediateValue(prog, 0u));
1075          i->src(0).mod = Modifier(0);
1076          i->postFactor = 0;
1077          i->setSrc(1, NULL);
1078       } else
1079       if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
1080          if (imm0.isNegative())
1081             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1082          i->op = i->src(t).mod.getOp();
1083          if (s == 0) {
1084             i->setSrc(0, i->getSrc(1));
1085             i->src(0).mod = i->src(1).mod;
1086             i->src(1).mod = 0;
1087          }
1088          if (i->op != OP_CVT)
1089             i->src(0).mod = 0;
1090          i->setSrc(1, NULL);
1091       } else
1092       if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
1093          if (imm0.isNegative())
1094             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1095          i->op = OP_ADD;
1096          i->setSrc(s, i->getSrc(t));
1097          i->src(s).mod = i->src(t).mod;
1098       } else
1099       if (!isFloatType(i->dType) && !i->src(t).mod) {
1100          bld.setPosition(i, false);
1101          int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
1102          if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, NULL))
1103             delete_Instruction(prog, i);
1104       } else
1105       if (i->postFactor && i->sType == TYPE_F32) {
1106          /* Can't emit a postfactor with an immediate, have to fold it in */
1107          i->setSrc(s, new_ImmediateValue(
1108                       prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
1109          i->postFactor = 0;
1110       }
1111       break;
1112    case OP_FMA:
1113    case OP_MAD:
1114       if (imm0.isInteger(0)) {
1115          i->setSrc(0, i->getSrc(2));
1116          i->src(0).mod = i->src(2).mod;
1117          i->setSrc(1, NULL);
1118          i->setSrc(2, NULL);
1119          i->op = i->src(0).mod.getOp();
1120          if (i->op != OP_CVT)
1121             i->src(0).mod = 0;
1122       } else
1123       if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
1124           (imm0.isInteger(1) || imm0.isInteger(-1))) {
1125          if (imm0.isNegative())
1126             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1127          if (s == 0) {
1128             i->setSrc(0, i->getSrc(1));
1129             i->src(0).mod = i->src(1).mod;
1130          }
1131          i->setSrc(1, i->getSrc(2));
1132          i->src(1).mod = i->src(2).mod;
1133          i->setSrc(2, NULL);
1134          i->op = OP_ADD;
1135       } else
1136       if (!isFloatType(i->dType) && !i->subOp && !i->src(t).mod && !i->src(2).mod) {
1137          bld.setPosition(i, false);
1138          int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
1139          if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, i->getSrc(2)))
1140             delete_Instruction(prog, i);
1141       }
1142       break;
1143    case OP_SUB:
1144       if (imm0.isInteger(0) && s == 0 && typeSizeof(i->dType) == 8 &&
1145           !isFloatType(i->dType))
1146          break;
1147       /* fallthrough */
1148    case OP_ADD:
1149       if (i->usesFlags())
1150          break;
1151       if (imm0.isInteger(0)) {
1152          if (s == 0) {
1153             i->setSrc(0, i->getSrc(1));
1154             i->src(0).mod = i->src(1).mod;
1155             if (i->op == OP_SUB)
1156                i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1157          }
1158          i->setSrc(1, NULL);
1159          i->op = i->src(0).mod.getOp();
1160          if (i->op != OP_CVT)
1161             i->src(0).mod = Modifier(0);
1162       }
1163       break;
1164
1165    case OP_DIV:
1166       if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
1167          break;
1168       bld.setPosition(i, false);
1169       if (imm0.reg.data.u32 == 0) {
1170          break;
1171       } else
1172       if (imm0.reg.data.u32 == 1) {
1173          i->op = OP_MOV;
1174          i->setSrc(1, NULL);
1175       } else
1176       if (i->dType == TYPE_U32 && imm0.isPow2()) {
1177          i->op = OP_SHR;
1178          i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
1179       } else
1180       if (i->dType == TYPE_U32) {
1181          Instruction *mul;
1182          Value *tA, *tB;
1183          const uint32_t d = imm0.reg.data.u32;
1184          uint32_t m;
1185          int r, s;
1186          uint32_t l = util_logbase2(d);
1187          if (((uint32_t)1 << l) < d)
1188             ++l;
1189          m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
1190          r = l ? 1 : 0;
1191          s = l ? (l - 1) : 0;
1192
1193          tA = bld.getSSA();
1194          tB = bld.getSSA();
1195          mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
1196                          bld.loadImm(NULL, m));
1197          mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
1198          bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
1199          tA = bld.getSSA();
1200          if (r)
1201             bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
1202          else
1203             tA = tB;
1204          tB = s ? bld.getSSA() : i->getDef(0);
1205          newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
1206          if (s)
1207             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
1208
1209          delete_Instruction(prog, i);
1210       } else
1211       if (imm0.reg.data.s32 == -1) {
1212          i->op = OP_NEG;
1213          i->setSrc(1, NULL);
1214       } else {
1215          LValue *tA, *tB;
1216          LValue *tD;
1217          const int32_t d = imm0.reg.data.s32;
1218          int32_t m;
1219          int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
1220          if ((1 << l) < abs(d))
1221             ++l;
1222          if (!l)
1223             l = 1;
1224          m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
1225
1226          tA = bld.getSSA();
1227          tB = bld.getSSA();
1228          bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
1229                    i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
1230          if (l > 1)
1231             bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
1232          else
1233             tB = tA;
1234          tA = bld.getSSA();
1235          bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
1236          tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
1237          newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
1238          if (d < 0)
1239             bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
1240
1241          delete_Instruction(prog, i);
1242       }
1243       break;
1244
1245    case OP_MOD:
1246       if (s == 1 && imm0.isPow2()) {
1247          bld.setPosition(i, false);
1248          if (i->sType == TYPE_U32) {
1249             i->op = OP_AND;
1250             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1251          } else if (i->sType == TYPE_S32) {
1252             // Do it on the absolute value of the input, and then restore the
1253             // sign. The only odd case is MIN_INT, but that should work out
1254             // as well, since MIN_INT mod any power of 2 is 0.
1255             //
1256             // Technically we don't have to do any of this since MOD is
1257             // undefined with negative arguments in GLSL, but this seems like
1258             // the nice thing to do.
1259             Value *abs = bld.mkOp1v(OP_ABS, TYPE_S32, bld.getSSA(), i->getSrc(0));
1260             Value *neg, *v1, *v2;
1261             bld.mkCmp(OP_SET, CC_LT, TYPE_S32,
1262                       (neg = bld.getSSA(1, prog->getTarget()->nativeFile(FILE_PREDICATE))),
1263                       TYPE_S32, i->getSrc(0), bld.loadImm(NULL, 0));
1264             Value *mod = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), abs,
1265                                     bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1266             bld.mkOp1(OP_NEG, TYPE_S32, (v1 = bld.getSSA()), mod)
1267                ->setPredicate(CC_P, neg);
1268             bld.mkOp1(OP_MOV, TYPE_S32, (v2 = bld.getSSA()), mod)
1269                ->setPredicate(CC_NOT_P, neg);
1270             newi = bld.mkOp2(OP_UNION, TYPE_S32, i->getDef(0), v1, v2);
1271
1272             delete_Instruction(prog, i);
1273          }
1274       } else if (s == 1) {
1275          // In this case, we still want the optimized lowering that we get
1276          // from having division by an immediate.
1277          //
1278          // a % b == a - (a/b) * b
1279          bld.setPosition(i, false);
1280          Value *div = bld.mkOp2v(OP_DIV, i->sType, bld.getSSA(),
1281                                  i->getSrc(0), i->getSrc(1));
1282          newi = bld.mkOp2(OP_ADD, i->sType, i->getDef(0), i->getSrc(0),
1283                           bld.mkOp2v(OP_MUL, i->sType, bld.getSSA(), div, i->getSrc(1)));
1284          // TODO: Check that target supports this. In this case, we know that
1285          // all backends do.
1286          newi->src(1).mod = Modifier(NV50_IR_MOD_NEG);
1287
1288          delete_Instruction(prog, i);
1289       }
1290       break;
1291
1292    case OP_SET: // TODO: SET_AND,OR,XOR
1293    {
1294       /* This optimizes the case where the output of a set is being compared
1295        * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1296        * can be a lot cleverer in our comparison.
1297        */
1298       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
1299       CondCode cc, ccZ;
1300       if (imm0.reg.data.u32 != 0 || !si)
1301          return;
1302       cc = si->setCond;
1303       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
1304       // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1305       // first.
1306       if (s == 0)
1307          ccZ = reverseCondCode(ccZ);
1308       // If there is a negative modifier, we need to undo that, by flipping
1309       // the comparison to zero.
1310       if (i->src(t).mod.neg())
1311          ccZ = reverseCondCode(ccZ);
1312       // If this is a signed comparison, we expect the input to be a regular
1313       // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1314       // is positive, so just flip the sign.
1315       if (i->sType == TYPE_S32) {
1316          assert(!isFloatType(si->dType));
1317          ccZ = reverseCondCode(ccZ);
1318       }
1319       switch (ccZ) {
1320       case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
1321       case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
1322       case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
1323       case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
1324       case CC_GT: break; // bool > 0 -- bool
1325       case CC_NE: break; // bool != 0 -- bool
1326       default:
1327          return;
1328       }
1329
1330       // Update the condition of this SET to be identical to the origin set,
1331       // but with the updated condition code. The original SET should get
1332       // DCE'd, ideally.
1333       i->op = si->op;
1334       i->asCmp()->setCond = cc;
1335       i->setSrc(0, si->src(0));
1336       i->setSrc(1, si->src(1));
1337       if (si->srcExists(2))
1338          i->setSrc(2, si->src(2));
1339       i->sType = si->sType;
1340    }
1341       break;
1342
1343    case OP_AND:
1344    {
1345       Instruction *src = i->getSrc(t)->getInsn();
1346       ImmediateValue imm1;
1347       if (imm0.reg.data.u32 == 0) {
1348          i->op = OP_MOV;
1349          i->setSrc(0, new_ImmediateValue(prog, 0u));
1350          i->src(0).mod = Modifier(0);
1351          i->setSrc(1, NULL);
1352       } else if (imm0.reg.data.u32 == ~0U) {
1353          i->op = i->src(t).mod.getOp();
1354          if (t) {
1355             i->setSrc(0, i->getSrc(t));
1356             i->src(0).mod = i->src(t).mod;
1357          }
1358          i->setSrc(1, NULL);
1359       } else if (src->asCmp()) {
1360          CmpInstruction *cmp = src->asCmp();
1361          if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
1362             return;
1363          if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
1364             return;
1365          if (imm0.reg.data.f32 != 1.0)
1366             return;
1367          if (cmp->dType != TYPE_U32)
1368             return;
1369
1370          cmp->dType = TYPE_F32;
1371          if (i->src(t).mod != Modifier(0)) {
1372             assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
1373             i->src(t).mod = Modifier(0);
1374             cmp->setCond = inverseCondCode(cmp->setCond);
1375          }
1376          i->op = OP_MOV;
1377          i->setSrc(s, NULL);
1378          if (t) {
1379             i->setSrc(0, i->getSrc(t));
1380             i->setSrc(t, NULL);
1381          }
1382       } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
1383                  src->op == OP_SHR &&
1384                  src->src(1).getImmediate(imm1) &&
1385                  i->src(t).mod == Modifier(0) &&
1386                  util_is_power_of_two_or_zero(imm0.reg.data.u32 + 1)) {
1387          // low byte = offset, high byte = width
1388          uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
1389          i->op = OP_EXTBF;
1390          i->setSrc(0, src->getSrc(0));
1391          i->setSrc(1, new_ImmediateValue(prog, ext));
1392       } else if (src->op == OP_SHL &&
1393                  src->src(1).getImmediate(imm1) &&
1394                  i->src(t).mod == Modifier(0) &&
1395                  util_is_power_of_two_or_zero(~imm0.reg.data.u32 + 1) &&
1396                  util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) {
1397          i->op = OP_MOV;
1398          i->setSrc(s, NULL);
1399          if (t) {
1400             i->setSrc(0, i->getSrc(t));
1401             i->setSrc(t, NULL);
1402          }
1403       }
1404    }
1405       break;
1406
1407    case OP_SHL:
1408    {
1409       if (s != 1 || i->src(0).mod != Modifier(0))
1410          break;
1411       // try to concatenate shifts
1412       Instruction *si = i->getSrc(0)->getInsn();
1413       if (!si)
1414          break;
1415       ImmediateValue imm1;
1416       switch (si->op) {
1417       case OP_SHL:
1418          if (si->src(1).getImmediate(imm1)) {
1419             bld.setPosition(i, false);
1420             i->setSrc(0, si->getSrc(0));
1421             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
1422          }
1423          break;
1424       case OP_SHR:
1425          if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
1426             bld.setPosition(i, false);
1427             i->op = OP_AND;
1428             i->setSrc(0, si->getSrc(0));
1429             i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
1430          }
1431          break;
1432       case OP_MUL:
1433          int muls;
1434          if (isFloatType(si->dType))
1435             return;
1436          if (si->src(1).getImmediate(imm1))
1437             muls = 1;
1438          else if (si->src(0).getImmediate(imm1))
1439             muls = 0;
1440          else
1441             return;
1442
1443          bld.setPosition(i, false);
1444          i->op = OP_MUL;
1445          i->setSrc(0, si->getSrc(!muls));
1446          i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1447          break;
1448       case OP_SUB:
1449       case OP_ADD:
1450          int adds;
1451          if (isFloatType(si->dType))
1452             return;
1453          if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
1454             adds = 0;
1455          else if (si->src(1).getImmediate(imm1))
1456             adds = 1;
1457          else
1458             return;
1459          if (si->src(!adds).mod != Modifier(0))
1460             return;
1461          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1462
1463          // This is more operations, but if one of x, y is an immediate, then
1464          // we can get a situation where (a) we can use ISCADD, or (b)
1465          // propagate the add bit into an indirect load.
1466          bld.setPosition(i, false);
1467          i->op = si->op;
1468          i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1469          i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
1470                                      bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
1471                                      si->getSrc(!adds),
1472                                      bld.mkImm(imm0.reg.data.u32)));
1473          break;
1474       default:
1475          return;
1476       }
1477    }
1478       break;
1479
1480    case OP_ABS:
1481    case OP_NEG:
1482    case OP_SAT:
1483    case OP_LG2:
1484    case OP_RCP:
1485    case OP_SQRT:
1486    case OP_RSQ:
1487    case OP_PRESIN:
1488    case OP_SIN:
1489    case OP_COS:
1490    case OP_PREEX2:
1491    case OP_EX2:
1492       unary(i, imm0);
1493       break;
1494    case OP_BFIND: {
1495       int32_t res;
1496       switch (i->dType) {
1497       case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
1498       case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
1499       default:
1500          return;
1501       }
1502       if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
1503          res = 31 - res;
1504       bld.setPosition(i, false); /* make sure bld is init'ed */
1505       i->setSrc(0, bld.mkImm(res));
1506       i->setSrc(1, NULL);
1507       i->op = OP_MOV;
1508       i->subOp = 0;
1509       break;
1510    }
1511    case OP_POPCNT: {
1512       // Only deal with 1-arg POPCNT here
1513       if (i->srcExists(1))
1514          break;
1515       uint32_t res = util_bitcount(imm0.reg.data.u32);
1516       i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1517       i->setSrc(1, NULL);
1518       i->op = OP_MOV;
1519       break;
1520    }
1521    case OP_CVT: {
1522       Storage res;
1523
1524       // TODO: handle 64-bit values properly
1525       if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
1526          return;
1527
1528       // TODO: handle single byte/word extractions
1529       if (i->subOp)
1530          return;
1531
1532       bld.setPosition(i, true); /* make sure bld is init'ed */
1533
1534 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1535    case type: \
1536       switch (i->sType) { \
1537       case TYPE_F64: \
1538          res.data.dst = util_iround(i->saturate ? \
1539                                     CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1540                                     imm0.reg.data.f64); \
1541          break; \
1542       case TYPE_F32: \
1543          res.data.dst = util_iround(i->saturate ? \
1544                                     CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1545                                     imm0.reg.data.f32); \
1546          break; \
1547       case TYPE_S32: \
1548          res.data.dst = i->saturate ? \
1549                         CLAMP(imm0.reg.data.s32, imin, imax) : \
1550                         imm0.reg.data.s32; \
1551          break; \
1552       case TYPE_U32: \
1553          res.data.dst = i->saturate ? \
1554                         CLAMP(imm0.reg.data.u32, umin, umax) : \
1555                         imm0.reg.data.u32; \
1556          break; \
1557       case TYPE_S16: \
1558          res.data.dst = i->saturate ? \
1559                         CLAMP(imm0.reg.data.s16, imin, imax) : \
1560                         imm0.reg.data.s16; \
1561          break; \
1562       case TYPE_U16: \
1563          res.data.dst = i->saturate ? \
1564                         CLAMP(imm0.reg.data.u16, umin, umax) : \
1565                         imm0.reg.data.u16; \
1566          break; \
1567       default: return; \
1568       } \
1569       i->setSrc(0, bld.mkImm(res.data.dst)); \
1570       break
1571
1572       switch(i->dType) {
1573       CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
1574       CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
1575       CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
1576       CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
1577       case TYPE_F32:
1578          switch (i->sType) {
1579          case TYPE_F64:
1580             res.data.f32 = i->saturate ?
1581                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1582                imm0.reg.data.f64;
1583             break;
1584          case TYPE_F32:
1585             res.data.f32 = i->saturate ?
1586                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1587                imm0.reg.data.f32;
1588             break;
1589          case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
1590          case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
1591          case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
1592          case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
1593          default:
1594             return;
1595          }
1596          i->setSrc(0, bld.mkImm(res.data.f32));
1597          break;
1598       case TYPE_F64:
1599          switch (i->sType) {
1600          case TYPE_F64:
1601             res.data.f64 = i->saturate ?
1602                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1603                imm0.reg.data.f64;
1604             break;
1605          case TYPE_F32:
1606             res.data.f64 = i->saturate ?
1607                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1608                imm0.reg.data.f32;
1609             break;
1610          case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
1611          case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
1612          case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
1613          case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
1614          default:
1615             return;
1616          }
1617          i->setSrc(0, bld.mkImm(res.data.f64));
1618          break;
1619       default:
1620          return;
1621       }
1622 #undef CASE
1623
1624       i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
1625       i->op = OP_MOV;
1626       i->saturate = 0;
1627       i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
1628       break;
1629    }
1630    default:
1631       return;
1632    }
1633
1634    // This can get left behind some of the optimizations which simplify
1635    // saturatable values.
1636    if (newi->op == OP_MOV && newi->saturate) {
1637       ImmediateValue tmp;
1638       newi->saturate = 0;
1639       newi->op = OP_SAT;
1640       if (newi->src(0).getImmediate(tmp))
1641          unary(newi, tmp);
1642    }
1643
1644    if (newi->op != op)
1645       foldCount++;
1646 }
1647
1648 // =============================================================================
1649
1650 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1651 class ModifierFolding : public Pass
1652 {
1653 private:
1654    virtual bool visit(BasicBlock *);
1655 };
1656
1657 bool
1658 ModifierFolding::visit(BasicBlock *bb)
1659 {
1660    const Target *target = prog->getTarget();
1661
1662    Instruction *i, *next, *mi;
1663    Modifier mod;
1664
1665    for (i = bb->getEntry(); i; i = next) {
1666       next = i->next;
1667
1668       if (0 && i->op == OP_SUB) {
1669          // turn "sub" into "add neg" (do we really want this ?)
1670          i->op = OP_ADD;
1671          i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1672       }
1673
1674       for (int s = 0; s < 3 && i->srcExists(s); ++s) {
1675          mi = i->getSrc(s)->getInsn();
1676          if (!mi ||
1677              mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
1678             continue;
1679          if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
1680             if ((i->op != OP_ADD &&
1681                  i->op != OP_MUL) ||
1682                 (mi->op != OP_ABS &&
1683                  mi->op != OP_NEG))
1684                continue;
1685          } else
1686          if (i->sType != mi->dType) {
1687             continue;
1688          }
1689          if ((mod = Modifier(mi->op)) == Modifier(0))
1690             continue;
1691          mod *= mi->src(0).mod;
1692
1693          if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
1694             // abs neg [abs] = abs
1695             mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
1696          } else
1697          if ((i->op == OP_NEG) && mod.neg()) {
1698             assert(s == 0);
1699             // neg as both opcode and modifier on same insn is prohibited
1700             // neg neg abs = abs, neg neg = identity
1701             mod = mod & Modifier(~NV50_IR_MOD_NEG);
1702             i->op = mod.getOp();
1703             mod = mod & Modifier(~NV50_IR_MOD_ABS);
1704             if (mod == Modifier(0))
1705                i->op = OP_MOV;
1706          }
1707
1708          if (target->isModSupported(i, s, mod)) {
1709             i->setSrc(s, mi->getSrc(0));
1710             i->src(s).mod *= mod;
1711          }
1712       }
1713
1714       if (i->op == OP_SAT) {
1715          mi = i->getSrc(0)->getInsn();
1716          if (mi &&
1717              mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
1718             mi->saturate = 1;
1719             mi->setDef(0, i->getDef(0));
1720             delete_Instruction(prog, i);
1721          }
1722       }
1723    }
1724
1725    return true;
1726 }
1727
1728 // =============================================================================
1729
1730 // MUL + ADD -> MAD/FMA
1731 // MIN/MAX(a, a) -> a, etc.
1732 // SLCT(a, b, const) -> cc(const) ? a : b
1733 // RCP(RCP(a)) -> a
1734 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1735 // EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
1736 class AlgebraicOpt : public Pass
1737 {
1738 private:
1739    virtual bool visit(BasicBlock *);
1740
1741    void handleABS(Instruction *);
1742    bool handleADD(Instruction *);
1743    bool tryADDToMADOrSAD(Instruction *, operation toOp);
1744    void handleMINMAX(Instruction *);
1745    void handleRCP(Instruction *);
1746    void handleSLCT(Instruction *);
1747    void handleLOGOP(Instruction *);
1748    void handleCVT_NEG(Instruction *);
1749    void handleCVT_CVT(Instruction *);
1750    void handleCVT_EXTBF(Instruction *);
1751    void handleSUCLAMP(Instruction *);
1752    void handleNEG(Instruction *);
1753    void handleEXTBF_RDSV(Instruction *);
1754
1755    BuildUtil bld;
1756 };
1757
1758 void
1759 AlgebraicOpt::handleABS(Instruction *abs)
1760 {
1761    Instruction *sub = abs->getSrc(0)->getInsn();
1762    DataType ty;
1763    if (!sub ||
1764        !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
1765       return;
1766    // expect not to have mods yet, if we do, bail
1767    if (sub->src(0).mod || sub->src(1).mod)
1768       return;
1769    // hidden conversion ?
1770    ty = intTypeToSigned(sub->dType);
1771    if (abs->dType != abs->sType || ty != abs->sType)
1772       return;
1773
1774    if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
1775        sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
1776        sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
1777          return;
1778
1779    Value *src0 = sub->getSrc(0);
1780    Value *src1 = sub->getSrc(1);
1781
1782    if (sub->op == OP_ADD) {
1783       Instruction *neg = sub->getSrc(1)->getInsn();
1784       if (neg && neg->op != OP_NEG) {
1785          neg = sub->getSrc(0)->getInsn();
1786          src0 = sub->getSrc(1);
1787       }
1788       if (!neg || neg->op != OP_NEG ||
1789           neg->dType != neg->sType || neg->sType != ty)
1790          return;
1791       src1 = neg->getSrc(0);
1792    }
1793
1794    // found ABS(SUB))
1795    abs->moveSources(1, 2); // move sources >=1 up by 2
1796    abs->op = OP_SAD;
1797    abs->setType(sub->dType);
1798    abs->setSrc(0, src0);
1799    abs->setSrc(1, src1);
1800    bld.setPosition(abs, false);
1801    abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
1802 }
1803
1804 bool
1805 AlgebraicOpt::handleADD(Instruction *add)
1806 {
1807    Value *src0 = add->getSrc(0);
1808    Value *src1 = add->getSrc(1);
1809
1810    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1811       return false;
1812
1813    bool changed = false;
1814    // we can't optimize to MAD if the add is precise
1815    if (!add->precise && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
1816       changed = tryADDToMADOrSAD(add, OP_MAD);
1817    if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
1818       changed = tryADDToMADOrSAD(add, OP_SAD);
1819    return changed;
1820 }
1821
1822 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1823 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1824 bool
1825 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
1826 {
1827    Value *src0 = add->getSrc(0);
1828    Value *src1 = add->getSrc(1);
1829    Value *src;
1830    int s;
1831    const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1832    const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1833    Modifier mod[4];
1834
1835    if (src0->refCount() == 1 &&
1836        src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1837       s = 0;
1838    else
1839    if (src1->refCount() == 1 &&
1840        src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1841       s = 1;
1842    else
1843       return false;
1844
1845    src = add->getSrc(s);
1846
1847    if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
1848       return false;
1849
1850    if (src->getInsn()->saturate || src->getInsn()->postFactor ||
1851        src->getInsn()->dnz || src->getInsn()->precise)
1852       return false;
1853
1854    if (toOp == OP_SAD) {
1855       ImmediateValue imm;
1856       if (!src->getInsn()->src(2).getImmediate(imm))
1857          return false;
1858       if (!imm.isInteger(0))
1859          return false;
1860    }
1861
1862    if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
1863        isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
1864       return false;
1865
1866    mod[0] = add->src(0).mod;
1867    mod[1] = add->src(1).mod;
1868    mod[2] = src->getUniqueInsn()->src(0).mod;
1869    mod[3] = src->getUniqueInsn()->src(1).mod;
1870
1871    if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1872       return false;
1873
1874    add->op = toOp;
1875    add->subOp = src->getInsn()->subOp; // potentially mul-high
1876    add->dnz = src->getInsn()->dnz;
1877    add->dType = src->getInsn()->dType; // sign matters for imad hi
1878    add->sType = src->getInsn()->sType;
1879
1880    add->setSrc(2, add->src(s ? 0 : 1));
1881
1882    add->setSrc(0, src->getInsn()->getSrc(0));
1883    add->src(0).mod = mod[2] ^ mod[s];
1884    add->setSrc(1, src->getInsn()->getSrc(1));
1885    add->src(1).mod = mod[3];
1886
1887    return true;
1888 }
1889
1890 void
1891 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1892 {
1893    Value *src0 = minmax->getSrc(0);
1894    Value *src1 = minmax->getSrc(1);
1895
1896    if (src0 != src1 || src0->reg.file != FILE_GPR)
1897       return;
1898    if (minmax->src(0).mod == minmax->src(1).mod) {
1899       if (minmax->def(0).mayReplace(minmax->src(0))) {
1900          minmax->def(0).replace(minmax->src(0), false);
1901          minmax->bb->remove(minmax);
1902       } else {
1903          minmax->op = OP_CVT;
1904          minmax->setSrc(1, NULL);
1905       }
1906    } else {
1907       // TODO:
1908       // min(x, -x) = -abs(x)
1909       // min(x, -abs(x)) = -abs(x)
1910       // min(x, abs(x)) = x
1911       // max(x, -abs(x)) = x
1912       // max(x, abs(x)) = abs(x)
1913       // max(x, -x) = abs(x)
1914    }
1915 }
1916
1917 // rcp(rcp(a)) = a
1918 // rcp(sqrt(a)) = rsq(a)
1919 void
1920 AlgebraicOpt::handleRCP(Instruction *rcp)
1921 {
1922    Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1923
1924    if (!si)
1925       return;
1926
1927    if (si->op == OP_RCP) {
1928       Modifier mod = rcp->src(0).mod * si->src(0).mod;
1929       rcp->op = mod.getOp();
1930       rcp->setSrc(0, si->getSrc(0));
1931    } else if (si->op == OP_SQRT) {
1932       rcp->op = OP_RSQ;
1933       rcp->setSrc(0, si->getSrc(0));
1934       rcp->src(0).mod = rcp->src(0).mod * si->src(0).mod;
1935    }
1936 }
1937
1938 void
1939 AlgebraicOpt::handleSLCT(Instruction *slct)
1940 {
1941    if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
1942       if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
1943          slct->setSrc(0, slct->getSrc(1));
1944    } else
1945    if (slct->getSrc(0) != slct->getSrc(1)) {
1946       return;
1947    }
1948    slct->op = OP_MOV;
1949    slct->setSrc(1, NULL);
1950    slct->setSrc(2, NULL);
1951 }
1952
1953 void
1954 AlgebraicOpt::handleLOGOP(Instruction *logop)
1955 {
1956    Value *src0 = logop->getSrc(0);
1957    Value *src1 = logop->getSrc(1);
1958
1959    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1960       return;
1961
1962    if (src0 == src1) {
1963       if ((logop->op == OP_AND || logop->op == OP_OR) &&
1964           logop->def(0).mayReplace(logop->src(0))) {
1965          logop->def(0).replace(logop->src(0), false);
1966          delete_Instruction(prog, logop);
1967       }
1968    } else {
1969       // try AND(SET, SET) -> SET_AND(SET)
1970       Instruction *set0 = src0->getInsn();
1971       Instruction *set1 = src1->getInsn();
1972
1973       if (!set0 || set0->fixed || !set1 || set1->fixed)
1974          return;
1975       if (set1->op != OP_SET) {
1976          Instruction *xchg = set0;
1977          set0 = set1;
1978          set1 = xchg;
1979          if (set1->op != OP_SET)
1980             return;
1981       }
1982       operation redOp = (logop->op == OP_AND ? OP_SET_AND :
1983                          logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
1984       if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
1985          return;
1986       if (set0->op != OP_SET &&
1987           set0->op != OP_SET_AND &&
1988           set0->op != OP_SET_OR &&
1989           set0->op != OP_SET_XOR)
1990          return;
1991       if (set0->getDef(0)->refCount() > 1 &&
1992           set1->getDef(0)->refCount() > 1)
1993          return;
1994       if (set0->getPredicate() || set1->getPredicate())
1995          return;
1996       // check that they don't source each other
1997       for (int s = 0; s < 2; ++s)
1998          if (set0->getSrc(s) == set1->getDef(0) ||
1999              set1->getSrc(s) == set0->getDef(0))
2000             return;
2001
2002       set0 = cloneForward(func, set0);
2003       set1 = cloneShallow(func, set1);
2004       logop->bb->insertAfter(logop, set1);
2005       logop->bb->insertAfter(logop, set0);
2006
2007       set0->dType = TYPE_U8;
2008       set0->getDef(0)->reg.file = FILE_PREDICATE;
2009       set0->getDef(0)->reg.size = 1;
2010       set1->setSrc(2, set0->getDef(0));
2011       set1->op = redOp;
2012       set1->setDef(0, logop->getDef(0));
2013       delete_Instruction(prog, logop);
2014    }
2015 }
2016
2017 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
2018 // nv50:
2019 //  F2I(NEG(I2F(ABS(SET))))
2020 void
2021 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
2022 {
2023    Instruction *insn = cvt->getSrc(0)->getInsn();
2024    if (cvt->sType != TYPE_F32 ||
2025        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
2026       return;
2027    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
2028       return;
2029    if (insn->src(0).mod != Modifier(0))
2030       return;
2031    insn = insn->getSrc(0)->getInsn();
2032
2033    // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
2034    if (insn && insn->op == OP_CVT &&
2035        insn->dType == TYPE_F32 &&
2036        insn->sType == TYPE_S32) {
2037       insn = insn->getSrc(0)->getInsn();
2038       if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
2039           insn->src(0).mod)
2040          return;
2041       insn = insn->getSrc(0)->getInsn();
2042       if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
2043          return;
2044    } else
2045    if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
2046       return;
2047    }
2048
2049    Instruction *bset = cloneShallow(func, insn);
2050    bset->dType = TYPE_U32;
2051    bset->setDef(0, cvt->getDef(0));
2052    cvt->bb->insertAfter(cvt, bset);
2053    delete_Instruction(prog, cvt);
2054 }
2055
2056 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
2057 // does a type conversion, this becomes trickier as there might be range
2058 // changes/etc. We could handle those in theory as long as the range was being
2059 // reduced or kept the same.
2060 void
2061 AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
2062 {
2063    Instruction *insn = cvt->getSrc(0)->getInsn();
2064    RoundMode rnd = insn->rnd;
2065
2066    if (insn->saturate ||
2067        insn->subOp ||
2068        insn->dType != insn->sType ||
2069        insn->dType != cvt->sType)
2070       return;
2071
2072    switch (insn->op) {
2073    case OP_CEIL:
2074       rnd = ROUND_PI;
2075       break;
2076    case OP_FLOOR:
2077       rnd = ROUND_MI;
2078       break;
2079    case OP_TRUNC:
2080       rnd = ROUND_ZI;
2081       break;
2082    case OP_CVT:
2083       break;
2084    default:
2085       return;
2086    }
2087
2088    if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
2089       rnd = (RoundMode)(rnd & 3);
2090
2091    cvt->rnd = rnd;
2092    cvt->setSrc(0, insn->getSrc(0));
2093    cvt->src(0).mod *= insn->src(0).mod;
2094    cvt->sType = insn->sType;
2095 }
2096
2097 // Some shaders extract packed bytes out of words and convert them to
2098 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
2099 // nv50 for word sizes.
2100 //
2101 // CVT(EXTBF(x, byte/word))
2102 // CVT(AND(bytemask, x))
2103 // CVT(AND(bytemask, SHR(x, 8/16/24)))
2104 // CVT(SHR(x, 16/24))
2105 void
2106 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
2107 {
2108    Instruction *insn = cvt->getSrc(0)->getInsn();
2109    ImmediateValue imm;
2110    Value *arg = NULL;
2111    unsigned width, offset;
2112    if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
2113       return;
2114    if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
2115       width = (imm.reg.data.u32 >> 8) & 0xff;
2116       offset = imm.reg.data.u32 & 0xff;
2117       arg = insn->getSrc(0);
2118
2119       if (width != 8 && width != 16)
2120          return;
2121       if (width == 8 && offset & 0x7)
2122          return;
2123       if (width == 16 && offset & 0xf)
2124          return;
2125    } else if (insn->op == OP_AND) {
2126       int s;
2127       if (insn->src(0).getImmediate(imm))
2128          s = 0;
2129       else if (insn->src(1).getImmediate(imm))
2130          s = 1;
2131       else
2132          return;
2133
2134       if (imm.reg.data.u32 == 0xff)
2135          width = 8;
2136       else if (imm.reg.data.u32 == 0xffff)
2137          width = 16;
2138       else
2139          return;
2140
2141       arg = insn->getSrc(!s);
2142       Instruction *shift = arg->getInsn();
2143       offset = 0;
2144       if (shift && shift->op == OP_SHR &&
2145           shift->sType == cvt->sType &&
2146           shift->src(1).getImmediate(imm) &&
2147           ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2148            (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
2149          arg = shift->getSrc(0);
2150          offset = imm.reg.data.u32;
2151       }
2152       // We just AND'd the high bits away, which means this is effectively an
2153       // unsigned value.
2154       cvt->sType = TYPE_U32;
2155    } else if (insn->op == OP_SHR &&
2156               insn->sType == cvt->sType &&
2157               insn->src(1).getImmediate(imm)) {
2158       arg = insn->getSrc(0);
2159       if (imm.reg.data.u32 == 24) {
2160          width = 8;
2161          offset = 24;
2162       } else if (imm.reg.data.u32 == 16) {
2163          width = 16;
2164          offset = 16;
2165       } else {
2166          return;
2167       }
2168    }
2169
2170    if (!arg)
2171       return;
2172
2173    // Irrespective of what came earlier, we can undo a shift on the argument
2174    // by adjusting the offset.
2175    Instruction *shift = arg->getInsn();
2176    if (shift && shift->op == OP_SHL &&
2177        shift->src(1).getImmediate(imm) &&
2178        ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2179         (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
2180        imm.reg.data.u32 <= offset) {
2181       arg = shift->getSrc(0);
2182       offset -= imm.reg.data.u32;
2183    }
2184
2185    // The unpackSnorm lowering still leaves a few shifts behind, but it's too
2186    // annoying to detect them.
2187
2188    if (width == 8) {
2189       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
2190    } else {
2191       assert(width == 16);
2192       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
2193    }
2194    cvt->setSrc(0, arg);
2195    cvt->subOp = offset >> 3;
2196 }
2197
2198 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
2199 void
2200 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
2201 {
2202    ImmediateValue imm;
2203    int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
2204    int s;
2205    Instruction *add;
2206
2207    assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
2208
2209    // look for ADD (TODO: only count references by non-SUCLAMP)
2210    if (insn->getSrc(0)->refCount() > 1)
2211       return;
2212    add = insn->getSrc(0)->getInsn();
2213    if (!add || add->op != OP_ADD ||
2214        (add->dType != TYPE_U32 &&
2215         add->dType != TYPE_S32))
2216       return;
2217
2218    // look for immediate
2219    for (s = 0; s < 2; ++s)
2220       if (add->src(s).getImmediate(imm))
2221          break;
2222    if (s >= 2)
2223       return;
2224    s = s ? 0 : 1;
2225    // determine if immediate fits
2226    val += imm.reg.data.s32;
2227    if (val > 31 || val < -32)
2228       return;
2229    // determine if other addend fits
2230    if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
2231       return;
2232
2233    bld.setPosition(insn, false); // make sure bld is init'ed
2234    // replace sources
2235    insn->setSrc(2, bld.mkImm(val));
2236    insn->setSrc(0, add->getSrc(s));
2237 }
2238
2239 // NEG(AND(SET, 1)) -> SET
2240 void
2241 AlgebraicOpt::handleNEG(Instruction *i) {
2242    Instruction *src = i->getSrc(0)->getInsn();
2243    ImmediateValue imm;
2244    int b;
2245
2246    if (isFloatType(i->sType) || !src || src->op != OP_AND)
2247       return;
2248
2249    if (src->src(0).getImmediate(imm))
2250       b = 1;
2251    else if (src->src(1).getImmediate(imm))
2252       b = 0;
2253    else
2254       return;
2255
2256    if (!imm.isInteger(1))
2257       return;
2258
2259    Instruction *set = src->getSrc(b)->getInsn();
2260    if ((set->op == OP_SET || set->op == OP_SET_AND ||
2261        set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
2262        !isFloatType(set->dType)) {
2263       i->def(0).replace(set->getDef(0), false);
2264    }
2265 }
2266
2267 // EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
2268 void
2269 AlgebraicOpt::handleEXTBF_RDSV(Instruction *i)
2270 {
2271    Instruction *rdsv = i->getSrc(0)->getUniqueInsn();
2272    if (rdsv->op != OP_RDSV ||
2273        rdsv->getSrc(0)->asSym()->reg.data.sv.sv != SV_COMBINED_TID)
2274       return;
2275    // Avoid creating more RDSV instructions
2276    if (rdsv->getDef(0)->refCount() > 1)
2277       return;
2278
2279    ImmediateValue imm;
2280    if (!i->src(1).getImmediate(imm))
2281       return;
2282
2283    int index;
2284    if (imm.isInteger(0x1000))
2285       index = 0;
2286    else
2287    if (imm.isInteger(0x0a10))
2288       index = 1;
2289    else
2290    if (imm.isInteger(0x061a))
2291       index = 2;
2292    else
2293       return;
2294
2295    bld.setPosition(i, false);
2296
2297    i->op = OP_RDSV;
2298    i->setSrc(0, bld.mkSysVal(SV_TID, index));
2299    i->setSrc(1, NULL);
2300 }
2301
2302 bool
2303 AlgebraicOpt::visit(BasicBlock *bb)
2304 {
2305    Instruction *next;
2306    for (Instruction *i = bb->getEntry(); i; i = next) {
2307       next = i->next;
2308       switch (i->op) {
2309       case OP_ABS:
2310          handleABS(i);
2311          break;
2312       case OP_ADD:
2313          handleADD(i);
2314          break;
2315       case OP_RCP:
2316          handleRCP(i);
2317          break;
2318       case OP_MIN:
2319       case OP_MAX:
2320          handleMINMAX(i);
2321          break;
2322       case OP_SLCT:
2323          handleSLCT(i);
2324          break;
2325       case OP_AND:
2326       case OP_OR:
2327       case OP_XOR:
2328          handleLOGOP(i);
2329          break;
2330       case OP_CVT:
2331          handleCVT_NEG(i);
2332          handleCVT_CVT(i);
2333          if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
2334              handleCVT_EXTBF(i);
2335          break;
2336       case OP_SUCLAMP:
2337          handleSUCLAMP(i);
2338          break;
2339       case OP_NEG:
2340          handleNEG(i);
2341          break;
2342       case OP_EXTBF:
2343          handleEXTBF_RDSV(i);
2344          break;
2345       default:
2346          break;
2347       }
2348    }
2349
2350    return true;
2351 }
2352
2353 // =============================================================================
2354
2355 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2356 // MUL(a, b) -> a few XMADs
2357 // MAD/FMA(a, b, c) -> a few XMADs
2358 class LateAlgebraicOpt : public Pass
2359 {
2360 private:
2361    virtual bool visit(Instruction *);
2362
2363    void handleADD(Instruction *);
2364    void handleMULMAD(Instruction *);
2365    bool tryADDToSHLADD(Instruction *);
2366
2367    BuildUtil bld;
2368 };
2369
2370 void
2371 LateAlgebraicOpt::handleADD(Instruction *add)
2372 {
2373    Value *src0 = add->getSrc(0);
2374    Value *src1 = add->getSrc(1);
2375
2376    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
2377       return;
2378
2379    if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType))
2380       tryADDToSHLADD(add);
2381 }
2382
2383 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2384 bool
2385 LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
2386 {
2387    Value *src0 = add->getSrc(0);
2388    Value *src1 = add->getSrc(1);
2389    ImmediateValue imm;
2390    Instruction *shl;
2391    Value *src;
2392    int s;
2393
2394    if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8
2395        || isFloatType(add->dType))
2396       return false;
2397
2398    if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL)
2399       s = 0;
2400    else
2401    if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL)
2402       s = 1;
2403    else
2404       return false;
2405
2406    src = add->getSrc(s);
2407    shl = src->getUniqueInsn();
2408
2409    if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod)
2410       return false;
2411
2412    if (!shl->src(1).getImmediate(imm))
2413       return false;
2414
2415    add->op = OP_SHLADD;
2416    add->setSrc(2, add->src(!s));
2417    // SHL can't have any modifiers, but the ADD source may have had
2418    // one. Preserve it.
2419    add->setSrc(0, shl->getSrc(0));
2420    if (s == 1)
2421       add->src(0).mod = add->src(1).mod;
2422    add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32));
2423    add->src(1).mod = Modifier(0);
2424
2425    return true;
2426 }
2427
2428 // MUL(a, b) -> a few XMADs
2429 // MAD/FMA(a, b, c) -> a few XMADs
2430 void
2431 LateAlgebraicOpt::handleMULMAD(Instruction *i)
2432 {
2433    // TODO: handle NV50_IR_SUBOP_MUL_HIGH
2434    if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
2435       return;
2436    if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
2437       return;
2438    if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
2439       return;
2440
2441    assert(!i->src(0).mod);
2442    assert(!i->src(1).mod);
2443    assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
2444
2445    bld.setPosition(i, false);
2446
2447    Value *a = i->getSrc(0);
2448    Value *b = i->getSrc(1);
2449    Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
2450
2451    Value *tmp0 = bld.getSSA();
2452    Value *tmp1 = bld.getSSA();
2453
2454    Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
2455    insn->setPredicate(i->cc, i->getPredicate());
2456
2457    insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
2458    insn->setPredicate(i->cc, i->getPredicate());
2459    insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
2460
2461    Value *pred = i->getPredicate();
2462    i->setPredicate(i->cc, NULL);
2463
2464    i->op = OP_XMAD;
2465    i->setSrc(0, b);
2466    i->setSrc(1, tmp1);
2467    i->setSrc(2, tmp0);
2468    i->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
2469    i->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
2470
2471    i->setPredicate(i->cc, pred);
2472 }
2473
2474 bool
2475 LateAlgebraicOpt::visit(Instruction *i)
2476 {
2477    switch (i->op) {
2478    case OP_ADD:
2479       handleADD(i);
2480       break;
2481    case OP_MUL:
2482    case OP_MAD:
2483    case OP_FMA:
2484       handleMULMAD(i);
2485       break;
2486    default:
2487       break;
2488    }
2489
2490    return true;
2491 }
2492
2493 // =============================================================================
2494
2495 // Split 64-bit MUL and MAD
2496 class Split64BitOpPreRA : public Pass
2497 {
2498 private:
2499    virtual bool visit(BasicBlock *);
2500    void split64MulMad(Function *, Instruction *, DataType);
2501
2502    BuildUtil bld;
2503 };
2504
2505 bool
2506 Split64BitOpPreRA::visit(BasicBlock *bb)
2507 {
2508    Instruction *i, *next;
2509    Modifier mod;
2510
2511    for (i = bb->getEntry(); i; i = next) {
2512       next = i->next;
2513
2514       DataType hTy;
2515       switch (i->dType) {
2516       case TYPE_U64: hTy = TYPE_U32; break;
2517       case TYPE_S64: hTy = TYPE_S32; break;
2518       default:
2519          continue;
2520       }
2521
2522       if (i->op == OP_MAD || i->op == OP_MUL)
2523          split64MulMad(func, i, hTy);
2524    }
2525
2526    return true;
2527 }
2528
2529 void
2530 Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
2531 {
2532    assert(i->op == OP_MAD || i->op == OP_MUL);
2533    assert(!isFloatType(i->dType) && !isFloatType(i->sType));
2534    assert(typeSizeof(hTy) == 4);
2535
2536    bld.setPosition(i, true);
2537
2538    Value *zero = bld.mkImm(0u);
2539    Value *carry = bld.getSSA(1, FILE_FLAGS);
2540
2541    // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
2542    // values (a, b and c might be 32-bit values), using 32-bit operations. This
2543    // gives the following operations:
2544    // * `d.low = low(a.low * b.low) (+ c.low)?`
2545    // * `d.high = low(a.high * b.low) + low(a.low * b.high)
2546    //           + high(a.low * b.low) (+ c.high)?`
2547    //
2548    // To compute the high bits, we can split in the following operations:
2549    // * `tmp1   = low(a.high * b.low) (+ c.high)?`
2550    // * `tmp2   = low(a.low * b.high) + tmp1`
2551    // * `d.high = high(a.low * b.low) + tmp2`
2552    //
2553    // mkSplit put lower bits at index 0 and higher bits at index 1
2554
2555    Value *op1[2];
2556    if (i->getSrc(0)->reg.size == 8)
2557       bld.mkSplit(op1, 4, i->getSrc(0));
2558    else {
2559       op1[0] = i->getSrc(0);
2560       op1[1] = zero;
2561    }
2562    Value *op2[2];
2563    if (i->getSrc(1)->reg.size == 8)
2564       bld.mkSplit(op2, 4, i->getSrc(1));
2565    else {
2566       op2[0] = i->getSrc(1);
2567       op2[1] = zero;
2568    }
2569
2570    Value *op3[2] = { NULL, NULL };
2571    if (i->op == OP_MAD) {
2572       if (i->getSrc(2)->reg.size == 8)
2573          bld.mkSplit(op3, 4, i->getSrc(2));
2574       else {
2575          op3[0] = i->getSrc(2);
2576          op3[1] = zero;
2577       }
2578    }
2579
2580    Value *tmpRes1Hi = bld.getSSA();
2581    if (i->op == OP_MAD)
2582       bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
2583    else
2584       bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
2585
2586    Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi);
2587
2588    Value *def[2] = { bld.getSSA(), bld.getSSA() };
2589
2590    // If it was a MAD, add the carry from the low bits
2591    // It is not needed if it was a MUL, since we added high(a.low * b.low) to
2592    // d.high
2593    if (i->op == OP_MAD)
2594       bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry);
2595    else
2596       bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
2597
2598    Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi);
2599    hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
2600    if (i->op == OP_MAD)
2601       hiPart3->setFlagsSrc(3, carry);
2602
2603    bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
2604
2605    delete_Instruction(fn->getProgram(), i);
2606 }
2607
2608 // =============================================================================
2609
2610 static inline void
2611 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
2612 {
2613    if (offset != ldst->getSrc(0)->reg.data.offset) {
2614       if (ldst->getSrc(0)->refCount() > 1)
2615          ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
2616       ldst->getSrc(0)->reg.data.offset = offset;
2617    }
2618 }
2619
2620 // Combine loads and stores, forward stores to loads where possible.
2621 class MemoryOpt : public Pass
2622 {
2623 private:
2624    class Record
2625    {
2626    public:
2627       Record *next;
2628       Instruction *insn;
2629       const Value *rel[2];
2630       const Value *base;
2631       int32_t offset;
2632       int8_t fileIndex;
2633       uint8_t size;
2634       bool locked;
2635       Record *prev;
2636
2637       bool overlaps(const Instruction *ldst) const;
2638
2639       inline void link(Record **);
2640       inline void unlink(Record **);
2641       inline void set(const Instruction *ldst);
2642    };
2643
2644 public:
2645    MemoryOpt();
2646
2647    Record *loads[DATA_FILE_COUNT];
2648    Record *stores[DATA_FILE_COUNT];
2649
2650    MemoryPool recordPool;
2651
2652 private:
2653    virtual bool visit(BasicBlock *);
2654    bool runOpt(BasicBlock *);
2655
2656    Record **getList(const Instruction *);
2657
2658    Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
2659
2660    // merge @insn into load/store instruction from @rec
2661    bool combineLd(Record *rec, Instruction *ld);
2662    bool combineSt(Record *rec, Instruction *st);
2663
2664    bool replaceLdFromLd(Instruction *ld, Record *ldRec);
2665    bool replaceLdFromSt(Instruction *ld, Record *stRec);
2666    bool replaceStFromSt(Instruction *restrict st, Record *stRec);
2667
2668    void addRecord(Instruction *ldst);
2669    void purgeRecords(Instruction *const st, DataFile);
2670    void lockStores(Instruction *const ld);
2671    void reset();
2672
2673 private:
2674    Record *prevRecord;
2675 };
2676
2677 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
2678 {
2679    for (int i = 0; i < DATA_FILE_COUNT; ++i) {
2680       loads[i] = NULL;
2681       stores[i] = NULL;
2682    }
2683    prevRecord = NULL;
2684 }
2685
2686 void
2687 MemoryOpt::reset()
2688 {
2689    for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
2690       Record *it, *next;
2691       for (it = loads[i]; it; it = next) {
2692          next = it->next;
2693          recordPool.release(it);
2694       }
2695       loads[i] = NULL;
2696       for (it = stores[i]; it; it = next) {
2697          next = it->next;
2698          recordPool.release(it);
2699       }
2700       stores[i] = NULL;
2701    }
2702 }
2703
2704 bool
2705 MemoryOpt::combineLd(Record *rec, Instruction *ld)
2706 {
2707    int32_t offRc = rec->offset;
2708    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2709    int sizeRc = rec->size;
2710    int sizeLd = typeSizeof(ld->dType);
2711    int size = sizeRc + sizeLd;
2712    int d, j;
2713
2714    if (!prog->getTarget()->
2715        isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
2716       return false;
2717    // no unaligned loads
2718    if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
2719        ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
2720       return false;
2721    // for compute indirect loads are not guaranteed to be aligned
2722    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2723       return false;
2724
2725    assert(sizeRc + sizeLd <= 16 && offRc != offLd);
2726
2727    // lock any stores that overlap with the load being merged into the
2728    // existing record.
2729    lockStores(ld);
2730
2731    for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
2732
2733    if (offLd < offRc) {
2734       int sz;
2735       for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
2736       // d: nr of definitions in ld
2737       // j: nr of definitions in rec->insn, move:
2738       for (d = d + j - 1; j > 0; --j, --d)
2739          rec->insn->setDef(d, rec->insn->getDef(j - 1));
2740
2741       if (rec->insn->getSrc(0)->refCount() > 1)
2742          rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
2743       rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
2744
2745       d = 0;
2746    } else {
2747       d = j;
2748    }
2749    // move definitions of @ld to @rec->insn
2750    for (j = 0; sizeLd; ++j, ++d) {
2751       sizeLd -= ld->getDef(j)->reg.size;
2752       rec->insn->setDef(d, ld->getDef(j));
2753    }
2754
2755    rec->size = size;
2756    rec->insn->getSrc(0)->reg.size = size;
2757    rec->insn->setType(typeOfSize(size));
2758
2759    delete_Instruction(prog, ld);
2760
2761    return true;
2762 }
2763
2764 bool
2765 MemoryOpt::combineSt(Record *rec, Instruction *st)
2766 {
2767    int32_t offRc = rec->offset;
2768    int32_t offSt = st->getSrc(0)->reg.data.offset;
2769    int sizeRc = rec->size;
2770    int sizeSt = typeSizeof(st->dType);
2771    int s = sizeSt / 4;
2772    int size = sizeRc + sizeSt;
2773    int j, k;
2774    Value *src[4]; // no modifiers in ValueRef allowed for st
2775    Value *extra[3];
2776
2777    if (!prog->getTarget()->
2778        isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
2779       return false;
2780    // no unaligned stores
2781    if (size == 8 && MIN2(offRc, offSt) & 0x7)
2782       return false;
2783    // for compute indirect stores are not guaranteed to be aligned
2784    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2785       return false;
2786
2787    // remove any existing load/store records for the store being merged into
2788    // the existing record.
2789    purgeRecords(st, DATA_FILE_COUNT);
2790
2791    st->takeExtraSources(0, extra); // save predicate and indirect address
2792
2793    if (offRc < offSt) {
2794       // save values from @st
2795       for (s = 0; sizeSt; ++s) {
2796          sizeSt -= st->getSrc(s + 1)->reg.size;
2797          src[s] = st->getSrc(s + 1);
2798       }
2799       // set record's values as low sources of @st
2800       for (j = 1; sizeRc; ++j) {
2801          sizeRc -= rec->insn->getSrc(j)->reg.size;
2802          st->setSrc(j, rec->insn->getSrc(j));
2803       }
2804       // set saved values as high sources of @st
2805       for (k = j, j = 0; j < s; ++j)
2806          st->setSrc(k++, src[j]);
2807
2808       updateLdStOffset(st, offRc, func);
2809    } else {
2810       for (j = 1; sizeSt; ++j)
2811          sizeSt -= st->getSrc(j)->reg.size;
2812       for (s = 1; sizeRc; ++j, ++s) {
2813          sizeRc -= rec->insn->getSrc(s)->reg.size;
2814          st->setSrc(j, rec->insn->getSrc(s));
2815       }
2816       rec->offset = offSt;
2817    }
2818    st->putExtraSources(0, extra); // restore pointer and predicate
2819
2820    delete_Instruction(prog, rec->insn);
2821    rec->insn = st;
2822    rec->size = size;
2823    rec->insn->getSrc(0)->reg.size = size;
2824    rec->insn->setType(typeOfSize(size));
2825    return true;
2826 }
2827
2828 void
2829 MemoryOpt::Record::set(const Instruction *ldst)
2830 {
2831    const Symbol *mem = ldst->getSrc(0)->asSym();
2832    fileIndex = mem->reg.fileIndex;
2833    rel[0] = ldst->getIndirect(0, 0);
2834    rel[1] = ldst->getIndirect(0, 1);
2835    offset = mem->reg.data.offset;
2836    base = mem->getBase();
2837    size = typeSizeof(ldst->sType);
2838 }
2839
2840 void
2841 MemoryOpt::Record::link(Record **list)
2842 {
2843    next = *list;
2844    if (next)
2845       next->prev = this;
2846    prev = NULL;
2847    *list = this;
2848 }
2849
2850 void
2851 MemoryOpt::Record::unlink(Record **list)
2852 {
2853    if (next)
2854       next->prev = prev;
2855    if (prev)
2856       prev->next = next;
2857    else
2858       *list = next;
2859 }
2860
2861 MemoryOpt::Record **
2862 MemoryOpt::getList(const Instruction *insn)
2863 {
2864    if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
2865       return &loads[insn->src(0).getFile()];
2866    return &stores[insn->src(0).getFile()];
2867 }
2868
2869 void
2870 MemoryOpt::addRecord(Instruction *i)
2871 {
2872    Record **list = getList(i);
2873    Record *it = reinterpret_cast<Record *>(recordPool.allocate());
2874
2875    it->link(list);
2876    it->set(i);
2877    it->insn = i;
2878    it->locked = false;
2879 }
2880
2881 MemoryOpt::Record *
2882 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
2883 {
2884    const Symbol *sym = insn->getSrc(0)->asSym();
2885    const int size = typeSizeof(insn->sType);
2886    Record *rec = NULL;
2887    Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
2888
2889    for (; it; it = it->next) {
2890       if (it->locked && insn->op != OP_LOAD && insn->op != OP_VFETCH)
2891          continue;
2892       if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
2893           it->rel[0] != insn->getIndirect(0, 0) ||
2894           it->fileIndex != sym->reg.fileIndex ||
2895           it->rel[1] != insn->getIndirect(0, 1))
2896          continue;
2897
2898       if (it->offset < sym->reg.data.offset) {
2899          if (it->offset + it->size >= sym->reg.data.offset) {
2900             isAdj = (it->offset + it->size == sym->reg.data.offset);
2901             if (!isAdj)
2902                return it;
2903             if (!(it->offset & 0x7))
2904                rec = it;
2905          }
2906       } else {
2907          isAdj = it->offset != sym->reg.data.offset;
2908          if (size <= it->size && !isAdj)
2909             return it;
2910          else
2911          if (!(sym->reg.data.offset & 0x7))
2912             if (it->offset - size <= sym->reg.data.offset)
2913                rec = it;
2914       }
2915    }
2916    return rec;
2917 }
2918
2919 bool
2920 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
2921 {
2922    Instruction *st = rec->insn;
2923    int32_t offSt = rec->offset;
2924    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2925    int d, s;
2926
2927    for (s = 1; offSt != offLd && st->srcExists(s); ++s)
2928       offSt += st->getSrc(s)->reg.size;
2929    if (offSt != offLd)
2930       return false;
2931
2932    for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
2933       if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
2934          return false;
2935       if (st->getSrc(s)->reg.file != FILE_GPR)
2936          return false;
2937       ld->def(d).replace(st->src(s), false);
2938    }
2939    ld->bb->remove(ld);
2940    return true;
2941 }
2942
2943 bool
2944 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
2945 {
2946    Instruction *ldR = rec->insn;
2947    int32_t offR = rec->offset;
2948    int32_t offE = ldE->getSrc(0)->reg.data.offset;
2949    int dR, dE;
2950
2951    assert(offR <= offE);
2952    for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
2953       offR += ldR->getDef(dR)->reg.size;
2954    if (offR != offE)
2955       return false;
2956
2957    for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
2958       if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
2959          return false;
2960       ldE->def(dE).replace(ldR->getDef(dR), false);
2961    }
2962
2963    delete_Instruction(prog, ldE);
2964    return true;
2965 }
2966
2967 bool
2968 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
2969 {
2970    const Instruction *const ri = rec->insn;
2971    Value *extra[3];
2972
2973    int32_t offS = st->getSrc(0)->reg.data.offset;
2974    int32_t offR = rec->offset;
2975    int32_t endS = offS + typeSizeof(st->dType);
2976    int32_t endR = offR + typeSizeof(ri->dType);
2977
2978    rec->size = MAX2(endS, endR) - MIN2(offS, offR);
2979
2980    st->takeExtraSources(0, extra);
2981
2982    if (offR < offS) {
2983       Value *vals[10];
2984       int s, n;
2985       int k = 0;
2986       // get non-replaced sources of ri
2987       for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
2988          vals[k++] = ri->getSrc(s);
2989       n = s;
2990       // get replaced sources of st
2991       for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
2992          vals[k++] = st->getSrc(s);
2993       // skip replaced sources of ri
2994       for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
2995       // get non-replaced sources after values covered by st
2996       for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
2997          vals[k++] = ri->getSrc(s);
2998       assert((unsigned int)k <= ARRAY_SIZE(vals));
2999       for (s = 0; s < k; ++s)
3000          st->setSrc(s + 1, vals[s]);
3001       st->setSrc(0, ri->getSrc(0));
3002    } else
3003    if (endR > endS) {
3004       int j, s;
3005       for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
3006       for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
3007       for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
3008          st->setSrc(s++, ri->getSrc(j));
3009    }
3010    st->putExtraSources(0, extra);
3011
3012    delete_Instruction(prog, rec->insn);
3013
3014    rec->insn = st;
3015    rec->offset = st->getSrc(0)->reg.data.offset;
3016
3017    st->setType(typeOfSize(rec->size));
3018
3019    return true;
3020 }
3021
3022 bool
3023 MemoryOpt::Record::overlaps(const Instruction *ldst) const
3024 {
3025    Record that;
3026    that.set(ldst);
3027
3028    // This assumes that images/buffers can't overlap. They can.
3029    // TODO: Plumb the restrict logic through, and only skip when it's a
3030    // restrict situation, or there can implicitly be no writes.
3031    if (this->fileIndex != that.fileIndex && this->rel[1] == that.rel[1])
3032       return false;
3033
3034    if (this->rel[0] || that.rel[0])
3035       return this->base == that.base;
3036
3037    return
3038       (this->offset < that.offset + that.size) &&
3039       (this->offset + this->size > that.offset);
3040 }
3041
3042 // We must not eliminate stores that affect the result of @ld if
3043 // we find later stores to the same location, and we may no longer
3044 // merge them with later stores.
3045 // The stored value can, however, still be used to determine the value
3046 // returned by future loads.
3047 void
3048 MemoryOpt::lockStores(Instruction *const ld)
3049 {
3050    for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
3051       if (!r->locked && r->overlaps(ld))
3052          r->locked = true;
3053 }
3054
3055 // Prior loads from the location of @st are no longer valid.
3056 // Stores to the location of @st may no longer be used to derive
3057 // the value at it nor be coalesced into later stores.
3058 void
3059 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
3060 {
3061    if (st)
3062       f = st->src(0).getFile();
3063
3064    for (Record *r = loads[f]; r; r = r->next)
3065       if (!st || r->overlaps(st))
3066          r->unlink(&loads[f]);
3067
3068    for (Record *r = stores[f]; r; r = r->next)
3069       if (!st || r->overlaps(st))
3070          r->unlink(&stores[f]);
3071 }
3072
3073 bool
3074 MemoryOpt::visit(BasicBlock *bb)
3075 {
3076    bool ret = runOpt(bb);
3077    // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
3078    // where 96 bit memory operations are forbidden.
3079    if (ret)
3080       ret = runOpt(bb);
3081    return ret;
3082 }
3083
3084 bool
3085 MemoryOpt::runOpt(BasicBlock *bb)
3086 {
3087    Instruction *ldst, *next;
3088    Record *rec;
3089    bool isAdjacent = true;
3090
3091    for (ldst = bb->getEntry(); ldst; ldst = next) {
3092       bool keep = true;
3093       bool isLoad = true;
3094       next = ldst->next;
3095
3096       if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
3097          if (ldst->isDead()) {
3098             // might have been produced by earlier optimization
3099             delete_Instruction(prog, ldst);
3100             continue;
3101          }
3102       } else
3103       if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
3104          if (typeSizeof(ldst->dType) == 4 &&
3105              ldst->src(1).getFile() == FILE_GPR &&
3106              ldst->getSrc(1)->getInsn()->op == OP_NOP) {
3107             delete_Instruction(prog, ldst);
3108             continue;
3109          }
3110          isLoad = false;
3111       } else {
3112          // TODO: maybe have all fixed ops act as barrier ?
3113          if (ldst->op == OP_CALL ||
3114              ldst->op == OP_BAR ||
3115              ldst->op == OP_MEMBAR) {
3116             purgeRecords(NULL, FILE_MEMORY_LOCAL);
3117             purgeRecords(NULL, FILE_MEMORY_GLOBAL);
3118             purgeRecords(NULL, FILE_MEMORY_SHARED);
3119             purgeRecords(NULL, FILE_SHADER_OUTPUT);
3120          } else
3121          if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
3122             if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
3123                purgeRecords(NULL, FILE_MEMORY_LOCAL);
3124                purgeRecords(NULL, FILE_MEMORY_GLOBAL);
3125                purgeRecords(NULL, FILE_MEMORY_SHARED);
3126             } else {
3127                purgeRecords(NULL, ldst->src(0).getFile());
3128             }
3129          } else
3130          if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
3131             purgeRecords(NULL, FILE_SHADER_OUTPUT);
3132          }
3133          continue;
3134       }
3135       if (ldst->getPredicate()) // TODO: handle predicated ld/st
3136          continue;
3137       if (ldst->perPatch) // TODO: create separate per-patch lists
3138          continue;
3139
3140       if (isLoad) {
3141          DataFile file = ldst->src(0).getFile();
3142
3143          // if ld l[]/g[] look for previous store to eliminate the reload
3144          if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
3145             // TODO: shared memory ?
3146             rec = findRecord(ldst, false, isAdjacent);
3147             if (rec && !isAdjacent)
3148                keep = !replaceLdFromSt(ldst, rec);
3149          }
3150
3151          // or look for ld from the same location and replace this one
3152          rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
3153          if (rec) {
3154             if (!isAdjacent)
3155                keep = !replaceLdFromLd(ldst, rec);
3156             else
3157                // or combine a previous load with this one
3158                keep = !combineLd(rec, ldst);
3159          }
3160          if (keep)
3161             lockStores(ldst);
3162       } else {
3163          rec = findRecord(ldst, false, isAdjacent);
3164          if (rec) {
3165             if (!isAdjacent)
3166                keep = !replaceStFromSt(ldst, rec);
3167             else
3168                keep = !combineSt(rec, ldst);
3169          }
3170          if (keep)
3171             purgeRecords(ldst, DATA_FILE_COUNT);
3172       }
3173       if (keep)
3174          addRecord(ldst);
3175    }
3176    reset();
3177
3178    return true;
3179 }
3180
3181 // =============================================================================
3182
3183 // Turn control flow into predicated instructions (after register allocation !).
3184 // TODO:
3185 // Could move this to before register allocation on NVC0 and also handle nested
3186 // constructs.
3187 class FlatteningPass : public Pass
3188 {
3189 private:
3190    virtual bool visit(Function *);
3191    virtual bool visit(BasicBlock *);
3192
3193    bool tryPredicateConditional(BasicBlock *);
3194    void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
3195    void tryPropagateBranch(BasicBlock *);
3196    inline bool isConstantCondition(Value *pred);
3197    inline bool mayPredicate(const Instruction *, const Value *pred) const;
3198    inline void removeFlow(Instruction *);
3199
3200    uint8_t gpr_unit;
3201 };
3202
3203 bool
3204 FlatteningPass::isConstantCondition(Value *pred)
3205 {
3206    Instruction *insn = pred->getUniqueInsn();
3207    assert(insn);
3208    if (insn->op != OP_SET || insn->srcExists(2))
3209       return false;
3210
3211    for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
3212       Instruction *ld = insn->getSrc(s)->getUniqueInsn();
3213       DataFile file;
3214       if (ld) {
3215          if (ld->op != OP_MOV && ld->op != OP_LOAD)
3216             return false;
3217          if (ld->src(0).isIndirect(0))
3218             return false;
3219          file = ld->src(0).getFile();
3220       } else {
3221          file = insn->src(s).getFile();
3222          // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
3223          // in register "units", which can vary between targets.
3224          if (file == FILE_GPR) {
3225             Value *v = insn->getSrc(s);
3226             int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
3227             int units = bytes >> gpr_unit;
3228             if (units > prog->maxGPR)
3229                file = FILE_IMMEDIATE;
3230          }
3231       }
3232       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
3233          return false;
3234    }
3235    return true;
3236 }
3237
3238 void
3239 FlatteningPass::removeFlow(Instruction *insn)
3240 {
3241    FlowInstruction *term = insn ? insn->asFlow() : NULL;
3242    if (!term)
3243       return;
3244    Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
3245
3246    if (term->op == OP_BRA) {
3247       // TODO: this might get more difficult when we get arbitrary BRAs
3248       if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
3249          return;
3250    } else
3251    if (term->op != OP_JOIN)
3252       return;
3253
3254    Value *pred = term->getPredicate();
3255
3256    delete_Instruction(prog, term);
3257
3258    if (pred && pred->refCount() == 0) {
3259       Instruction *pSet = pred->getUniqueInsn();
3260       pred->join->reg.data.id = -1; // deallocate
3261       if (pSet->isDead())
3262          delete_Instruction(prog, pSet);
3263    }
3264 }
3265
3266 void
3267 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
3268 {
3269    for (Instruction *i = bb->getEntry(); i; i = i->next) {
3270       if (i->isNop())
3271          continue;
3272       assert(!i->getPredicate());
3273       i->setPredicate(cc, pred);
3274    }
3275    removeFlow(bb->getExit());
3276 }
3277
3278 bool
3279 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
3280 {
3281    if (insn->isPseudo())
3282       return true;
3283    // TODO: calls where we don't know which registers are modified
3284
3285    if (!prog->getTarget()->mayPredicate(insn, pred))
3286       return false;
3287    for (int d = 0; insn->defExists(d); ++d)
3288       if (insn->getDef(d)->equals(pred))
3289          return false;
3290    return true;
3291 }
3292
3293 // If we jump to BRA/RET/EXIT, replace the jump with it.
3294 // NOTE: We do not update the CFG anymore here !
3295 //
3296 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
3297 //  BB:0
3298 //   @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
3299 //  BB1:
3300 //   bra BB:3
3301 //  BB2:
3302 //   ...
3303 //  BB3:
3304 //   ...
3305 void
3306 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
3307 {
3308    for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
3309       BasicBlock *bf = i->asFlow()->target.bb;
3310
3311       if (bf->getInsnCount() != 1)
3312          continue;
3313
3314       FlowInstruction *bra = i->asFlow();
3315       FlowInstruction *rep = bf->getExit()->asFlow();
3316
3317       if (!rep || rep->getPredicate())
3318          continue;
3319       if (rep->op != OP_BRA &&
3320           rep->op != OP_JOIN &&
3321           rep->op != OP_EXIT)
3322          continue;
3323
3324       // TODO: If there are multiple branches to @rep, only the first would
3325       // be replaced, so only remove them after this pass is done ?
3326       // Also, need to check all incident blocks for fall-through exits and
3327       // add the branch there.
3328       bra->op = rep->op;
3329       bra->target.bb = rep->target.bb;
3330       if (bf->cfg.incidentCount() == 1)
3331          bf->remove(rep);
3332    }
3333 }
3334
3335 bool
3336 FlatteningPass::visit(Function *fn)
3337 {
3338    gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
3339
3340    return true;
3341 }
3342
3343 bool
3344 FlatteningPass::visit(BasicBlock *bb)
3345 {
3346    if (tryPredicateConditional(bb))
3347       return true;
3348
3349    // try to attach join to previous instruction
3350    if (prog->getTarget()->hasJoin) {
3351       Instruction *insn = bb->getExit();
3352       if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
3353          insn = insn->prev;
3354          if (insn && !insn->getPredicate() &&
3355              !insn->asFlow() &&
3356              insn->op != OP_DISCARD &&
3357              insn->op != OP_TEXBAR &&
3358              !isTextureOp(insn->op) && // probably just nve4
3359              !isSurfaceOp(insn->op) && // not confirmed
3360              insn->op != OP_LINTERP && // probably just nve4
3361              insn->op != OP_PINTERP && // probably just nve4
3362              ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
3363               (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
3364              !insn->isNop()) {
3365             insn->join = 1;
3366             bb->remove(bb->getExit());
3367             return true;
3368          }
3369       }
3370    }
3371
3372    tryPropagateBranch(bb);
3373
3374    return true;
3375 }
3376
3377 bool
3378 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
3379 {
3380    BasicBlock *bL = NULL, *bR = NULL;
3381    unsigned int nL = 0, nR = 0, limit = 12;
3382    Instruction *insn;
3383    unsigned int mask;
3384
3385    mask = bb->initiatesSimpleConditional();
3386    if (!mask)
3387       return false;
3388
3389    assert(bb->getExit());
3390    Value *pred = bb->getExit()->getPredicate();
3391    assert(pred);
3392
3393    if (isConstantCondition(pred))
3394       limit = 4;
3395
3396    Graph::EdgeIterator ei = bb->cfg.outgoing();
3397
3398    if (mask & 1) {
3399       bL = BasicBlock::get(ei.getNode());
3400       for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
3401          if (!mayPredicate(insn, pred))
3402             return false;
3403       if (nL > limit)
3404          return false; // too long, do a real branch
3405    }
3406    ei.next();
3407
3408    if (mask & 2) {
3409       bR = BasicBlock::get(ei.getNode());
3410       for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
3411          if (!mayPredicate(insn, pred))
3412             return false;
3413       if (nR > limit)
3414          return false; // too long, do a real branch
3415    }
3416
3417    if (bL)
3418       predicateInstructions(bL, pred, bb->getExit()->cc);
3419    if (bR)
3420       predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
3421
3422    if (bb->joinAt) {
3423       bb->remove(bb->joinAt);
3424       bb->joinAt = NULL;
3425    }
3426    removeFlow(bb->getExit()); // delete the branch/join at the fork point
3427
3428    // remove potential join operations at the end of the conditional
3429    if (prog->getTarget()->joinAnterior) {
3430       bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
3431       if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
3432          removeFlow(bb->getEntry());
3433    }
3434
3435    return true;
3436 }
3437
3438 // =============================================================================
3439
3440 // Fold Immediate into MAD; must be done after register allocation due to
3441 // constraint SDST == SSRC2
3442 // TODO:
3443 // Does NVC0+ have other situations where this pass makes sense?
3444 class PostRaLoadPropagation : public Pass
3445 {
3446 private:
3447    virtual bool visit(Instruction *);
3448
3449    void handleMADforNV50(Instruction *);
3450    void handleMADforNVC0(Instruction *);
3451 };
3452
3453 static bool
3454 post_ra_dead(Instruction *i)
3455 {
3456    for (int d = 0; i->defExists(d); ++d)
3457       if (i->getDef(d)->refCount())
3458          return false;
3459    return true;
3460 }
3461
3462 // Fold Immediate into MAD; must be done after register allocation due to
3463 // constraint SDST == SSRC2
3464 void
3465 PostRaLoadPropagation::handleMADforNV50(Instruction *i)
3466 {
3467    if (i->def(0).getFile() != FILE_GPR ||
3468        i->src(0).getFile() != FILE_GPR ||
3469        i->src(1).getFile() != FILE_GPR ||
3470        i->src(2).getFile() != FILE_GPR ||
3471        i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3472       return;
3473
3474    if (i->getDef(0)->reg.data.id >= 64 ||
3475        i->getSrc(0)->reg.data.id >= 64)
3476       return;
3477
3478    if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
3479       return;
3480
3481    if (i->getPredicate())
3482       return;
3483
3484    Value *vtmp;
3485    Instruction *def = i->getSrc(1)->getInsn();
3486
3487    if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
3488       def = def->getSrc(0)->getInsn();
3489    if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
3490       vtmp = i->getSrc(1);
3491       if (isFloatType(i->sType)) {
3492          i->setSrc(1, def->getSrc(0));
3493       } else {
3494          ImmediateValue val;
3495          // getImmediate() has side-effects on the argument so this *shouldn't*
3496          // be folded into the assert()
3497          MAYBE_UNUSED bool ret = def->src(0).getImmediate(val);
3498          assert(ret);
3499          if (i->getSrc(1)->reg.data.id & 1)
3500             val.reg.data.u32 >>= 16;
3501          val.reg.data.u32 &= 0xffff;
3502          i->setSrc(1, new_ImmediateValue(prog, val.reg.data.u32));
3503       }
3504
3505       /* There's no post-RA dead code elimination, so do it here
3506        * XXX: if we add more code-removing post-RA passes, we might
3507        *      want to create a post-RA dead-code elim pass */
3508       if (post_ra_dead(vtmp->getInsn())) {
3509          Value *src = vtmp->getInsn()->getSrc(0);
3510          // Careful -- splits will have already been removed from the
3511          // functions. Don't double-delete.
3512          if (vtmp->getInsn()->bb)
3513             delete_Instruction(prog, vtmp->getInsn());
3514          if (src->getInsn() && post_ra_dead(src->getInsn()))
3515             delete_Instruction(prog, src->getInsn());
3516       }
3517    }
3518 }
3519
3520 void
3521 PostRaLoadPropagation::handleMADforNVC0(Instruction *i)
3522 {
3523    if (i->def(0).getFile() != FILE_GPR ||
3524        i->src(0).getFile() != FILE_GPR ||
3525        i->src(1).getFile() != FILE_GPR ||
3526        i->src(2).getFile() != FILE_GPR ||
3527        i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3528       return;
3529
3530    // TODO: gm107 can also do this for S32, maybe other chipsets as well
3531    if (i->dType != TYPE_F32)
3532       return;
3533
3534    if ((i->src(2).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
3535       return;
3536
3537    ImmediateValue val;
3538    int s;
3539
3540    if (i->src(0).getImmediate(val))
3541       s = 1;
3542    else if (i->src(1).getImmediate(val))
3543       s = 0;
3544    else
3545       return;
3546
3547    if ((i->src(s).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
3548       return;
3549
3550    if (s == 1)
3551       i->swapSources(0, 1);
3552
3553    Instruction *imm = i->getSrc(1)->getInsn();
3554    i->setSrc(1, imm->getSrc(0));
3555    if (post_ra_dead(imm))
3556       delete_Instruction(prog, imm);
3557 }
3558
3559 bool
3560 PostRaLoadPropagation::visit(Instruction *i)
3561 {
3562    switch (i->op) {
3563    case OP_FMA:
3564    case OP_MAD:
3565       if (prog->getTarget()->getChipset() < 0xc0)
3566          handleMADforNV50(i);
3567       else
3568          handleMADforNVC0(i);
3569       break;
3570    default:
3571       break;
3572    }
3573
3574    return true;
3575 }
3576
3577 // =============================================================================
3578
3579 // Common subexpression elimination. Stupid O^2 implementation.
3580 class LocalCSE : public Pass
3581 {
3582 private:
3583    virtual bool visit(BasicBlock *);
3584
3585    inline bool tryReplace(Instruction **, Instruction *);
3586
3587    DLList ops[OP_LAST + 1];
3588 };
3589
3590 class GlobalCSE : public Pass
3591 {
3592 private:
3593    virtual bool visit(BasicBlock *);
3594 };
3595
3596 bool
3597 Instruction::isActionEqual(const Instruction *that) const
3598 {
3599    if (this->op != that->op ||
3600        this->dType != that->dType ||
3601        this->sType != that->sType)
3602       return false;
3603    if (this->cc != that->cc)
3604       return false;
3605
3606    if (this->asTex()) {
3607       if (memcmp(&this->asTex()->tex,
3608                  &that->asTex()->tex,
3609                  sizeof(this->asTex()->tex)))
3610          return false;
3611    } else
3612    if (this->asCmp()) {
3613       if (this->asCmp()->setCond != that->asCmp()->setCond)
3614          return false;
3615    } else
3616    if (this->asFlow()) {
3617       return false;
3618    } else
3619    if (this->op == OP_PHI && this->bb != that->bb) {
3620       /* TODO: we could probably be a bit smarter here by following the
3621        * control flow, but honestly, it is quite painful to check */
3622       return false;
3623    } else {
3624       if (this->ipa != that->ipa ||
3625           this->lanes != that->lanes ||
3626           this->perPatch != that->perPatch)
3627          return false;
3628       if (this->postFactor != that->postFactor)
3629          return false;
3630    }
3631
3632    if (this->subOp != that->subOp ||
3633        this->saturate != that->saturate ||
3634        this->rnd != that->rnd ||
3635        this->ftz != that->ftz ||
3636        this->dnz != that->dnz ||
3637        this->cache != that->cache ||
3638        this->mask != that->mask)
3639       return false;
3640
3641    return true;
3642 }
3643
3644 bool
3645 Instruction::isResultEqual(const Instruction *that) const
3646 {
3647    unsigned int d, s;
3648
3649    // NOTE: location of discard only affects tex with liveOnly and quadops
3650    if (!this->defExists(0) && this->op != OP_DISCARD)
3651       return false;
3652
3653    if (!isActionEqual(that))
3654       return false;
3655
3656    if (this->predSrc != that->predSrc)
3657       return false;
3658
3659    for (d = 0; this->defExists(d); ++d) {
3660       if (!that->defExists(d) ||
3661           !this->getDef(d)->equals(that->getDef(d), false))
3662          return false;
3663    }
3664    if (that->defExists(d))
3665       return false;
3666
3667    for (s = 0; this->srcExists(s); ++s) {
3668       if (!that->srcExists(s))
3669          return false;
3670       if (this->src(s).mod != that->src(s).mod)
3671          return false;
3672       if (!this->getSrc(s)->equals(that->getSrc(s), true))
3673          return false;
3674    }
3675    if (that->srcExists(s))
3676       return false;
3677
3678    if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
3679       switch (src(0).getFile()) {
3680       case FILE_MEMORY_CONST:
3681       case FILE_SHADER_INPUT:
3682          return true;
3683       case FILE_SHADER_OUTPUT:
3684          return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
3685       default:
3686          return false;
3687       }
3688    }
3689
3690    return true;
3691 }
3692
3693 // pull through common expressions from different in-blocks
3694 bool
3695 GlobalCSE::visit(BasicBlock *bb)
3696 {
3697    Instruction *phi, *next, *ik;
3698    int s;
3699
3700    // TODO: maybe do this with OP_UNION, too
3701
3702    for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
3703       next = phi->next;
3704       if (phi->getSrc(0)->refCount() > 1)
3705          continue;
3706       ik = phi->getSrc(0)->getInsn();
3707       if (!ik)
3708          continue; // probably a function input
3709       if (ik->defCount(0xff) > 1)
3710          continue; // too painful to check if we can really push this forward
3711       for (s = 1; phi->srcExists(s); ++s) {
3712          if (phi->getSrc(s)->refCount() > 1)
3713             break;
3714          if (!phi->getSrc(s)->getInsn() ||
3715              !phi->getSrc(s)->getInsn()->isResultEqual(ik))
3716             break;
3717       }
3718       if (!phi->srcExists(s)) {
3719          assert(ik->op != OP_PHI);
3720          Instruction *entry = bb->getEntry();
3721          ik->bb->remove(ik);
3722          if (!entry || entry->op != OP_JOIN)
3723             bb->insertHead(ik);
3724          else
3725             bb->insertAfter(entry, ik);
3726          ik->setDef(0, phi->getDef(0));
3727          delete_Instruction(prog, phi);
3728       }
3729    }
3730
3731    return true;
3732 }
3733
3734 bool
3735 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
3736 {
3737    Instruction *old = *ptr;
3738
3739    // TODO: maybe relax this later (causes trouble with OP_UNION)
3740    if (i->isPredicated())
3741       return false;
3742
3743    if (!old->isResultEqual(i))
3744       return false;
3745
3746    for (int d = 0; old->defExists(d); ++d)
3747       old->def(d).replace(i->getDef(d), false);
3748    delete_Instruction(prog, old);
3749    *ptr = NULL;
3750    return true;
3751 }
3752
3753 bool
3754 LocalCSE::visit(BasicBlock *bb)
3755 {
3756    unsigned int replaced;
3757
3758    do {
3759       Instruction *ir, *next;
3760
3761       replaced = 0;
3762
3763       // will need to know the order of instructions
3764       int serial = 0;
3765       for (ir = bb->getFirst(); ir; ir = ir->next)
3766          ir->serial = serial++;
3767
3768       for (ir = bb->getFirst(); ir; ir = next) {
3769          int s;
3770          Value *src = NULL;
3771
3772          next = ir->next;
3773
3774          if (ir->fixed) {
3775             ops[ir->op].insert(ir);
3776             continue;
3777          }
3778
3779          for (s = 0; ir->srcExists(s); ++s)
3780             if (ir->getSrc(s)->asLValue())
3781                if (!src || ir->getSrc(s)->refCount() < src->refCount())
3782                   src = ir->getSrc(s);
3783
3784          if (src) {
3785             for (Value::UseIterator it = src->uses.begin();
3786                  it != src->uses.end(); ++it) {
3787                Instruction *ik = (*it)->getInsn();
3788                if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
3789                   if (tryReplace(&ir, ik))
3790                      break;
3791             }
3792          } else {
3793             DLLIST_FOR_EACH(&ops[ir->op], iter)
3794             {
3795                Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
3796                if (tryReplace(&ir, ik))
3797                   break;
3798             }
3799          }
3800
3801          if (ir)
3802             ops[ir->op].insert(ir);
3803          else
3804             ++replaced;
3805       }
3806       for (unsigned int i = 0; i <= OP_LAST; ++i)
3807          ops[i].clear();
3808
3809    } while (replaced);
3810
3811    return true;
3812 }
3813
3814 // =============================================================================
3815
3816 // Remove computations of unused values.
3817 class DeadCodeElim : public Pass
3818 {
3819 public:
3820    bool buryAll(Program *);
3821
3822 private:
3823    virtual bool visit(BasicBlock *);
3824
3825    void checkSplitLoad(Instruction *ld); // for partially dead loads
3826
3827    unsigned int deadCount;
3828 };
3829
3830 bool
3831 DeadCodeElim::buryAll(Program *prog)
3832 {
3833    do {
3834       deadCount = 0;
3835       if (!this->run(prog, false, false))
3836          return false;
3837    } while (deadCount);
3838
3839    return true;
3840 }
3841
3842 bool
3843 DeadCodeElim::visit(BasicBlock *bb)
3844 {
3845    Instruction *prev;
3846
3847    for (Instruction *i = bb->getExit(); i; i = prev) {
3848       prev = i->prev;
3849       if (i->isDead()) {
3850          ++deadCount;
3851          delete_Instruction(prog, i);
3852       } else
3853       if (i->defExists(1) &&
3854           i->subOp == 0 &&
3855           (i->op == OP_VFETCH || i->op == OP_LOAD)) {
3856          checkSplitLoad(i);
3857       } else
3858       if (i->defExists(0) && !i->getDef(0)->refCount()) {
3859          if (i->op == OP_ATOM ||
3860              i->op == OP_SUREDP ||
3861              i->op == OP_SUREDB) {
3862             i->setDef(0, NULL);
3863             if (i->op == OP_ATOM && i->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
3864                i->cache = CACHE_CV;
3865                i->op = OP_STORE;
3866                i->subOp = 0;
3867             }
3868          } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
3869             i->setDef(0, i->getDef(1));
3870             i->setDef(1, NULL);
3871          }
3872       }
3873    }
3874    return true;
3875 }
3876
3877 // Each load can go into up to 4 destinations, any of which might potentially
3878 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3879 // of where the holes are. We find the first contiguous region, put it into
3880 // the first load, and then put the second contiguous region into the second
3881 // load. There can be at most 2 contiguous regions.
3882 //
3883 // Note that there are some restrictions, for example it's not possible to do
3884 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3885 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3886 // split into a 64-bit and 32-bit load.
3887 void
3888 DeadCodeElim::checkSplitLoad(Instruction *ld1)
3889 {
3890    Instruction *ld2 = NULL; // can get at most 2 loads
3891    Value *def1[4];
3892    Value *def2[4];
3893    int32_t addr1, addr2;
3894    int32_t size1, size2;
3895    int d, n1, n2;
3896    uint32_t mask = 0xffffffff;
3897
3898    for (d = 0; ld1->defExists(d); ++d)
3899       if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
3900          mask &= ~(1 << d);
3901    if (mask == 0xffffffff)
3902       return;
3903
3904    addr1 = ld1->getSrc(0)->reg.data.offset;
3905    n1 = n2 = 0;
3906    size1 = size2 = 0;
3907
3908    // Compute address/width for first load
3909    for (d = 0; ld1->defExists(d); ++d) {
3910       if (mask & (1 << d)) {
3911          if (size1 && (addr1 & 0x7))
3912             break;
3913          def1[n1] = ld1->getDef(d);
3914          size1 += def1[n1++]->reg.size;
3915       } else
3916       if (!n1) {
3917          addr1 += ld1->getDef(d)->reg.size;
3918       } else {
3919          break;
3920       }
3921    }
3922
3923    // Scale back the size of the first load until it can be loaded. This
3924    // typically happens for TYPE_B96 loads.
3925    while (n1 &&
3926           !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
3927                                                 typeOfSize(size1))) {
3928       size1 -= def1[--n1]->reg.size;
3929       d--;
3930    }
3931
3932    // Compute address/width for second load
3933    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
3934       if (mask & (1 << d)) {
3935          assert(!size2 || !(addr2 & 0x7));
3936          def2[n2] = ld1->getDef(d);
3937          size2 += def2[n2++]->reg.size;
3938       } else if (!n2) {
3939          assert(!n2);
3940          addr2 += ld1->getDef(d)->reg.size;
3941       } else {
3942          break;
3943       }
3944    }
3945
3946    // Make sure that we've processed all the values
3947    for (; ld1->defExists(d); ++d)
3948       assert(!(mask & (1 << d)));
3949
3950    updateLdStOffset(ld1, addr1, func);
3951    ld1->setType(typeOfSize(size1));
3952    for (d = 0; d < 4; ++d)
3953       ld1->setDef(d, (d < n1) ? def1[d] : NULL);
3954
3955    if (!n2)
3956       return;
3957
3958    ld2 = cloneShallow(func, ld1);
3959    updateLdStOffset(ld2, addr2, func);
3960    ld2->setType(typeOfSize(size2));
3961    for (d = 0; d < 4; ++d)
3962       ld2->setDef(d, (d < n2) ? def2[d] : NULL);
3963
3964    ld1->bb->insertAfter(ld1, ld2);
3965 }
3966
3967 // =============================================================================
3968
3969 #define RUN_PASS(l, n, f)                       \
3970    if (level >= (l)) {                          \
3971       if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
3972          INFO("PEEPHOLE: %s\n", #n);            \
3973       n pass;                                   \
3974       if (!pass.f(this))                        \
3975          return false;                          \
3976    }
3977
3978 bool
3979 Program::optimizeSSA(int level)
3980 {
3981    RUN_PASS(1, DeadCodeElim, buryAll);
3982    RUN_PASS(1, CopyPropagation, run);
3983    RUN_PASS(1, MergeSplits, run);
3984    RUN_PASS(2, GlobalCSE, run);
3985    RUN_PASS(1, LocalCSE, run);
3986    RUN_PASS(2, AlgebraicOpt, run);
3987    RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
3988    RUN_PASS(1, ConstantFolding, foldAll);
3989    RUN_PASS(0, Split64BitOpPreRA, run);
3990    RUN_PASS(2, LateAlgebraicOpt, run);
3991    RUN_PASS(1, LoadPropagation, run);
3992    RUN_PASS(1, IndirectPropagation, run);
3993    RUN_PASS(2, MemoryOpt, run);
3994    RUN_PASS(2, LocalCSE, run);
3995    RUN_PASS(0, DeadCodeElim, buryAll);
3996
3997    return true;
3998 }
3999
4000 bool
4001 Program::optimizePostRA(int level)
4002 {
4003    RUN_PASS(2, FlatteningPass, run);
4004    RUN_PASS(2, PostRaLoadPropagation, run);
4005
4006    return true;
4007 }
4008
4009 }