src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_target.h"
  25 #include "codegen/nv50_ir_build_util.h"
  26
  27 extern "C" {
  28 #include "util/u_math.h"
  29 }
  30
  31 namespace nv50_ir {
  32
  33 bool
  34 Instruction::isNop() const
  35 {
  36    if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
  37       return true;
  38    if (terminator || join) // XXX: should terminator imply flow ?
  39       return false;
  40    if (op == OP_ATOM)
  41       return false;
  42    if (!fixed && op == OP_NOP)
  43       return true;
  44
  45    if (defExists(0) && def(0).rep()->reg.data.id < 0) {
  46       for (int d = 1; defExists(d); ++d)
  47          if (def(d).rep()->reg.data.id >= 0)
  48             WARN("part of vector result is unused !\n");
  49       return true;
  50    }
  51
  52    if (op == OP_MOV || op == OP_UNION) {
  53       if (!getDef(0)->equals(getSrc(0)))
  54          return false;
  55       if (op == OP_UNION)
  56          if (!def(0).rep()->equals(getSrc(1)))
  57             return false;
  58       return true;
  59    }
  60
  61    return false;
  62 }
  63
  64 bool Instruction::isDead() const
  65 {
  66    if (op == OP_STORE ||
  67        op == OP_EXPORT ||
  68        op == OP_ATOM ||
  69        op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
  70        op == OP_WRSV)
  71       return false;
  72
  73    for (int d = 0; defExists(d); ++d)
  74       if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
  75          return false;
  76
  77    if (terminator || asFlow())
  78       return false;
  79    if (fixed)
  80       return false;
  81
  82    return true;
  83 };
  84
  85 // =============================================================================
  86
  87 class CopyPropagation : public Pass
  88 {
  89 private:
  90    virtual bool visit(BasicBlock *);
  91 };
  92
  93 // Propagate all MOVs forward to make subsequent optimization easier, except if
  94 // the sources stem from a phi, in which case we don't want to mess up potential
  95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
  96 bool
  97 CopyPropagation::visit(BasicBlock *bb)
  98 {
  99    Instruction *mov, *si, *next;
 100
 101    for (mov = bb->getEntry(); mov; mov = next) {
 102       next = mov->next;
 103       if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
 104          continue;
 105       if (mov->getPredicate())
 106          continue;
 107       if (mov->def(0).getFile() != mov->src(0).getFile())
 108          continue;
 109       si = mov->getSrc(0)->getInsn();
 110       if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
 111          // propagate
 112          mov->def(0).replace(mov->getSrc(0), false);
 113          delete_Instruction(prog, mov);
 114       }
 115    }
 116    return true;
 117 }
 118
 119 // =============================================================================
 120
 121 class MergeSplits : public Pass
 122 {
 123 private:
 124    virtual bool visit(BasicBlock *);
 125 };
 126
 127 // For SPLIT / MERGE pairs that operate on the same registers, replace the
 128 // post-merge def with the SPLIT's source.
 129 bool
 130 MergeSplits::visit(BasicBlock *bb)
 131 {
 132    Instruction *i, *next, *si;
 133
 134    for (i = bb->getEntry(); i; i = next) {
 135       next = i->next;
 136       if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
 137          continue;
 138       si = i->getSrc(0)->getInsn();
 139       if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
 140          continue;
 141       i->def(0).replace(si->getSrc(0), false);
 142       delete_Instruction(prog, i);
 143    }
 144
 145    return true;
 146 }
 147
 148 // =============================================================================
 149
 150 class LoadPropagation : public Pass
 151 {
 152 private:
 153    virtual bool visit(BasicBlock *);
 154
 155    void checkSwapSrc01(Instruction *);
 156
 157    bool isCSpaceLoad(Instruction *);
 158    bool isImmdLoad(Instruction *);
 159    bool isAttribOrSharedLoad(Instruction *);
 160 };
 161
 162 bool
 163 LoadPropagation::isCSpaceLoad(Instruction *ld)
 164 {
 165    return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
 166 }
 167
 168 bool
 169 LoadPropagation::isImmdLoad(Instruction *ld)
 170 {
 171    if (!ld || (ld->op != OP_MOV) ||
 172        ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
 173       return false;
 174
 175    // A 0 can be replaced with a register, so it doesn't count as an immediate.
 176    ImmediateValue val;
 177    return ld->src(0).getImmediate(val) && !val.isInteger(0);
 178 }
 179
 180 bool
 181 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
 182 {
 183    return ld &&
 184       (ld->op == OP_VFETCH ||
 185        (ld->op == OP_LOAD &&
 186         (ld->src(0).getFile() == FILE_SHADER_INPUT ||
 187          ld->src(0).getFile() == FILE_MEMORY_SHARED)));
 188 }
 189
 190 void
 191 LoadPropagation::checkSwapSrc01(Instruction *insn)
 192 {
 193    const Target *targ = prog->getTarget();
 194    if (!targ->getOpInfo(insn).commutative)
 195       if (insn->op != OP_SET && insn->op != OP_SLCT)
 196          return;
 197    if (insn->src(1).getFile() != FILE_GPR)
 198       return;
 199
 200    Instruction *i0 = insn->getSrc(0)->getInsn();
 201    Instruction *i1 = insn->getSrc(1)->getInsn();
 202
 203    // Swap sources to inline the less frequently used source. That way,
 204    // optimistically, it will eventually be able to remove the instruction.
 205    int i0refs = insn->getSrc(0)->refCount();
 206    int i1refs = insn->getSrc(1)->refCount();
 207
 208    if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
 209       if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
 210           !targ->insnCanLoad(insn, 1, i1) ||
 211           i0refs < i1refs)
 212          insn->swapSources(0, 1);
 213       else
 214          return;
 215    } else
 216    if (isAttribOrSharedLoad(i1)) {
 217       if (!isAttribOrSharedLoad(i0))
 218          insn->swapSources(0, 1);
 219       else
 220          return;
 221    } else {
 222       return;
 223    }
 224
 225    if (insn->op == OP_SET || insn->op == OP_SET_AND ||
 226        insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
 227       insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
 228    else
 229    if (insn->op == OP_SLCT)
 230       insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
 231 }
 232
 233 bool
 234 LoadPropagation::visit(BasicBlock *bb)
 235 {
 236    const Target *targ = prog->getTarget();
 237    Instruction *next;
 238
 239    for (Instruction *i = bb->getEntry(); i; i = next) {
 240       next = i->next;
 241
 242       if (i->op == OP_CALL) // calls have args as sources, they must be in regs
 243          continue;
 244
 245       if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
 246          continue;
 247
 248       if (i->srcExists(1))
 249          checkSwapSrc01(i);
 250
 251       for (int s = 0; i->srcExists(s); ++s) {
 252          Instruction *ld = i->getSrc(s)->getInsn();
 253
 254          if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
 255             continue;
 256          if (!targ->insnCanLoad(i, s, ld))
 257             continue;
 258
 259          // propagate !
 260          i->setSrc(s, ld->getSrc(0));
 261          if (ld->src(0).isIndirect(0))
 262             i->setIndirect(s, 0, ld->getIndirect(0, 0));
 263
 264          if (ld->getDef(0)->refCount() == 0)
 265             delete_Instruction(prog, ld);
 266       }
 267    }
 268    return true;
 269 }
 270
 271 // =============================================================================
 272
 273 class IndirectPropagation : public Pass
 274 {
 275 private:
 276    virtual bool visit(BasicBlock *);
 277 };
 278
 279 bool
 280 IndirectPropagation::visit(BasicBlock *bb)
 281 {
 282    const Target *targ = prog->getTarget();
 283    Instruction *next;
 284
 285    for (Instruction *i = bb->getEntry(); i; i = next) {
 286       next = i->next;
 287
 288       for (int s = 0; i->srcExists(s); ++s) {
 289          Instruction *insn;
 290          ImmediateValue imm;
 291          if (!i->src(s).isIndirect(0))
 292             continue;
 293          insn = i->getIndirect(s, 0)->getInsn();
 294          if (!insn)
 295             continue;
 296          if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
 297             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
 298                 !insn->src(1).getImmediate(imm) ||
 299                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
 300                continue;
 301             i->setIndirect(s, 0, insn->getSrc(0));
 302             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 303             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
 304          } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
 305             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
 306                 !insn->src(1).getImmediate(imm) ||
 307                 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
 308                continue;
 309             i->setIndirect(s, 0, insn->getSrc(0));
 310             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 311             i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
 312          } else if (insn->op == OP_MOV) {
 313             if (!insn->src(0).getImmediate(imm) ||
 314                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
 315                continue;
 316             i->setIndirect(s, 0, NULL);
 317             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 318             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
 319          }
 320       }
 321    }
 322    return true;
 323 }
 324
 325 // =============================================================================
 326
 327 // Evaluate constant expressions.
 328 class ConstantFolding : public Pass
 329 {
 330 public:
 331    bool foldAll(Program *);
 332
 333 private:
 334    virtual bool visit(BasicBlock *);
 335
 336    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
 337    void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
 338    void opnd(Instruction *, ImmediateValue&, int s);
 339
 340    void unary(Instruction *, const ImmediateValue&);
 341
 342    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
 343
 344    CmpInstruction *findOriginForTestWithZero(Value *);
 345
 346    unsigned int foldCount;
 347
 348    BuildUtil bld;
 349 };
 350
 351 // TODO: remember generated immediates and only revisit these
 352 bool
 353 ConstantFolding::foldAll(Program *prog)
 354 {
 355    unsigned int iterCount = 0;
 356    do {
 357       foldCount = 0;
 358       if (!run(prog))
 359          return false;
 360    } while (foldCount && ++iterCount < 2);
 361    return true;
 362 }
 363
 364 bool
 365 ConstantFolding::visit(BasicBlock *bb)
 366 {
 367    Instruction *i, *next;
 368
 369    for (i = bb->getEntry(); i; i = next) {
 370       next = i->next;
 371       if (i->op == OP_MOV || i->op == OP_CALL)
 372          continue;
 373
 374       ImmediateValue src0, src1, src2;
 375
 376       if (i->srcExists(2) &&
 377           i->src(0).getImmediate(src0) &&
 378           i->src(1).getImmediate(src1) &&
 379           i->src(2).getImmediate(src2))
 380          expr(i, src0, src1, src2);
 381       else
 382       if (i->srcExists(1) &&
 383           i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
 384          expr(i, src0, src1);
 385       else
 386       if (i->srcExists(0) && i->src(0).getImmediate(src0))
 387          opnd(i, src0, 0);
 388       else
 389       if (i->srcExists(1) && i->src(1).getImmediate(src1))
 390          opnd(i, src1, 1);
 391    }
 392    return true;
 393 }
 394
 395 CmpInstruction *
 396 ConstantFolding::findOriginForTestWithZero(Value *value)
 397 {
 398    if (!value)
 399       return NULL;
 400    Instruction *insn = value->getInsn();
 401
 402    if (insn->asCmp() && insn->op != OP_SLCT)
 403       return insn->asCmp();
 404
 405    /* Sometimes mov's will sneak in as a result of other folding. This gets
 406     * cleaned up later.
 407     */
 408    if (insn->op == OP_MOV)
 409       return findOriginForTestWithZero(insn->getSrc(0));
 410
 411    /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
 412    if (insn->op == OP_AND) {
 413       int s = 0;
 414       ImmediateValue imm;
 415       if (!insn->src(s).getImmediate(imm)) {
 416          s = 1;
 417          if (!insn->src(s).getImmediate(imm))
 418             return NULL;
 419       }
 420       if (imm.reg.data.f32 != 1.0f)
 421          return NULL;
 422       /* TODO: Come up with a way to handle the condition being inverted */
 423       if (insn->src(!s).mod != Modifier(0))
 424          return NULL;
 425       return findOriginForTestWithZero(insn->getSrc(!s));
 426    }
 427
 428    return NULL;
 429 }
 430
 431 void
 432 Modifier::applyTo(ImmediateValue& imm) const
 433 {
 434    if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
 435       return;
 436    switch (imm.reg.type) {
 437    case TYPE_F32:
 438       if (bits & NV50_IR_MOD_ABS)
 439          imm.reg.data.f32 = fabsf(imm.reg.data.f32);
 440       if (bits & NV50_IR_MOD_NEG)
 441          imm.reg.data.f32 = -imm.reg.data.f32;
 442       if (bits & NV50_IR_MOD_SAT) {
 443          if (imm.reg.data.f32 < 0.0f)
 444             imm.reg.data.f32 = 0.0f;
 445          else
 446          if (imm.reg.data.f32 > 1.0f)
 447             imm.reg.data.f32 = 1.0f;
 448       }
 449       assert(!(bits & NV50_IR_MOD_NOT));
 450       break;
 451
 452    case TYPE_S8: // NOTE: will be extended
 453    case TYPE_S16:
 454    case TYPE_S32:
 455    case TYPE_U8: // NOTE: treated as signed
 456    case TYPE_U16:
 457    case TYPE_U32:
 458       if (bits & NV50_IR_MOD_ABS)
 459          imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
 460             imm.reg.data.s32 : -imm.reg.data.s32;
 461       if (bits & NV50_IR_MOD_NEG)
 462          imm.reg.data.s32 = -imm.reg.data.s32;
 463       if (bits & NV50_IR_MOD_NOT)
 464          imm.reg.data.s32 = ~imm.reg.data.s32;
 465       break;
 466
 467    case TYPE_F64:
 468       if (bits & NV50_IR_MOD_ABS)
 469          imm.reg.data.f64 = fabs(imm.reg.data.f64);
 470       if (bits & NV50_IR_MOD_NEG)
 471          imm.reg.data.f64 = -imm.reg.data.f64;
 472       if (bits & NV50_IR_MOD_SAT) {
 473          if (imm.reg.data.f64 < 0.0)
 474             imm.reg.data.f64 = 0.0;
 475          else
 476          if (imm.reg.data.f64 > 1.0)
 477             imm.reg.data.f64 = 1.0;
 478       }
 479       assert(!(bits & NV50_IR_MOD_NOT));
 480       break;
 481
 482    default:
 483       assert(!"invalid/unhandled type");
 484       imm.reg.data.u64 = 0;
 485       break;
 486    }
 487 }
 488
 489 operation
 490 Modifier::getOp() const
 491 {
 492    switch (bits) {
 493    case NV50_IR_MOD_ABS: return OP_ABS;
 494    case NV50_IR_MOD_NEG: return OP_NEG;
 495    case NV50_IR_MOD_SAT: return OP_SAT;
 496    case NV50_IR_MOD_NOT: return OP_NOT;
 497    case 0:
 498       return OP_MOV;
 499    default:
 500       return OP_CVT;
 501    }
 502 }
 503
 504 void
 505 ConstantFolding::expr(Instruction *i,
 506                       ImmediateValue &imm0, ImmediateValue &imm1)
 507 {
 508    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
 509    struct Storage res;
 510    DataType type = i->dType;
 511
 512    memset(&res.data, 0, sizeof(res.data));
 513
 514    switch (i->op) {
 515    case OP_MAD:
 516    case OP_FMA:
 517    case OP_MUL:
 518       if (i->dnz && i->dType == TYPE_F32) {
 519          if (!isfinite(a->data.f32))
 520             a->data.f32 = 0.0f;
 521          if (!isfinite(b->data.f32))
 522             b->data.f32 = 0.0f;
 523       }
 524       switch (i->dType) {
 525       case TYPE_F32:
 526          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
 527          break;
 528       case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
 529       case TYPE_S32:
 530          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 531             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
 532             break;
 533          }
 534          /* fallthrough */
 535       case TYPE_U32:
 536          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 537             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
 538             break;
 539          }
 540          res.data.u32 = a->data.u32 * b->data.u32; break;
 541       default:
 542          return;
 543       }
 544       break;
 545    case OP_DIV:
 546       if (b->data.u32 == 0)
 547          break;
 548       switch (i->dType) {
 549       case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
 550       case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
 551       case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
 552       case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
 553       default:
 554          return;
 555       }
 556       break;
 557    case OP_ADD:
 558       switch (i->dType) {
 559       case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
 560       case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
 561       case TYPE_S32:
 562       case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
 563       default:
 564          return;
 565       }
 566       break;
 567    case OP_POW:
 568       switch (i->dType) {
 569       case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
 570       case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
 571       default:
 572          return;
 573       }
 574       break;
 575    case OP_MAX:
 576       switch (i->dType) {
 577       case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
 578       case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
 579       case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
 580       case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
 581       default:
 582          return;
 583       }
 584       break;
 585    case OP_MIN:
 586       switch (i->dType) {
 587       case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
 588       case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
 589       case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
 590       case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
 591       default:
 592          return;
 593       }
 594       break;
 595    case OP_AND:
 596       res.data.u64 = a->data.u64 & b->data.u64;
 597       break;
 598    case OP_OR:
 599       res.data.u64 = a->data.u64 | b->data.u64;
 600       break;
 601    case OP_XOR:
 602       res.data.u64 = a->data.u64 ^ b->data.u64;
 603       break;
 604    case OP_SHL:
 605       res.data.u32 = a->data.u32 << b->data.u32;
 606       break;
 607    case OP_SHR:
 608       switch (i->dType) {
 609       case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
 610       case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
 611       default:
 612          return;
 613       }
 614       break;
 615    case OP_SLCT:
 616       if (a->data.u32 != b->data.u32)
 617          return;
 618       res.data.u32 = a->data.u32;
 619       break;
 620    case OP_EXTBF: {
 621       int offset = b->data.u32 & 0xff;
 622       int width = (b->data.u32 >> 8) & 0xff;
 623       int rshift = offset;
 624       int lshift = 0;
 625       if (width == 0) {
 626          res.data.u32 = 0;
 627          break;
 628       }
 629       if (width + offset < 32) {
 630          rshift = 32 - width;
 631          lshift = 32 - width - offset;
 632       }
 633       if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
 634          res.data.u32 = util_bitreverse(a->data.u32);
 635       else
 636          res.data.u32 = a->data.u32;
 637       switch (i->dType) {
 638       case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
 639       case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
 640       default:
 641          return;
 642       }
 643       break;
 644    }
 645    case OP_POPCNT:
 646       res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
 647       break;
 648    case OP_PFETCH:
 649       // The two arguments to pfetch are logically added together. Normally
 650       // the second argument will not be constant, but that can happen.
 651       res.data.u32 = a->data.u32 + b->data.u32;
 652       type = TYPE_U32;
 653       break;
 654    case OP_MERGE:
 655       switch (i->dType) {
 656       case TYPE_U64:
 657       case TYPE_S64:
 658       case TYPE_F64:
 659          res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
 660          break;
 661       default:
 662          return;
 663       }
 664       break;
 665    default:
 666       return;
 667    }
 668    ++foldCount;
 669
 670    i->src(0).mod = Modifier(0);
 671    i->src(1).mod = Modifier(0);
 672    i->postFactor = 0;
 673
 674    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
 675    i->setSrc(1, NULL);
 676
 677    i->getSrc(0)->reg.data = res.data;
 678    i->getSrc(0)->reg.type = type;
 679    i->getSrc(0)->reg.size = typeSizeof(type);
 680
 681    switch (i->op) {
 682    case OP_MAD:
 683    case OP_FMA: {
 684       ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
 685
 686       // Move the immediate into position 1, where we know it might be
 687       // emittable. However it might not be anyways, as there may be other
 688       // restrictions, so move it into a separate LValue.
 689       bld.setPosition(i, false);
 690       i->op = OP_ADD;
 691       i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
 692       i->setSrc(0, i->getSrc(2));
 693       i->src(0).mod = i->src(2).mod;
 694       i->setSrc(2, NULL);
 695
 696       if (i->src(0).getImmediate(src0))
 697          expr(i, src0, src1);
 698       else
 699          opnd(i, src1, 1);
 700       break;
 701    }
 702    case OP_PFETCH:
 703       // Leave PFETCH alone... we just folded its 2 args into 1.
 704       break;
 705    default:
 706       i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
 707       break;
 708    }
 709    i->subOp = 0;
 710 }
 711
 712 void
 713 ConstantFolding::expr(Instruction *i,
 714                       ImmediateValue &imm0,
 715                       ImmediateValue &imm1,
 716                       ImmediateValue &imm2)
 717 {
 718    struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
 719    struct Storage res;
 720
 721    memset(&res.data, 0, sizeof(res.data));
 722
 723    switch (i->op) {
 724    case OP_INSBF: {
 725       int offset = b->data.u32 & 0xff;
 726       int width = (b->data.u32 >> 8) & 0xff;
 727       unsigned bitmask = ((1 << width) - 1) << offset;
 728       res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
 729       break;
 730    }
 731    case OP_MAD:
 732    case OP_FMA: {
 733       switch (i->dType) {
 734       case TYPE_F32:
 735          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
 736             c->data.f32;
 737          break;
 738       case TYPE_F64:
 739          res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
 740          break;
 741       case TYPE_S32:
 742          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 743             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
 744             break;
 745          }
 746          /* fallthrough */
 747       case TYPE_U32:
 748          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 749             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
 750             break;
 751          }
 752          res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
 753          break;
 754       default:
 755          return;
 756       }
 757       break;
 758    }
 759    default:
 760       return;
 761    }
 762
 763    ++foldCount;
 764    i->src(0).mod = Modifier(0);
 765    i->src(1).mod = Modifier(0);
 766    i->src(2).mod = Modifier(0);
 767
 768    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
 769    i->setSrc(1, NULL);
 770    i->setSrc(2, NULL);
 771
 772    i->getSrc(0)->reg.data = res.data;
 773    i->getSrc(0)->reg.type = i->dType;
 774    i->getSrc(0)->reg.size = typeSizeof(i->dType);
 775
 776    i->op = OP_MOV;
 777 }
 778
 779 void
 780 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
 781 {
 782    Storage res;
 783
 784    if (i->dType != TYPE_F32)
 785       return;
 786    switch (i->op) {
 787    case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
 788    case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
 789    case OP_SAT: res.data.f32 = CLAMP(imm.reg.data.f32, 0.0f, 1.0f); break;
 790    case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
 791    case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
 792    case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
 793    case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
 794    case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
 795    case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
 796    case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
 797    case OP_PRESIN:
 798    case OP_PREEX2:
 799       // these should be handled in subsequent OP_SIN/COS/EX2
 800       res.data.f32 = imm.reg.data.f32;
 801       break;
 802    default:
 803       return;
 804    }
 805    i->op = OP_MOV;
 806    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
 807    i->src(0).mod = Modifier(0);
 808 }
 809
 810 void
 811 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
 812                                         const int s, ImmediateValue& imm2)
 813 {
 814    const int t = s ? 0 : 1;
 815    Instruction *insn;
 816    Instruction *mul1 = NULL; // mul1 before mul2
 817    int e = 0;
 818    float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
 819    ImmediateValue imm1;
 820
 821    assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
 822
 823    if (mul2->getSrc(t)->refCount() == 1) {
 824       insn = mul2->getSrc(t)->getInsn();
 825       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
 826          mul1 = insn;
 827       if (mul1 && !mul1->saturate) {
 828          int s1;
 829
 830          if (mul1->src(s1 = 0).getImmediate(imm1) ||
 831              mul1->src(s1 = 1).getImmediate(imm1)) {
 832             bld.setPosition(mul1, false);
 833             // a = mul r, imm1
 834             // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
 835             mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
 836             mul1->src(s1).mod = Modifier(0);
 837             mul2->def(0).replace(mul1->getDef(0), false);
 838             mul1->saturate = mul2->saturate;
 839          } else
 840          if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
 841             // c = mul a, b
 842             // d = mul c, imm   -> d = mul_x_imm a, b
 843             mul1->postFactor = e;
 844             mul2->def(0).replace(mul1->getDef(0), false);
 845             if (f < 0)
 846                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
 847             mul1->saturate = mul2->saturate;
 848          }
 849          return;
 850       }
 851    }
 852    if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
 853       // b = mul a, imm
 854       // d = mul b, c   -> d = mul_x_imm a, c
 855       int s2, t2;
 856       insn = (*mul2->getDef(0)->uses.begin())->getInsn();
 857       if (!insn)
 858          return;
 859       mul1 = mul2;
 860       mul2 = NULL;
 861       s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
 862       t2 = s2 ? 0 : 1;
 863       if (insn->op == OP_MUL && insn->dType == TYPE_F32)
 864          if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
 865             mul2 = insn;
 866       if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
 867          mul2->postFactor = e;
 868          mul2->setSrc(s2, mul1->src(t));
 869          if (f < 0)
 870             mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
 871       }
 872    }
 873 }
 874
 875 void
 876 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 877 {
 878    const int t = !s;
 879    const operation op = i->op;
 880    Instruction *newi = i;
 881
 882    switch (i->op) {
 883    case OP_MUL:
 884       if (i->dType == TYPE_F32)
 885          tryCollapseChainedMULs(i, s, imm0);
 886
 887       if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 888          assert(!isFloatType(i->sType));
 889          if (imm0.isInteger(1) && i->dType == TYPE_S32) {
 890             bld.setPosition(i, false);
 891             // Need to set to the sign value, which is a compare.
 892             newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
 893                              TYPE_S32, i->getSrc(t), bld.mkImm(0));
 894             delete_Instruction(prog, i);
 895          } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
 896             // The high bits can't be set in this case (either mul by 0 or
 897             // unsigned by 1)
 898             i->op = OP_MOV;
 899             i->subOp = 0;
 900             i->setSrc(0, new_ImmediateValue(prog, 0u));
 901             i->src(0).mod = Modifier(0);
 902             i->setSrc(1, NULL);
 903          } else if (!imm0.isNegative() && imm0.isPow2()) {
 904             // Translate into a shift
 905             imm0.applyLog2();
 906             i->op = OP_SHR;
 907             i->subOp = 0;
 908             imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
 909             i->setSrc(0, i->getSrc(t));
 910             i->src(0).mod = i->src(t).mod;
 911             i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
 912             i->src(1).mod = 0;
 913          }
 914       } else
 915       if (imm0.isInteger(0)) {
 916          i->op = OP_MOV;
 917          i->setSrc(0, new_ImmediateValue(prog, 0u));
 918          i->src(0).mod = Modifier(0);
 919          i->postFactor = 0;
 920          i->setSrc(1, NULL);
 921       } else
 922       if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
 923          if (imm0.isNegative())
 924             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
 925          i->op = i->src(t).mod.getOp();
 926          if (s == 0) {
 927             i->setSrc(0, i->getSrc(1));
 928             i->src(0).mod = i->src(1).mod;
 929             i->src(1).mod = 0;
 930          }
 931          if (i->op != OP_CVT)
 932             i->src(0).mod = 0;
 933          i->setSrc(1, NULL);
 934       } else
 935       if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
 936          if (imm0.isNegative())
 937             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
 938          i->op = OP_ADD;
 939          i->setSrc(s, i->getSrc(t));
 940          i->src(s).mod = i->src(t).mod;
 941       } else
 942       if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
 943          i->op = OP_SHL;
 944          imm0.applyLog2();
 945          i->setSrc(0, i->getSrc(t));
 946          i->src(0).mod = i->src(t).mod;
 947          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
 948          i->src(1).mod = 0;
 949       } else
 950       if (i->postFactor && i->sType == TYPE_F32) {
 951          /* Can't emit a postfactor with an immediate, have to fold it in */
 952          i->setSrc(s, new_ImmediateValue(
 953                       prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
 954          i->postFactor = 0;
 955       }
 956       break;
 957    case OP_MAD:
 958       if (imm0.isInteger(0)) {
 959          i->setSrc(0, i->getSrc(2));
 960          i->src(0).mod = i->src(2).mod;
 961          i->setSrc(1, NULL);
 962          i->setSrc(2, NULL);
 963          i->op = i->src(0).mod.getOp();
 964          if (i->op != OP_CVT)
 965             i->src(0).mod = 0;
 966       } else
 967       if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
 968           (imm0.isInteger(1) || imm0.isInteger(-1))) {
 969          if (imm0.isNegative())
 970             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
 971          if (s == 0) {
 972             i->setSrc(0, i->getSrc(1));
 973             i->src(0).mod = i->src(1).mod;
 974          }
 975          i->setSrc(1, i->getSrc(2));
 976          i->src(1).mod = i->src(2).mod;
 977          i->setSrc(2, NULL);
 978          i->op = OP_ADD;
 979       }
 980       break;
 981    case OP_ADD:
 982       if (i->usesFlags())
 983          break;
 984       if (imm0.isInteger(0)) {
 985          if (s == 0) {
 986             i->setSrc(0, i->getSrc(1));
 987             i->src(0).mod = i->src(1).mod;
 988          }
 989          i->setSrc(1, NULL);
 990          i->op = i->src(0).mod.getOp();
 991          if (i->op != OP_CVT)
 992             i->src(0).mod = Modifier(0);
 993       }
 994       break;
 995
 996    case OP_DIV:
 997       if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
 998          break;
 999       bld.setPosition(i, false);
1000       if (imm0.reg.data.u32 == 0) {
1001          break;
1002       } else
1003       if (imm0.reg.data.u32 == 1) {
1004          i->op = OP_MOV;
1005          i->setSrc(1, NULL);
1006       } else
1007       if (i->dType == TYPE_U32 && imm0.isPow2()) {
1008          i->op = OP_SHR;
1009          i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
1010       } else
1011       if (i->dType == TYPE_U32) {
1012          Instruction *mul;
1013          Value *tA, *tB;
1014          const uint32_t d = imm0.reg.data.u32;
1015          uint32_t m;
1016          int r, s;
1017          uint32_t l = util_logbase2(d);
1018          if (((uint32_t)1 << l) < d)
1019             ++l;
1020          m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
1021          r = l ? 1 : 0;
1022          s = l ? (l - 1) : 0;
1023
1024          tA = bld.getSSA();
1025          tB = bld.getSSA();
1026          mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
1027                          bld.loadImm(NULL, m));
1028          mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
1029          bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
1030          tA = bld.getSSA();
1031          if (r)
1032             bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
1033          else
1034             tA = tB;
1035          tB = s ? bld.getSSA() : i->getDef(0);
1036          newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
1037          if (s)
1038             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
1039
1040          delete_Instruction(prog, i);
1041       } else
1042       if (imm0.reg.data.s32 == -1) {
1043          i->op = OP_NEG;
1044          i->setSrc(1, NULL);
1045       } else {
1046          LValue *tA, *tB;
1047          LValue *tD;
1048          const int32_t d = imm0.reg.data.s32;
1049          int32_t m;
1050          int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
1051          if ((1 << l) < abs(d))
1052             ++l;
1053          if (!l)
1054             l = 1;
1055          m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
1056
1057          tA = bld.getSSA();
1058          tB = bld.getSSA();
1059          bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
1060                    i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
1061          if (l > 1)
1062             bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
1063          else
1064             tB = tA;
1065          tA = bld.getSSA();
1066          bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
1067          tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
1068          newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
1069          if (d < 0)
1070             bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
1071
1072          delete_Instruction(prog, i);
1073       }
1074       break;
1075
1076    case OP_MOD:
1077       if (i->sType == TYPE_U32 && imm0.isPow2()) {
1078          bld.setPosition(i, false);
1079          i->op = OP_AND;
1080          i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1081       }
1082       break;
1083
1084    case OP_SET: // TODO: SET_AND,OR,XOR
1085    {
1086       /* This optimizes the case where the output of a set is being compared
1087        * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1088        * can be a lot cleverer in our comparison.
1089        */
1090       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
1091       CondCode cc, ccZ;
1092       if (imm0.reg.data.u32 != 0 || !si)
1093          return;
1094       cc = si->setCond;
1095       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
1096       // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1097       // first.
1098       if (s == 0)
1099          ccZ = reverseCondCode(ccZ);
1100       // If there is a negative modifier, we need to undo that, by flipping
1101       // the comparison to zero.
1102       if (i->src(t).mod.neg())
1103          ccZ = reverseCondCode(ccZ);
1104       // If this is a signed comparison, we expect the input to be a regular
1105       // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1106       // is positive, so just flip the sign.
1107       if (i->sType == TYPE_S32) {
1108          assert(!isFloatType(si->dType));
1109          ccZ = reverseCondCode(ccZ);
1110       }
1111       switch (ccZ) {
1112       case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
1113       case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
1114       case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
1115       case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
1116       case CC_GT: break; // bool > 0 -- bool
1117       case CC_NE: break; // bool != 0 -- bool
1118       default:
1119          return;
1120       }
1121
1122       // Update the condition of this SET to be identical to the origin set,
1123       // but with the updated condition code. The original SET should get
1124       // DCE'd, ideally.
1125       i->op = si->op;
1126       i->asCmp()->setCond = cc;
1127       i->setSrc(0, si->src(0));
1128       i->setSrc(1, si->src(1));
1129       if (si->srcExists(2))
1130          i->setSrc(2, si->src(2));
1131       i->sType = si->sType;
1132    }
1133       break;
1134
1135    case OP_AND:
1136    {
1137       Instruction *src = i->getSrc(t)->getInsn();
1138       ImmediateValue imm1;
1139       if (imm0.reg.data.u32 == 0) {
1140          i->op = OP_MOV;
1141          i->setSrc(0, new_ImmediateValue(prog, 0u));
1142          i->src(0).mod = Modifier(0);
1143          i->setSrc(1, NULL);
1144       } else if (imm0.reg.data.u32 == ~0U) {
1145          i->op = i->src(t).mod.getOp();
1146          if (t) {
1147             i->setSrc(0, i->getSrc(t));
1148             i->src(0).mod = i->src(t).mod;
1149          }
1150          i->setSrc(1, NULL);
1151       } else if (src->asCmp()) {
1152          CmpInstruction *cmp = src->asCmp();
1153          if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
1154             return;
1155          if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
1156             return;
1157          if (imm0.reg.data.f32 != 1.0)
1158             return;
1159          if (cmp->dType != TYPE_U32)
1160             return;
1161
1162          cmp->dType = TYPE_F32;
1163          if (i->src(t).mod != Modifier(0)) {
1164             assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
1165             i->src(t).mod = Modifier(0);
1166             cmp->setCond = inverseCondCode(cmp->setCond);
1167          }
1168          i->op = OP_MOV;
1169          i->setSrc(s, NULL);
1170          if (t) {
1171             i->setSrc(0, i->getSrc(t));
1172             i->setSrc(t, NULL);
1173          }
1174       } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
1175                  src->op == OP_SHR &&
1176                  src->src(1).getImmediate(imm1) &&
1177                  i->src(t).mod == Modifier(0) &&
1178                  util_is_power_of_two(imm0.reg.data.u32 + 1)) {
1179          // low byte = offset, high byte = width
1180          uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
1181          i->op = OP_EXTBF;
1182          i->setSrc(0, src->getSrc(0));
1183          i->setSrc(1, new_ImmediateValue(prog, ext));
1184       }
1185    }
1186       break;
1187
1188    case OP_SHL:
1189    {
1190       if (s != 1 || i->src(0).mod != Modifier(0))
1191          break;
1192       // try to concatenate shifts
1193       Instruction *si = i->getSrc(0)->getInsn();
1194       if (!si)
1195          break;
1196       ImmediateValue imm1;
1197       switch (si->op) {
1198       case OP_SHL:
1199          if (si->src(1).getImmediate(imm1)) {
1200             bld.setPosition(i, false);
1201             i->setSrc(0, si->getSrc(0));
1202             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
1203          }
1204          break;
1205       case OP_SHR:
1206          if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
1207             bld.setPosition(i, false);
1208             i->op = OP_AND;
1209             i->setSrc(0, si->getSrc(0));
1210             i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
1211          }
1212          break;
1213       case OP_MUL:
1214          int muls;
1215          if (isFloatType(si->dType))
1216             return;
1217          if (si->src(1).getImmediate(imm1))
1218             muls = 1;
1219          else if (si->src(0).getImmediate(imm1))
1220             muls = 0;
1221          else
1222             return;
1223
1224          bld.setPosition(i, false);
1225          i->op = OP_MUL;
1226          i->setSrc(0, si->getSrc(!muls));
1227          i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1228          break;
1229       case OP_SUB:
1230       case OP_ADD:
1231          int adds;
1232          if (isFloatType(si->dType))
1233             return;
1234          if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
1235             adds = 0;
1236          else if (si->src(1).getImmediate(imm1))
1237             adds = 1;
1238          else
1239             return;
1240          if (si->src(!adds).mod != Modifier(0))
1241             return;
1242          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1243
1244          // This is more operations, but if one of x, y is an immediate, then
1245          // we can get a situation where (a) we can use ISCADD, or (b)
1246          // propagate the add bit into an indirect load.
1247          bld.setPosition(i, false);
1248          i->op = si->op;
1249          i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1250          i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
1251                                      bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
1252                                      si->getSrc(!adds),
1253                                      bld.mkImm(imm0.reg.data.u32)));
1254          break;
1255       default:
1256          return;
1257       }
1258    }
1259       break;
1260
1261    case OP_ABS:
1262    case OP_NEG:
1263    case OP_SAT:
1264    case OP_LG2:
1265    case OP_RCP:
1266    case OP_SQRT:
1267    case OP_RSQ:
1268    case OP_PRESIN:
1269    case OP_SIN:
1270    case OP_COS:
1271    case OP_PREEX2:
1272    case OP_EX2:
1273       unary(i, imm0);
1274       break;
1275    case OP_BFIND: {
1276       int32_t res;
1277       switch (i->dType) {
1278       case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
1279       case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
1280       default:
1281          return;
1282       }
1283       if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
1284          res = 31 - res;
1285       bld.setPosition(i, false); /* make sure bld is init'ed */
1286       i->setSrc(0, bld.mkImm(res));
1287       i->setSrc(1, NULL);
1288       i->op = OP_MOV;
1289       i->subOp = 0;
1290       break;
1291    }
1292    case OP_POPCNT: {
1293       // Only deal with 1-arg POPCNT here
1294       if (i->srcExists(1))
1295          break;
1296       uint32_t res = util_bitcount(imm0.reg.data.u32);
1297       i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1298       i->setSrc(1, NULL);
1299       i->op = OP_MOV;
1300       break;
1301    }
1302    case OP_CVT: {
1303       Storage res;
1304
1305       // TODO: handle 64-bit values properly
1306       if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
1307          return;
1308
1309       // TODO: handle single byte/word extractions
1310       if (i->subOp)
1311          return;
1312
1313       bld.setPosition(i, true); /* make sure bld is init'ed */
1314
1315 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1316    case type: \
1317       switch (i->sType) { \
1318       case TYPE_F64: \
1319          res.data.dst = util_iround(i->saturate ? \
1320                                     CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1321                                     imm0.reg.data.f64); \
1322          break; \
1323       case TYPE_F32: \
1324          res.data.dst = util_iround(i->saturate ? \
1325                                     CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1326                                     imm0.reg.data.f32); \
1327          break; \
1328       case TYPE_S32: \
1329          res.data.dst = i->saturate ? \
1330                         CLAMP(imm0.reg.data.s32, imin, imax) : \
1331                         imm0.reg.data.s32; \
1332          break; \
1333       case TYPE_U32: \
1334          res.data.dst = i->saturate ? \
1335                         CLAMP(imm0.reg.data.u32, umin, umax) : \
1336                         imm0.reg.data.u32; \
1337          break; \
1338       case TYPE_S16: \
1339          res.data.dst = i->saturate ? \
1340                         CLAMP(imm0.reg.data.s16, imin, imax) : \
1341                         imm0.reg.data.s16; \
1342          break; \
1343       case TYPE_U16: \
1344          res.data.dst = i->saturate ? \
1345                         CLAMP(imm0.reg.data.u16, umin, umax) : \
1346                         imm0.reg.data.u16; \
1347          break; \
1348       default: return; \
1349       } \
1350       i->setSrc(0, bld.mkImm(res.data.dst)); \
1351       break
1352
1353       switch(i->dType) {
1354       CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
1355       CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
1356       CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
1357       CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
1358       case TYPE_F32:
1359          switch (i->sType) {
1360          case TYPE_F64:
1361             res.data.f32 = i->saturate ?
1362                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1363                imm0.reg.data.f64;
1364             break;
1365          case TYPE_F32:
1366             res.data.f32 = i->saturate ?
1367                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1368                imm0.reg.data.f32;
1369             break;
1370          case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
1371          case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
1372          case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
1373          case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
1374          default:
1375             return;
1376          }
1377          i->setSrc(0, bld.mkImm(res.data.f32));
1378          break;
1379       case TYPE_F64:
1380          switch (i->sType) {
1381          case TYPE_F64:
1382             res.data.f64 = i->saturate ?
1383                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1384                imm0.reg.data.f64;
1385             break;
1386          case TYPE_F32:
1387             res.data.f64 = i->saturate ?
1388                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1389                imm0.reg.data.f32;
1390             break;
1391          case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
1392          case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
1393          case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
1394          case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
1395          default:
1396             return;
1397          }
1398          i->setSrc(0, bld.mkImm(res.data.f64));
1399          break;
1400       default:
1401          return;
1402       }
1403 #undef CASE
1404
1405       i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
1406       i->op = OP_MOV;
1407       i->saturate = 0;
1408       i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
1409       break;
1410    }
1411    default:
1412       return;
1413    }
1414    if (newi->op != op)
1415       foldCount++;
1416 }
1417
1418 // =============================================================================
1419
1420 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1421 class ModifierFolding : public Pass
1422 {
1423 private:
1424    virtual bool visit(BasicBlock *);
1425 };
1426
1427 bool
1428 ModifierFolding::visit(BasicBlock *bb)
1429 {
1430    const Target *target = prog->getTarget();
1431
1432    Instruction *i, *next, *mi;
1433    Modifier mod;
1434
1435    for (i = bb->getEntry(); i; i = next) {
1436       next = i->next;
1437
1438       if (0 && i->op == OP_SUB) {
1439          // turn "sub" into "add neg" (do we really want this ?)
1440          i->op = OP_ADD;
1441          i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1442       }
1443
1444       for (int s = 0; s < 3 && i->srcExists(s); ++s) {
1445          mi = i->getSrc(s)->getInsn();
1446          if (!mi ||
1447              mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
1448             continue;
1449          if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
1450             if ((i->op != OP_ADD &&
1451                  i->op != OP_MUL) ||
1452                 (mi->op != OP_ABS &&
1453                  mi->op != OP_NEG))
1454                continue;
1455          } else
1456          if (i->sType != mi->dType) {
1457             continue;
1458          }
1459          if ((mod = Modifier(mi->op)) == Modifier(0))
1460             continue;
1461          mod *= mi->src(0).mod;
1462
1463          if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
1464             // abs neg [abs] = abs
1465             mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
1466          } else
1467          if ((i->op == OP_NEG) && mod.neg()) {
1468             assert(s == 0);
1469             // neg as both opcode and modifier on same insn is prohibited
1470             // neg neg abs = abs, neg neg = identity
1471             mod = mod & Modifier(~NV50_IR_MOD_NEG);
1472             i->op = mod.getOp();
1473             mod = mod & Modifier(~NV50_IR_MOD_ABS);
1474             if (mod == Modifier(0))
1475                i->op = OP_MOV;
1476          }
1477
1478          if (target->isModSupported(i, s, mod)) {
1479             i->setSrc(s, mi->getSrc(0));
1480             i->src(s).mod *= mod;
1481          }
1482       }
1483
1484       if (i->op == OP_SAT) {
1485          mi = i->getSrc(0)->getInsn();
1486          if (mi &&
1487              mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
1488             mi->saturate = 1;
1489             mi->setDef(0, i->getDef(0));
1490             delete_Instruction(prog, i);
1491          }
1492       }
1493    }
1494
1495    return true;
1496 }
1497
1498 // =============================================================================
1499
1500 // MUL + ADD -> MAD/FMA
1501 // MIN/MAX(a, a) -> a, etc.
1502 // SLCT(a, b, const) -> cc(const) ? a : b
1503 // RCP(RCP(a)) -> a
1504 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1505 class AlgebraicOpt : public Pass
1506 {
1507 private:
1508    virtual bool visit(BasicBlock *);
1509
1510    void handleABS(Instruction *);
1511    bool handleADD(Instruction *);
1512    bool tryADDToMADOrSAD(Instruction *, operation toOp);
1513    void handleMINMAX(Instruction *);
1514    void handleRCP(Instruction *);
1515    void handleSLCT(Instruction *);
1516    void handleLOGOP(Instruction *);
1517    void handleCVT_NEG(Instruction *);
1518    void handleCVT_CVT(Instruction *);
1519    void handleCVT_EXTBF(Instruction *);
1520    void handleSUCLAMP(Instruction *);
1521
1522    BuildUtil bld;
1523 };
1524
1525 void
1526 AlgebraicOpt::handleABS(Instruction *abs)
1527 {
1528    Instruction *sub = abs->getSrc(0)->getInsn();
1529    DataType ty;
1530    if (!sub ||
1531        !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
1532       return;
1533    // expect not to have mods yet, if we do, bail
1534    if (sub->src(0).mod || sub->src(1).mod)
1535       return;
1536    // hidden conversion ?
1537    ty = intTypeToSigned(sub->dType);
1538    if (abs->dType != abs->sType || ty != abs->sType)
1539       return;
1540
1541    if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
1542        sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
1543        sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
1544          return;
1545
1546    Value *src0 = sub->getSrc(0);
1547    Value *src1 = sub->getSrc(1);
1548
1549    if (sub->op == OP_ADD) {
1550       Instruction *neg = sub->getSrc(1)->getInsn();
1551       if (neg && neg->op != OP_NEG) {
1552          neg = sub->getSrc(0)->getInsn();
1553          src0 = sub->getSrc(1);
1554       }
1555       if (!neg || neg->op != OP_NEG ||
1556           neg->dType != neg->sType || neg->sType != ty)
1557          return;
1558       src1 = neg->getSrc(0);
1559    }
1560
1561    // found ABS(SUB))
1562    abs->moveSources(1, 2); // move sources >=1 up by 2
1563    abs->op = OP_SAD;
1564    abs->setType(sub->dType);
1565    abs->setSrc(0, src0);
1566    abs->setSrc(1, src1);
1567    bld.setPosition(abs, false);
1568    abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
1569 }
1570
1571 bool
1572 AlgebraicOpt::handleADD(Instruction *add)
1573 {
1574    Value *src0 = add->getSrc(0);
1575    Value *src1 = add->getSrc(1);
1576
1577    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1578       return false;
1579
1580    bool changed = false;
1581    if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
1582       changed = tryADDToMADOrSAD(add, OP_MAD);
1583    if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
1584       changed = tryADDToMADOrSAD(add, OP_SAD);
1585    return changed;
1586 }
1587
1588 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1589 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1590 bool
1591 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
1592 {
1593    Value *src0 = add->getSrc(0);
1594    Value *src1 = add->getSrc(1);
1595    Value *src;
1596    int s;
1597    const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1598    const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1599    Modifier mod[4];
1600
1601    if (src0->refCount() == 1 &&
1602        src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1603       s = 0;
1604    else
1605    if (src1->refCount() == 1 &&
1606        src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1607       s = 1;
1608    else
1609       return false;
1610
1611    src = add->getSrc(s);
1612
1613    if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
1614       return false;
1615
1616    if (src->getInsn()->postFactor)
1617       return false;
1618    if (toOp == OP_SAD) {
1619       ImmediateValue imm;
1620       if (!src->getInsn()->src(2).getImmediate(imm))
1621          return false;
1622       if (!imm.isInteger(0))
1623          return false;
1624    }
1625
1626    if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
1627        isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
1628       return false;
1629
1630    mod[0] = add->src(0).mod;
1631    mod[1] = add->src(1).mod;
1632    mod[2] = src->getUniqueInsn()->src(0).mod;
1633    mod[3] = src->getUniqueInsn()->src(1).mod;
1634
1635    if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1636       return false;
1637
1638    add->op = toOp;
1639    add->subOp = src->getInsn()->subOp; // potentially mul-high
1640    add->dType = src->getInsn()->dType; // sign matters for imad hi
1641    add->sType = src->getInsn()->sType;
1642
1643    add->setSrc(2, add->src(s ? 0 : 1));
1644
1645    add->setSrc(0, src->getInsn()->getSrc(0));
1646    add->src(0).mod = mod[2] ^ mod[s];
1647    add->setSrc(1, src->getInsn()->getSrc(1));
1648    add->src(1).mod = mod[3];
1649
1650    return true;
1651 }
1652
1653 void
1654 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1655 {
1656    Value *src0 = minmax->getSrc(0);
1657    Value *src1 = minmax->getSrc(1);
1658
1659    if (src0 != src1 || src0->reg.file != FILE_GPR)
1660       return;
1661    if (minmax->src(0).mod == minmax->src(1).mod) {
1662       if (minmax->def(0).mayReplace(minmax->src(0))) {
1663          minmax->def(0).replace(minmax->src(0), false);
1664          minmax->bb->remove(minmax);
1665       } else {
1666          minmax->op = OP_CVT;
1667          minmax->setSrc(1, NULL);
1668       }
1669    } else {
1670       // TODO:
1671       // min(x, -x) = -abs(x)
1672       // min(x, -abs(x)) = -abs(x)
1673       // min(x, abs(x)) = x
1674       // max(x, -abs(x)) = x
1675       // max(x, abs(x)) = abs(x)
1676       // max(x, -x) = abs(x)
1677    }
1678 }
1679
1680 void
1681 AlgebraicOpt::handleRCP(Instruction *rcp)
1682 {
1683    Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1684
1685    if (si && si->op == OP_RCP) {
1686       Modifier mod = rcp->src(0).mod * si->src(0).mod;
1687       rcp->op = mod.getOp();
1688       rcp->setSrc(0, si->getSrc(0));
1689    }
1690 }
1691
1692 void
1693 AlgebraicOpt::handleSLCT(Instruction *slct)
1694 {
1695    if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
1696       if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
1697          slct->setSrc(0, slct->getSrc(1));
1698    } else
1699    if (slct->getSrc(0) != slct->getSrc(1)) {
1700       return;
1701    }
1702    slct->op = OP_MOV;
1703    slct->setSrc(1, NULL);
1704    slct->setSrc(2, NULL);
1705 }
1706
1707 void
1708 AlgebraicOpt::handleLOGOP(Instruction *logop)
1709 {
1710    Value *src0 = logop->getSrc(0);
1711    Value *src1 = logop->getSrc(1);
1712
1713    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1714       return;
1715
1716    if (src0 == src1) {
1717       if ((logop->op == OP_AND || logop->op == OP_OR) &&
1718           logop->def(0).mayReplace(logop->src(0))) {
1719          logop->def(0).replace(logop->src(0), false);
1720          delete_Instruction(prog, logop);
1721       }
1722    } else {
1723       // try AND(SET, SET) -> SET_AND(SET)
1724       Instruction *set0 = src0->getInsn();
1725       Instruction *set1 = src1->getInsn();
1726
1727       if (!set0 || set0->fixed || !set1 || set1->fixed)
1728          return;
1729       if (set1->op != OP_SET) {
1730          Instruction *xchg = set0;
1731          set0 = set1;
1732          set1 = xchg;
1733          if (set1->op != OP_SET)
1734             return;
1735       }
1736       operation redOp = (logop->op == OP_AND ? OP_SET_AND :
1737                          logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
1738       if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
1739          return;
1740       if (set0->op != OP_SET &&
1741           set0->op != OP_SET_AND &&
1742           set0->op != OP_SET_OR &&
1743           set0->op != OP_SET_XOR)
1744          return;
1745       if (set0->getDef(0)->refCount() > 1 &&
1746           set1->getDef(0)->refCount() > 1)
1747          return;
1748       if (set0->getPredicate() || set1->getPredicate())
1749          return;
1750       // check that they don't source each other
1751       for (int s = 0; s < 2; ++s)
1752          if (set0->getSrc(s) == set1->getDef(0) ||
1753              set1->getSrc(s) == set0->getDef(0))
1754             return;
1755
1756       set0 = cloneForward(func, set0);
1757       set1 = cloneShallow(func, set1);
1758       logop->bb->insertAfter(logop, set1);
1759       logop->bb->insertAfter(logop, set0);
1760
1761       set0->dType = TYPE_U8;
1762       set0->getDef(0)->reg.file = FILE_PREDICATE;
1763       set0->getDef(0)->reg.size = 1;
1764       set1->setSrc(2, set0->getDef(0));
1765       set1->op = redOp;
1766       set1->setDef(0, logop->getDef(0));
1767       delete_Instruction(prog, logop);
1768    }
1769 }
1770
1771 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1772 // nv50:
1773 //  F2I(NEG(I2F(ABS(SET))))
1774 void
1775 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
1776 {
1777    Instruction *insn = cvt->getSrc(0)->getInsn();
1778    if (cvt->sType != TYPE_F32 ||
1779        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
1780       return;
1781    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
1782       return;
1783    if (insn->src(0).mod != Modifier(0))
1784       return;
1785    insn = insn->getSrc(0)->getInsn();
1786
1787    // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
1788    if (insn && insn->op == OP_CVT &&
1789        insn->dType == TYPE_F32 &&
1790        insn->sType == TYPE_S32) {
1791       insn = insn->getSrc(0)->getInsn();
1792       if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
1793           insn->src(0).mod)
1794          return;
1795       insn = insn->getSrc(0)->getInsn();
1796       if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
1797          return;
1798    } else
1799    if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
1800       return;
1801    }
1802
1803    Instruction *bset = cloneShallow(func, insn);
1804    bset->dType = TYPE_U32;
1805    bset->setDef(0, cvt->getDef(0));
1806    cvt->bb->insertAfter(cvt, bset);
1807    delete_Instruction(prog, cvt);
1808 }
1809
1810 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
1811 // does a type conversion, this becomes trickier as there might be range
1812 // changes/etc. We could handle those in theory as long as the range was being
1813 // reduced or kept the same.
1814 void
1815 AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
1816 {
1817    Instruction *insn = cvt->getSrc(0)->getInsn();
1818    RoundMode rnd = insn->rnd;
1819
1820    if (insn->saturate ||
1821        insn->subOp ||
1822        insn->dType != insn->sType ||
1823        insn->dType != cvt->sType)
1824       return;
1825
1826    switch (insn->op) {
1827    case OP_CEIL:
1828       rnd = ROUND_PI;
1829       break;
1830    case OP_FLOOR:
1831       rnd = ROUND_MI;
1832       break;
1833    case OP_TRUNC:
1834       rnd = ROUND_ZI;
1835       break;
1836    case OP_CVT:
1837       break;
1838    default:
1839       return;
1840    }
1841
1842    if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
1843       rnd = (RoundMode)(rnd & 3);
1844
1845    cvt->rnd = rnd;
1846    cvt->setSrc(0, insn->getSrc(0));
1847    cvt->src(0).mod *= insn->src(0).mod;
1848    cvt->sType = insn->sType;
1849 }
1850
1851 // Some shaders extract packed bytes out of words and convert them to
1852 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
1853 // nv50 for word sizes.
1854 //
1855 // CVT(EXTBF(x, byte/word))
1856 // CVT(AND(bytemask, x))
1857 // CVT(AND(bytemask, SHR(x, 8/16/24)))
1858 // CVT(SHR(x, 16/24))
1859 void
1860 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
1861 {
1862    Instruction *insn = cvt->getSrc(0)->getInsn();
1863    ImmediateValue imm;
1864    Value *arg = NULL;
1865    unsigned width, offset;
1866    if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
1867       return;
1868    if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
1869       width = (imm.reg.data.u32 >> 8) & 0xff;
1870       offset = imm.reg.data.u32 & 0xff;
1871       arg = insn->getSrc(0);
1872
1873       if (width != 8 && width != 16)
1874          return;
1875       if (width == 8 && offset & 0x7)
1876          return;
1877       if (width == 16 && offset & 0xf)
1878          return;
1879    } else if (insn->op == OP_AND) {
1880       int s;
1881       if (insn->src(0).getImmediate(imm))
1882          s = 0;
1883       else if (insn->src(1).getImmediate(imm))
1884          s = 1;
1885       else
1886          return;
1887
1888       if (imm.reg.data.u32 == 0xff)
1889          width = 8;
1890       else if (imm.reg.data.u32 == 0xffff)
1891          width = 16;
1892       else
1893          return;
1894
1895       arg = insn->getSrc(!s);
1896       Instruction *shift = arg->getInsn();
1897       offset = 0;
1898       if (shift && shift->op == OP_SHR &&
1899           shift->sType == cvt->sType &&
1900           shift->src(1).getImmediate(imm) &&
1901           ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
1902            (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
1903          arg = shift->getSrc(0);
1904          offset = imm.reg.data.u32;
1905       }
1906       // We just AND'd the high bits away, which means this is effectively an
1907       // unsigned value.
1908       cvt->sType = TYPE_U32;
1909    } else if (insn->op == OP_SHR &&
1910               insn->sType == cvt->sType &&
1911               insn->src(1).getImmediate(imm)) {
1912       arg = insn->getSrc(0);
1913       if (imm.reg.data.u32 == 24) {
1914          width = 8;
1915          offset = 24;
1916       } else if (imm.reg.data.u32 == 16) {
1917          width = 16;
1918          offset = 16;
1919       } else {
1920          return;
1921       }
1922    }
1923
1924    if (!arg)
1925       return;
1926
1927    // Irrespective of what came earlier, we can undo a shift on the argument
1928    // by adjusting the offset.
1929    Instruction *shift = arg->getInsn();
1930    if (shift && shift->op == OP_SHL &&
1931        shift->src(1).getImmediate(imm) &&
1932        ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
1933         (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
1934        imm.reg.data.u32 <= offset) {
1935       arg = shift->getSrc(0);
1936       offset -= imm.reg.data.u32;
1937    }
1938
1939    // The unpackSnorm lowering still leaves a few shifts behind, but it's too
1940    // annoying to detect them.
1941
1942    if (width == 8) {
1943       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
1944    } else {
1945       assert(width == 16);
1946       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
1947    }
1948    cvt->setSrc(0, arg);
1949    cvt->subOp = offset >> 3;
1950 }
1951
1952 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
1953 void
1954 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
1955 {
1956    ImmediateValue imm;
1957    int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
1958    int s;
1959    Instruction *add;
1960
1961    assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
1962
1963    // look for ADD (TODO: only count references by non-SUCLAMP)
1964    if (insn->getSrc(0)->refCount() > 1)
1965       return;
1966    add = insn->getSrc(0)->getInsn();
1967    if (!add || add->op != OP_ADD ||
1968        (add->dType != TYPE_U32 &&
1969         add->dType != TYPE_S32))
1970       return;
1971
1972    // look for immediate
1973    for (s = 0; s < 2; ++s)
1974       if (add->src(s).getImmediate(imm))
1975          break;
1976    if (s >= 2)
1977       return;
1978    s = s ? 0 : 1;
1979    // determine if immediate fits
1980    val += imm.reg.data.s32;
1981    if (val > 31 || val < -32)
1982       return;
1983    // determine if other addend fits
1984    if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
1985       return;
1986
1987    bld.setPosition(insn, false); // make sure bld is init'ed
1988    // replace sources
1989    insn->setSrc(2, bld.mkImm(val));
1990    insn->setSrc(0, add->getSrc(s));
1991 }
1992
1993 bool
1994 AlgebraicOpt::visit(BasicBlock *bb)
1995 {
1996    Instruction *next;
1997    for (Instruction *i = bb->getEntry(); i; i = next) {
1998       next = i->next;
1999       switch (i->op) {
2000       case OP_ABS:
2001          handleABS(i);
2002          break;
2003       case OP_ADD:
2004          handleADD(i);
2005          break;
2006       case OP_RCP:
2007          handleRCP(i);
2008          break;
2009       case OP_MIN:
2010       case OP_MAX:
2011          handleMINMAX(i);
2012          break;
2013       case OP_SLCT:
2014          handleSLCT(i);
2015          break;
2016       case OP_AND:
2017       case OP_OR:
2018       case OP_XOR:
2019          handleLOGOP(i);
2020          break;
2021       case OP_CVT:
2022          handleCVT_NEG(i);
2023          handleCVT_CVT(i);
2024          if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
2025              handleCVT_EXTBF(i);
2026          break;
2027       case OP_SUCLAMP:
2028          handleSUCLAMP(i);
2029          break;
2030       default:
2031          break;
2032       }
2033    }
2034
2035    return true;
2036 }
2037
2038 // =============================================================================
2039
2040 static inline void
2041 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
2042 {
2043    if (offset != ldst->getSrc(0)->reg.data.offset) {
2044       if (ldst->getSrc(0)->refCount() > 1)
2045          ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
2046       ldst->getSrc(0)->reg.data.offset = offset;
2047    }
2048 }
2049
2050 // Combine loads and stores, forward stores to loads where possible.
2051 class MemoryOpt : public Pass
2052 {
2053 private:
2054    class Record
2055    {
2056    public:
2057       Record *next;
2058       Instruction *insn;
2059       const Value *rel[2];
2060       const Value *base;
2061       int32_t offset;
2062       int8_t fileIndex;
2063       uint8_t size;
2064       bool locked;
2065       Record *prev;
2066
2067       bool overlaps(const Instruction *ldst) const;
2068
2069       inline void link(Record **);
2070       inline void unlink(Record **);
2071       inline void set(const Instruction *ldst);
2072    };
2073
2074 public:
2075    MemoryOpt();
2076
2077    Record *loads[DATA_FILE_COUNT];
2078    Record *stores[DATA_FILE_COUNT];
2079
2080    MemoryPool recordPool;
2081
2082 private:
2083    virtual bool visit(BasicBlock *);
2084    bool runOpt(BasicBlock *);
2085
2086    Record **getList(const Instruction *);
2087
2088    Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
2089
2090    // merge @insn into load/store instruction from @rec
2091    bool combineLd(Record *rec, Instruction *ld);
2092    bool combineSt(Record *rec, Instruction *st);
2093
2094    bool replaceLdFromLd(Instruction *ld, Record *ldRec);
2095    bool replaceLdFromSt(Instruction *ld, Record *stRec);
2096    bool replaceStFromSt(Instruction *restrict st, Record *stRec);
2097
2098    void addRecord(Instruction *ldst);
2099    void purgeRecords(Instruction *const st, DataFile);
2100    void lockStores(Instruction *const ld);
2101    void reset();
2102
2103 private:
2104    Record *prevRecord;
2105 };
2106
2107 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
2108 {
2109    for (int i = 0; i < DATA_FILE_COUNT; ++i) {
2110       loads[i] = NULL;
2111       stores[i] = NULL;
2112    }
2113    prevRecord = NULL;
2114 }
2115
2116 void
2117 MemoryOpt::reset()
2118 {
2119    for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
2120       Record *it, *next;
2121       for (it = loads[i]; it; it = next) {
2122          next = it->next;
2123          recordPool.release(it);
2124       }
2125       loads[i] = NULL;
2126       for (it = stores[i]; it; it = next) {
2127          next = it->next;
2128          recordPool.release(it);
2129       }
2130       stores[i] = NULL;
2131    }
2132 }
2133
2134 bool
2135 MemoryOpt::combineLd(Record *rec, Instruction *ld)
2136 {
2137    int32_t offRc = rec->offset;
2138    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2139    int sizeRc = rec->size;
2140    int sizeLd = typeSizeof(ld->dType);
2141    int size = sizeRc + sizeLd;
2142    int d, j;
2143
2144    if (!prog->getTarget()->
2145        isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
2146       return false;
2147    // no unaligned loads
2148    if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
2149        ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
2150       return false;
2151
2152    assert(sizeRc + sizeLd <= 16 && offRc != offLd);
2153
2154    for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
2155
2156    if (offLd < offRc) {
2157       int sz;
2158       for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
2159       // d: nr of definitions in ld
2160       // j: nr of definitions in rec->insn, move:
2161       for (d = d + j - 1; j > 0; --j, --d)
2162          rec->insn->setDef(d, rec->insn->getDef(j - 1));
2163
2164       if (rec->insn->getSrc(0)->refCount() > 1)
2165          rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
2166       rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
2167
2168       d = 0;
2169    } else {
2170       d = j;
2171    }
2172    // move definitions of @ld to @rec->insn
2173    for (j = 0; sizeLd; ++j, ++d) {
2174       sizeLd -= ld->getDef(j)->reg.size;
2175       rec->insn->setDef(d, ld->getDef(j));
2176    }
2177
2178    rec->size = size;
2179    rec->insn->getSrc(0)->reg.size = size;
2180    rec->insn->setType(typeOfSize(size));
2181
2182    delete_Instruction(prog, ld);
2183
2184    return true;
2185 }
2186
2187 bool
2188 MemoryOpt::combineSt(Record *rec, Instruction *st)
2189 {
2190    int32_t offRc = rec->offset;
2191    int32_t offSt = st->getSrc(0)->reg.data.offset;
2192    int sizeRc = rec->size;
2193    int sizeSt = typeSizeof(st->dType);
2194    int s = sizeSt / 4;
2195    int size = sizeRc + sizeSt;
2196    int j, k;
2197    Value *src[4]; // no modifiers in ValueRef allowed for st
2198    Value *extra[3];
2199
2200    if (!prog->getTarget()->
2201        isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
2202       return false;
2203    if (size == 8 && MIN2(offRc, offSt) & 0x7)
2204       return false;
2205
2206    st->takeExtraSources(0, extra); // save predicate and indirect address
2207
2208    if (offRc < offSt) {
2209       // save values from @st
2210       for (s = 0; sizeSt; ++s) {
2211          sizeSt -= st->getSrc(s + 1)->reg.size;
2212          src[s] = st->getSrc(s + 1);
2213       }
2214       // set record's values as low sources of @st
2215       for (j = 1; sizeRc; ++j) {
2216          sizeRc -= rec->insn->getSrc(j)->reg.size;
2217          st->setSrc(j, rec->insn->getSrc(j));
2218       }
2219       // set saved values as high sources of @st
2220       for (k = j, j = 0; j < s; ++j)
2221          st->setSrc(k++, src[j]);
2222
2223       updateLdStOffset(st, offRc, func);
2224    } else {
2225       for (j = 1; sizeSt; ++j)
2226          sizeSt -= st->getSrc(j)->reg.size;
2227       for (s = 1; sizeRc; ++j, ++s) {
2228          sizeRc -= rec->insn->getSrc(s)->reg.size;
2229          st->setSrc(j, rec->insn->getSrc(s));
2230       }
2231       rec->offset = offSt;
2232    }
2233    st->putExtraSources(0, extra); // restore pointer and predicate
2234
2235    delete_Instruction(prog, rec->insn);
2236    rec->insn = st;
2237    rec->size = size;
2238    rec->insn->getSrc(0)->reg.size = size;
2239    rec->insn->setType(typeOfSize(size));
2240    return true;
2241 }
2242
2243 void
2244 MemoryOpt::Record::set(const Instruction *ldst)
2245 {
2246    const Symbol *mem = ldst->getSrc(0)->asSym();
2247    fileIndex = mem->reg.fileIndex;
2248    rel[0] = ldst->getIndirect(0, 0);
2249    rel[1] = ldst->getIndirect(0, 1);
2250    offset = mem->reg.data.offset;
2251    base = mem->getBase();
2252    size = typeSizeof(ldst->sType);
2253 }
2254
2255 void
2256 MemoryOpt::Record::link(Record **list)
2257 {
2258    next = *list;
2259    if (next)
2260       next->prev = this;
2261    prev = NULL;
2262    *list = this;
2263 }
2264
2265 void
2266 MemoryOpt::Record::unlink(Record **list)
2267 {
2268    if (next)
2269       next->prev = prev;
2270    if (prev)
2271       prev->next = next;
2272    else
2273       *list = next;
2274 }
2275
2276 MemoryOpt::Record **
2277 MemoryOpt::getList(const Instruction *insn)
2278 {
2279    if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
2280       return &loads[insn->src(0).getFile()];
2281    return &stores[insn->src(0).getFile()];
2282 }
2283
2284 void
2285 MemoryOpt::addRecord(Instruction *i)
2286 {
2287    Record **list = getList(i);
2288    Record *it = reinterpret_cast<Record *>(recordPool.allocate());
2289
2290    it->link(list);
2291    it->set(i);
2292    it->insn = i;
2293    it->locked = false;
2294 }
2295
2296 MemoryOpt::Record *
2297 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
2298 {
2299    const Symbol *sym = insn->getSrc(0)->asSym();
2300    const int size = typeSizeof(insn->sType);
2301    Record *rec = NULL;
2302    Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
2303
2304    for (; it; it = it->next) {
2305       if (it->locked && insn->op != OP_LOAD)
2306          continue;
2307       if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
2308           it->rel[0] != insn->getIndirect(0, 0) ||
2309           it->fileIndex != sym->reg.fileIndex ||
2310           it->rel[1] != insn->getIndirect(0, 1))
2311          continue;
2312
2313       if (it->offset < sym->reg.data.offset) {
2314          if (it->offset + it->size >= sym->reg.data.offset) {
2315             isAdj = (it->offset + it->size == sym->reg.data.offset);
2316             if (!isAdj)
2317                return it;
2318             if (!(it->offset & 0x7))
2319                rec = it;
2320          }
2321       } else {
2322          isAdj = it->offset != sym->reg.data.offset;
2323          if (size <= it->size && !isAdj)
2324             return it;
2325          else
2326          if (!(sym->reg.data.offset & 0x7))
2327             if (it->offset - size <= sym->reg.data.offset)
2328                rec = it;
2329       }
2330    }
2331    return rec;
2332 }
2333
2334 bool
2335 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
2336 {
2337    Instruction *st = rec->insn;
2338    int32_t offSt = rec->offset;
2339    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2340    int d, s;
2341
2342    for (s = 1; offSt != offLd && st->srcExists(s); ++s)
2343       offSt += st->getSrc(s)->reg.size;
2344    if (offSt != offLd)
2345       return false;
2346
2347    for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
2348       if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
2349          return false;
2350       if (st->getSrc(s)->reg.file != FILE_GPR)
2351          return false;
2352       ld->def(d).replace(st->src(s), false);
2353    }
2354    ld->bb->remove(ld);
2355    return true;
2356 }
2357
2358 bool
2359 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
2360 {
2361    Instruction *ldR = rec->insn;
2362    int32_t offR = rec->offset;
2363    int32_t offE = ldE->getSrc(0)->reg.data.offset;
2364    int dR, dE;
2365
2366    assert(offR <= offE);
2367    for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
2368       offR += ldR->getDef(dR)->reg.size;
2369    if (offR != offE)
2370       return false;
2371
2372    for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
2373       if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
2374          return false;
2375       ldE->def(dE).replace(ldR->getDef(dR), false);
2376    }
2377
2378    delete_Instruction(prog, ldE);
2379    return true;
2380 }
2381
2382 bool
2383 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
2384 {
2385    const Instruction *const ri = rec->insn;
2386    Value *extra[3];
2387
2388    int32_t offS = st->getSrc(0)->reg.data.offset;
2389    int32_t offR = rec->offset;
2390    int32_t endS = offS + typeSizeof(st->dType);
2391    int32_t endR = offR + typeSizeof(ri->dType);
2392
2393    rec->size = MAX2(endS, endR) - MIN2(offS, offR);
2394
2395    st->takeExtraSources(0, extra);
2396
2397    if (offR < offS) {
2398       Value *vals[10];
2399       int s, n;
2400       int k = 0;
2401       // get non-replaced sources of ri
2402       for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
2403          vals[k++] = ri->getSrc(s);
2404       n = s;
2405       // get replaced sources of st
2406       for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
2407          vals[k++] = st->getSrc(s);
2408       // skip replaced sources of ri
2409       for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
2410       // get non-replaced sources after values covered by st
2411       for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
2412          vals[k++] = ri->getSrc(s);
2413       assert((unsigned int)k <= Elements(vals));
2414       for (s = 0; s < k; ++s)
2415          st->setSrc(s + 1, vals[s]);
2416       st->setSrc(0, ri->getSrc(0));
2417    } else
2418    if (endR > endS) {
2419       int j, s;
2420       for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
2421       for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
2422       for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
2423          st->setSrc(s++, ri->getSrc(j));
2424    }
2425    st->putExtraSources(0, extra);
2426
2427    delete_Instruction(prog, rec->insn);
2428
2429    rec->insn = st;
2430    rec->offset = st->getSrc(0)->reg.data.offset;
2431
2432    st->setType(typeOfSize(rec->size));
2433
2434    return true;
2435 }
2436
2437 bool
2438 MemoryOpt::Record::overlaps(const Instruction *ldst) const
2439 {
2440    Record that;
2441    that.set(ldst);
2442
2443    if (this->fileIndex != that.fileIndex)
2444       return false;
2445
2446    if (this->rel[0] || that.rel[0])
2447       return this->base == that.base;
2448    return
2449       (this->offset < that.offset + that.size) &&
2450       (this->offset + this->size > that.offset);
2451 }
2452
2453 // We must not eliminate stores that affect the result of @ld if
2454 // we find later stores to the same location, and we may no longer
2455 // merge them with later stores.
2456 // The stored value can, however, still be used to determine the value
2457 // returned by future loads.
2458 void
2459 MemoryOpt::lockStores(Instruction *const ld)
2460 {
2461    for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
2462       if (!r->locked && r->overlaps(ld))
2463          r->locked = true;
2464 }
2465
2466 // Prior loads from the location of @st are no longer valid.
2467 // Stores to the location of @st may no longer be used to derive
2468 // the value at it nor be coalesced into later stores.
2469 void
2470 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
2471 {
2472    if (st)
2473       f = st->src(0).getFile();
2474
2475    for (Record *r = loads[f]; r; r = r->next)
2476       if (!st || r->overlaps(st))
2477          r->unlink(&loads[f]);
2478
2479    for (Record *r = stores[f]; r; r = r->next)
2480       if (!st || r->overlaps(st))
2481          r->unlink(&stores[f]);
2482 }
2483
2484 bool
2485 MemoryOpt::visit(BasicBlock *bb)
2486 {
2487    bool ret = runOpt(bb);
2488    // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
2489    // where 96 bit memory operations are forbidden.
2490    if (ret)
2491       ret = runOpt(bb);
2492    return ret;
2493 }
2494
2495 bool
2496 MemoryOpt::runOpt(BasicBlock *bb)
2497 {
2498    Instruction *ldst, *next;
2499    Record *rec;
2500    bool isAdjacent = true;
2501
2502    for (ldst = bb->getEntry(); ldst; ldst = next) {
2503       bool keep = true;
2504       bool isLoad = true;
2505       next = ldst->next;
2506
2507       if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
2508          if (ldst->isDead()) {
2509             // might have been produced by earlier optimization
2510             delete_Instruction(prog, ldst);
2511             continue;
2512          }
2513       } else
2514       if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
2515          isLoad = false;
2516       } else {
2517          // TODO: maybe have all fixed ops act as barrier ?
2518          if (ldst->op == OP_CALL ||
2519              ldst->op == OP_BAR ||
2520              ldst->op == OP_MEMBAR) {
2521             purgeRecords(NULL, FILE_MEMORY_LOCAL);
2522             purgeRecords(NULL, FILE_MEMORY_GLOBAL);
2523             purgeRecords(NULL, FILE_MEMORY_SHARED);
2524             purgeRecords(NULL, FILE_SHADER_OUTPUT);
2525          } else
2526          if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
2527             if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
2528                purgeRecords(NULL, FILE_MEMORY_LOCAL);
2529                purgeRecords(NULL, FILE_MEMORY_GLOBAL);
2530                purgeRecords(NULL, FILE_MEMORY_SHARED);
2531             } else {
2532                purgeRecords(NULL, ldst->src(0).getFile());
2533             }
2534          } else
2535          if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
2536             purgeRecords(NULL, FILE_SHADER_OUTPUT);
2537          }
2538          continue;
2539       }
2540       if (ldst->getPredicate()) // TODO: handle predicated ld/st
2541          continue;
2542       if (ldst->perPatch) // TODO: create separate per-patch lists
2543          continue;
2544
2545       if (isLoad) {
2546          DataFile file = ldst->src(0).getFile();
2547
2548          // if ld l[]/g[] look for previous store to eliminate the reload
2549          if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
2550             // TODO: shared memory ?
2551             rec = findRecord(ldst, false, isAdjacent);
2552             if (rec && !isAdjacent)
2553                keep = !replaceLdFromSt(ldst, rec);
2554          }
2555
2556          // or look for ld from the same location and replace this one
2557          rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
2558          if (rec) {
2559             if (!isAdjacent)
2560                keep = !replaceLdFromLd(ldst, rec);
2561             else
2562                // or combine a previous load with this one
2563                keep = !combineLd(rec, ldst);
2564          }
2565          if (keep)
2566             lockStores(ldst);
2567       } else {
2568          rec = findRecord(ldst, false, isAdjacent);
2569          if (rec) {
2570             if (!isAdjacent)
2571                keep = !replaceStFromSt(ldst, rec);
2572             else
2573                keep = !combineSt(rec, ldst);
2574          }
2575          if (keep)
2576             purgeRecords(ldst, DATA_FILE_COUNT);
2577       }
2578       if (keep)
2579          addRecord(ldst);
2580    }
2581    reset();
2582
2583    return true;
2584 }
2585
2586 // =============================================================================
2587
2588 // Turn control flow into predicated instructions (after register allocation !).
2589 // TODO:
2590 // Could move this to before register allocation on NVC0 and also handle nested
2591 // constructs.
2592 class FlatteningPass : public Pass
2593 {
2594 private:
2595    virtual bool visit(Function *);
2596    virtual bool visit(BasicBlock *);
2597
2598    bool tryPredicateConditional(BasicBlock *);
2599    void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
2600    void tryPropagateBranch(BasicBlock *);
2601    inline bool isConstantCondition(Value *pred);
2602    inline bool mayPredicate(const Instruction *, const Value *pred) const;
2603    inline void removeFlow(Instruction *);
2604
2605    uint8_t gpr_unit;
2606 };
2607
2608 bool
2609 FlatteningPass::isConstantCondition(Value *pred)
2610 {
2611    Instruction *insn = pred->getUniqueInsn();
2612    assert(insn);
2613    if (insn->op != OP_SET || insn->srcExists(2))
2614       return false;
2615
2616    for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
2617       Instruction *ld = insn->getSrc(s)->getUniqueInsn();
2618       DataFile file;
2619       if (ld) {
2620          if (ld->op != OP_MOV && ld->op != OP_LOAD)
2621             return false;
2622          if (ld->src(0).isIndirect(0))
2623             return false;
2624          file = ld->src(0).getFile();
2625       } else {
2626          file = insn->src(s).getFile();
2627          // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
2628          // in register "units", which can vary between targets.
2629          if (file == FILE_GPR) {
2630             Value *v = insn->getSrc(s);
2631             int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
2632             int units = bytes >> gpr_unit;
2633             if (units > prog->maxGPR)
2634                file = FILE_IMMEDIATE;
2635          }
2636       }
2637       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
2638          return false;
2639    }
2640    return true;
2641 }
2642
2643 void
2644 FlatteningPass::removeFlow(Instruction *insn)
2645 {
2646    FlowInstruction *term = insn ? insn->asFlow() : NULL;
2647    if (!term)
2648       return;
2649    Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
2650
2651    if (term->op == OP_BRA) {
2652       // TODO: this might get more difficult when we get arbitrary BRAs
2653       if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
2654          return;
2655    } else
2656    if (term->op != OP_JOIN)
2657       return;
2658
2659    Value *pred = term->getPredicate();
2660
2661    delete_Instruction(prog, term);
2662
2663    if (pred && pred->refCount() == 0) {
2664       Instruction *pSet = pred->getUniqueInsn();
2665       pred->join->reg.data.id = -1; // deallocate
2666       if (pSet->isDead())
2667          delete_Instruction(prog, pSet);
2668    }
2669 }
2670
2671 void
2672 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
2673 {
2674    for (Instruction *i = bb->getEntry(); i; i = i->next) {
2675       if (i->isNop())
2676          continue;
2677       assert(!i->getPredicate());
2678       i->setPredicate(cc, pred);
2679    }
2680    removeFlow(bb->getExit());
2681 }
2682
2683 bool
2684 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
2685 {
2686    if (insn->isPseudo())
2687       return true;
2688    // TODO: calls where we don't know which registers are modified
2689
2690    if (!prog->getTarget()->mayPredicate(insn, pred))
2691       return false;
2692    for (int d = 0; insn->defExists(d); ++d)
2693       if (insn->getDef(d)->equals(pred))
2694          return false;
2695    return true;
2696 }
2697
2698 // If we jump to BRA/RET/EXIT, replace the jump with it.
2699 // NOTE: We do not update the CFG anymore here !
2700 //
2701 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
2702 //  BB:0
2703 //   @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
2704 //  BB1:
2705 //   bra BB:3
2706 //  BB2:
2707 //   ...
2708 //  BB3:
2709 //   ...
2710 void
2711 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
2712 {
2713    for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
2714       BasicBlock *bf = i->asFlow()->target.bb;
2715
2716       if (bf->getInsnCount() != 1)
2717          continue;
2718
2719       FlowInstruction *bra = i->asFlow();
2720       FlowInstruction *rep = bf->getExit()->asFlow();
2721
2722       if (!rep || rep->getPredicate())
2723          continue;
2724       if (rep->op != OP_BRA &&
2725           rep->op != OP_JOIN &&
2726           rep->op != OP_EXIT)
2727          continue;
2728
2729       // TODO: If there are multiple branches to @rep, only the first would
2730       // be replaced, so only remove them after this pass is done ?
2731       // Also, need to check all incident blocks for fall-through exits and
2732       // add the branch there.
2733       bra->op = rep->op;
2734       bra->target.bb = rep->target.bb;
2735       if (bf->cfg.incidentCount() == 1)
2736          bf->remove(rep);
2737    }
2738 }
2739
2740 bool
2741 FlatteningPass::visit(Function *fn)
2742 {
2743    gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
2744
2745    return true;
2746 }
2747
2748 bool
2749 FlatteningPass::visit(BasicBlock *bb)
2750 {
2751    if (tryPredicateConditional(bb))
2752       return true;
2753
2754    // try to attach join to previous instruction
2755    if (prog->getTarget()->hasJoin) {
2756       Instruction *insn = bb->getExit();
2757       if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
2758          insn = insn->prev;
2759          if (insn && !insn->getPredicate() &&
2760              !insn->asFlow() &&
2761              insn->op != OP_TEXBAR &&
2762              !isTextureOp(insn->op) && // probably just nve4
2763              !isSurfaceOp(insn->op) && // not confirmed
2764              insn->op != OP_LINTERP && // probably just nve4
2765              insn->op != OP_PINTERP && // probably just nve4
2766              ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
2767               (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
2768              !insn->isNop()) {
2769             insn->join = 1;
2770             bb->remove(bb->getExit());
2771             return true;
2772          }
2773       }
2774    }
2775
2776    tryPropagateBranch(bb);
2777
2778    return true;
2779 }
2780
2781 bool
2782 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
2783 {
2784    BasicBlock *bL = NULL, *bR = NULL;
2785    unsigned int nL = 0, nR = 0, limit = 12;
2786    Instruction *insn;
2787    unsigned int mask;
2788
2789    mask = bb->initiatesSimpleConditional();
2790    if (!mask)
2791       return false;
2792
2793    assert(bb->getExit());
2794    Value *pred = bb->getExit()->getPredicate();
2795    assert(pred);
2796
2797    if (isConstantCondition(pred))
2798       limit = 4;
2799
2800    Graph::EdgeIterator ei = bb->cfg.outgoing();
2801
2802    if (mask & 1) {
2803       bL = BasicBlock::get(ei.getNode());
2804       for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
2805          if (!mayPredicate(insn, pred))
2806             return false;
2807       if (nL > limit)
2808          return false; // too long, do a real branch
2809    }
2810    ei.next();
2811
2812    if (mask & 2) {
2813       bR = BasicBlock::get(ei.getNode());
2814       for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
2815          if (!mayPredicate(insn, pred))
2816             return false;
2817       if (nR > limit)
2818          return false; // too long, do a real branch
2819    }
2820
2821    if (bL)
2822       predicateInstructions(bL, pred, bb->getExit()->cc);
2823    if (bR)
2824       predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
2825
2826    if (bb->joinAt) {
2827       bb->remove(bb->joinAt);
2828       bb->joinAt = NULL;
2829    }
2830    removeFlow(bb->getExit()); // delete the branch/join at the fork point
2831
2832    // remove potential join operations at the end of the conditional
2833    if (prog->getTarget()->joinAnterior) {
2834       bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
2835       if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
2836          removeFlow(bb->getEntry());
2837    }
2838
2839    return true;
2840 }
2841
2842 // =============================================================================
2843
2844 // Fold Immediate into MAD; must be done after register allocation due to
2845 // constraint SDST == SSRC2
2846 // TODO:
2847 // Does NVC0+ have other situations where this pass makes sense?
2848 class NV50PostRaConstantFolding : public Pass
2849 {
2850 private:
2851    virtual bool visit(BasicBlock *);
2852 };
2853
2854 static bool
2855 post_ra_dead(Instruction *i)
2856 {
2857    for (int d = 0; i->defExists(d); ++d)
2858       if (i->getDef(d)->refCount())
2859          return false;
2860    return true;
2861 }
2862
2863 bool
2864 NV50PostRaConstantFolding::visit(BasicBlock *bb)
2865 {
2866    Value *vtmp;
2867    Instruction *def;
2868
2869    for (Instruction *i = bb->getFirst(); i; i = i->next) {
2870       switch (i->op) {
2871       case OP_MAD:
2872          if (i->def(0).getFile() != FILE_GPR ||
2873              i->src(0).getFile() != FILE_GPR ||
2874              i->src(1).getFile() != FILE_GPR ||
2875              i->src(2).getFile() != FILE_GPR ||
2876              i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
2877             break;
2878
2879          if (i->getDef(0)->reg.data.id >= 64 ||
2880              i->getSrc(0)->reg.data.id >= 64)
2881             break;
2882
2883          if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
2884             break;
2885
2886          if (i->getPredicate())
2887             break;
2888
2889          def = i->getSrc(1)->getInsn();
2890          if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
2891             def = def->getSrc(0)->getInsn();
2892          if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
2893             vtmp = i->getSrc(1);
2894             if (isFloatType(i->sType)) {
2895                i->setSrc(1, def->getSrc(0));
2896             } else {
2897                ImmediateValue val;
2898                bool ret = def->src(0).getImmediate(val);
2899                assert(ret);
2900                if (i->getSrc(1)->reg.data.id & 1)
2901                   val.reg.data.u32 >>= 16;
2902                val.reg.data.u32 &= 0xffff;
2903                i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32));
2904             }
2905
2906             /* There's no post-RA dead code elimination, so do it here
2907              * XXX: if we add more code-removing post-RA passes, we might
2908              *      want to create a post-RA dead-code elim pass */
2909             if (post_ra_dead(vtmp->getInsn())) {
2910                Value *src = vtmp->getInsn()->getSrc(0);
2911                // Careful -- splits will have already been removed from the
2912                // functions. Don't double-delete.
2913                if (vtmp->getInsn()->bb)
2914                   delete_Instruction(prog, vtmp->getInsn());
2915                if (src->getInsn() && post_ra_dead(src->getInsn()))
2916                   delete_Instruction(prog, src->getInsn());
2917             }
2918
2919             break;
2920          }
2921          break;
2922       default:
2923          break;
2924       }
2925    }
2926
2927    return true;
2928 }
2929
2930 // =============================================================================
2931
2932 // Common subexpression elimination. Stupid O^2 implementation.
2933 class LocalCSE : public Pass
2934 {
2935 private:
2936    virtual bool visit(BasicBlock *);
2937
2938    inline bool tryReplace(Instruction **, Instruction *);
2939
2940    DLList ops[OP_LAST + 1];
2941 };
2942
2943 class GlobalCSE : public Pass
2944 {
2945 private:
2946    virtual bool visit(BasicBlock *);
2947 };
2948
2949 bool
2950 Instruction::isActionEqual(const Instruction *that) const
2951 {
2952    if (this->op != that->op ||
2953        this->dType != that->dType ||
2954        this->sType != that->sType)
2955       return false;
2956    if (this->cc != that->cc)
2957       return false;
2958
2959    if (this->asTex()) {
2960       if (memcmp(&this->asTex()->tex,
2961                  &that->asTex()->tex,
2962                  sizeof(this->asTex()->tex)))
2963          return false;
2964    } else
2965    if (this->asCmp()) {
2966       if (this->asCmp()->setCond != that->asCmp()->setCond)
2967          return false;
2968    } else
2969    if (this->asFlow()) {
2970       return false;
2971    } else {
2972       if (this->ipa != that->ipa ||
2973           this->lanes != that->lanes ||
2974           this->perPatch != that->perPatch)
2975          return false;
2976       if (this->postFactor != that->postFactor)
2977          return false;
2978    }
2979
2980    if (this->subOp != that->subOp ||
2981        this->saturate != that->saturate ||
2982        this->rnd != that->rnd ||
2983        this->ftz != that->ftz ||
2984        this->dnz != that->dnz ||
2985        this->cache != that->cache ||
2986        this->mask != that->mask)
2987       return false;
2988
2989    return true;
2990 }
2991
2992 bool
2993 Instruction::isResultEqual(const Instruction *that) const
2994 {
2995    unsigned int d, s;
2996
2997    // NOTE: location of discard only affects tex with liveOnly and quadops
2998    if (!this->defExists(0) && this->op != OP_DISCARD)
2999       return false;
3000
3001    if (!isActionEqual(that))
3002       return false;
3003
3004    if (this->predSrc != that->predSrc)
3005       return false;
3006
3007    for (d = 0; this->defExists(d); ++d) {
3008       if (!that->defExists(d) ||
3009           !this->getDef(d)->equals(that->getDef(d), false))
3010          return false;
3011    }
3012    if (that->defExists(d))
3013       return false;
3014
3015    for (s = 0; this->srcExists(s); ++s) {
3016       if (!that->srcExists(s))
3017          return false;
3018       if (this->src(s).mod != that->src(s).mod)
3019          return false;
3020       if (!this->getSrc(s)->equals(that->getSrc(s), true))
3021          return false;
3022    }
3023    if (that->srcExists(s))
3024       return false;
3025
3026    if (op == OP_LOAD || op == OP_VFETCH) {
3027       switch (src(0).getFile()) {
3028       case FILE_MEMORY_CONST:
3029       case FILE_SHADER_INPUT:
3030          return true;
3031       case FILE_SHADER_OUTPUT:
3032          return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
3033       default:
3034          return false;
3035       }
3036    }
3037
3038    return true;
3039 }
3040
3041 // pull through common expressions from different in-blocks
3042 bool
3043 GlobalCSE::visit(BasicBlock *bb)
3044 {
3045    Instruction *phi, *next, *ik;
3046    int s;
3047
3048    // TODO: maybe do this with OP_UNION, too
3049
3050    for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
3051       next = phi->next;
3052       if (phi->getSrc(0)->refCount() > 1)
3053          continue;
3054       ik = phi->getSrc(0)->getInsn();
3055       if (!ik)
3056          continue; // probably a function input
3057       for (s = 1; phi->srcExists(s); ++s) {
3058          if (phi->getSrc(s)->refCount() > 1)
3059             break;
3060          if (!phi->getSrc(s)->getInsn() ||
3061              !phi->getSrc(s)->getInsn()->isResultEqual(ik))
3062             break;
3063       }
3064       if (!phi->srcExists(s)) {
3065          Instruction *entry = bb->getEntry();
3066          ik->bb->remove(ik);
3067          if (!entry || entry->op != OP_JOIN)
3068             bb->insertHead(ik);
3069          else
3070             bb->insertAfter(entry, ik);
3071          ik->setDef(0, phi->getDef(0));
3072          delete_Instruction(prog, phi);
3073       }
3074    }
3075
3076    return true;
3077 }
3078
3079 bool
3080 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
3081 {
3082    Instruction *old = *ptr;
3083
3084    // TODO: maybe relax this later (causes trouble with OP_UNION)
3085    if (i->isPredicated())
3086       return false;
3087
3088    if (!old->isResultEqual(i))
3089       return false;
3090
3091    for (int d = 0; old->defExists(d); ++d)
3092       old->def(d).replace(i->getDef(d), false);
3093    delete_Instruction(prog, old);
3094    *ptr = NULL;
3095    return true;
3096 }
3097
3098 bool
3099 LocalCSE::visit(BasicBlock *bb)
3100 {
3101    unsigned int replaced;
3102
3103    do {
3104       Instruction *ir, *next;
3105
3106       replaced = 0;
3107
3108       // will need to know the order of instructions
3109       int serial = 0;
3110       for (ir = bb->getFirst(); ir; ir = ir->next)
3111          ir->serial = serial++;
3112
3113       for (ir = bb->getEntry(); ir; ir = next) {
3114          int s;
3115          Value *src = NULL;
3116
3117          next = ir->next;
3118
3119          if (ir->fixed) {
3120             ops[ir->op].insert(ir);
3121             continue;
3122          }
3123
3124          for (s = 0; ir->srcExists(s); ++s)
3125             if (ir->getSrc(s)->asLValue())
3126                if (!src || ir->getSrc(s)->refCount() < src->refCount())
3127                   src = ir->getSrc(s);
3128
3129          if (src) {
3130             for (Value::UseIterator it = src->uses.begin();
3131                  it != src->uses.end(); ++it) {
3132                Instruction *ik = (*it)->getInsn();
3133                if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
3134                   if (tryReplace(&ir, ik))
3135                      break;
3136             }
3137          } else {
3138             DLLIST_FOR_EACH(&ops[ir->op], iter)
3139             {
3140                Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
3141                if (tryReplace(&ir, ik))
3142                   break;
3143             }
3144          }
3145
3146          if (ir)
3147             ops[ir->op].insert(ir);
3148          else
3149             ++replaced;
3150       }
3151       for (unsigned int i = 0; i <= OP_LAST; ++i)
3152          ops[i].clear();
3153
3154    } while (replaced);
3155
3156    return true;
3157 }
3158
3159 // =============================================================================
3160
3161 // Remove computations of unused values.
3162 class DeadCodeElim : public Pass
3163 {
3164 public:
3165    bool buryAll(Program *);
3166
3167 private:
3168    virtual bool visit(BasicBlock *);
3169
3170    void checkSplitLoad(Instruction *ld); // for partially dead loads
3171
3172    unsigned int deadCount;
3173 };
3174
3175 bool
3176 DeadCodeElim::buryAll(Program *prog)
3177 {
3178    do {
3179       deadCount = 0;
3180       if (!this->run(prog, false, false))
3181          return false;
3182    } while (deadCount);
3183
3184    return true;
3185 }
3186
3187 bool
3188 DeadCodeElim::visit(BasicBlock *bb)
3189 {
3190    Instruction *next;
3191
3192    for (Instruction *i = bb->getFirst(); i; i = next) {
3193       next = i->next;
3194       if (i->isDead()) {
3195          ++deadCount;
3196          delete_Instruction(prog, i);
3197       } else
3198       if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
3199          checkSplitLoad(i);
3200       } else
3201       if (i->defExists(0) && !i->getDef(0)->refCount()) {
3202          if (i->op == OP_ATOM ||
3203              i->op == OP_SUREDP ||
3204              i->op == OP_SUREDB)
3205             i->setDef(0, NULL);
3206       }
3207    }
3208    return true;
3209 }
3210
3211 // Each load can go into up to 4 destinations, any of which might potentially
3212 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3213 // of where the holes are. We find the first contiguous region, put it into
3214 // the first load, and then put the second contiguous region into the second
3215 // load. There can be at most 2 contiguous regions.
3216 //
3217 // Note that there are some restrictions, for example it's not possible to do
3218 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3219 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3220 // split into a 64-bit and 32-bit load.
3221 void
3222 DeadCodeElim::checkSplitLoad(Instruction *ld1)
3223 {
3224    Instruction *ld2 = NULL; // can get at most 2 loads
3225    Value *def1[4];
3226    Value *def2[4];
3227    int32_t addr1, addr2;
3228    int32_t size1, size2;
3229    int d, n1, n2;
3230    uint32_t mask = 0xffffffff;
3231
3232    for (d = 0; ld1->defExists(d); ++d)
3233       if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
3234          mask &= ~(1 << d);
3235    if (mask == 0xffffffff)
3236       return;
3237
3238    addr1 = ld1->getSrc(0)->reg.data.offset;
3239    n1 = n2 = 0;
3240    size1 = size2 = 0;
3241
3242    // Compute address/width for first load
3243    for (d = 0; ld1->defExists(d); ++d) {
3244       if (mask & (1 << d)) {
3245          if (size1 && (addr1 & 0x7))
3246             break;
3247          def1[n1] = ld1->getDef(d);
3248          size1 += def1[n1++]->reg.size;
3249       } else
3250       if (!n1) {
3251          addr1 += ld1->getDef(d)->reg.size;
3252       } else {
3253          break;
3254       }
3255    }
3256
3257    // Scale back the size of the first load until it can be loaded. This
3258    // typically happens for TYPE_B96 loads.
3259    while (n1 &&
3260           !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
3261                                                 typeOfSize(size1))) {
3262       size1 -= def1[--n1]->reg.size;
3263       d--;
3264    }
3265
3266    // Compute address/width for second load
3267    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
3268       if (mask & (1 << d)) {
3269          assert(!size2 || !(addr2 & 0x7));
3270          def2[n2] = ld1->getDef(d);
3271          size2 += def2[n2++]->reg.size;
3272       } else if (!n2) {
3273          assert(!n2);
3274          addr2 += ld1->getDef(d)->reg.size;
3275       } else {
3276          break;
3277       }
3278    }
3279
3280    // Make sure that we've processed all the values
3281    for (; ld1->defExists(d); ++d)
3282       assert(!(mask & (1 << d)));
3283
3284    updateLdStOffset(ld1, addr1, func);
3285    ld1->setType(typeOfSize(size1));
3286    for (d = 0; d < 4; ++d)
3287       ld1->setDef(d, (d < n1) ? def1[d] : NULL);
3288
3289    if (!n2)
3290       return;
3291
3292    ld2 = cloneShallow(func, ld1);
3293    updateLdStOffset(ld2, addr2, func);
3294    ld2->setType(typeOfSize(size2));
3295    for (d = 0; d < 4; ++d)
3296       ld2->setDef(d, (d < n2) ? def2[d] : NULL);
3297
3298    ld1->bb->insertAfter(ld1, ld2);
3299 }
3300
3301 // =============================================================================
3302
3303 #define RUN_PASS(l, n, f)                       \
3304    if (level >= (l)) {                          \
3305       if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
3306          INFO("PEEPHOLE: %s\n", #n);            \
3307       n pass;                                   \
3308       if (!pass.f(this))                        \
3309          return false;                          \
3310    }
3311
3312 bool
3313 Program::optimizeSSA(int level)
3314 {
3315    RUN_PASS(1, DeadCodeElim, buryAll);
3316    RUN_PASS(1, CopyPropagation, run);
3317    RUN_PASS(1, MergeSplits, run);
3318    RUN_PASS(2, GlobalCSE, run);
3319    RUN_PASS(1, LocalCSE, run);
3320    RUN_PASS(2, AlgebraicOpt, run);
3321    RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
3322    RUN_PASS(1, ConstantFolding, foldAll);
3323    RUN_PASS(1, LoadPropagation, run);
3324    RUN_PASS(1, IndirectPropagation, run);
3325    RUN_PASS(2, MemoryOpt, run);
3326    RUN_PASS(2, LocalCSE, run);
3327    RUN_PASS(0, DeadCodeElim, buryAll);
3328
3329    return true;
3330 }
3331
3332 bool
3333 Program::optimizePostRA(int level)
3334 {
3335    RUN_PASS(2, FlatteningPass, run);
3336    if (getTarget()->getChipset() < 0xc0)
3337       RUN_PASS(2, NV50PostRaConstantFolding, run);
3338
3339    return true;
3340 }
3341
3342 }