src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_target.h"
  25 #include "codegen/nv50_ir_build_util.h"
  26
  27 extern "C" {
  28 #include "util/u_math.h"
  29 }
  30
  31 namespace nv50_ir {
  32
  33 bool
  34 Instruction::isNop() const
  35 {
  36    if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
  37       return true;
  38    if (terminator || join) // XXX: should terminator imply flow ?
  39       return false;
  40    if (op == OP_ATOM)
  41       return false;
  42    if (!fixed && op == OP_NOP)
  43       return true;
  44
  45    if (defExists(0) && def(0).rep()->reg.data.id < 0) {
  46       for (int d = 1; defExists(d); ++d)
  47          if (def(d).rep()->reg.data.id >= 0)
  48             WARN("part of vector result is unused !\n");
  49       return true;
  50    }
  51
  52    if (op == OP_MOV || op == OP_UNION) {
  53       if (!getDef(0)->equals(getSrc(0)))
  54          return false;
  55       if (op == OP_UNION)
  56          if (!def(0).rep()->equals(getSrc(1)))
  57             return false;
  58       return true;
  59    }
  60
  61    return false;
  62 }
  63
  64 bool Instruction::isDead() const
  65 {
  66    if (op == OP_STORE ||
  67        op == OP_EXPORT ||
  68        op == OP_ATOM ||
  69        op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
  70        op == OP_WRSV)
  71       return false;
  72
  73    for (int d = 0; defExists(d); ++d)
  74       if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
  75          return false;
  76
  77    if (terminator || asFlow())
  78       return false;
  79    if (fixed)
  80       return false;
  81
  82    return true;
  83 };
  84
  85 // =============================================================================
  86
  87 class CopyPropagation : public Pass
  88 {
  89 private:
  90    virtual bool visit(BasicBlock *);
  91 };
  92
  93 // Propagate all MOVs forward to make subsequent optimization easier, except if
  94 // the sources stem from a phi, in which case we don't want to mess up potential
  95 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
  96 bool
  97 CopyPropagation::visit(BasicBlock *bb)
  98 {
  99    Instruction *mov, *si, *next;
 100
 101    for (mov = bb->getEntry(); mov; mov = next) {
 102       next = mov->next;
 103       if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
 104          continue;
 105       if (mov->getPredicate())
 106          continue;
 107       if (mov->def(0).getFile() != mov->src(0).getFile())
 108          continue;
 109       si = mov->getSrc(0)->getInsn();
 110       if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
 111          // propagate
 112          mov->def(0).replace(mov->getSrc(0), false);
 113          delete_Instruction(prog, mov);
 114       }
 115    }
 116    return true;
 117 }
 118
 119 // =============================================================================
 120
 121 class MergeSplits : public Pass
 122 {
 123 private:
 124    virtual bool visit(BasicBlock *);
 125 };
 126
 127 // For SPLIT / MERGE pairs that operate on the same registers, replace the
 128 // post-merge def with the SPLIT's source.
 129 bool
 130 MergeSplits::visit(BasicBlock *bb)
 131 {
 132    Instruction *i, *next, *si;
 133
 134    for (i = bb->getEntry(); i; i = next) {
 135       next = i->next;
 136       if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
 137          continue;
 138       si = i->getSrc(0)->getInsn();
 139       if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
 140          continue;
 141       i->def(0).replace(si->getSrc(0), false);
 142       delete_Instruction(prog, i);
 143    }
 144
 145    return true;
 146 }
 147
 148 // =============================================================================
 149
 150 class LoadPropagation : public Pass
 151 {
 152 private:
 153    virtual bool visit(BasicBlock *);
 154
 155    void checkSwapSrc01(Instruction *);
 156
 157    bool isCSpaceLoad(Instruction *);
 158    bool isImmdLoad(Instruction *);
 159    bool isAttribOrSharedLoad(Instruction *);
 160 };
 161
 162 bool
 163 LoadPropagation::isCSpaceLoad(Instruction *ld)
 164 {
 165    return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
 166 }
 167
 168 bool
 169 LoadPropagation::isImmdLoad(Instruction *ld)
 170 {
 171    if (!ld || (ld->op != OP_MOV) ||
 172        ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
 173       return false;
 174
 175    // A 0 can be replaced with a register, so it doesn't count as an immediate.
 176    ImmediateValue val;
 177    return ld->src(0).getImmediate(val) && !val.isInteger(0);
 178 }
 179
 180 bool
 181 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
 182 {
 183    return ld &&
 184       (ld->op == OP_VFETCH ||
 185        (ld->op == OP_LOAD &&
 186         (ld->src(0).getFile() == FILE_SHADER_INPUT ||
 187          ld->src(0).getFile() == FILE_MEMORY_SHARED)));
 188 }
 189
 190 void
 191 LoadPropagation::checkSwapSrc01(Instruction *insn)
 192 {
 193    const Target *targ = prog->getTarget();
 194    if (!targ->getOpInfo(insn).commutative)
 195       if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB)
 196          return;
 197    if (insn->src(1).getFile() != FILE_GPR)
 198       return;
 199    // This is the special OP_SET used for alphatesting, we can't reverse its
 200    // arguments as that will confuse the fixup code.
 201    if (insn->op == OP_SET && insn->subOp)
 202       return;
 203
 204    Instruction *i0 = insn->getSrc(0)->getInsn();
 205    Instruction *i1 = insn->getSrc(1)->getInsn();
 206
 207    // Swap sources to inline the less frequently used source. That way,
 208    // optimistically, it will eventually be able to remove the instruction.
 209    int i0refs = insn->getSrc(0)->refCount();
 210    int i1refs = insn->getSrc(1)->refCount();
 211
 212    if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
 213       if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
 214           !targ->insnCanLoad(insn, 1, i1) ||
 215           i0refs < i1refs)
 216          insn->swapSources(0, 1);
 217       else
 218          return;
 219    } else
 220    if (isAttribOrSharedLoad(i1)) {
 221       if (!isAttribOrSharedLoad(i0))
 222          insn->swapSources(0, 1);
 223       else
 224          return;
 225    } else {
 226       return;
 227    }
 228
 229    if (insn->op == OP_SET || insn->op == OP_SET_AND ||
 230        insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
 231       insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
 232    else
 233    if (insn->op == OP_SLCT)
 234       insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
 235    else
 236    if (insn->op == OP_SUB) {
 237       insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
 238       insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
 239    }
 240 }
 241
 242 bool
 243 LoadPropagation::visit(BasicBlock *bb)
 244 {
 245    const Target *targ = prog->getTarget();
 246    Instruction *next;
 247
 248    for (Instruction *i = bb->getEntry(); i; i = next) {
 249       next = i->next;
 250
 251       if (i->op == OP_CALL) // calls have args as sources, they must be in regs
 252          continue;
 253
 254       if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
 255          continue;
 256
 257       if (i->srcExists(1))
 258          checkSwapSrc01(i);
 259
 260       for (int s = 0; i->srcExists(s); ++s) {
 261          Instruction *ld = i->getSrc(s)->getInsn();
 262
 263          if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
 264             continue;
 265          if (!targ->insnCanLoad(i, s, ld))
 266             continue;
 267
 268          // propagate !
 269          i->setSrc(s, ld->getSrc(0));
 270          if (ld->src(0).isIndirect(0))
 271             i->setIndirect(s, 0, ld->getIndirect(0, 0));
 272
 273          if (ld->getDef(0)->refCount() == 0)
 274             delete_Instruction(prog, ld);
 275       }
 276    }
 277    return true;
 278 }
 279
 280 // =============================================================================
 281
 282 class IndirectPropagation : public Pass
 283 {
 284 private:
 285    virtual bool visit(BasicBlock *);
 286 };
 287
 288 bool
 289 IndirectPropagation::visit(BasicBlock *bb)
 290 {
 291    const Target *targ = prog->getTarget();
 292    Instruction *next;
 293
 294    for (Instruction *i = bb->getEntry(); i; i = next) {
 295       next = i->next;
 296
 297       for (int s = 0; i->srcExists(s); ++s) {
 298          Instruction *insn;
 299          ImmediateValue imm;
 300          if (!i->src(s).isIndirect(0))
 301             continue;
 302          insn = i->getIndirect(s, 0)->getInsn();
 303          if (!insn)
 304             continue;
 305          if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
 306             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
 307                 !insn->src(1).getImmediate(imm) ||
 308                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
 309                continue;
 310             i->setIndirect(s, 0, insn->getSrc(0));
 311             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 312             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
 313          } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
 314             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
 315                 !insn->src(1).getImmediate(imm) ||
 316                 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
 317                continue;
 318             i->setIndirect(s, 0, insn->getSrc(0));
 319             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 320             i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
 321          } else if (insn->op == OP_MOV) {
 322             if (!insn->src(0).getImmediate(imm) ||
 323                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
 324                continue;
 325             i->setIndirect(s, 0, NULL);
 326             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
 327             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
 328          }
 329       }
 330    }
 331    return true;
 332 }
 333
 334 // =============================================================================
 335
 336 // Evaluate constant expressions.
 337 class ConstantFolding : public Pass
 338 {
 339 public:
 340    bool foldAll(Program *);
 341
 342 private:
 343    virtual bool visit(BasicBlock *);
 344
 345    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
 346    void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
 347    void opnd(Instruction *, ImmediateValue&, int s);
 348    void opnd3(Instruction *, ImmediateValue&);
 349
 350    void unary(Instruction *, const ImmediateValue&);
 351
 352    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
 353
 354    CmpInstruction *findOriginForTestWithZero(Value *);
 355
 356    unsigned int foldCount;
 357
 358    BuildUtil bld;
 359 };
 360
 361 // TODO: remember generated immediates and only revisit these
 362 bool
 363 ConstantFolding::foldAll(Program *prog)
 364 {
 365    unsigned int iterCount = 0;
 366    do {
 367       foldCount = 0;
 368       if (!run(prog))
 369          return false;
 370    } while (foldCount && ++iterCount < 2);
 371    return true;
 372 }
 373
 374 bool
 375 ConstantFolding::visit(BasicBlock *bb)
 376 {
 377    Instruction *i, *next;
 378
 379    for (i = bb->getEntry(); i; i = next) {
 380       next = i->next;
 381       if (i->op == OP_MOV || i->op == OP_CALL)
 382          continue;
 383
 384       ImmediateValue src0, src1, src2;
 385
 386       if (i->srcExists(2) &&
 387           i->src(0).getImmediate(src0) &&
 388           i->src(1).getImmediate(src1) &&
 389           i->src(2).getImmediate(src2))
 390          expr(i, src0, src1, src2);
 391       else
 392       if (i->srcExists(1) &&
 393           i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
 394          expr(i, src0, src1);
 395       else
 396       if (i->srcExists(0) && i->src(0).getImmediate(src0))
 397          opnd(i, src0, 0);
 398       else
 399       if (i->srcExists(1) && i->src(1).getImmediate(src1))
 400          opnd(i, src1, 1);
 401       if (i->srcExists(2) && i->src(2).getImmediate(src2))
 402          opnd3(i, src2);
 403    }
 404    return true;
 405 }
 406
 407 CmpInstruction *
 408 ConstantFolding::findOriginForTestWithZero(Value *value)
 409 {
 410    if (!value)
 411       return NULL;
 412    Instruction *insn = value->getInsn();
 413
 414    if (insn->asCmp() && insn->op != OP_SLCT)
 415       return insn->asCmp();
 416
 417    /* Sometimes mov's will sneak in as a result of other folding. This gets
 418     * cleaned up later.
 419     */
 420    if (insn->op == OP_MOV)
 421       return findOriginForTestWithZero(insn->getSrc(0));
 422
 423    /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
 424    if (insn->op == OP_AND) {
 425       int s = 0;
 426       ImmediateValue imm;
 427       if (!insn->src(s).getImmediate(imm)) {
 428          s = 1;
 429          if (!insn->src(s).getImmediate(imm))
 430             return NULL;
 431       }
 432       if (imm.reg.data.f32 != 1.0f)
 433          return NULL;
 434       /* TODO: Come up with a way to handle the condition being inverted */
 435       if (insn->src(!s).mod != Modifier(0))
 436          return NULL;
 437       return findOriginForTestWithZero(insn->getSrc(!s));
 438    }
 439
 440    return NULL;
 441 }
 442
 443 void
 444 Modifier::applyTo(ImmediateValue& imm) const
 445 {
 446    if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
 447       return;
 448    switch (imm.reg.type) {
 449    case TYPE_F32:
 450       if (bits & NV50_IR_MOD_ABS)
 451          imm.reg.data.f32 = fabsf(imm.reg.data.f32);
 452       if (bits & NV50_IR_MOD_NEG)
 453          imm.reg.data.f32 = -imm.reg.data.f32;
 454       if (bits & NV50_IR_MOD_SAT) {
 455          if (imm.reg.data.f32 < 0.0f)
 456             imm.reg.data.f32 = 0.0f;
 457          else
 458          if (imm.reg.data.f32 > 1.0f)
 459             imm.reg.data.f32 = 1.0f;
 460       }
 461       assert(!(bits & NV50_IR_MOD_NOT));
 462       break;
 463
 464    case TYPE_S8: // NOTE: will be extended
 465    case TYPE_S16:
 466    case TYPE_S32:
 467    case TYPE_U8: // NOTE: treated as signed
 468    case TYPE_U16:
 469    case TYPE_U32:
 470       if (bits & NV50_IR_MOD_ABS)
 471          imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
 472             imm.reg.data.s32 : -imm.reg.data.s32;
 473       if (bits & NV50_IR_MOD_NEG)
 474          imm.reg.data.s32 = -imm.reg.data.s32;
 475       if (bits & NV50_IR_MOD_NOT)
 476          imm.reg.data.s32 = ~imm.reg.data.s32;
 477       break;
 478
 479    case TYPE_F64:
 480       if (bits & NV50_IR_MOD_ABS)
 481          imm.reg.data.f64 = fabs(imm.reg.data.f64);
 482       if (bits & NV50_IR_MOD_NEG)
 483          imm.reg.data.f64 = -imm.reg.data.f64;
 484       if (bits & NV50_IR_MOD_SAT) {
 485          if (imm.reg.data.f64 < 0.0)
 486             imm.reg.data.f64 = 0.0;
 487          else
 488          if (imm.reg.data.f64 > 1.0)
 489             imm.reg.data.f64 = 1.0;
 490       }
 491       assert(!(bits & NV50_IR_MOD_NOT));
 492       break;
 493
 494    default:
 495       assert(!"invalid/unhandled type");
 496       imm.reg.data.u64 = 0;
 497       break;
 498    }
 499 }
 500
 501 operation
 502 Modifier::getOp() const
 503 {
 504    switch (bits) {
 505    case NV50_IR_MOD_ABS: return OP_ABS;
 506    case NV50_IR_MOD_NEG: return OP_NEG;
 507    case NV50_IR_MOD_SAT: return OP_SAT;
 508    case NV50_IR_MOD_NOT: return OP_NOT;
 509    case 0:
 510       return OP_MOV;
 511    default:
 512       return OP_CVT;
 513    }
 514 }
 515
 516 void
 517 ConstantFolding::expr(Instruction *i,
 518                       ImmediateValue &imm0, ImmediateValue &imm1)
 519 {
 520    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
 521    struct Storage res;
 522    DataType type = i->dType;
 523
 524    memset(&res.data, 0, sizeof(res.data));
 525
 526    switch (i->op) {
 527    case OP_MAD:
 528    case OP_FMA:
 529    case OP_MUL:
 530       if (i->dnz && i->dType == TYPE_F32) {
 531          if (!isfinite(a->data.f32))
 532             a->data.f32 = 0.0f;
 533          if (!isfinite(b->data.f32))
 534             b->data.f32 = 0.0f;
 535       }
 536       switch (i->dType) {
 537       case TYPE_F32:
 538          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
 539          break;
 540       case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
 541       case TYPE_S32:
 542          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 543             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
 544             break;
 545          }
 546          /* fallthrough */
 547       case TYPE_U32:
 548          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 549             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
 550             break;
 551          }
 552          res.data.u32 = a->data.u32 * b->data.u32; break;
 553       default:
 554          return;
 555       }
 556       break;
 557    case OP_DIV:
 558       if (b->data.u32 == 0)
 559          break;
 560       switch (i->dType) {
 561       case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
 562       case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
 563       case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
 564       case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
 565       default:
 566          return;
 567       }
 568       break;
 569    case OP_ADD:
 570       switch (i->dType) {
 571       case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
 572       case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
 573       case TYPE_S32:
 574       case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
 575       default:
 576          return;
 577       }
 578       break;
 579    case OP_SUB:
 580       switch (i->dType) {
 581       case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
 582       case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
 583       case TYPE_S32:
 584       case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
 585       default:
 586          return;
 587       }
 588       break;
 589    case OP_POW:
 590       switch (i->dType) {
 591       case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
 592       case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
 593       default:
 594          return;
 595       }
 596       break;
 597    case OP_MAX:
 598       switch (i->dType) {
 599       case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
 600       case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
 601       case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
 602       case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
 603       default:
 604          return;
 605       }
 606       break;
 607    case OP_MIN:
 608       switch (i->dType) {
 609       case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
 610       case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
 611       case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
 612       case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
 613       default:
 614          return;
 615       }
 616       break;
 617    case OP_AND:
 618       res.data.u64 = a->data.u64 & b->data.u64;
 619       break;
 620    case OP_OR:
 621       res.data.u64 = a->data.u64 | b->data.u64;
 622       break;
 623    case OP_XOR:
 624       res.data.u64 = a->data.u64 ^ b->data.u64;
 625       break;
 626    case OP_SHL:
 627       res.data.u32 = a->data.u32 << b->data.u32;
 628       break;
 629    case OP_SHR:
 630       switch (i->dType) {
 631       case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
 632       case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
 633       default:
 634          return;
 635       }
 636       break;
 637    case OP_SLCT:
 638       if (a->data.u32 != b->data.u32)
 639          return;
 640       res.data.u32 = a->data.u32;
 641       break;
 642    case OP_EXTBF: {
 643       int offset = b->data.u32 & 0xff;
 644       int width = (b->data.u32 >> 8) & 0xff;
 645       int rshift = offset;
 646       int lshift = 0;
 647       if (width == 0) {
 648          res.data.u32 = 0;
 649          break;
 650       }
 651       if (width + offset < 32) {
 652          rshift = 32 - width;
 653          lshift = 32 - width - offset;
 654       }
 655       if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
 656          res.data.u32 = util_bitreverse(a->data.u32);
 657       else
 658          res.data.u32 = a->data.u32;
 659       switch (i->dType) {
 660       case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
 661       case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
 662       default:
 663          return;
 664       }
 665       break;
 666    }
 667    case OP_POPCNT:
 668       res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
 669       break;
 670    case OP_PFETCH:
 671       // The two arguments to pfetch are logically added together. Normally
 672       // the second argument will not be constant, but that can happen.
 673       res.data.u32 = a->data.u32 + b->data.u32;
 674       type = TYPE_U32;
 675       break;
 676    case OP_MERGE:
 677       switch (i->dType) {
 678       case TYPE_U64:
 679       case TYPE_S64:
 680       case TYPE_F64:
 681          res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
 682          break;
 683       default:
 684          return;
 685       }
 686       break;
 687    default:
 688       return;
 689    }
 690    ++foldCount;
 691
 692    i->src(0).mod = Modifier(0);
 693    i->src(1).mod = Modifier(0);
 694    i->postFactor = 0;
 695
 696    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
 697    i->setSrc(1, NULL);
 698
 699    i->getSrc(0)->reg.data = res.data;
 700    i->getSrc(0)->reg.type = type;
 701    i->getSrc(0)->reg.size = typeSizeof(type);
 702
 703    switch (i->op) {
 704    case OP_MAD:
 705    case OP_FMA: {
 706       ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
 707
 708       // Move the immediate into position 1, where we know it might be
 709       // emittable. However it might not be anyways, as there may be other
 710       // restrictions, so move it into a separate LValue.
 711       bld.setPosition(i, false);
 712       i->op = OP_ADD;
 713       i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
 714       i->setSrc(0, i->getSrc(2));
 715       i->src(0).mod = i->src(2).mod;
 716       i->setSrc(2, NULL);
 717
 718       if (i->src(0).getImmediate(src0))
 719          expr(i, src0, src1);
 720       else
 721          opnd(i, src1, 1);
 722       break;
 723    }
 724    case OP_PFETCH:
 725       // Leave PFETCH alone... we just folded its 2 args into 1.
 726       break;
 727    default:
 728       i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
 729       break;
 730    }
 731    i->subOp = 0;
 732 }
 733
 734 void
 735 ConstantFolding::expr(Instruction *i,
 736                       ImmediateValue &imm0,
 737                       ImmediateValue &imm1,
 738                       ImmediateValue &imm2)
 739 {
 740    struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
 741    struct Storage res;
 742
 743    memset(&res.data, 0, sizeof(res.data));
 744
 745    switch (i->op) {
 746    case OP_INSBF: {
 747       int offset = b->data.u32 & 0xff;
 748       int width = (b->data.u32 >> 8) & 0xff;
 749       unsigned bitmask = ((1 << width) - 1) << offset;
 750       res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
 751       break;
 752    }
 753    case OP_MAD:
 754    case OP_FMA: {
 755       switch (i->dType) {
 756       case TYPE_F32:
 757          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
 758             c->data.f32;
 759          break;
 760       case TYPE_F64:
 761          res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
 762          break;
 763       case TYPE_S32:
 764          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 765             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
 766             break;
 767          }
 768          /* fallthrough */
 769       case TYPE_U32:
 770          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 771             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
 772             break;
 773          }
 774          res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
 775          break;
 776       default:
 777          return;
 778       }
 779       break;
 780    }
 781    case OP_SHLADD:
 782       res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
 783       break;
 784    default:
 785       return;
 786    }
 787
 788    ++foldCount;
 789    i->src(0).mod = Modifier(0);
 790    i->src(1).mod = Modifier(0);
 791    i->src(2).mod = Modifier(0);
 792
 793    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
 794    i->setSrc(1, NULL);
 795    i->setSrc(2, NULL);
 796
 797    i->getSrc(0)->reg.data = res.data;
 798    i->getSrc(0)->reg.type = i->dType;
 799    i->getSrc(0)->reg.size = typeSizeof(i->dType);
 800
 801    i->op = OP_MOV;
 802 }
 803
 804 void
 805 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
 806 {
 807    Storage res;
 808
 809    if (i->dType != TYPE_F32)
 810       return;
 811    switch (i->op) {
 812    case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
 813    case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
 814    case OP_SAT: res.data.f32 = CLAMP(imm.reg.data.f32, 0.0f, 1.0f); break;
 815    case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
 816    case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
 817    case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
 818    case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
 819    case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
 820    case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
 821    case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
 822    case OP_PRESIN:
 823    case OP_PREEX2:
 824       // these should be handled in subsequent OP_SIN/COS/EX2
 825       res.data.f32 = imm.reg.data.f32;
 826       break;
 827    default:
 828       return;
 829    }
 830    i->op = OP_MOV;
 831    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
 832    i->src(0).mod = Modifier(0);
 833 }
 834
 835 void
 836 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
 837                                         const int s, ImmediateValue& imm2)
 838 {
 839    const int t = s ? 0 : 1;
 840    Instruction *insn;
 841    Instruction *mul1 = NULL; // mul1 before mul2
 842    int e = 0;
 843    float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
 844    ImmediateValue imm1;
 845
 846    assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
 847
 848    if (mul2->getSrc(t)->refCount() == 1) {
 849       insn = mul2->getSrc(t)->getInsn();
 850       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
 851          mul1 = insn;
 852       if (mul1 && !mul1->saturate) {
 853          int s1;
 854
 855          if (mul1->src(s1 = 0).getImmediate(imm1) ||
 856              mul1->src(s1 = 1).getImmediate(imm1)) {
 857             bld.setPosition(mul1, false);
 858             // a = mul r, imm1
 859             // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
 860             mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
 861             mul1->src(s1).mod = Modifier(0);
 862             mul2->def(0).replace(mul1->getDef(0), false);
 863             mul1->saturate = mul2->saturate;
 864          } else
 865          if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
 866             // c = mul a, b
 867             // d = mul c, imm   -> d = mul_x_imm a, b
 868             mul1->postFactor = e;
 869             mul2->def(0).replace(mul1->getDef(0), false);
 870             if (f < 0)
 871                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
 872             mul1->saturate = mul2->saturate;
 873          }
 874          return;
 875       }
 876    }
 877    if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
 878       // b = mul a, imm
 879       // d = mul b, c   -> d = mul_x_imm a, c
 880       int s2, t2;
 881       insn = (*mul2->getDef(0)->uses.begin())->getInsn();
 882       if (!insn)
 883          return;
 884       mul1 = mul2;
 885       mul2 = NULL;
 886       s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
 887       t2 = s2 ? 0 : 1;
 888       if (insn->op == OP_MUL && insn->dType == TYPE_F32)
 889          if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
 890             mul2 = insn;
 891       if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
 892          mul2->postFactor = e;
 893          mul2->setSrc(s2, mul1->src(t));
 894          if (f < 0)
 895             mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
 896       }
 897    }
 898 }
 899
 900 void
 901 ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
 902 {
 903    switch (i->op) {
 904    case OP_MAD:
 905    case OP_FMA:
 906       if (imm2.isInteger(0)) {
 907          i->op = OP_MUL;
 908          i->setSrc(2, NULL);
 909          foldCount++;
 910          return;
 911       }
 912       break;
 913    case OP_SHLADD:
 914       if (imm2.isInteger(0)) {
 915          i->op = OP_SHL;
 916          i->setSrc(2, NULL);
 917          foldCount++;
 918          return;
 919       }
 920       break;
 921    default:
 922       return;
 923    }
 924 }
 925
 926 void
 927 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 928 {
 929    const Target *target = prog->getTarget();
 930    const int t = !s;
 931    const operation op = i->op;
 932    Instruction *newi = i;
 933
 934    switch (i->op) {
 935    case OP_SPLIT: {
 936       bld.setPosition(i, false);
 937
 938       uint8_t size = i->getDef(0)->reg.size;
 939       uint32_t mask = (1ULL << size) - 1;
 940       assert(size <= 32);
 941
 942       uint64_t val = imm0.reg.data.u64;
 943       for (int8_t d = 0; i->defExists(d); ++d) {
 944          Value *def = i->getDef(d);
 945          assert(def->reg.size == size);
 946
 947          newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)), TYPE_U32);
 948          val >>= size;
 949       }
 950       delete_Instruction(prog, i);
 951       break;
 952    }
 953    case OP_MUL:
 954       if (i->dType == TYPE_F32)
 955          tryCollapseChainedMULs(i, s, imm0);
 956
 957       if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
 958          assert(!isFloatType(i->sType));
 959          if (imm0.isInteger(1) && i->dType == TYPE_S32) {
 960             bld.setPosition(i, false);
 961             // Need to set to the sign value, which is a compare.
 962             newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
 963                              TYPE_S32, i->getSrc(t), bld.mkImm(0));
 964             delete_Instruction(prog, i);
 965          } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
 966             // The high bits can't be set in this case (either mul by 0 or
 967             // unsigned by 1)
 968             i->op = OP_MOV;
 969             i->subOp = 0;
 970             i->setSrc(0, new_ImmediateValue(prog, 0u));
 971             i->src(0).mod = Modifier(0);
 972             i->setSrc(1, NULL);
 973          } else if (!imm0.isNegative() && imm0.isPow2()) {
 974             // Translate into a shift
 975             imm0.applyLog2();
 976             i->op = OP_SHR;
 977             i->subOp = 0;
 978             imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
 979             i->setSrc(0, i->getSrc(t));
 980             i->src(0).mod = i->src(t).mod;
 981             i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
 982             i->src(1).mod = 0;
 983          }
 984       } else
 985       if (imm0.isInteger(0)) {
 986          i->op = OP_MOV;
 987          i->setSrc(0, new_ImmediateValue(prog, 0u));
 988          i->src(0).mod = Modifier(0);
 989          i->postFactor = 0;
 990          i->setSrc(1, NULL);
 991       } else
 992       if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
 993          if (imm0.isNegative())
 994             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
 995          i->op = i->src(t).mod.getOp();
 996          if (s == 0) {
 997             i->setSrc(0, i->getSrc(1));
 998             i->src(0).mod = i->src(1).mod;
 999             i->src(1).mod = 0;
1000          }
1001          if (i->op != OP_CVT)
1002             i->src(0).mod = 0;
1003          i->setSrc(1, NULL);
1004       } else
1005       if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
1006          if (imm0.isNegative())
1007             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1008          i->op = OP_ADD;
1009          i->setSrc(s, i->getSrc(t));
1010          i->src(s).mod = i->src(t).mod;
1011       } else
1012       if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
1013          i->op = OP_SHL;
1014          imm0.applyLog2();
1015          i->setSrc(0, i->getSrc(t));
1016          i->src(0).mod = i->src(t).mod;
1017          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
1018          i->src(1).mod = 0;
1019       } else
1020       if (i->postFactor && i->sType == TYPE_F32) {
1021          /* Can't emit a postfactor with an immediate, have to fold it in */
1022          i->setSrc(s, new_ImmediateValue(
1023                       prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
1024          i->postFactor = 0;
1025       }
1026       break;
1027    case OP_MAD:
1028       if (imm0.isInteger(0)) {
1029          i->setSrc(0, i->getSrc(2));
1030          i->src(0).mod = i->src(2).mod;
1031          i->setSrc(1, NULL);
1032          i->setSrc(2, NULL);
1033          i->op = i->src(0).mod.getOp();
1034          if (i->op != OP_CVT)
1035             i->src(0).mod = 0;
1036       } else
1037       if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
1038           (imm0.isInteger(1) || imm0.isInteger(-1))) {
1039          if (imm0.isNegative())
1040             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1041          if (s == 0) {
1042             i->setSrc(0, i->getSrc(1));
1043             i->src(0).mod = i->src(1).mod;
1044          }
1045          i->setSrc(1, i->getSrc(2));
1046          i->src(1).mod = i->src(2).mod;
1047          i->setSrc(2, NULL);
1048          i->op = OP_ADD;
1049       } else
1050       if (s == 1 && !imm0.isNegative() && imm0.isPow2() &&
1051           target->isOpSupported(OP_SHLADD, i->dType)) {
1052          i->op = OP_SHLADD;
1053          imm0.applyLog2();
1054          i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
1055       }
1056       break;
1057    case OP_ADD:
1058    case OP_SUB:
1059       if (i->usesFlags())
1060          break;
1061       if (imm0.isInteger(0)) {
1062          if (s == 0) {
1063             i->setSrc(0, i->getSrc(1));
1064             i->src(0).mod = i->src(1).mod;
1065             if (i->op == OP_SUB)
1066                i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1067          }
1068          i->setSrc(1, NULL);
1069          i->op = i->src(0).mod.getOp();
1070          if (i->op != OP_CVT)
1071             i->src(0).mod = Modifier(0);
1072       }
1073       break;
1074
1075    case OP_DIV:
1076       if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
1077          break;
1078       bld.setPosition(i, false);
1079       if (imm0.reg.data.u32 == 0) {
1080          break;
1081       } else
1082       if (imm0.reg.data.u32 == 1) {
1083          i->op = OP_MOV;
1084          i->setSrc(1, NULL);
1085       } else
1086       if (i->dType == TYPE_U32 && imm0.isPow2()) {
1087          i->op = OP_SHR;
1088          i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
1089       } else
1090       if (i->dType == TYPE_U32) {
1091          Instruction *mul;
1092          Value *tA, *tB;
1093          const uint32_t d = imm0.reg.data.u32;
1094          uint32_t m;
1095          int r, s;
1096          uint32_t l = util_logbase2(d);
1097          if (((uint32_t)1 << l) < d)
1098             ++l;
1099          m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
1100          r = l ? 1 : 0;
1101          s = l ? (l - 1) : 0;
1102
1103          tA = bld.getSSA();
1104          tB = bld.getSSA();
1105          mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
1106                          bld.loadImm(NULL, m));
1107          mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
1108          bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
1109          tA = bld.getSSA();
1110          if (r)
1111             bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
1112          else
1113             tA = tB;
1114          tB = s ? bld.getSSA() : i->getDef(0);
1115          newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
1116          if (s)
1117             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
1118
1119          delete_Instruction(prog, i);
1120       } else
1121       if (imm0.reg.data.s32 == -1) {
1122          i->op = OP_NEG;
1123          i->setSrc(1, NULL);
1124       } else {
1125          LValue *tA, *tB;
1126          LValue *tD;
1127          const int32_t d = imm0.reg.data.s32;
1128          int32_t m;
1129          int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
1130          if ((1 << l) < abs(d))
1131             ++l;
1132          if (!l)
1133             l = 1;
1134          m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
1135
1136          tA = bld.getSSA();
1137          tB = bld.getSSA();
1138          bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
1139                    i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
1140          if (l > 1)
1141             bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
1142          else
1143             tB = tA;
1144          tA = bld.getSSA();
1145          bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
1146          tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
1147          newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
1148          if (d < 0)
1149             bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
1150
1151          delete_Instruction(prog, i);
1152       }
1153       break;
1154
1155    case OP_MOD:
1156       if (i->sType == TYPE_U32 && imm0.isPow2()) {
1157          bld.setPosition(i, false);
1158          i->op = OP_AND;
1159          i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1160       }
1161       break;
1162
1163    case OP_SET: // TODO: SET_AND,OR,XOR
1164    {
1165       /* This optimizes the case where the output of a set is being compared
1166        * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1167        * can be a lot cleverer in our comparison.
1168        */
1169       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
1170       CondCode cc, ccZ;
1171       if (imm0.reg.data.u32 != 0 || !si)
1172          return;
1173       cc = si->setCond;
1174       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
1175       // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1176       // first.
1177       if (s == 0)
1178          ccZ = reverseCondCode(ccZ);
1179       // If there is a negative modifier, we need to undo that, by flipping
1180       // the comparison to zero.
1181       if (i->src(t).mod.neg())
1182          ccZ = reverseCondCode(ccZ);
1183       // If this is a signed comparison, we expect the input to be a regular
1184       // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1185       // is positive, so just flip the sign.
1186       if (i->sType == TYPE_S32) {
1187          assert(!isFloatType(si->dType));
1188          ccZ = reverseCondCode(ccZ);
1189       }
1190       switch (ccZ) {
1191       case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
1192       case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
1193       case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
1194       case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
1195       case CC_GT: break; // bool > 0 -- bool
1196       case CC_NE: break; // bool != 0 -- bool
1197       default:
1198          return;
1199       }
1200
1201       // Update the condition of this SET to be identical to the origin set,
1202       // but with the updated condition code. The original SET should get
1203       // DCE'd, ideally.
1204       i->op = si->op;
1205       i->asCmp()->setCond = cc;
1206       i->setSrc(0, si->src(0));
1207       i->setSrc(1, si->src(1));
1208       if (si->srcExists(2))
1209          i->setSrc(2, si->src(2));
1210       i->sType = si->sType;
1211    }
1212       break;
1213
1214    case OP_AND:
1215    {
1216       Instruction *src = i->getSrc(t)->getInsn();
1217       ImmediateValue imm1;
1218       if (imm0.reg.data.u32 == 0) {
1219          i->op = OP_MOV;
1220          i->setSrc(0, new_ImmediateValue(prog, 0u));
1221          i->src(0).mod = Modifier(0);
1222          i->setSrc(1, NULL);
1223       } else if (imm0.reg.data.u32 == ~0U) {
1224          i->op = i->src(t).mod.getOp();
1225          if (t) {
1226             i->setSrc(0, i->getSrc(t));
1227             i->src(0).mod = i->src(t).mod;
1228          }
1229          i->setSrc(1, NULL);
1230       } else if (src->asCmp()) {
1231          CmpInstruction *cmp = src->asCmp();
1232          if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
1233             return;
1234          if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
1235             return;
1236          if (imm0.reg.data.f32 != 1.0)
1237             return;
1238          if (cmp->dType != TYPE_U32)
1239             return;
1240
1241          cmp->dType = TYPE_F32;
1242          if (i->src(t).mod != Modifier(0)) {
1243             assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
1244             i->src(t).mod = Modifier(0);
1245             cmp->setCond = inverseCondCode(cmp->setCond);
1246          }
1247          i->op = OP_MOV;
1248          i->setSrc(s, NULL);
1249          if (t) {
1250             i->setSrc(0, i->getSrc(t));
1251             i->setSrc(t, NULL);
1252          }
1253       } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
1254                  src->op == OP_SHR &&
1255                  src->src(1).getImmediate(imm1) &&
1256                  i->src(t).mod == Modifier(0) &&
1257                  util_is_power_of_two(imm0.reg.data.u32 + 1)) {
1258          // low byte = offset, high byte = width
1259          uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
1260          i->op = OP_EXTBF;
1261          i->setSrc(0, src->getSrc(0));
1262          i->setSrc(1, new_ImmediateValue(prog, ext));
1263       }
1264    }
1265       break;
1266
1267    case OP_SHL:
1268    {
1269       if (s != 1 || i->src(0).mod != Modifier(0))
1270          break;
1271       // try to concatenate shifts
1272       Instruction *si = i->getSrc(0)->getInsn();
1273       if (!si)
1274          break;
1275       ImmediateValue imm1;
1276       switch (si->op) {
1277       case OP_SHL:
1278          if (si->src(1).getImmediate(imm1)) {
1279             bld.setPosition(i, false);
1280             i->setSrc(0, si->getSrc(0));
1281             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
1282          }
1283          break;
1284       case OP_SHR:
1285          if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
1286             bld.setPosition(i, false);
1287             i->op = OP_AND;
1288             i->setSrc(0, si->getSrc(0));
1289             i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
1290          }
1291          break;
1292       case OP_MUL:
1293          int muls;
1294          if (isFloatType(si->dType))
1295             return;
1296          if (si->src(1).getImmediate(imm1))
1297             muls = 1;
1298          else if (si->src(0).getImmediate(imm1))
1299             muls = 0;
1300          else
1301             return;
1302
1303          bld.setPosition(i, false);
1304          i->op = OP_MUL;
1305          i->setSrc(0, si->getSrc(!muls));
1306          i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1307          break;
1308       case OP_SUB:
1309       case OP_ADD:
1310          int adds;
1311          if (isFloatType(si->dType))
1312             return;
1313          if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
1314             adds = 0;
1315          else if (si->src(1).getImmediate(imm1))
1316             adds = 1;
1317          else
1318             return;
1319          if (si->src(!adds).mod != Modifier(0))
1320             return;
1321          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1322
1323          // This is more operations, but if one of x, y is an immediate, then
1324          // we can get a situation where (a) we can use ISCADD, or (b)
1325          // propagate the add bit into an indirect load.
1326          bld.setPosition(i, false);
1327          i->op = si->op;
1328          i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1329          i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
1330                                      bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
1331                                      si->getSrc(!adds),
1332                                      bld.mkImm(imm0.reg.data.u32)));
1333          break;
1334       default:
1335          return;
1336       }
1337    }
1338       break;
1339
1340    case OP_ABS:
1341    case OP_NEG:
1342    case OP_SAT:
1343    case OP_LG2:
1344    case OP_RCP:
1345    case OP_SQRT:
1346    case OP_RSQ:
1347    case OP_PRESIN:
1348    case OP_SIN:
1349    case OP_COS:
1350    case OP_PREEX2:
1351    case OP_EX2:
1352       unary(i, imm0);
1353       break;
1354    case OP_BFIND: {
1355       int32_t res;
1356       switch (i->dType) {
1357       case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
1358       case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
1359       default:
1360          return;
1361       }
1362       if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
1363          res = 31 - res;
1364       bld.setPosition(i, false); /* make sure bld is init'ed */
1365       i->setSrc(0, bld.mkImm(res));
1366       i->setSrc(1, NULL);
1367       i->op = OP_MOV;
1368       i->subOp = 0;
1369       break;
1370    }
1371    case OP_POPCNT: {
1372       // Only deal with 1-arg POPCNT here
1373       if (i->srcExists(1))
1374          break;
1375       uint32_t res = util_bitcount(imm0.reg.data.u32);
1376       i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1377       i->setSrc(1, NULL);
1378       i->op = OP_MOV;
1379       break;
1380    }
1381    case OP_CVT: {
1382       Storage res;
1383
1384       // TODO: handle 64-bit values properly
1385       if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
1386          return;
1387
1388       // TODO: handle single byte/word extractions
1389       if (i->subOp)
1390          return;
1391
1392       bld.setPosition(i, true); /* make sure bld is init'ed */
1393
1394 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1395    case type: \
1396       switch (i->sType) { \
1397       case TYPE_F64: \
1398          res.data.dst = util_iround(i->saturate ? \
1399                                     CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1400                                     imm0.reg.data.f64); \
1401          break; \
1402       case TYPE_F32: \
1403          res.data.dst = util_iround(i->saturate ? \
1404                                     CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1405                                     imm0.reg.data.f32); \
1406          break; \
1407       case TYPE_S32: \
1408          res.data.dst = i->saturate ? \
1409                         CLAMP(imm0.reg.data.s32, imin, imax) : \
1410                         imm0.reg.data.s32; \
1411          break; \
1412       case TYPE_U32: \
1413          res.data.dst = i->saturate ? \
1414                         CLAMP(imm0.reg.data.u32, umin, umax) : \
1415                         imm0.reg.data.u32; \
1416          break; \
1417       case TYPE_S16: \
1418          res.data.dst = i->saturate ? \
1419                         CLAMP(imm0.reg.data.s16, imin, imax) : \
1420                         imm0.reg.data.s16; \
1421          break; \
1422       case TYPE_U16: \
1423          res.data.dst = i->saturate ? \
1424                         CLAMP(imm0.reg.data.u16, umin, umax) : \
1425                         imm0.reg.data.u16; \
1426          break; \
1427       default: return; \
1428       } \
1429       i->setSrc(0, bld.mkImm(res.data.dst)); \
1430       break
1431
1432       switch(i->dType) {
1433       CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
1434       CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
1435       CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
1436       CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
1437       case TYPE_F32:
1438          switch (i->sType) {
1439          case TYPE_F64:
1440             res.data.f32 = i->saturate ?
1441                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1442                imm0.reg.data.f64;
1443             break;
1444          case TYPE_F32:
1445             res.data.f32 = i->saturate ?
1446                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1447                imm0.reg.data.f32;
1448             break;
1449          case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
1450          case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
1451          case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
1452          case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
1453          default:
1454             return;
1455          }
1456          i->setSrc(0, bld.mkImm(res.data.f32));
1457          break;
1458       case TYPE_F64:
1459          switch (i->sType) {
1460          case TYPE_F64:
1461             res.data.f64 = i->saturate ?
1462                CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
1463                imm0.reg.data.f64;
1464             break;
1465          case TYPE_F32:
1466             res.data.f64 = i->saturate ?
1467                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
1468                imm0.reg.data.f32;
1469             break;
1470          case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
1471          case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
1472          case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
1473          case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
1474          default:
1475             return;
1476          }
1477          i->setSrc(0, bld.mkImm(res.data.f64));
1478          break;
1479       default:
1480          return;
1481       }
1482 #undef CASE
1483
1484       i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
1485       i->op = OP_MOV;
1486       i->saturate = 0;
1487       i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
1488       break;
1489    }
1490    default:
1491       return;
1492    }
1493    if (newi->op != op)
1494       foldCount++;
1495 }
1496
1497 // =============================================================================
1498
1499 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1500 class ModifierFolding : public Pass
1501 {
1502 private:
1503    virtual bool visit(BasicBlock *);
1504 };
1505
1506 bool
1507 ModifierFolding::visit(BasicBlock *bb)
1508 {
1509    const Target *target = prog->getTarget();
1510
1511    Instruction *i, *next, *mi;
1512    Modifier mod;
1513
1514    for (i = bb->getEntry(); i; i = next) {
1515       next = i->next;
1516
1517       if (0 && i->op == OP_SUB) {
1518          // turn "sub" into "add neg" (do we really want this ?)
1519          i->op = OP_ADD;
1520          i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1521       }
1522
1523       for (int s = 0; s < 3 && i->srcExists(s); ++s) {
1524          mi = i->getSrc(s)->getInsn();
1525          if (!mi ||
1526              mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
1527             continue;
1528          if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
1529             if ((i->op != OP_ADD &&
1530                  i->op != OP_MUL) ||
1531                 (mi->op != OP_ABS &&
1532                  mi->op != OP_NEG))
1533                continue;
1534          } else
1535          if (i->sType != mi->dType) {
1536             continue;
1537          }
1538          if ((mod = Modifier(mi->op)) == Modifier(0))
1539             continue;
1540          mod *= mi->src(0).mod;
1541
1542          if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
1543             // abs neg [abs] = abs
1544             mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
1545          } else
1546          if ((i->op == OP_NEG) && mod.neg()) {
1547             assert(s == 0);
1548             // neg as both opcode and modifier on same insn is prohibited
1549             // neg neg abs = abs, neg neg = identity
1550             mod = mod & Modifier(~NV50_IR_MOD_NEG);
1551             i->op = mod.getOp();
1552             mod = mod & Modifier(~NV50_IR_MOD_ABS);
1553             if (mod == Modifier(0))
1554                i->op = OP_MOV;
1555          }
1556
1557          if (target->isModSupported(i, s, mod)) {
1558             i->setSrc(s, mi->getSrc(0));
1559             i->src(s).mod *= mod;
1560          }
1561       }
1562
1563       if (i->op == OP_SAT) {
1564          mi = i->getSrc(0)->getInsn();
1565          if (mi &&
1566              mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
1567             mi->saturate = 1;
1568             mi->setDef(0, i->getDef(0));
1569             delete_Instruction(prog, i);
1570          }
1571       }
1572    }
1573
1574    return true;
1575 }
1576
1577 // =============================================================================
1578
1579 // MUL + ADD -> MAD/FMA
1580 // MIN/MAX(a, a) -> a, etc.
1581 // SLCT(a, b, const) -> cc(const) ? a : b
1582 // RCP(RCP(a)) -> a
1583 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1584 class AlgebraicOpt : public Pass
1585 {
1586 private:
1587    virtual bool visit(BasicBlock *);
1588
1589    void handleABS(Instruction *);
1590    bool handleADD(Instruction *);
1591    bool tryADDToMADOrSAD(Instruction *, operation toOp);
1592    void handleMINMAX(Instruction *);
1593    void handleRCP(Instruction *);
1594    void handleSLCT(Instruction *);
1595    void handleLOGOP(Instruction *);
1596    void handleCVT_NEG(Instruction *);
1597    void handleCVT_CVT(Instruction *);
1598    void handleCVT_EXTBF(Instruction *);
1599    void handleSUCLAMP(Instruction *);
1600    void handleNEG(Instruction *);
1601
1602    BuildUtil bld;
1603 };
1604
1605 void
1606 AlgebraicOpt::handleABS(Instruction *abs)
1607 {
1608    Instruction *sub = abs->getSrc(0)->getInsn();
1609    DataType ty;
1610    if (!sub ||
1611        !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
1612       return;
1613    // expect not to have mods yet, if we do, bail
1614    if (sub->src(0).mod || sub->src(1).mod)
1615       return;
1616    // hidden conversion ?
1617    ty = intTypeToSigned(sub->dType);
1618    if (abs->dType != abs->sType || ty != abs->sType)
1619       return;
1620
1621    if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
1622        sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
1623        sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
1624          return;
1625
1626    Value *src0 = sub->getSrc(0);
1627    Value *src1 = sub->getSrc(1);
1628
1629    if (sub->op == OP_ADD) {
1630       Instruction *neg = sub->getSrc(1)->getInsn();
1631       if (neg && neg->op != OP_NEG) {
1632          neg = sub->getSrc(0)->getInsn();
1633          src0 = sub->getSrc(1);
1634       }
1635       if (!neg || neg->op != OP_NEG ||
1636           neg->dType != neg->sType || neg->sType != ty)
1637          return;
1638       src1 = neg->getSrc(0);
1639    }
1640
1641    // found ABS(SUB))
1642    abs->moveSources(1, 2); // move sources >=1 up by 2
1643    abs->op = OP_SAD;
1644    abs->setType(sub->dType);
1645    abs->setSrc(0, src0);
1646    abs->setSrc(1, src1);
1647    bld.setPosition(abs, false);
1648    abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
1649 }
1650
1651 bool
1652 AlgebraicOpt::handleADD(Instruction *add)
1653 {
1654    Value *src0 = add->getSrc(0);
1655    Value *src1 = add->getSrc(1);
1656
1657    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1658       return false;
1659
1660    bool changed = false;
1661    if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
1662       changed = tryADDToMADOrSAD(add, OP_MAD);
1663    if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
1664       changed = tryADDToMADOrSAD(add, OP_SAD);
1665    return changed;
1666 }
1667
1668 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1669 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1670 bool
1671 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
1672 {
1673    Value *src0 = add->getSrc(0);
1674    Value *src1 = add->getSrc(1);
1675    Value *src;
1676    int s;
1677    const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1678    const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1679    Modifier mod[4];
1680
1681    if (src0->refCount() == 1 &&
1682        src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1683       s = 0;
1684    else
1685    if (src1->refCount() == 1 &&
1686        src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1687       s = 1;
1688    else
1689       return false;
1690
1691    src = add->getSrc(s);
1692
1693    if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
1694       return false;
1695
1696    if (src->getInsn()->saturate || src->getInsn()->postFactor ||
1697        src->getInsn()->dnz)
1698       return false;
1699
1700    if (toOp == OP_SAD) {
1701       ImmediateValue imm;
1702       if (!src->getInsn()->src(2).getImmediate(imm))
1703          return false;
1704       if (!imm.isInteger(0))
1705          return false;
1706    }
1707
1708    if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
1709        isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
1710       return false;
1711
1712    mod[0] = add->src(0).mod;
1713    mod[1] = add->src(1).mod;
1714    mod[2] = src->getUniqueInsn()->src(0).mod;
1715    mod[3] = src->getUniqueInsn()->src(1).mod;
1716
1717    if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1718       return false;
1719
1720    add->op = toOp;
1721    add->subOp = src->getInsn()->subOp; // potentially mul-high
1722    add->dType = src->getInsn()->dType; // sign matters for imad hi
1723    add->sType = src->getInsn()->sType;
1724
1725    add->setSrc(2, add->src(s ? 0 : 1));
1726
1727    add->setSrc(0, src->getInsn()->getSrc(0));
1728    add->src(0).mod = mod[2] ^ mod[s];
1729    add->setSrc(1, src->getInsn()->getSrc(1));
1730    add->src(1).mod = mod[3];
1731
1732    return true;
1733 }
1734
1735 void
1736 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1737 {
1738    Value *src0 = minmax->getSrc(0);
1739    Value *src1 = minmax->getSrc(1);
1740
1741    if (src0 != src1 || src0->reg.file != FILE_GPR)
1742       return;
1743    if (minmax->src(0).mod == minmax->src(1).mod) {
1744       if (minmax->def(0).mayReplace(minmax->src(0))) {
1745          minmax->def(0).replace(minmax->src(0), false);
1746          minmax->bb->remove(minmax);
1747       } else {
1748          minmax->op = OP_CVT;
1749          minmax->setSrc(1, NULL);
1750       }
1751    } else {
1752       // TODO:
1753       // min(x, -x) = -abs(x)
1754       // min(x, -abs(x)) = -abs(x)
1755       // min(x, abs(x)) = x
1756       // max(x, -abs(x)) = x
1757       // max(x, abs(x)) = abs(x)
1758       // max(x, -x) = abs(x)
1759    }
1760 }
1761
1762 void
1763 AlgebraicOpt::handleRCP(Instruction *rcp)
1764 {
1765    Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1766
1767    if (si && si->op == OP_RCP) {
1768       Modifier mod = rcp->src(0).mod * si->src(0).mod;
1769       rcp->op = mod.getOp();
1770       rcp->setSrc(0, si->getSrc(0));
1771    }
1772 }
1773
1774 void
1775 AlgebraicOpt::handleSLCT(Instruction *slct)
1776 {
1777    if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
1778       if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
1779          slct->setSrc(0, slct->getSrc(1));
1780    } else
1781    if (slct->getSrc(0) != slct->getSrc(1)) {
1782       return;
1783    }
1784    slct->op = OP_MOV;
1785    slct->setSrc(1, NULL);
1786    slct->setSrc(2, NULL);
1787 }
1788
1789 void
1790 AlgebraicOpt::handleLOGOP(Instruction *logop)
1791 {
1792    Value *src0 = logop->getSrc(0);
1793    Value *src1 = logop->getSrc(1);
1794
1795    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1796       return;
1797
1798    if (src0 == src1) {
1799       if ((logop->op == OP_AND || logop->op == OP_OR) &&
1800           logop->def(0).mayReplace(logop->src(0))) {
1801          logop->def(0).replace(logop->src(0), false);
1802          delete_Instruction(prog, logop);
1803       }
1804    } else {
1805       // try AND(SET, SET) -> SET_AND(SET)
1806       Instruction *set0 = src0->getInsn();
1807       Instruction *set1 = src1->getInsn();
1808
1809       if (!set0 || set0->fixed || !set1 || set1->fixed)
1810          return;
1811       if (set1->op != OP_SET) {
1812          Instruction *xchg = set0;
1813          set0 = set1;
1814          set1 = xchg;
1815          if (set1->op != OP_SET)
1816             return;
1817       }
1818       operation redOp = (logop->op == OP_AND ? OP_SET_AND :
1819                          logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
1820       if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
1821          return;
1822       if (set0->op != OP_SET &&
1823           set0->op != OP_SET_AND &&
1824           set0->op != OP_SET_OR &&
1825           set0->op != OP_SET_XOR)
1826          return;
1827       if (set0->getDef(0)->refCount() > 1 &&
1828           set1->getDef(0)->refCount() > 1)
1829          return;
1830       if (set0->getPredicate() || set1->getPredicate())
1831          return;
1832       // check that they don't source each other
1833       for (int s = 0; s < 2; ++s)
1834          if (set0->getSrc(s) == set1->getDef(0) ||
1835              set1->getSrc(s) == set0->getDef(0))
1836             return;
1837
1838       set0 = cloneForward(func, set0);
1839       set1 = cloneShallow(func, set1);
1840       logop->bb->insertAfter(logop, set1);
1841       logop->bb->insertAfter(logop, set0);
1842
1843       set0->dType = TYPE_U8;
1844       set0->getDef(0)->reg.file = FILE_PREDICATE;
1845       set0->getDef(0)->reg.size = 1;
1846       set1->setSrc(2, set0->getDef(0));
1847       set1->op = redOp;
1848       set1->setDef(0, logop->getDef(0));
1849       delete_Instruction(prog, logop);
1850    }
1851 }
1852
1853 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1854 // nv50:
1855 //  F2I(NEG(I2F(ABS(SET))))
1856 void
1857 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
1858 {
1859    Instruction *insn = cvt->getSrc(0)->getInsn();
1860    if (cvt->sType != TYPE_F32 ||
1861        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
1862       return;
1863    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
1864       return;
1865    if (insn->src(0).mod != Modifier(0))
1866       return;
1867    insn = insn->getSrc(0)->getInsn();
1868
1869    // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
1870    if (insn && insn->op == OP_CVT &&
1871        insn->dType == TYPE_F32 &&
1872        insn->sType == TYPE_S32) {
1873       insn = insn->getSrc(0)->getInsn();
1874       if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
1875           insn->src(0).mod)
1876          return;
1877       insn = insn->getSrc(0)->getInsn();
1878       if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
1879          return;
1880    } else
1881    if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
1882       return;
1883    }
1884
1885    Instruction *bset = cloneShallow(func, insn);
1886    bset->dType = TYPE_U32;
1887    bset->setDef(0, cvt->getDef(0));
1888    cvt->bb->insertAfter(cvt, bset);
1889    delete_Instruction(prog, cvt);
1890 }
1891
1892 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
1893 // does a type conversion, this becomes trickier as there might be range
1894 // changes/etc. We could handle those in theory as long as the range was being
1895 // reduced or kept the same.
1896 void
1897 AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
1898 {
1899    Instruction *insn = cvt->getSrc(0)->getInsn();
1900    RoundMode rnd = insn->rnd;
1901
1902    if (insn->saturate ||
1903        insn->subOp ||
1904        insn->dType != insn->sType ||
1905        insn->dType != cvt->sType)
1906       return;
1907
1908    switch (insn->op) {
1909    case OP_CEIL:
1910       rnd = ROUND_PI;
1911       break;
1912    case OP_FLOOR:
1913       rnd = ROUND_MI;
1914       break;
1915    case OP_TRUNC:
1916       rnd = ROUND_ZI;
1917       break;
1918    case OP_CVT:
1919       break;
1920    default:
1921       return;
1922    }
1923
1924    if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
1925       rnd = (RoundMode)(rnd & 3);
1926
1927    cvt->rnd = rnd;
1928    cvt->setSrc(0, insn->getSrc(0));
1929    cvt->src(0).mod *= insn->src(0).mod;
1930    cvt->sType = insn->sType;
1931 }
1932
1933 // Some shaders extract packed bytes out of words and convert them to
1934 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
1935 // nv50 for word sizes.
1936 //
1937 // CVT(EXTBF(x, byte/word))
1938 // CVT(AND(bytemask, x))
1939 // CVT(AND(bytemask, SHR(x, 8/16/24)))
1940 // CVT(SHR(x, 16/24))
1941 void
1942 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
1943 {
1944    Instruction *insn = cvt->getSrc(0)->getInsn();
1945    ImmediateValue imm;
1946    Value *arg = NULL;
1947    unsigned width, offset;
1948    if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
1949       return;
1950    if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
1951       width = (imm.reg.data.u32 >> 8) & 0xff;
1952       offset = imm.reg.data.u32 & 0xff;
1953       arg = insn->getSrc(0);
1954
1955       if (width != 8 && width != 16)
1956          return;
1957       if (width == 8 && offset & 0x7)
1958          return;
1959       if (width == 16 && offset & 0xf)
1960          return;
1961    } else if (insn->op == OP_AND) {
1962       int s;
1963       if (insn->src(0).getImmediate(imm))
1964          s = 0;
1965       else if (insn->src(1).getImmediate(imm))
1966          s = 1;
1967       else
1968          return;
1969
1970       if (imm.reg.data.u32 == 0xff)
1971          width = 8;
1972       else if (imm.reg.data.u32 == 0xffff)
1973          width = 16;
1974       else
1975          return;
1976
1977       arg = insn->getSrc(!s);
1978       Instruction *shift = arg->getInsn();
1979       offset = 0;
1980       if (shift && shift->op == OP_SHR &&
1981           shift->sType == cvt->sType &&
1982           shift->src(1).getImmediate(imm) &&
1983           ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
1984            (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
1985          arg = shift->getSrc(0);
1986          offset = imm.reg.data.u32;
1987       }
1988       // We just AND'd the high bits away, which means this is effectively an
1989       // unsigned value.
1990       cvt->sType = TYPE_U32;
1991    } else if (insn->op == OP_SHR &&
1992               insn->sType == cvt->sType &&
1993               insn->src(1).getImmediate(imm)) {
1994       arg = insn->getSrc(0);
1995       if (imm.reg.data.u32 == 24) {
1996          width = 8;
1997          offset = 24;
1998       } else if (imm.reg.data.u32 == 16) {
1999          width = 16;
2000          offset = 16;
2001       } else {
2002          return;
2003       }
2004    }
2005
2006    if (!arg)
2007       return;
2008
2009    // Irrespective of what came earlier, we can undo a shift on the argument
2010    // by adjusting the offset.
2011    Instruction *shift = arg->getInsn();
2012    if (shift && shift->op == OP_SHL &&
2013        shift->src(1).getImmediate(imm) &&
2014        ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2015         (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
2016        imm.reg.data.u32 <= offset) {
2017       arg = shift->getSrc(0);
2018       offset -= imm.reg.data.u32;
2019    }
2020
2021    // The unpackSnorm lowering still leaves a few shifts behind, but it's too
2022    // annoying to detect them.
2023
2024    if (width == 8) {
2025       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
2026    } else {
2027       assert(width == 16);
2028       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
2029    }
2030    cvt->setSrc(0, arg);
2031    cvt->subOp = offset >> 3;
2032 }
2033
2034 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
2035 void
2036 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
2037 {
2038    ImmediateValue imm;
2039    int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
2040    int s;
2041    Instruction *add;
2042
2043    assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
2044
2045    // look for ADD (TODO: only count references by non-SUCLAMP)
2046    if (insn->getSrc(0)->refCount() > 1)
2047       return;
2048    add = insn->getSrc(0)->getInsn();
2049    if (!add || add->op != OP_ADD ||
2050        (add->dType != TYPE_U32 &&
2051         add->dType != TYPE_S32))
2052       return;
2053
2054    // look for immediate
2055    for (s = 0; s < 2; ++s)
2056       if (add->src(s).getImmediate(imm))
2057          break;
2058    if (s >= 2)
2059       return;
2060    s = s ? 0 : 1;
2061    // determine if immediate fits
2062    val += imm.reg.data.s32;
2063    if (val > 31 || val < -32)
2064       return;
2065    // determine if other addend fits
2066    if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
2067       return;
2068
2069    bld.setPosition(insn, false); // make sure bld is init'ed
2070    // replace sources
2071    insn->setSrc(2, bld.mkImm(val));
2072    insn->setSrc(0, add->getSrc(s));
2073 }
2074
2075 // NEG(AND(SET, 1)) -> SET
2076 void
2077 AlgebraicOpt::handleNEG(Instruction *i) {
2078    Instruction *src = i->getSrc(0)->getInsn();
2079    ImmediateValue imm;
2080    int b;
2081
2082    if (isFloatType(i->sType) || !src || src->op != OP_AND)
2083       return;
2084
2085    if (src->src(0).getImmediate(imm))
2086       b = 1;
2087    else if (src->src(1).getImmediate(imm))
2088       b = 0;
2089    else
2090       return;
2091
2092    if (!imm.isInteger(1))
2093       return;
2094
2095    Instruction *set = src->getSrc(b)->getInsn();
2096    if ((set->op == OP_SET || set->op == OP_SET_AND ||
2097        set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
2098        !isFloatType(set->dType)) {
2099       i->def(0).replace(set->getDef(0), false);
2100    }
2101 }
2102
2103 bool
2104 AlgebraicOpt::visit(BasicBlock *bb)
2105 {
2106    Instruction *next;
2107    for (Instruction *i = bb->getEntry(); i; i = next) {
2108       next = i->next;
2109       switch (i->op) {
2110       case OP_ABS:
2111          handleABS(i);
2112          break;
2113       case OP_ADD:
2114          handleADD(i);
2115          break;
2116       case OP_RCP:
2117          handleRCP(i);
2118          break;
2119       case OP_MIN:
2120       case OP_MAX:
2121          handleMINMAX(i);
2122          break;
2123       case OP_SLCT:
2124          handleSLCT(i);
2125          break;
2126       case OP_AND:
2127       case OP_OR:
2128       case OP_XOR:
2129          handleLOGOP(i);
2130          break;
2131       case OP_CVT:
2132          handleCVT_NEG(i);
2133          handleCVT_CVT(i);
2134          if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
2135              handleCVT_EXTBF(i);
2136          break;
2137       case OP_SUCLAMP:
2138          handleSUCLAMP(i);
2139          break;
2140       case OP_NEG:
2141          handleNEG(i);
2142          break;
2143       default:
2144          break;
2145       }
2146    }
2147
2148    return true;
2149 }
2150
2151 // =============================================================================
2152
2153 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2154 class LateAlgebraicOpt : public Pass
2155 {
2156 private:
2157    virtual bool visit(Instruction *);
2158
2159    void handleADD(Instruction *);
2160    bool tryADDToSHLADD(Instruction *);
2161 };
2162
2163 void
2164 LateAlgebraicOpt::handleADD(Instruction *add)
2165 {
2166    Value *src0 = add->getSrc(0);
2167    Value *src1 = add->getSrc(1);
2168
2169    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
2170       return;
2171
2172    if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType))
2173       tryADDToSHLADD(add);
2174 }
2175
2176 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2177 bool
2178 LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
2179 {
2180    Value *src0 = add->getSrc(0);
2181    Value *src1 = add->getSrc(1);
2182    ImmediateValue imm;
2183    Instruction *shl;
2184    Value *src;
2185    int s;
2186
2187    if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8
2188        || isFloatType(add->dType))
2189       return false;
2190
2191    if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL)
2192       s = 0;
2193    else
2194    if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL)
2195       s = 1;
2196    else
2197       return false;
2198
2199    src = add->getSrc(s);
2200    shl = src->getUniqueInsn();
2201
2202    if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod)
2203       return false;
2204
2205    if (!shl->src(1).getImmediate(imm))
2206       return false;
2207
2208    add->op = OP_SHLADD;
2209    add->setSrc(2, add->src(!s));
2210    // SHL can't have any modifiers, but the ADD source may have had
2211    // one. Preserve it.
2212    add->setSrc(0, shl->getSrc(0));
2213    if (s == 1)
2214       add->src(0).mod = add->src(1).mod;
2215    add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32));
2216    add->src(1).mod = Modifier(0);
2217
2218    return true;
2219 }
2220
2221 bool
2222 LateAlgebraicOpt::visit(Instruction *i)
2223 {
2224    switch (i->op) {
2225    case OP_ADD:
2226       handleADD(i);
2227       break;
2228    default:
2229       break;
2230    }
2231
2232    return true;
2233 }
2234
2235 // =============================================================================
2236
2237 static inline void
2238 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
2239 {
2240    if (offset != ldst->getSrc(0)->reg.data.offset) {
2241       if (ldst->getSrc(0)->refCount() > 1)
2242          ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
2243       ldst->getSrc(0)->reg.data.offset = offset;
2244    }
2245 }
2246
2247 // Combine loads and stores, forward stores to loads where possible.
2248 class MemoryOpt : public Pass
2249 {
2250 private:
2251    class Record
2252    {
2253    public:
2254       Record *next;
2255       Instruction *insn;
2256       const Value *rel[2];
2257       const Value *base;
2258       int32_t offset;
2259       int8_t fileIndex;
2260       uint8_t size;
2261       bool locked;
2262       Record *prev;
2263
2264       bool overlaps(const Instruction *ldst) const;
2265
2266       inline void link(Record **);
2267       inline void unlink(Record **);
2268       inline void set(const Instruction *ldst);
2269    };
2270
2271 public:
2272    MemoryOpt();
2273
2274    Record *loads[DATA_FILE_COUNT];
2275    Record *stores[DATA_FILE_COUNT];
2276
2277    MemoryPool recordPool;
2278
2279 private:
2280    virtual bool visit(BasicBlock *);
2281    bool runOpt(BasicBlock *);
2282
2283    Record **getList(const Instruction *);
2284
2285    Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
2286
2287    // merge @insn into load/store instruction from @rec
2288    bool combineLd(Record *rec, Instruction *ld);
2289    bool combineSt(Record *rec, Instruction *st);
2290
2291    bool replaceLdFromLd(Instruction *ld, Record *ldRec);
2292    bool replaceLdFromSt(Instruction *ld, Record *stRec);
2293    bool replaceStFromSt(Instruction *restrict st, Record *stRec);
2294
2295    void addRecord(Instruction *ldst);
2296    void purgeRecords(Instruction *const st, DataFile);
2297    void lockStores(Instruction *const ld);
2298    void reset();
2299
2300 private:
2301    Record *prevRecord;
2302 };
2303
2304 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
2305 {
2306    for (int i = 0; i < DATA_FILE_COUNT; ++i) {
2307       loads[i] = NULL;
2308       stores[i] = NULL;
2309    }
2310    prevRecord = NULL;
2311 }
2312
2313 void
2314 MemoryOpt::reset()
2315 {
2316    for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
2317       Record *it, *next;
2318       for (it = loads[i]; it; it = next) {
2319          next = it->next;
2320          recordPool.release(it);
2321       }
2322       loads[i] = NULL;
2323       for (it = stores[i]; it; it = next) {
2324          next = it->next;
2325          recordPool.release(it);
2326       }
2327       stores[i] = NULL;
2328    }
2329 }
2330
2331 bool
2332 MemoryOpt::combineLd(Record *rec, Instruction *ld)
2333 {
2334    int32_t offRc = rec->offset;
2335    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2336    int sizeRc = rec->size;
2337    int sizeLd = typeSizeof(ld->dType);
2338    int size = sizeRc + sizeLd;
2339    int d, j;
2340
2341    if (!prog->getTarget()->
2342        isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
2343       return false;
2344    // no unaligned loads
2345    if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
2346        ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
2347       return false;
2348    // for compute indirect loads are not guaranteed to be aligned
2349    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2350       return false;
2351
2352    assert(sizeRc + sizeLd <= 16 && offRc != offLd);
2353
2354    for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
2355
2356    if (offLd < offRc) {
2357       int sz;
2358       for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
2359       // d: nr of definitions in ld
2360       // j: nr of definitions in rec->insn, move:
2361       for (d = d + j - 1; j > 0; --j, --d)
2362          rec->insn->setDef(d, rec->insn->getDef(j - 1));
2363
2364       if (rec->insn->getSrc(0)->refCount() > 1)
2365          rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
2366       rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
2367
2368       d = 0;
2369    } else {
2370       d = j;
2371    }
2372    // move definitions of @ld to @rec->insn
2373    for (j = 0; sizeLd; ++j, ++d) {
2374       sizeLd -= ld->getDef(j)->reg.size;
2375       rec->insn->setDef(d, ld->getDef(j));
2376    }
2377
2378    rec->size = size;
2379    rec->insn->getSrc(0)->reg.size = size;
2380    rec->insn->setType(typeOfSize(size));
2381
2382    delete_Instruction(prog, ld);
2383
2384    return true;
2385 }
2386
2387 bool
2388 MemoryOpt::combineSt(Record *rec, Instruction *st)
2389 {
2390    int32_t offRc = rec->offset;
2391    int32_t offSt = st->getSrc(0)->reg.data.offset;
2392    int sizeRc = rec->size;
2393    int sizeSt = typeSizeof(st->dType);
2394    int s = sizeSt / 4;
2395    int size = sizeRc + sizeSt;
2396    int j, k;
2397    Value *src[4]; // no modifiers in ValueRef allowed for st
2398    Value *extra[3];
2399
2400    if (!prog->getTarget()->
2401        isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
2402       return false;
2403    // no unaligned stores
2404    if (size == 8 && MIN2(offRc, offSt) & 0x7)
2405       return false;
2406    // for compute indirect stores are not guaranteed to be aligned
2407    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2408       return false;
2409
2410    st->takeExtraSources(0, extra); // save predicate and indirect address
2411
2412    if (offRc < offSt) {
2413       // save values from @st
2414       for (s = 0; sizeSt; ++s) {
2415          sizeSt -= st->getSrc(s + 1)->reg.size;
2416          src[s] = st->getSrc(s + 1);
2417       }
2418       // set record's values as low sources of @st
2419       for (j = 1; sizeRc; ++j) {
2420          sizeRc -= rec->insn->getSrc(j)->reg.size;
2421          st->setSrc(j, rec->insn->getSrc(j));
2422       }
2423       // set saved values as high sources of @st
2424       for (k = j, j = 0; j < s; ++j)
2425          st->setSrc(k++, src[j]);
2426
2427       updateLdStOffset(st, offRc, func);
2428    } else {
2429       for (j = 1; sizeSt; ++j)
2430          sizeSt -= st->getSrc(j)->reg.size;
2431       for (s = 1; sizeRc; ++j, ++s) {
2432          sizeRc -= rec->insn->getSrc(s)->reg.size;
2433          st->setSrc(j, rec->insn->getSrc(s));
2434       }
2435       rec->offset = offSt;
2436    }
2437    st->putExtraSources(0, extra); // restore pointer and predicate
2438
2439    delete_Instruction(prog, rec->insn);
2440    rec->insn = st;
2441    rec->size = size;
2442    rec->insn->getSrc(0)->reg.size = size;
2443    rec->insn->setType(typeOfSize(size));
2444    return true;
2445 }
2446
2447 void
2448 MemoryOpt::Record::set(const Instruction *ldst)
2449 {
2450    const Symbol *mem = ldst->getSrc(0)->asSym();
2451    fileIndex = mem->reg.fileIndex;
2452    rel[0] = ldst->getIndirect(0, 0);
2453    rel[1] = ldst->getIndirect(0, 1);
2454    offset = mem->reg.data.offset;
2455    base = mem->getBase();
2456    size = typeSizeof(ldst->sType);
2457 }
2458
2459 void
2460 MemoryOpt::Record::link(Record **list)
2461 {
2462    next = *list;
2463    if (next)
2464       next->prev = this;
2465    prev = NULL;
2466    *list = this;
2467 }
2468
2469 void
2470 MemoryOpt::Record::unlink(Record **list)
2471 {
2472    if (next)
2473       next->prev = prev;
2474    if (prev)
2475       prev->next = next;
2476    else
2477       *list = next;
2478 }
2479
2480 MemoryOpt::Record **
2481 MemoryOpt::getList(const Instruction *insn)
2482 {
2483    if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
2484       return &loads[insn->src(0).getFile()];
2485    return &stores[insn->src(0).getFile()];
2486 }
2487
2488 void
2489 MemoryOpt::addRecord(Instruction *i)
2490 {
2491    Record **list = getList(i);
2492    Record *it = reinterpret_cast<Record *>(recordPool.allocate());
2493
2494    it->link(list);
2495    it->set(i);
2496    it->insn = i;
2497    it->locked = false;
2498 }
2499
2500 MemoryOpt::Record *
2501 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
2502 {
2503    const Symbol *sym = insn->getSrc(0)->asSym();
2504    const int size = typeSizeof(insn->sType);
2505    Record *rec = NULL;
2506    Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
2507
2508    for (; it; it = it->next) {
2509       if (it->locked && insn->op != OP_LOAD)
2510          continue;
2511       if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
2512           it->rel[0] != insn->getIndirect(0, 0) ||
2513           it->fileIndex != sym->reg.fileIndex ||
2514           it->rel[1] != insn->getIndirect(0, 1))
2515          continue;
2516
2517       if (it->offset < sym->reg.data.offset) {
2518          if (it->offset + it->size >= sym->reg.data.offset) {
2519             isAdj = (it->offset + it->size == sym->reg.data.offset);
2520             if (!isAdj)
2521                return it;
2522             if (!(it->offset & 0x7))
2523                rec = it;
2524          }
2525       } else {
2526          isAdj = it->offset != sym->reg.data.offset;
2527          if (size <= it->size && !isAdj)
2528             return it;
2529          else
2530          if (!(sym->reg.data.offset & 0x7))
2531             if (it->offset - size <= sym->reg.data.offset)
2532                rec = it;
2533       }
2534    }
2535    return rec;
2536 }
2537
2538 bool
2539 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
2540 {
2541    Instruction *st = rec->insn;
2542    int32_t offSt = rec->offset;
2543    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2544    int d, s;
2545
2546    for (s = 1; offSt != offLd && st->srcExists(s); ++s)
2547       offSt += st->getSrc(s)->reg.size;
2548    if (offSt != offLd)
2549       return false;
2550
2551    for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
2552       if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
2553          return false;
2554       if (st->getSrc(s)->reg.file != FILE_GPR)
2555          return false;
2556       ld->def(d).replace(st->src(s), false);
2557    }
2558    ld->bb->remove(ld);
2559    return true;
2560 }
2561
2562 bool
2563 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
2564 {
2565    Instruction *ldR = rec->insn;
2566    int32_t offR = rec->offset;
2567    int32_t offE = ldE->getSrc(0)->reg.data.offset;
2568    int dR, dE;
2569
2570    assert(offR <= offE);
2571    for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
2572       offR += ldR->getDef(dR)->reg.size;
2573    if (offR != offE)
2574       return false;
2575
2576    for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
2577       if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
2578          return false;
2579       ldE->def(dE).replace(ldR->getDef(dR), false);
2580    }
2581
2582    delete_Instruction(prog, ldE);
2583    return true;
2584 }
2585
2586 bool
2587 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
2588 {
2589    const Instruction *const ri = rec->insn;
2590    Value *extra[3];
2591
2592    int32_t offS = st->getSrc(0)->reg.data.offset;
2593    int32_t offR = rec->offset;
2594    int32_t endS = offS + typeSizeof(st->dType);
2595    int32_t endR = offR + typeSizeof(ri->dType);
2596
2597    rec->size = MAX2(endS, endR) - MIN2(offS, offR);
2598
2599    st->takeExtraSources(0, extra);
2600
2601    if (offR < offS) {
2602       Value *vals[10];
2603       int s, n;
2604       int k = 0;
2605       // get non-replaced sources of ri
2606       for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
2607          vals[k++] = ri->getSrc(s);
2608       n = s;
2609       // get replaced sources of st
2610       for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
2611          vals[k++] = st->getSrc(s);
2612       // skip replaced sources of ri
2613       for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
2614       // get non-replaced sources after values covered by st
2615       for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
2616          vals[k++] = ri->getSrc(s);
2617       assert((unsigned int)k <= ARRAY_SIZE(vals));
2618       for (s = 0; s < k; ++s)
2619          st->setSrc(s + 1, vals[s]);
2620       st->setSrc(0, ri->getSrc(0));
2621    } else
2622    if (endR > endS) {
2623       int j, s;
2624       for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
2625       for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
2626       for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
2627          st->setSrc(s++, ri->getSrc(j));
2628    }
2629    st->putExtraSources(0, extra);
2630
2631    delete_Instruction(prog, rec->insn);
2632
2633    rec->insn = st;
2634    rec->offset = st->getSrc(0)->reg.data.offset;
2635
2636    st->setType(typeOfSize(rec->size));
2637
2638    return true;
2639 }
2640
2641 bool
2642 MemoryOpt::Record::overlaps(const Instruction *ldst) const
2643 {
2644    Record that;
2645    that.set(ldst);
2646
2647    if (this->fileIndex != that.fileIndex)
2648       return false;
2649
2650    if (this->rel[0] || that.rel[0])
2651       return this->base == that.base;
2652    return
2653       (this->offset < that.offset + that.size) &&
2654       (this->offset + this->size > that.offset);
2655 }
2656
2657 // We must not eliminate stores that affect the result of @ld if
2658 // we find later stores to the same location, and we may no longer
2659 // merge them with later stores.
2660 // The stored value can, however, still be used to determine the value
2661 // returned by future loads.
2662 void
2663 MemoryOpt::lockStores(Instruction *const ld)
2664 {
2665    for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
2666       if (!r->locked && r->overlaps(ld))
2667          r->locked = true;
2668 }
2669
2670 // Prior loads from the location of @st are no longer valid.
2671 // Stores to the location of @st may no longer be used to derive
2672 // the value at it nor be coalesced into later stores.
2673 void
2674 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
2675 {
2676    if (st)
2677       f = st->src(0).getFile();
2678
2679    for (Record *r = loads[f]; r; r = r->next)
2680       if (!st || r->overlaps(st))
2681          r->unlink(&loads[f]);
2682
2683    for (Record *r = stores[f]; r; r = r->next)
2684       if (!st || r->overlaps(st))
2685          r->unlink(&stores[f]);
2686 }
2687
2688 bool
2689 MemoryOpt::visit(BasicBlock *bb)
2690 {
2691    bool ret = runOpt(bb);
2692    // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
2693    // where 96 bit memory operations are forbidden.
2694    if (ret)
2695       ret = runOpt(bb);
2696    return ret;
2697 }
2698
2699 bool
2700 MemoryOpt::runOpt(BasicBlock *bb)
2701 {
2702    Instruction *ldst, *next;
2703    Record *rec;
2704    bool isAdjacent = true;
2705
2706    for (ldst = bb->getEntry(); ldst; ldst = next) {
2707       bool keep = true;
2708       bool isLoad = true;
2709       next = ldst->next;
2710
2711       if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
2712          if (ldst->isDead()) {
2713             // might have been produced by earlier optimization
2714             delete_Instruction(prog, ldst);
2715             continue;
2716          }
2717       } else
2718       if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
2719          if (typeSizeof(ldst->dType) == 4 &&
2720              ldst->src(1).getFile() == FILE_GPR &&
2721              ldst->getSrc(1)->getInsn()->op == OP_NOP) {
2722             delete_Instruction(prog, ldst);
2723             continue;
2724          }
2725          isLoad = false;
2726       } else {
2727          // TODO: maybe have all fixed ops act as barrier ?
2728          if (ldst->op == OP_CALL ||
2729              ldst->op == OP_BAR ||
2730              ldst->op == OP_MEMBAR) {
2731             purgeRecords(NULL, FILE_MEMORY_LOCAL);
2732             purgeRecords(NULL, FILE_MEMORY_GLOBAL);
2733             purgeRecords(NULL, FILE_MEMORY_SHARED);
2734             purgeRecords(NULL, FILE_SHADER_OUTPUT);
2735          } else
2736          if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
2737             if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
2738                purgeRecords(NULL, FILE_MEMORY_LOCAL);
2739                purgeRecords(NULL, FILE_MEMORY_GLOBAL);
2740                purgeRecords(NULL, FILE_MEMORY_SHARED);
2741             } else {
2742                purgeRecords(NULL, ldst->src(0).getFile());
2743             }
2744          } else
2745          if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
2746             purgeRecords(NULL, FILE_SHADER_OUTPUT);
2747          }
2748          continue;
2749       }
2750       if (ldst->getPredicate()) // TODO: handle predicated ld/st
2751          continue;
2752       if (ldst->perPatch) // TODO: create separate per-patch lists
2753          continue;
2754
2755       if (isLoad) {
2756          DataFile file = ldst->src(0).getFile();
2757
2758          // if ld l[]/g[] look for previous store to eliminate the reload
2759          if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
2760             // TODO: shared memory ?
2761             rec = findRecord(ldst, false, isAdjacent);
2762             if (rec && !isAdjacent)
2763                keep = !replaceLdFromSt(ldst, rec);
2764          }
2765
2766          // or look for ld from the same location and replace this one
2767          rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
2768          if (rec) {
2769             if (!isAdjacent)
2770                keep = !replaceLdFromLd(ldst, rec);
2771             else
2772                // or combine a previous load with this one
2773                keep = !combineLd(rec, ldst);
2774          }
2775          if (keep)
2776             lockStores(ldst);
2777       } else {
2778          rec = findRecord(ldst, false, isAdjacent);
2779          if (rec) {
2780             if (!isAdjacent)
2781                keep = !replaceStFromSt(ldst, rec);
2782             else
2783                keep = !combineSt(rec, ldst);
2784          }
2785          if (keep)
2786             purgeRecords(ldst, DATA_FILE_COUNT);
2787       }
2788       if (keep)
2789          addRecord(ldst);
2790    }
2791    reset();
2792
2793    return true;
2794 }
2795
2796 // =============================================================================
2797
2798 // Turn control flow into predicated instructions (after register allocation !).
2799 // TODO:
2800 // Could move this to before register allocation on NVC0 and also handle nested
2801 // constructs.
2802 class FlatteningPass : public Pass
2803 {
2804 private:
2805    virtual bool visit(Function *);
2806    virtual bool visit(BasicBlock *);
2807
2808    bool tryPredicateConditional(BasicBlock *);
2809    void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
2810    void tryPropagateBranch(BasicBlock *);
2811    inline bool isConstantCondition(Value *pred);
2812    inline bool mayPredicate(const Instruction *, const Value *pred) const;
2813    inline void removeFlow(Instruction *);
2814
2815    uint8_t gpr_unit;
2816 };
2817
2818 bool
2819 FlatteningPass::isConstantCondition(Value *pred)
2820 {
2821    Instruction *insn = pred->getUniqueInsn();
2822    assert(insn);
2823    if (insn->op != OP_SET || insn->srcExists(2))
2824       return false;
2825
2826    for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
2827       Instruction *ld = insn->getSrc(s)->getUniqueInsn();
2828       DataFile file;
2829       if (ld) {
2830          if (ld->op != OP_MOV && ld->op != OP_LOAD)
2831             return false;
2832          if (ld->src(0).isIndirect(0))
2833             return false;
2834          file = ld->src(0).getFile();
2835       } else {
2836          file = insn->src(s).getFile();
2837          // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
2838          // in register "units", which can vary between targets.
2839          if (file == FILE_GPR) {
2840             Value *v = insn->getSrc(s);
2841             int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
2842             int units = bytes >> gpr_unit;
2843             if (units > prog->maxGPR)
2844                file = FILE_IMMEDIATE;
2845          }
2846       }
2847       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
2848          return false;
2849    }
2850    return true;
2851 }
2852
2853 void
2854 FlatteningPass::removeFlow(Instruction *insn)
2855 {
2856    FlowInstruction *term = insn ? insn->asFlow() : NULL;
2857    if (!term)
2858       return;
2859    Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
2860
2861    if (term->op == OP_BRA) {
2862       // TODO: this might get more difficult when we get arbitrary BRAs
2863       if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
2864          return;
2865    } else
2866    if (term->op != OP_JOIN)
2867       return;
2868
2869    Value *pred = term->getPredicate();
2870
2871    delete_Instruction(prog, term);
2872
2873    if (pred && pred->refCount() == 0) {
2874       Instruction *pSet = pred->getUniqueInsn();
2875       pred->join->reg.data.id = -1; // deallocate
2876       if (pSet->isDead())
2877          delete_Instruction(prog, pSet);
2878    }
2879 }
2880
2881 void
2882 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
2883 {
2884    for (Instruction *i = bb->getEntry(); i; i = i->next) {
2885       if (i->isNop())
2886          continue;
2887       assert(!i->getPredicate());
2888       i->setPredicate(cc, pred);
2889    }
2890    removeFlow(bb->getExit());
2891 }
2892
2893 bool
2894 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
2895 {
2896    if (insn->isPseudo())
2897       return true;
2898    // TODO: calls where we don't know which registers are modified
2899
2900    if (!prog->getTarget()->mayPredicate(insn, pred))
2901       return false;
2902    for (int d = 0; insn->defExists(d); ++d)
2903       if (insn->getDef(d)->equals(pred))
2904          return false;
2905    return true;
2906 }
2907
2908 // If we jump to BRA/RET/EXIT, replace the jump with it.
2909 // NOTE: We do not update the CFG anymore here !
2910 //
2911 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
2912 //  BB:0
2913 //   @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
2914 //  BB1:
2915 //   bra BB:3
2916 //  BB2:
2917 //   ...
2918 //  BB3:
2919 //   ...
2920 void
2921 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
2922 {
2923    for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
2924       BasicBlock *bf = i->asFlow()->target.bb;
2925
2926       if (bf->getInsnCount() != 1)
2927          continue;
2928
2929       FlowInstruction *bra = i->asFlow();
2930       FlowInstruction *rep = bf->getExit()->asFlow();
2931
2932       if (!rep || rep->getPredicate())
2933          continue;
2934       if (rep->op != OP_BRA &&
2935           rep->op != OP_JOIN &&
2936           rep->op != OP_EXIT)
2937          continue;
2938
2939       // TODO: If there are multiple branches to @rep, only the first would
2940       // be replaced, so only remove them after this pass is done ?
2941       // Also, need to check all incident blocks for fall-through exits and
2942       // add the branch there.
2943       bra->op = rep->op;
2944       bra->target.bb = rep->target.bb;
2945       if (bf->cfg.incidentCount() == 1)
2946          bf->remove(rep);
2947    }
2948 }
2949
2950 bool
2951 FlatteningPass::visit(Function *fn)
2952 {
2953    gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
2954
2955    return true;
2956 }
2957
2958 bool
2959 FlatteningPass::visit(BasicBlock *bb)
2960 {
2961    if (tryPredicateConditional(bb))
2962       return true;
2963
2964    // try to attach join to previous instruction
2965    if (prog->getTarget()->hasJoin) {
2966       Instruction *insn = bb->getExit();
2967       if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
2968          insn = insn->prev;
2969          if (insn && !insn->getPredicate() &&
2970              !insn->asFlow() &&
2971              insn->op != OP_DISCARD &&
2972              insn->op != OP_TEXBAR &&
2973              !isTextureOp(insn->op) && // probably just nve4
2974              !isSurfaceOp(insn->op) && // not confirmed
2975              insn->op != OP_LINTERP && // probably just nve4
2976              insn->op != OP_PINTERP && // probably just nve4
2977              ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
2978               (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
2979              !insn->isNop()) {
2980             insn->join = 1;
2981             bb->remove(bb->getExit());
2982             return true;
2983          }
2984       }
2985    }
2986
2987    tryPropagateBranch(bb);
2988
2989    return true;
2990 }
2991
2992 bool
2993 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
2994 {
2995    BasicBlock *bL = NULL, *bR = NULL;
2996    unsigned int nL = 0, nR = 0, limit = 12;
2997    Instruction *insn;
2998    unsigned int mask;
2999
3000    mask = bb->initiatesSimpleConditional();
3001    if (!mask)
3002       return false;
3003
3004    assert(bb->getExit());
3005    Value *pred = bb->getExit()->getPredicate();
3006    assert(pred);
3007
3008    if (isConstantCondition(pred))
3009       limit = 4;
3010
3011    Graph::EdgeIterator ei = bb->cfg.outgoing();
3012
3013    if (mask & 1) {
3014       bL = BasicBlock::get(ei.getNode());
3015       for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
3016          if (!mayPredicate(insn, pred))
3017             return false;
3018       if (nL > limit)
3019          return false; // too long, do a real branch
3020    }
3021    ei.next();
3022
3023    if (mask & 2) {
3024       bR = BasicBlock::get(ei.getNode());
3025       for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
3026          if (!mayPredicate(insn, pred))
3027             return false;
3028       if (nR > limit)
3029          return false; // too long, do a real branch
3030    }
3031
3032    if (bL)
3033       predicateInstructions(bL, pred, bb->getExit()->cc);
3034    if (bR)
3035       predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
3036
3037    if (bb->joinAt) {
3038       bb->remove(bb->joinAt);
3039       bb->joinAt = NULL;
3040    }
3041    removeFlow(bb->getExit()); // delete the branch/join at the fork point
3042
3043    // remove potential join operations at the end of the conditional
3044    if (prog->getTarget()->joinAnterior) {
3045       bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
3046       if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
3047          removeFlow(bb->getEntry());
3048    }
3049
3050    return true;
3051 }
3052
3053 // =============================================================================
3054
3055 // Fold Immediate into MAD; must be done after register allocation due to
3056 // constraint SDST == SSRC2
3057 // TODO:
3058 // Does NVC0+ have other situations where this pass makes sense?
3059 class NV50PostRaConstantFolding : public Pass
3060 {
3061 private:
3062    virtual bool visit(BasicBlock *);
3063 };
3064
3065 static bool
3066 post_ra_dead(Instruction *i)
3067 {
3068    for (int d = 0; i->defExists(d); ++d)
3069       if (i->getDef(d)->refCount())
3070          return false;
3071    return true;
3072 }
3073
3074 bool
3075 NV50PostRaConstantFolding::visit(BasicBlock *bb)
3076 {
3077    Value *vtmp;
3078    Instruction *def;
3079
3080    for (Instruction *i = bb->getFirst(); i; i = i->next) {
3081       switch (i->op) {
3082       case OP_MAD:
3083          if (i->def(0).getFile() != FILE_GPR ||
3084              i->src(0).getFile() != FILE_GPR ||
3085              i->src(1).getFile() != FILE_GPR ||
3086              i->src(2).getFile() != FILE_GPR ||
3087              i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3088             break;
3089
3090          if (i->getDef(0)->reg.data.id >= 64 ||
3091              i->getSrc(0)->reg.data.id >= 64)
3092             break;
3093
3094          if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
3095             break;
3096
3097          if (i->getPredicate())
3098             break;
3099
3100          def = i->getSrc(1)->getInsn();
3101          if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
3102             def = def->getSrc(0)->getInsn();
3103          if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
3104             vtmp = i->getSrc(1);
3105             if (isFloatType(i->sType)) {
3106                i->setSrc(1, def->getSrc(0));
3107             } else {
3108                ImmediateValue val;
3109                bool ret = def->src(0).getImmediate(val);
3110                assert(ret);
3111                if (i->getSrc(1)->reg.data.id & 1)
3112                   val.reg.data.u32 >>= 16;
3113                val.reg.data.u32 &= 0xffff;
3114                i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32));
3115             }
3116
3117             /* There's no post-RA dead code elimination, so do it here
3118              * XXX: if we add more code-removing post-RA passes, we might
3119              *      want to create a post-RA dead-code elim pass */
3120             if (post_ra_dead(vtmp->getInsn())) {
3121                Value *src = vtmp->getInsn()->getSrc(0);
3122                // Careful -- splits will have already been removed from the
3123                // functions. Don't double-delete.
3124                if (vtmp->getInsn()->bb)
3125                   delete_Instruction(prog, vtmp->getInsn());
3126                if (src->getInsn() && post_ra_dead(src->getInsn()))
3127                   delete_Instruction(prog, src->getInsn());
3128             }
3129
3130             break;
3131          }
3132          break;
3133       default:
3134          break;
3135       }
3136    }
3137
3138    return true;
3139 }
3140
3141 // =============================================================================
3142
3143 // Common subexpression elimination. Stupid O^2 implementation.
3144 class LocalCSE : public Pass
3145 {
3146 private:
3147    virtual bool visit(BasicBlock *);
3148
3149    inline bool tryReplace(Instruction **, Instruction *);
3150
3151    DLList ops[OP_LAST + 1];
3152 };
3153
3154 class GlobalCSE : public Pass
3155 {
3156 private:
3157    virtual bool visit(BasicBlock *);
3158 };
3159
3160 bool
3161 Instruction::isActionEqual(const Instruction *that) const
3162 {
3163    if (this->op != that->op ||
3164        this->dType != that->dType ||
3165        this->sType != that->sType)
3166       return false;
3167    if (this->cc != that->cc)
3168       return false;
3169
3170    if (this->asTex()) {
3171       if (memcmp(&this->asTex()->tex,
3172                  &that->asTex()->tex,
3173                  sizeof(this->asTex()->tex)))
3174          return false;
3175    } else
3176    if (this->asCmp()) {
3177       if (this->asCmp()->setCond != that->asCmp()->setCond)
3178          return false;
3179    } else
3180    if (this->asFlow()) {
3181       return false;
3182    } else {
3183       if (this->ipa != that->ipa ||
3184           this->lanes != that->lanes ||
3185           this->perPatch != that->perPatch)
3186          return false;
3187       if (this->postFactor != that->postFactor)
3188          return false;
3189    }
3190
3191    if (this->subOp != that->subOp ||
3192        this->saturate != that->saturate ||
3193        this->rnd != that->rnd ||
3194        this->ftz != that->ftz ||
3195        this->dnz != that->dnz ||
3196        this->cache != that->cache ||
3197        this->mask != that->mask)
3198       return false;
3199
3200    return true;
3201 }
3202
3203 bool
3204 Instruction::isResultEqual(const Instruction *that) const
3205 {
3206    unsigned int d, s;
3207
3208    // NOTE: location of discard only affects tex with liveOnly and quadops
3209    if (!this->defExists(0) && this->op != OP_DISCARD)
3210       return false;
3211
3212    if (!isActionEqual(that))
3213       return false;
3214
3215    if (this->predSrc != that->predSrc)
3216       return false;
3217
3218    for (d = 0; this->defExists(d); ++d) {
3219       if (!that->defExists(d) ||
3220           !this->getDef(d)->equals(that->getDef(d), false))
3221          return false;
3222    }
3223    if (that->defExists(d))
3224       return false;
3225
3226    for (s = 0; this->srcExists(s); ++s) {
3227       if (!that->srcExists(s))
3228          return false;
3229       if (this->src(s).mod != that->src(s).mod)
3230          return false;
3231       if (!this->getSrc(s)->equals(that->getSrc(s), true))
3232          return false;
3233    }
3234    if (that->srcExists(s))
3235       return false;
3236
3237    if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
3238       switch (src(0).getFile()) {
3239       case FILE_MEMORY_CONST:
3240       case FILE_SHADER_INPUT:
3241          return true;
3242       case FILE_SHADER_OUTPUT:
3243          return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
3244       default:
3245          return false;
3246       }
3247    }
3248
3249    return true;
3250 }
3251
3252 // pull through common expressions from different in-blocks
3253 bool
3254 GlobalCSE::visit(BasicBlock *bb)
3255 {
3256    Instruction *phi, *next, *ik;
3257    int s;
3258
3259    // TODO: maybe do this with OP_UNION, too
3260
3261    for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
3262       next = phi->next;
3263       if (phi->getSrc(0)->refCount() > 1)
3264          continue;
3265       ik = phi->getSrc(0)->getInsn();
3266       if (!ik)
3267          continue; // probably a function input
3268       if (ik->defCount(0xff) > 1)
3269          continue; // too painful to check if we can really push this forward
3270       for (s = 1; phi->srcExists(s); ++s) {
3271          if (phi->getSrc(s)->refCount() > 1)
3272             break;
3273          if (!phi->getSrc(s)->getInsn() ||
3274              !phi->getSrc(s)->getInsn()->isResultEqual(ik))
3275             break;
3276       }
3277       if (!phi->srcExists(s)) {
3278          Instruction *entry = bb->getEntry();
3279          ik->bb->remove(ik);
3280          if (!entry || entry->op != OP_JOIN)
3281             bb->insertHead(ik);
3282          else
3283             bb->insertAfter(entry, ik);
3284          ik->setDef(0, phi->getDef(0));
3285          delete_Instruction(prog, phi);
3286       }
3287    }
3288
3289    return true;
3290 }
3291
3292 bool
3293 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
3294 {
3295    Instruction *old = *ptr;
3296
3297    // TODO: maybe relax this later (causes trouble with OP_UNION)
3298    if (i->isPredicated())
3299       return false;
3300
3301    if (!old->isResultEqual(i))
3302       return false;
3303
3304    for (int d = 0; old->defExists(d); ++d)
3305       old->def(d).replace(i->getDef(d), false);
3306    delete_Instruction(prog, old);
3307    *ptr = NULL;
3308    return true;
3309 }
3310
3311 bool
3312 LocalCSE::visit(BasicBlock *bb)
3313 {
3314    unsigned int replaced;
3315
3316    do {
3317       Instruction *ir, *next;
3318
3319       replaced = 0;
3320
3321       // will need to know the order of instructions
3322       int serial = 0;
3323       for (ir = bb->getFirst(); ir; ir = ir->next)
3324          ir->serial = serial++;
3325
3326       for (ir = bb->getFirst(); ir; ir = next) {
3327          int s;
3328          Value *src = NULL;
3329
3330          next = ir->next;
3331
3332          if (ir->fixed) {
3333             ops[ir->op].insert(ir);
3334             continue;
3335          }
3336
3337          for (s = 0; ir->srcExists(s); ++s)
3338             if (ir->getSrc(s)->asLValue())
3339                if (!src || ir->getSrc(s)->refCount() < src->refCount())
3340                   src = ir->getSrc(s);
3341
3342          if (src) {
3343             for (Value::UseIterator it = src->uses.begin();
3344                  it != src->uses.end(); ++it) {
3345                Instruction *ik = (*it)->getInsn();
3346                if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
3347                   if (tryReplace(&ir, ik))
3348                      break;
3349             }
3350          } else {
3351             DLLIST_FOR_EACH(&ops[ir->op], iter)
3352             {
3353                Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
3354                if (tryReplace(&ir, ik))
3355                   break;
3356             }
3357          }
3358
3359          if (ir)
3360             ops[ir->op].insert(ir);
3361          else
3362             ++replaced;
3363       }
3364       for (unsigned int i = 0; i <= OP_LAST; ++i)
3365          ops[i].clear();
3366
3367    } while (replaced);
3368
3369    return true;
3370 }
3371
3372 // =============================================================================
3373
3374 // Remove computations of unused values.
3375 class DeadCodeElim : public Pass
3376 {
3377 public:
3378    bool buryAll(Program *);
3379
3380 private:
3381    virtual bool visit(BasicBlock *);
3382
3383    void checkSplitLoad(Instruction *ld); // for partially dead loads
3384
3385    unsigned int deadCount;
3386 };
3387
3388 bool
3389 DeadCodeElim::buryAll(Program *prog)
3390 {
3391    do {
3392       deadCount = 0;
3393       if (!this->run(prog, false, false))
3394          return false;
3395    } while (deadCount);
3396
3397    return true;
3398 }
3399
3400 bool
3401 DeadCodeElim::visit(BasicBlock *bb)
3402 {
3403    Instruction *prev;
3404
3405    for (Instruction *i = bb->getExit(); i; i = prev) {
3406       prev = i->prev;
3407       if (i->isDead()) {
3408          ++deadCount;
3409          delete_Instruction(prog, i);
3410       } else
3411       if (i->defExists(1) &&
3412           i->subOp == 0 &&
3413           (i->op == OP_VFETCH || i->op == OP_LOAD)) {
3414          checkSplitLoad(i);
3415       } else
3416       if (i->defExists(0) && !i->getDef(0)->refCount()) {
3417          if (i->op == OP_ATOM ||
3418              i->op == OP_SUREDP ||
3419              i->op == OP_SUREDB) {
3420             i->setDef(0, NULL);
3421          } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
3422             i->setDef(0, i->getDef(1));
3423             i->setDef(1, NULL);
3424          }
3425       }
3426    }
3427    return true;
3428 }
3429
3430 // Each load can go into up to 4 destinations, any of which might potentially
3431 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3432 // of where the holes are. We find the first contiguous region, put it into
3433 // the first load, and then put the second contiguous region into the second
3434 // load. There can be at most 2 contiguous regions.
3435 //
3436 // Note that there are some restrictions, for example it's not possible to do
3437 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3438 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3439 // split into a 64-bit and 32-bit load.
3440 void
3441 DeadCodeElim::checkSplitLoad(Instruction *ld1)
3442 {
3443    Instruction *ld2 = NULL; // can get at most 2 loads
3444    Value *def1[4];
3445    Value *def2[4];
3446    int32_t addr1, addr2;
3447    int32_t size1, size2;
3448    int d, n1, n2;
3449    uint32_t mask = 0xffffffff;
3450
3451    for (d = 0; ld1->defExists(d); ++d)
3452       if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
3453          mask &= ~(1 << d);
3454    if (mask == 0xffffffff)
3455       return;
3456
3457    addr1 = ld1->getSrc(0)->reg.data.offset;
3458    n1 = n2 = 0;
3459    size1 = size2 = 0;
3460
3461    // Compute address/width for first load
3462    for (d = 0; ld1->defExists(d); ++d) {
3463       if (mask & (1 << d)) {
3464          if (size1 && (addr1 & 0x7))
3465             break;
3466          def1[n1] = ld1->getDef(d);
3467          size1 += def1[n1++]->reg.size;
3468       } else
3469       if (!n1) {
3470          addr1 += ld1->getDef(d)->reg.size;
3471       } else {
3472          break;
3473       }
3474    }
3475
3476    // Scale back the size of the first load until it can be loaded. This
3477    // typically happens for TYPE_B96 loads.
3478    while (n1 &&
3479           !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
3480                                                 typeOfSize(size1))) {
3481       size1 -= def1[--n1]->reg.size;
3482       d--;
3483    }
3484
3485    // Compute address/width for second load
3486    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
3487       if (mask & (1 << d)) {
3488          assert(!size2 || !(addr2 & 0x7));
3489          def2[n2] = ld1->getDef(d);
3490          size2 += def2[n2++]->reg.size;
3491       } else if (!n2) {
3492          assert(!n2);
3493          addr2 += ld1->getDef(d)->reg.size;
3494       } else {
3495          break;
3496       }
3497    }
3498
3499    // Make sure that we've processed all the values
3500    for (; ld1->defExists(d); ++d)
3501       assert(!(mask & (1 << d)));
3502
3503    updateLdStOffset(ld1, addr1, func);
3504    ld1->setType(typeOfSize(size1));
3505    for (d = 0; d < 4; ++d)
3506       ld1->setDef(d, (d < n1) ? def1[d] : NULL);
3507
3508    if (!n2)
3509       return;
3510
3511    ld2 = cloneShallow(func, ld1);
3512    updateLdStOffset(ld2, addr2, func);
3513    ld2->setType(typeOfSize(size2));
3514    for (d = 0; d < 4; ++d)
3515       ld2->setDef(d, (d < n2) ? def2[d] : NULL);
3516
3517    ld1->bb->insertAfter(ld1, ld2);
3518 }
3519
3520 // =============================================================================
3521
3522 #define RUN_PASS(l, n, f)                       \
3523    if (level >= (l)) {                          \
3524       if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
3525          INFO("PEEPHOLE: %s\n", #n);            \
3526       n pass;                                   \
3527       if (!pass.f(this))                        \
3528          return false;                          \
3529    }
3530
3531 bool
3532 Program::optimizeSSA(int level)
3533 {
3534    RUN_PASS(1, DeadCodeElim, buryAll);
3535    RUN_PASS(1, CopyPropagation, run);
3536    RUN_PASS(1, MergeSplits, run);
3537    RUN_PASS(2, GlobalCSE, run);
3538    RUN_PASS(1, LocalCSE, run);
3539    RUN_PASS(2, AlgebraicOpt, run);
3540    RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
3541    RUN_PASS(1, ConstantFolding, foldAll);
3542    RUN_PASS(2, LateAlgebraicOpt, run);
3543    RUN_PASS(1, LoadPropagation, run);
3544    RUN_PASS(1, IndirectPropagation, run);
3545    RUN_PASS(2, MemoryOpt, run);
3546    RUN_PASS(2, LocalCSE, run);
3547    RUN_PASS(0, DeadCodeElim, buryAll);
3548
3549    return true;
3550 }
3551
3552 bool
3553 Program::optimizePostRA(int level)
3554 {
3555    RUN_PASS(2, FlatteningPass, run);
3556    if (getTarget()->getChipset() < 0xc0)
3557       RUN_PASS(2, NV50PostRaConstantFolding, run);
3558
3559    return true;
3560 }
3561
3562 }