nv50/ir: optimize SHLADD(a, b, c) to MOV((a << b) + c)

[mesa.git] / src / gallium / drivers / nouveau / codegen / nv50_ir_peephole.cpp
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp

index 82e81482b5107ce83b9ead5592099cd18369b0f7..9875738593e12b01fcf2b37b1fb2c8bb2b0f17be 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -155,7 +155,7 @@ private:
     void checkSwapSrc01(Instruction *);
  
     bool isCSpaceLoad(Instruction *);
-   bool isImmd32Load(Instruction *);
+   bool isImmdLoad(Instruction *);
     bool isAttribOrSharedLoad(Instruction *);
  };
  
@@ -166,11 +166,15 @@ LoadPropagation::isCSpaceLoad(Instruction *ld)
  }
  
  bool
-LoadPropagation::isImmd32Load(Instruction *ld)
+LoadPropagation::isImmdLoad(Instruction *ld)
  {
-   if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
+   if (!ld || (ld->op != OP_MOV) ||
+       ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
        return false;
-   return ld->src(0).getFile() == FILE_IMMEDIATE;
+
+   // A 0 can be replaced with a register, so it doesn't count as an immediate.
+   ImmediateValue val;
+   return ld->src(0).getImmediate(val) && !val.isInteger(0);
  }
  
  bool
@@ -186,23 +190,29 @@ LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
  void
  LoadPropagation::checkSwapSrc01(Instruction *insn)
  {
-   if (!prog->getTarget()->getOpInfo(insn).commutative)
-      if (insn->op != OP_SET && insn->op != OP_SLCT)
+   const Target *targ = prog->getTarget();
+   if (!targ->getOpInfo(insn).commutative)
+      if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB)
           return;
     if (insn->src(1).getFile() != FILE_GPR)
        return;
+   // This is the special OP_SET used for alphatesting, we can't reverse its
+   // arguments as that will confuse the fixup code.
+   if (insn->op == OP_SET && insn->subOp)
+      return;
  
     Instruction *i0 = insn->getSrc(0)->getInsn();
     Instruction *i1 = insn->getSrc(1)->getInsn();
  
-   if (isCSpaceLoad(i0)) {
-      if (!isCSpaceLoad(i1))
-         insn->swapSources(0, 1);
-      else
-         return;
-   } else
-   if (isImmd32Load(i0)) {
-      if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
+   // Swap sources to inline the less frequently used source. That way,
+   // optimistically, it will eventually be able to remove the instruction.
+   int i0refs = insn->getSrc(0)->refCount();
+   int i1refs = insn->getSrc(1)->refCount();
+
+   if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
+      if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
+          !targ->insnCanLoad(insn, 1, i1) ||
+          i0refs < i1refs)
           insn->swapSources(0, 1);
        else
           return;
@@ -222,6 +232,11 @@ LoadPropagation::checkSwapSrc01(Instruction *insn)
     else
     if (insn->op == OP_SLCT)
        insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
+   else
+   if (insn->op == OP_SUB) {
+      insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
+      insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
+   }
  }
  
  bool
@@ -236,6 +251,9 @@ LoadPropagation::visit(BasicBlock *bb)
        if (i->op == OP_CALL) // calls have args as sources, they must be in regs
           continue;
  
+      if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
+         continue;
+
        if (i->srcExists(1))
           checkSwapSrc01(i);
  
@@ -261,6 +279,60 @@ LoadPropagation::visit(BasicBlock *bb)
  
  // =============================================================================
  
+class IndirectPropagation : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+};
+
+bool
+IndirectPropagation::visit(BasicBlock *bb)
+{
+   const Target *targ = prog->getTarget();
+   Instruction *next;
+
+   for (Instruction *i = bb->getEntry(); i; i = next) {
+      next = i->next;
+
+      for (int s = 0; i->srcExists(s); ++s) {
+         Instruction *insn;
+         ImmediateValue imm;
+         if (!i->src(s).isIndirect(0))
+            continue;
+         insn = i->getIndirect(s, 0)->getInsn();
+         if (!insn)
+            continue;
+         if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
+            if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
+                !insn->src(1).getImmediate(imm) ||
+                !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
+               continue;
+            i->setIndirect(s, 0, insn->getSrc(0));
+            i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+            i->src(s).get()->reg.data.offset += imm.reg.data.u32;
+         } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
+            if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
+                !insn->src(1).getImmediate(imm) ||
+                !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
+               continue;
+            i->setIndirect(s, 0, insn->getSrc(0));
+            i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+            i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
+         } else if (insn->op == OP_MOV) {
+            if (!insn->src(0).getImmediate(imm) ||
+                !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
+               continue;
+            i->setIndirect(s, 0, NULL);
+            i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+            i->src(s).get()->reg.data.offset += imm.reg.data.u32;
+         }
+      }
+   }
+   return true;
+}
+
+// =============================================================================
+
  // Evaluate constant expressions.
  class ConstantFolding : public Pass
  {
@@ -273,12 +345,12 @@ private:
     void expr(Instruction *, ImmediateValue&, ImmediateValue&);
     void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
     void opnd(Instruction *, ImmediateValue&, int s);
+   void opnd3(Instruction *, ImmediateValue&);
  
     void unary(Instruction *, const ImmediateValue&);
  
     void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
  
-   // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
     CmpInstruction *findOriginForTestWithZero(Value *);
  
     unsigned int foldCount;
@@ -326,6 +398,8 @@ ConstantFolding::visit(BasicBlock *bb)
        else
        if (i->srcExists(1) && i->src(1).getImmediate(src1))
           opnd(i, src1, 1);
+      if (i->srcExists(2) && i->src(2).getImmediate(src2))
+         opnd3(i, src2);
     }
     return true;
  }
@@ -337,25 +411,33 @@ ConstantFolding::findOriginForTestWithZero(Value *value)
        return NULL;
     Instruction *insn = value->getInsn();
  
-   while (insn && insn->op != OP_SET) {
-      Instruction *next = NULL;
-      switch (insn->op) {
-      case OP_NEG:
-      case OP_ABS:
-      case OP_CVT:
-         next = insn->getSrc(0)->getInsn();
-         if (insn->sType != next->dType)
+   if (insn->asCmp() && insn->op != OP_SLCT)
+      return insn->asCmp();
+
+   /* Sometimes mov's will sneak in as a result of other folding. This gets
+    * cleaned up later.
+    */
+   if (insn->op == OP_MOV)
+      return findOriginForTestWithZero(insn->getSrc(0));
+
+   /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
+   if (insn->op == OP_AND) {
+      int s = 0;
+      ImmediateValue imm;
+      if (!insn->src(s).getImmediate(imm)) {
+         s = 1;
+         if (!insn->src(s).getImmediate(imm))
              return NULL;
-         break;
-      case OP_MOV:
-         next = insn->getSrc(0)->getInsn();
-         break;
-      default:
-         return NULL;
        }
-      insn = next;
+      if (imm.reg.data.f32 != 1.0f)
+         return NULL;
+      /* TODO: Come up with a way to handle the condition being inverted */
+      if (insn->src(!s).mod != Modifier(0))
+         return NULL;
+      return findOriginForTestWithZero(insn->getSrc(!s));
     }
-   return insn ? insn->asCmp() : NULL;
+
+   return NULL;
  }
  
  void
@@ -437,6 +519,7 @@ ConstantFolding::expr(Instruction *i,
  {
     struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
     struct Storage res;
+   DataType type = i->dType;
  
     memset(&res.data, 0, sizeof(res.data));
  
@@ -493,6 +576,16 @@ ConstantFolding::expr(Instruction *i,
           return;
        }
        break;
+   case OP_SUB:
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
+      case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
+      case TYPE_S32:
+      case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
+      default:
+         return;
+      }
+      break;
     case OP_POW:
        switch (i->dType) {
        case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
@@ -574,6 +667,23 @@ ConstantFolding::expr(Instruction *i,
     case OP_POPCNT:
        res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
        break;
+   case OP_PFETCH:
+      // The two arguments to pfetch are logically added together. Normally
+      // the second argument will not be constant, but that can happen.
+      res.data.u32 = a->data.u32 + b->data.u32;
+      type = TYPE_U32;
+      break;
+   case OP_MERGE:
+      switch (i->dType) {
+      case TYPE_U64:
+      case TYPE_S64:
+      case TYPE_F64:
+         res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
+         break;
+      default:
+         return;
+      }
+      break;
     default:
        return;
     }
@@ -587,24 +697,36 @@ ConstantFolding::expr(Instruction *i,
     i->setSrc(1, NULL);
  
     i->getSrc(0)->reg.data = res.data;
+   i->getSrc(0)->reg.type = type;
+   i->getSrc(0)->reg.size = typeSizeof(type);
  
-   if (i->op == OP_MAD || i->op == OP_FMA) {
-      i->op = OP_ADD;
+   switch (i->op) {
+   case OP_MAD:
+   case OP_FMA: {
+      ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
  
-      i->setSrc(1, i->getSrc(0));
-      i->src(1).mod = i->src(2).mod;
+      // Move the immediate into position 1, where we know it might be
+      // emittable. However it might not be anyways, as there may be other
+      // restrictions, so move it into a separate LValue.
+      bld.setPosition(i, false);
+      i->op = OP_ADD;
+      i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
        i->setSrc(0, i->getSrc(2));
+      i->src(0).mod = i->src(2).mod;
        i->setSrc(2, NULL);
  
-      ImmediateValue src0;
        if (i->src(0).getImmediate(src0))
-         expr(i, src0, *i->getSrc(1)->asImm());
-      if (i->saturate && !prog->getTarget()->isSatSupported(i)) {
-         bld.setPosition(i, false);
-         i->setSrc(1, bld.loadImm(NULL, res.data.u32));
-      }
-   } else {
+         expr(i, src0, src1);
+      else
+         opnd(i, src1, 1);
+      break;
+   }
+   case OP_PFETCH:
+      // Leave PFETCH alone... we just folded its 2 args into 1.
+      break;
+   default:
        i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
+      break;
     }
     i->subOp = 0;
  }
@@ -628,6 +750,37 @@ ConstantFolding::expr(Instruction *i,
        res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
        break;
     }
+   case OP_MAD:
+   case OP_FMA: {
+      switch (i->dType) {
+      case TYPE_F32:
+         res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
+            c->data.f32;
+         break;
+      case TYPE_F64:
+         res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
+         break;
+      case TYPE_S32:
+         if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
+            res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
+            break;
+         }
+         /* fallthrough */
+      case TYPE_U32:
+         if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
+            res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
+            break;
+         }
+         res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
+         break;
+      default:
+         return;
+      }
+      break;
+   }
+   case OP_SHLADD:
+      res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
+      break;
     default:
        return;
     }
@@ -642,6 +795,8 @@ ConstantFolding::expr(Instruction *i,
     i->setSrc(2, NULL);
  
     i->getSrc(0)->reg.data = res.data;
+   i->getSrc(0)->reg.type = i->dType;
+   i->getSrc(0)->reg.size = typeSizeof(i->dType);
  
     i->op = OP_MOV;
  }
@@ -742,9 +897,36 @@ ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
     }
  }
  
+void
+ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
+{
+   switch (i->op) {
+   case OP_MAD:
+   case OP_FMA:
+      if (imm2.isInteger(0)) {
+         i->op = OP_MUL;
+         i->setSrc(2, NULL);
+         foldCount++;
+         return;
+      }
+      break;
+   case OP_SHLADD:
+      if (imm2.isInteger(0)) {
+         i->op = OP_SHL;
+         i->setSrc(2, NULL);
+         foldCount++;
+         return;
+      }
+      break;
+   default:
+      return;
+   }
+}
+
  void
  ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
  {
+   const Target *target = prog->getTarget();
     const int t = !s;
     const operation op = i->op;
     Instruction *newi = i;
@@ -816,6 +998,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
           i->src(0).mod = i->src(t).mod;
           i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
           i->src(1).mod = 0;
+      } else
+      if (i->postFactor && i->sType == TYPE_F32) {
+         /* Can't emit a postfactor with an immediate, have to fold it in */
+         i->setSrc(s, new_ImmediateValue(
+                      prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
+         i->postFactor = 0;
        }
        break;
     case OP_MAD:
@@ -828,7 +1016,8 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
           if (i->op != OP_CVT)
              i->src(0).mod = 0;
        } else
-      if (imm0.isInteger(1) || imm0.isInteger(-1)) {
+      if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
+          (imm0.isInteger(1) || imm0.isInteger(-1))) {
           if (imm0.isNegative())
              i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
           if (s == 0) {
@@ -839,6 +1028,12 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
           i->src(1).mod = i->src(2).mod;
           i->setSrc(2, NULL);
           i->op = OP_ADD;
+      } else
+      if (s == 1 && !imm0.isNegative() && imm0.isPow2() &&
+          target->isOpSupported(i->op, i->dType)) {
+         i->op = OP_SHLADD;
+         imm0.applyLog2();
+         i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
        }
        break;
     case OP_ADD:
@@ -946,56 +1141,104 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
  
     case OP_SET: // TODO: SET_AND,OR,XOR
     {
+      /* This optimizes the case where the output of a set is being compared
+       * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
+       * can be a lot cleverer in our comparison.
+       */
        CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
        CondCode cc, ccZ;
-      if (i->src(t).mod != Modifier(0))
-         return;
-      if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET)
+      if (imm0.reg.data.u32 != 0 || !si)
           return;
        cc = si->setCond;
        ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
+      // We do everything assuming var (cmp) 0, reverse the condition if 0 is
+      // first.
        if (s == 0)
           ccZ = reverseCondCode(ccZ);
+      // If there is a negative modifier, we need to undo that, by flipping
+      // the comparison to zero.
+      if (i->src(t).mod.neg())
+         ccZ = reverseCondCode(ccZ);
+      // If this is a signed comparison, we expect the input to be a regular
+      // boolean, i.e. 0/-1. However the rest of the logic assumes that true
+      // is positive, so just flip the sign.
+      if (i->sType == TYPE_S32) {
+         assert(!isFloatType(si->dType));
+         ccZ = reverseCondCode(ccZ);
+      }
        switch (ccZ) {
-      case CC_LT: cc = CC_FL; break;
-      case CC_GE: cc = CC_TR; break;
-      case CC_EQ: cc = inverseCondCode(cc); break;
-      case CC_LE: cc = inverseCondCode(cc); break;
-      case CC_GT: break;
-      case CC_NE: break;
+      case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
+      case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
+      case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
+      case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
+      case CC_GT: break; // bool > 0 -- bool
+      case CC_NE: break; // bool != 0 -- bool
        default:
           return;
        }
+
+      // Update the condition of this SET to be identical to the origin set,
+      // but with the updated condition code. The original SET should get
+      // DCE'd, ideally.
+      i->op = si->op;
        i->asCmp()->setCond = cc;
        i->setSrc(0, si->src(0));
        i->setSrc(1, si->src(1));
+      if (si->srcExists(2))
+         i->setSrc(2, si->src(2));
        i->sType = si->sType;
     }
        break;
  
     case OP_AND:
     {
-      CmpInstruction *cmp = i->getSrc(t)->getInsn()->asCmp();
-      if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
-         return;
-      if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
-         return;
-      if (imm0.reg.data.f32 != 1.0)
-         return;
-      if (i->getSrc(t)->getInsn()->dType != TYPE_U32)
-         return;
+      Instruction *src = i->getSrc(t)->getInsn();
+      ImmediateValue imm1;
+      if (imm0.reg.data.u32 == 0) {
+         i->op = OP_MOV;
+         i->setSrc(0, new_ImmediateValue(prog, 0u));
+         i->src(0).mod = Modifier(0);
+         i->setSrc(1, NULL);
+      } else if (imm0.reg.data.u32 == ~0U) {
+         i->op = i->src(t).mod.getOp();
+         if (t) {
+            i->setSrc(0, i->getSrc(t));
+            i->src(0).mod = i->src(t).mod;
+         }
+         i->setSrc(1, NULL);
+      } else if (src->asCmp()) {
+         CmpInstruction *cmp = src->asCmp();
+         if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
+            return;
+         if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
+            return;
+         if (imm0.reg.data.f32 != 1.0)
+            return;
+         if (cmp->dType != TYPE_U32)
+            return;
  
-      i->getSrc(t)->getInsn()->dType = TYPE_F32;
-      if (i->src(t).mod != Modifier(0)) {
-         assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
-         i->src(t).mod = Modifier(0);
-         cmp->setCond = inverseCondCode(cmp->setCond);
-      }
-      i->op = OP_MOV;
-      i->setSrc(s, NULL);
-      if (t) {
-         i->setSrc(0, i->getSrc(t));
-         i->setSrc(t, NULL);
+         cmp->dType = TYPE_F32;
+         if (i->src(t).mod != Modifier(0)) {
+            assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
+            i->src(t).mod = Modifier(0);
+            cmp->setCond = inverseCondCode(cmp->setCond);
+         }
+         i->op = OP_MOV;
+         i->setSrc(s, NULL);
+         if (t) {
+            i->setSrc(0, i->getSrc(t));
+            i->setSrc(t, NULL);
+         }
+      } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
+                 src->op == OP_SHR &&
+                 src->src(1).getImmediate(imm1) &&
+                 i->src(t).mod == Modifier(0) &&
+                 util_is_power_of_two(imm0.reg.data.u32 + 1)) {
+         // low byte = offset, high byte = width
+         uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
+         i->op = OP_EXTBF;
+         i->setSrc(0, src->getSrc(0));
+         i->setSrc(1, new_ImmediateValue(prog, ext));
        }
     }
        break;
@@ -1006,13 +1249,69 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
           break;
        // try to concatenate shifts
        Instruction *si = i->getSrc(0)->getInsn();
-      if (!si || si->op != OP_SHL)
+      if (!si)
           break;
        ImmediateValue imm1;
-      if (si->src(1).getImmediate(imm1)) {
+      switch (si->op) {
+      case OP_SHL:
+         if (si->src(1).getImmediate(imm1)) {
+            bld.setPosition(i, false);
+            i->setSrc(0, si->getSrc(0));
+            i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
+         }
+         break;
+      case OP_SHR:
+         if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
+            bld.setPosition(i, false);
+            i->op = OP_AND;
+            i->setSrc(0, si->getSrc(0));
+            i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
+         }
+         break;
+      case OP_MUL:
+         int muls;
+         if (isFloatType(si->dType))
+            return;
+         if (si->src(1).getImmediate(imm1))
+            muls = 1;
+         else if (si->src(0).getImmediate(imm1))
+            muls = 0;
+         else
+            return;
+
+         bld.setPosition(i, false);
+         i->op = OP_MUL;
+         i->setSrc(0, si->getSrc(!muls));
+         i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
+         break;
+      case OP_SUB:
+      case OP_ADD:
+         int adds;
+         if (isFloatType(si->dType))
+            return;
+         if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
+            adds = 0;
+         else if (si->src(1).getImmediate(imm1))
+            adds = 1;
+         else
+            return;
+         if (si->src(!adds).mod != Modifier(0))
+            return;
+         // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
+
+         // This is more operations, but if one of x, y is an immediate, then
+         // we can get a situation where (a) we can use ISCADD, or (b)
+         // propagate the add bit into an indirect load.
           bld.setPosition(i, false);
-         i->setSrc(0, si->getSrc(0));
-         i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
+         i->op = si->op;
+         i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
+         i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
+                                     bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
+                                     si->getSrc(!adds),
+                                     bld.mkImm(imm0.reg.data.u32)));
+         break;
+      default:
+         return;
        }
     }
        break;
@@ -1058,6 +1357,115 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
        i->op = OP_MOV;
        break;
     }
+   case OP_CVT: {
+      Storage res;
+
+      // TODO: handle 64-bit values properly
+      if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
+         return;
+
+      // TODO: handle single byte/word extractions
+      if (i->subOp)
+         return;
+
+      bld.setPosition(i, true); /* make sure bld is init'ed */
+
+#define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
+   case type: \
+      switch (i->sType) { \
+      case TYPE_F64: \
+         res.data.dst = util_iround(i->saturate ? \
+                                    CLAMP(imm0.reg.data.f64, fmin, fmax) : \
+                                    imm0.reg.data.f64); \
+         break; \
+      case TYPE_F32: \
+         res.data.dst = util_iround(i->saturate ? \
+                                    CLAMP(imm0.reg.data.f32, fmin, fmax) : \
+                                    imm0.reg.data.f32); \
+         break; \
+      case TYPE_S32: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.s32, imin, imax) : \
+                        imm0.reg.data.s32; \
+         break; \
+      case TYPE_U32: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.u32, umin, umax) : \
+                        imm0.reg.data.u32; \
+         break; \
+      case TYPE_S16: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.s16, imin, imax) : \
+                        imm0.reg.data.s16; \
+         break; \
+      case TYPE_U16: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.u16, umin, umax) : \
+                        imm0.reg.data.u16; \
+         break; \
+      default: return; \
+      } \
+      i->setSrc(0, bld.mkImm(res.data.dst)); \
+      break
+
+      switch(i->dType) {
+      CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
+      CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
+      CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
+      CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
+      case TYPE_F32:
+         switch (i->sType) {
+         case TYPE_F64:
+            res.data.f32 = i->saturate ?
+               CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+               imm0.reg.data.f64;
+            break;
+         case TYPE_F32:
+            res.data.f32 = i->saturate ?
+               CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+               imm0.reg.data.f32;
+            break;
+         case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
+         case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
+         case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
+         case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
+         default:
+            return;
+         }
+         i->setSrc(0, bld.mkImm(res.data.f32));
+         break;
+      case TYPE_F64:
+         switch (i->sType) {
+         case TYPE_F64:
+            res.data.f64 = i->saturate ?
+               CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+               imm0.reg.data.f64;
+            break;
+         case TYPE_F32:
+            res.data.f64 = i->saturate ?
+               CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+               imm0.reg.data.f32;
+            break;
+         case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
+         case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
+         case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
+         case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
+         default:
+            return;
+         }
+         i->setSrc(0, bld.mkImm(res.data.f64));
+         break;
+      default:
+         return;
+      }
+#undef CASE
+
+      i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
+      i->op = OP_MOV;
+      i->saturate = 0;
+      i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
+      break;
+   }
     default:
        return;
     }
@@ -1164,8 +1572,11 @@ private:
     void handleRCP(Instruction *);
     void handleSLCT(Instruction *);
     void handleLOGOP(Instruction *);
-   void handleCVT(Instruction *);
+   void handleCVT_NEG(Instruction *);
+   void handleCVT_CVT(Instruction *);
+   void handleCVT_EXTBF(Instruction *);
     void handleSUCLAMP(Instruction *);
+   void handleNEG(Instruction *);
  
     BuildUtil bld;
  };
@@ -1256,14 +1667,15 @@ AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
     else
        return false;
  
-   if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) ||
-       (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb))
-      return false;
-
     src = add->getSrc(s);
  
-   if (src->getInsn()->postFactor)
+   if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
        return false;
+
+   if (src->getInsn()->saturate || src->getInsn()->postFactor ||
+       src->getInsn()->dnz)
+      return false;
+
     if (toOp == OP_SAD) {
        ImmediateValue imm;
        if (!src->getInsn()->src(2).getImmediate(imm))
@@ -1272,6 +1684,10 @@ AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
           return false;
     }
  
+   if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
+       isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
+      return false;
+
     mod[0] = add->src(0).mod;
     mod[1] = add->src(1).mod;
     mod[2] = src->getUniqueInsn()->src(0).mod;
@@ -1282,6 +1698,8 @@ AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
  
     add->op = toOp;
     add->subOp = src->getInsn()->subOp; // potentially mul-high
+   add->dType = src->getInsn()->dType; // sign matters for imad hi
+   add->sType = src->getInsn()->sType;
  
     add->setSrc(2, add->src(s ? 0 : 1));
  
@@ -1415,12 +1833,12 @@ AlgebraicOpt::handleLOGOP(Instruction *logop)
  // nv50:
  //  F2I(NEG(I2F(ABS(SET))))
  void
-AlgebraicOpt::handleCVT(Instruction *cvt)
+AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
  {
+   Instruction *insn = cvt->getSrc(0)->getInsn();
     if (cvt->sType != TYPE_F32 ||
         cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
        return;
-   Instruction *insn = cvt->getSrc(0)->getInsn();
     if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
        return;
     if (insn->src(0).mod != Modifier(0))
@@ -1450,6 +1868,148 @@ AlgebraicOpt::handleCVT(Instruction *cvt)
     delete_Instruction(prog, cvt);
  }
  
+// F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
+// does a type conversion, this becomes trickier as there might be range
+// changes/etc. We could handle those in theory as long as the range was being
+// reduced or kept the same.
+void
+AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
+{
+   Instruction *insn = cvt->getSrc(0)->getInsn();
+   RoundMode rnd = insn->rnd;
+
+   if (insn->saturate ||
+       insn->subOp ||
+       insn->dType != insn->sType ||
+       insn->dType != cvt->sType)
+      return;
+
+   switch (insn->op) {
+   case OP_CEIL:
+      rnd = ROUND_PI;
+      break;
+   case OP_FLOOR:
+      rnd = ROUND_MI;
+      break;
+   case OP_TRUNC:
+      rnd = ROUND_ZI;
+      break;
+   case OP_CVT:
+      break;
+   default:
+      return;
+   }
+
+   if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
+      rnd = (RoundMode)(rnd & 3);
+
+   cvt->rnd = rnd;
+   cvt->setSrc(0, insn->getSrc(0));
+   cvt->src(0).mod *= insn->src(0).mod;
+   cvt->sType = insn->sType;
+}
+
+// Some shaders extract packed bytes out of words and convert them to
+// e.g. float. The Fermi+ CVT instruction can extract those directly, as can
+// nv50 for word sizes.
+//
+// CVT(EXTBF(x, byte/word))
+// CVT(AND(bytemask, x))
+// CVT(AND(bytemask, SHR(x, 8/16/24)))
+// CVT(SHR(x, 16/24))
+void
+AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
+{
+   Instruction *insn = cvt->getSrc(0)->getInsn();
+   ImmediateValue imm;
+   Value *arg = NULL;
+   unsigned width, offset;
+   if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
+      return;
+   if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
+      width = (imm.reg.data.u32 >> 8) & 0xff;
+      offset = imm.reg.data.u32 & 0xff;
+      arg = insn->getSrc(0);
+
+      if (width != 8 && width != 16)
+         return;
+      if (width == 8 && offset & 0x7)
+         return;
+      if (width == 16 && offset & 0xf)
+         return;
+   } else if (insn->op == OP_AND) {
+      int s;
+      if (insn->src(0).getImmediate(imm))
+         s = 0;
+      else if (insn->src(1).getImmediate(imm))
+         s = 1;
+      else
+         return;
+
+      if (imm.reg.data.u32 == 0xff)
+         width = 8;
+      else if (imm.reg.data.u32 == 0xffff)
+         width = 16;
+      else
+         return;
+
+      arg = insn->getSrc(!s);
+      Instruction *shift = arg->getInsn();
+      offset = 0;
+      if (shift && shift->op == OP_SHR &&
+          shift->sType == cvt->sType &&
+          shift->src(1).getImmediate(imm) &&
+          ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
+           (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
+         arg = shift->getSrc(0);
+         offset = imm.reg.data.u32;
+      }
+      // We just AND'd the high bits away, which means this is effectively an
+      // unsigned value.
+      cvt->sType = TYPE_U32;
+   } else if (insn->op == OP_SHR &&
+              insn->sType == cvt->sType &&
+              insn->src(1).getImmediate(imm)) {
+      arg = insn->getSrc(0);
+      if (imm.reg.data.u32 == 24) {
+         width = 8;
+         offset = 24;
+      } else if (imm.reg.data.u32 == 16) {
+         width = 16;
+         offset = 16;
+      } else {
+         return;
+      }
+   }
+
+   if (!arg)
+      return;
+
+   // Irrespective of what came earlier, we can undo a shift on the argument
+   // by adjusting the offset.
+   Instruction *shift = arg->getInsn();
+   if (shift && shift->op == OP_SHL &&
+       shift->src(1).getImmediate(imm) &&
+       ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
+        (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
+       imm.reg.data.u32 <= offset) {
+      arg = shift->getSrc(0);
+      offset -= imm.reg.data.u32;
+   }
+
+   // The unpackSnorm lowering still leaves a few shifts behind, but it's too
+   // annoying to detect them.
+
+   if (width == 8) {
+      cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
+   } else {
+      assert(width == 16);
+      cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
+   }
+   cvt->setSrc(0, arg);
+   cvt->subOp = offset >> 3;
+}
+
  // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
  void
  AlgebraicOpt::handleSUCLAMP(Instruction *insn)
@@ -1491,6 +2051,34 @@ AlgebraicOpt::handleSUCLAMP(Instruction *insn)
     insn->setSrc(0, add->getSrc(s));
  }
  
+// NEG(AND(SET, 1)) -> SET
+void
+AlgebraicOpt::handleNEG(Instruction *i) {
+   Instruction *src = i->getSrc(0)->getInsn();
+   ImmediateValue imm;
+   int b;
+
+   if (isFloatType(i->sType) || !src || src->op != OP_AND)
+      return;
+
+   if (src->src(0).getImmediate(imm))
+      b = 1;
+   else if (src->src(1).getImmediate(imm))
+      b = 0;
+   else
+      return;
+
+   if (!imm.isInteger(1))
+      return;
+
+   Instruction *set = src->getSrc(b)->getInsn();
+   if ((set->op == OP_SET || set->op == OP_SET_AND ||
+       set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
+       !isFloatType(set->dType)) {
+      i->def(0).replace(set->getDef(0), false);
+   }
+}
+
  bool
  AlgebraicOpt::visit(BasicBlock *bb)
  {
@@ -1520,11 +2108,17 @@ AlgebraicOpt::visit(BasicBlock *bb)
           handleLOGOP(i);
           break;
        case OP_CVT:
-         handleCVT(i);
+         handleCVT_NEG(i);
+         handleCVT_CVT(i);
+         if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
+             handleCVT_EXTBF(i);
           break;
        case OP_SUCLAMP:
           handleSUCLAMP(i);
           break;
+      case OP_NEG:
+         handleNEG(i);
+         break;
        default:
           break;
        }
@@ -1646,6 +2240,9 @@ MemoryOpt::combineLd(Record *rec, Instruction *ld)
     if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
         ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
        return false;
+   // for compute indirect loads are not guaranteed to be aligned
+   if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
+      return false;
  
     assert(sizeRc + sizeLd <= 16 && offRc != offLd);
  
@@ -1698,8 +2295,12 @@ MemoryOpt::combineSt(Record *rec, Instruction *st)
     if (!prog->getTarget()->
         isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
        return false;
+   // no unaligned stores
     if (size == 8 && MIN2(offRc, offSt) & 0x7)
        return false;
+   // for compute indirect stores are not guaranteed to be aligned
+   if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
+      return false;
  
     st->takeExtraSources(0, extra); // save predicate and indirect address
  
@@ -1908,7 +2509,7 @@ MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
        // get non-replaced sources after values covered by st
        for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
           vals[k++] = ri->getSrc(s);
-      assert((unsigned int)k <= Elements(vals));
+      assert((unsigned int)k <= ARRAY_SIZE(vals));
        for (s = 0; s < k; ++s)
           st->setSrc(s + 1, vals[s]);
        st->setSrc(0, ri->getSrc(0));
@@ -2010,6 +2611,12 @@ MemoryOpt::runOpt(BasicBlock *bb)
           }
        } else
        if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
+         if (typeSizeof(ldst->dType) == 4 &&
+             ldst->src(1).getFile() == FILE_GPR &&
+             ldst->getSrc(1)->getInsn()->op == OP_NOP) {
+            delete_Instruction(prog, ldst);
+            continue;
+         }
           isLoad = false;
        } else {
           // TODO: maybe have all fixed ops act as barrier ?
@@ -2037,6 +2644,8 @@ MemoryOpt::runOpt(BasicBlock *bb)
        }
        if (ldst->getPredicate()) // TODO: handle predicated ld/st
           continue;
+      if (ldst->perPatch) // TODO: create separate per-patch lists
+         continue;
  
        if (isLoad) {
           DataFile file = ldst->src(0).getFile();
@@ -2088,6 +2697,7 @@ MemoryOpt::runOpt(BasicBlock *bb)
  class FlatteningPass : public Pass
  {
  private:
+   virtual bool visit(Function *);
     virtual bool visit(BasicBlock *);
  
     bool tryPredicateConditional(BasicBlock *);
@@ -2096,6 +2706,8 @@ private:
     inline bool isConstantCondition(Value *pred);
     inline bool mayPredicate(const Instruction *, const Value *pred) const;
     inline void removeFlow(Instruction *);
+
+   uint8_t gpr_unit;
  };
  
  bool
@@ -2117,9 +2729,15 @@ FlatteningPass::isConstantCondition(Value *pred)
           file = ld->src(0).getFile();
        } else {
           file = insn->src(s).getFile();
-         // catch $r63 on NVC0
-         if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR)
-            file = FILE_IMMEDIATE;
+         // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
+         // in register "units", which can vary between targets.
+         if (file == FILE_GPR) {
+            Value *v = insn->getSrc(s);
+            int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
+            int units = bytes >> gpr_unit;
+            if (units > prog->maxGPR)
+               file = FILE_IMMEDIATE;
+         }
        }
        if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
           return false;
@@ -2224,6 +2842,14 @@ FlatteningPass::tryPropagateBranch(BasicBlock *bb)
     }
  }
  
+bool
+FlatteningPass::visit(Function *fn)
+{
+   gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
+
+   return true;
+}
+
  bool
  FlatteningPass::visit(BasicBlock *bb)
  {
@@ -2242,8 +2868,8 @@ FlatteningPass::visit(BasicBlock *bb)
               !isSurfaceOp(insn->op) && // not confirmed
               insn->op != OP_LINTERP && // probably just nve4
               insn->op != OP_PINTERP && // probably just nve4
-             ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
-              typeSizeof(insn->dType) <= 4) &&
+             ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
+              (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
               !insn->isNop()) {
              insn->join = 1;
              bb->remove(bb->getExit());
@@ -2330,6 +2956,15 @@ private:
     virtual bool visit(BasicBlock *);
  };
  
+static bool
+post_ra_dead(Instruction *i)
+{
+   for (int d = 0; i->defExists(d); ++d)
+      if (i->getDef(d)->refCount())
+         return false;
+   return true;
+}
+
  bool
  NV50PostRaConstantFolding::visit(BasicBlock *bb)
  {
@@ -2343,20 +2978,48 @@ NV50PostRaConstantFolding::visit(BasicBlock *bb)
               i->src(0).getFile() != FILE_GPR ||
               i->src(1).getFile() != FILE_GPR ||
               i->src(2).getFile() != FILE_GPR ||
-             i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id ||
-             !isFloatType(i->dType))
+             i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
+            break;
+
+         if (i->getDef(0)->reg.data.id >= 64 ||
+             i->getSrc(0)->reg.data.id >= 64)
+            break;
+
+         if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
+            break;
+
+         if (i->getPredicate())
              break;
  
           def = i->getSrc(1)->getInsn();
-         if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
+         if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
+            def = def->getSrc(0)->getInsn();
+         if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
              vtmp = i->getSrc(1);
-            i->setSrc(1, def->getSrc(0));
+            if (isFloatType(i->sType)) {
+               i->setSrc(1, def->getSrc(0));
+            } else {
+               ImmediateValue val;
+               bool ret = def->src(0).getImmediate(val);
+               assert(ret);
+               if (i->getSrc(1)->reg.data.id & 1)
+                  val.reg.data.u32 >>= 16;
+               val.reg.data.u32 &= 0xffff;
+               i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32));
+            }
  
              /* There's no post-RA dead code elimination, so do it here
               * XXX: if we add more code-removing post-RA passes, we might
               *      want to create a post-RA dead-code elim pass */
-            if (vtmp->refCount() == 0)
-               delete_Instruction(bb->getProgram(), def);
+            if (post_ra_dead(vtmp->getInsn())) {
+               Value *src = vtmp->getInsn()->getSrc(0);
+               // Careful -- splits will have already been removed from the
+               // functions. Don't double-delete.
+               if (vtmp->getInsn()->bb)
+                  delete_Instruction(prog, vtmp->getInsn());
+               if (src->getInsn() && post_ra_dead(src->getInsn()))
+                  delete_Instruction(prog, src->getInsn());
+            }
  
              break;
           }
@@ -2465,11 +3128,13 @@ Instruction::isResultEqual(const Instruction *that) const
     if (that->srcExists(s))
        return false;
  
-   if (op == OP_LOAD || op == OP_VFETCH) {
+   if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
        switch (src(0).getFile()) {
        case FILE_MEMORY_CONST:
        case FILE_SHADER_INPUT:
           return true;
+      case FILE_SHADER_OUTPUT:
+         return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
        default:
           return false;
        }
@@ -2494,6 +3159,8 @@ GlobalCSE::visit(BasicBlock *bb)
        ik = phi->getSrc(0)->getInsn();
        if (!ik)
           continue; // probably a function input
+      if (ik->defCount(0xff) > 1)
+         continue; // too painful to check if we can really push this forward
        for (s = 1; phi->srcExists(s); ++s) {
           if (phi->getSrc(s)->refCount() > 1)
              break;
@@ -2627,27 +3294,43 @@ DeadCodeElim::buryAll(Program *prog)
  bool
  DeadCodeElim::visit(BasicBlock *bb)
  {
-   Instruction *next;
+   Instruction *prev;
  
-   for (Instruction *i = bb->getFirst(); i; i = next) {
-      next = i->next;
+   for (Instruction *i = bb->getExit(); i; i = prev) {
+      prev = i->prev;
        if (i->isDead()) {
           ++deadCount;
           delete_Instruction(prog, i);
        } else
-      if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
+      if (i->defExists(1) &&
+          i->subOp == 0 &&
+          (i->op == OP_VFETCH || i->op == OP_LOAD)) {
           checkSplitLoad(i);
        } else
        if (i->defExists(0) && !i->getDef(0)->refCount()) {
           if (i->op == OP_ATOM ||
               i->op == OP_SUREDP ||
-             i->op == OP_SUREDB)
+             i->op == OP_SUREDB) {
              i->setDef(0, NULL);
+         } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
+            i->setDef(0, i->getDef(1));
+            i->setDef(1, NULL);
+         }
        }
     }
     return true;
  }
  
+// Each load can go into up to 4 destinations, any of which might potentially
+// be dead (i.e. a hole). These can always be split into 2 loads, independent
+// of where the holes are. We find the first contiguous region, put it into
+// the first load, and then put the second contiguous region into the second
+// load. There can be at most 2 contiguous regions.
+//
+// Note that there are some restrictions, for example it's not possible to do
+// a 64-bit load that's not 64-bit aligned, so such a load has to be split
+// up. Also hardware doesn't support 96-bit loads, so those also have to be
+// split into a 64-bit and 32-bit load.
  void
  DeadCodeElim::checkSplitLoad(Instruction *ld1)
  {
@@ -2668,6 +3351,8 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
     addr1 = ld1->getSrc(0)->reg.data.offset;
     n1 = n2 = 0;
     size1 = size2 = 0;
+
+   // Compute address/width for first load
     for (d = 0; ld1->defExists(d); ++d) {
        if (mask & (1 << d)) {
           if (size1 && (addr1 & 0x7))
@@ -2681,16 +3366,34 @@ DeadCodeElim::checkSplitLoad(Instruction *ld1)
           break;
        }
     }
+
+   // Scale back the size of the first load until it can be loaded. This
+   // typically happens for TYPE_B96 loads.
+   while (n1 &&
+          !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
+                                                typeOfSize(size1))) {
+      size1 -= def1[--n1]->reg.size;
+      d--;
+   }
+
+   // Compute address/width for second load
     for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
        if (mask & (1 << d)) {
+         assert(!size2 || !(addr2 & 0x7));
           def2[n2] = ld1->getDef(d);
           size2 += def2[n2++]->reg.size;
-      } else {
+      } else if (!n2) {
           assert(!n2);
           addr2 += ld1->getDef(d)->reg.size;
+      } else {
+         break;
        }
     }
  
+   // Make sure that we've processed all the values
+   for (; ld1->defExists(d); ++d)
+      assert(!(mask & (1 << d)));
+
     updateLdStOffset(ld1, addr1, func);
     ld1->setType(typeOfSize(size1));
     for (d = 0; d < 4; ++d)
@@ -2731,6 +3434,7 @@ Program::optimizeSSA(int level)
     RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
     RUN_PASS(1, ConstantFolding, foldAll);
     RUN_PASS(1, LoadPropagation, run);
+   RUN_PASS(1, IndirectPropagation, run);
     RUN_PASS(2, MemoryOpt, run);
     RUN_PASS(2, LocalCSE, run);
     RUN_PASS(0, DeadCodeElim, buryAll);