From 57594065c30feec9376be9b2132659f7d87362ee Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Wed, 14 Sep 2011 16:18:23 +0200 Subject: [PATCH] nv50/ir: import new shader backend code --- src/gallium/drivers/nv50/Makefile | 2 +- src/gallium/drivers/nv50/Makefile.sources | 14 + src/gallium/drivers/nv50/codegen/nv50_ir.cpp | 1008 ++++++++ src/gallium/drivers/nv50/codegen/nv50_ir.h | 1049 ++++++++ .../drivers/nv50/codegen/nv50_ir_bb.cpp | 409 +++ .../nv50/codegen/nv50_ir_build_util.cpp | 501 ++++ .../drivers/nv50/codegen/nv50_ir_build_util.h | 245 ++ .../drivers/nv50/codegen/nv50_ir_driver.h | 149 ++ .../nv50/codegen/nv50_ir_emit_nv50.cpp | 1333 ++++++++++ .../nv50/codegen/nv50_ir_from_tgsi.cpp | 2288 +++++++++++++++++ .../drivers/nv50/codegen/nv50_ir_graph.cpp | 381 +++ .../drivers/nv50/codegen/nv50_ir_graph.h | 207 ++ .../drivers/nv50/codegen/nv50_ir_inlines.h | 328 +++ .../drivers/nv50/codegen/nv50_ir_peephole.cpp | 2192 ++++++++++++++++ .../drivers/nv50/codegen/nv50_ir_print.cpp | 558 ++++ .../drivers/nv50/codegen/nv50_ir_ra.cpp | 963 +++++++ .../drivers/nv50/codegen/nv50_ir_ssa.cpp | 463 ++++ .../drivers/nv50/codegen/nv50_ir_target.cpp | 304 +++ .../drivers/nv50/codegen/nv50_ir_target.h | 164 ++ .../drivers/nv50/codegen/nv50_ir_util.cpp | 253 ++ .../drivers/nv50/codegen/nv50_ir_util.h | 585 +++++ src/gallium/drivers/nvc0/Makefile | 2 +- src/gallium/drivers/nvc0/Makefile.sources | 5 + .../nvc0/codegen/nv50_ir_emit_nvc0.cpp | 1714 ++++++++++++ .../nvc0/codegen/nv50_ir_lowering_nvc0.cpp | 705 +++++ .../nvc0/codegen/nv50_ir_target_nvc0.cpp | 568 ++++ .../nvc0/codegen/nv50_ir_target_nvc0.h | 46 + src/gallium/targets/gbm/Makefile | 1 + 28 files changed, 16435 insertions(+), 2 deletions(-) create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir.h create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_driver.h create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_graph.h create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_ssa.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_target.h create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp create mode 100644 src/gallium/drivers/nv50/codegen/nv50_ir_util.h create mode 100644 src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp create mode 100644 src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp create mode 100644 src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp create mode 100644 src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile index 220adf696b3..18e30b0a54f 100644 --- a/src/gallium/drivers/nv50/Makefile +++ b/src/gallium/drivers/nv50/Makefile @@ -3,7 +3,7 @@ include $(TOP)/configs/current LIBNAME = nv50 -# get C_SOURCES +# get C/CPP_SOURCES include Makefile.sources LIBRARY_INCLUDES = \ diff --git a/src/gallium/drivers/nv50/Makefile.sources b/src/gallium/drivers/nv50/Makefile.sources index 756f90be979..cc9321bef7e 100644 --- a/src/gallium/drivers/nv50/Makefile.sources +++ b/src/gallium/drivers/nv50/Makefile.sources @@ -21,3 +21,17 @@ C_SOURCES := \ nv50_pc_regalloc.c \ nv50_push.c \ nv50_query.c + +CPP_SOURCES := \ + codegen/nv50_ir.cpp \ + codegen/nv50_ir_bb.cpp \ + codegen/nv50_ir_build_util.cpp \ + codegen/nv50_ir_emit_nv50.cpp \ + codegen/nv50_ir_from_tgsi.cpp \ + codegen/nv50_ir_graph.cpp \ + codegen/nv50_ir_peephole.cpp \ + codegen/nv50_ir_print.cpp \ + codegen/nv50_ir_ra.cpp \ + codegen/nv50_ir_ssa.cpp \ + codegen/nv50_ir_target.cpp \ + codegen/nv50_ir_util.cpp diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp new file mode 100644 index 00000000000..ff2e6ef3401 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir.cpp @@ -0,0 +1,1008 @@ + +#include "nv50_ir.h" +#include "nv50_ir_target.h" +#include "nv50_ir_driver.h" + +extern "C" { +#include "nv50/nv50_program.h" +#include "nv50/nv50_debug.h" +} + +namespace nv50_ir { + +Modifier::Modifier(operation op) +{ + switch (op) { + case OP_NEG: bits = NV50_IR_MOD_NEG; break; + case OP_ABS: bits = NV50_IR_MOD_ABS; break; + case OP_SAT: bits = NV50_IR_MOD_SAT; break; + case OP_NOP: bits = NV50_IR_MOD_NOT; break; + default: + bits = 0; + break; + } +} + +Modifier Modifier::operator*(const Modifier m) const +{ + unsigned int a, b, c; + + b = m.bits; + if (this->bits & NV50_IR_MOD_ABS) + b &= ~NV50_IR_MOD_NEG; + + a = (this->bits ^ b) & (NV50_IR_MOD_NOT | NV50_IR_MOD_NEG); + c = (this->bits | m.bits) & (NV50_IR_MOD_ABS | NV50_IR_MOD_SAT); + + return Modifier(a | c); +} + +ValueRef::ValueRef() : value(0), insn(0), next(this), prev(this) +{ + indirect[0] = -1; + indirect[1] = -1; + usedAsPtr = false; +} + +ValueRef::~ValueRef() +{ + this->set(NULL); +} + +ImmediateValue *ValueRef::getImmediate() const +{ + Value *src = value; + + while (src) { + if (src->reg.file == FILE_IMMEDIATE) + return src->asImm(); + + Instruction *insn = src->getUniqueInsn(); + + src = (insn && insn->op == OP_MOV) ? insn->getSrc(0) : NULL; + } + return NULL; +} + +ValueDef::ValueDef() : value(0), insn(0), next(this), prev(this) +{ + // nothing to do +} + +ValueDef::~ValueDef() +{ + this->set(NULL); +} + +void +ValueRef::set(const ValueRef &ref) +{ + this->set(ref.get()); + mod = ref.mod; + indirect[0] = ref.indirect[0]; + indirect[1] = ref.indirect[1]; +} + +void +ValueRef::set(Value *refVal) +{ + if (value == refVal) + return; + if (value) { + if (value->uses == this) + value->uses = (next == this) ? NULL : next; + value->unref(); + DLLIST_DEL(this); + } + + if (refVal) { + if (refVal->uses) + DLLIST_ADDTAIL(refVal->uses, this); + else + refVal->uses = this; + refVal->ref(); + } + value = refVal; +} + +void +ValueDef::set(Value *defVal) +{ + assert(next != this || prev == this); // check that SSA hack isn't active + + if (value == defVal) + return; + if (value) { + if (value->defs == this) + value->defs = (next == this) ? NULL : next; + DLLIST_DEL(this); + } + + if (defVal) { + if (defVal->defs) + DLLIST_ADDTAIL(defVal->defs, this); + else + defVal->defs = this; + } + value = defVal; +} + +// TODO: make me faster by using a safe iterator +void +ValueDef::replace(Value *repVal, bool doSet) +{ + ValueRef **refs = new ValueRef * [value->refCount()]; + int n = 0; + + if (!refs && value->refCount()) + FATAL("memory allocation failed"); + + for (ValueRef::Iterator iter = value->uses->iterator(); !iter.end(); + iter.next()) { + assert(n < value->refCount()); + refs[n++] = iter.get(); + } + while (n) + refs[--n]->set(repVal); + + if (doSet) + this->set(repVal); + + if (refs) + delete[] refs; +} + +void +ValueDef::mergeDefs(ValueDef *join) +{ + DLLIST_MERGE(this, join, ValueDef *); +} + +Value::Value() +{ + refCnt = 0; + uses = NULL; + defs = NULL; + join = this; + + memset(®, 0, sizeof(reg)); + reg.size = 4; +} + +bool +Value::coalesce(Value *jval, bool force) +{ + Value *repr = this->join; // new representative + Value *jrep = jval->join; + + if (reg.file != jval->reg.file || reg.size != jval->reg.size) { + if (!force) + return false; + ERROR("forced coalescing of values of different sizes/files"); + } + + if (!force && (repr->reg.data.id != jrep->reg.data.id)) { + if (repr->reg.data.id >= 0 && + jrep->reg.data.id >= 0) + return false; + if (jrep->reg.data.id >= 0) { + repr = jval->join; + jrep = this->join; + jval = this; + } + + // need to check all fixed register values of the program for overlap + Function *func = defs->getInsn()->bb->getFunction(); + + // TODO: put values in by register-id bins per function + ArrayList::Iterator iter = func->allLValues.iterator(); + for (; !iter.end(); iter.next()) { + Value *fixed = reinterpret_cast(iter.get()); + assert(fixed); + if (fixed->reg.data.id == repr->reg.data.id) + if (fixed->livei.overlaps(jrep->livei)) + return false; + } + } + if (repr->livei.overlaps(jrep->livei)) { + if (!force) + return false; + // do we really want this ? if at all, only for constraint ops + INFO("NOTE: forced coalescing with live range overlap\n"); + } + + ValueDef::Iterator iter = jrep->defs->iterator(); + for (; !iter.end(); iter.next()) + iter.get()->get()->join = repr; + + repr->defs->mergeDefs(jrep->defs); + repr->livei.unify(jrep->livei); + + assert(repr->join == repr && jval->join == repr); + return true; +} + +LValue::LValue(Function *fn, DataFile file) +{ + reg.file = file; + reg.size = (file != FILE_PREDICATE) ? 4 : 1; + reg.data.id = -1; + + affinity = -1; + + fn->add(this, this->id); +} + +LValue::LValue(Function *fn, LValue *lval) +{ + assert(lval); + + reg.file = lval->reg.file; + reg.size = lval->reg.size; + reg.data.id = -1; + + affinity = -1; + + fn->add(this, this->id); +} + +Value *LValue::clone(Function *func) const +{ + LValue *that = new_LValue(func, reg.file); + + that->reg.size = this->reg.size; + that->reg.type = this->reg.type; + that->reg.data = this->reg.data; + + return that; +} + +Symbol::Symbol(Program *prog, DataFile f, ubyte fidx) +{ + baseSym = NULL; + + reg.file = f; + reg.fileIndex = fidx; + reg.data.offset = 0; + + prog->add(this, this->id); +} + +Value * +Symbol::clone(Function *func) const +{ + Program *prog = func->getProgram(); + + Symbol *that = new_Symbol(prog, reg.file, reg.fileIndex); + + that->reg.size = this->reg.size; + that->reg.type = this->reg.type; + that->reg.data = this->reg.data; + + that->baseSym = this->baseSym; + + return that; +} + +ImmediateValue::ImmediateValue(Program *prog, uint32_t uval) +{ + memset(®, 0, sizeof(reg)); + + reg.file = FILE_IMMEDIATE; + reg.size = 4; + reg.type = TYPE_U32; + + reg.data.u32 = uval; + + prog->add(this, this->id); +} + +ImmediateValue::ImmediateValue(Program *prog, float fval) +{ + memset(®, 0, sizeof(reg)); + + reg.file = FILE_IMMEDIATE; + reg.size = 4; + reg.type = TYPE_F32; + + reg.data.f32 = fval; + + prog->add(this, this->id); +} + +ImmediateValue::ImmediateValue(Program *prog, double dval) +{ + memset(®, 0, sizeof(reg)); + + reg.file = FILE_IMMEDIATE; + reg.size = 8; + reg.type = TYPE_F64; + + reg.data.f64 = dval; + + prog->add(this, this->id); +} + +ImmediateValue::ImmediateValue(const ImmediateValue *proto, DataType ty) +{ + reg = proto->reg; + + reg.type = ty; + reg.size = typeSizeof(ty); +} + +bool +ImmediateValue::isInteger(const int i) const +{ + switch (reg.type) { + case TYPE_S8: + return reg.data.s8 == i; + case TYPE_U8: + return reg.data.u8 == i; + case TYPE_S16: + return reg.data.s16 == i; + case TYPE_U16: + return reg.data.u16 == i; + case TYPE_S32: + case TYPE_U32: + return reg.data.s32 == i; // as if ... + case TYPE_F32: + return reg.data.f32 == static_cast(i); + case TYPE_F64: + return reg.data.f64 == static_cast(i); + default: + return false; + } +} + +bool +ImmediateValue::isNegative() const +{ + switch (reg.type) { + case TYPE_S8: return reg.data.s8 < 0; + case TYPE_S16: return reg.data.s16 < 0; + case TYPE_S32: + case TYPE_U32: return reg.data.s32 < 0; + case TYPE_F32: return reg.data.u32 & (1 << 31); + case TYPE_F64: return reg.data.u64 & (1ULL << 63); + default: + return false; + } +} + +bool +ImmediateValue::isPow2() const +{ + switch (reg.type) { + case TYPE_U8: + case TYPE_U16: + case TYPE_U32: return util_is_power_of_two(reg.data.u32); + default: + return false; + } +} + +void +ImmediateValue::applyLog2() +{ + switch (reg.type) { + case TYPE_S8: + case TYPE_S16: + case TYPE_S32: + assert(!this->isNegative()); + // fall through + case TYPE_U8: + case TYPE_U16: + case TYPE_U32: + reg.data.u32 = util_logbase2(reg.data.u32); + break; + case TYPE_F32: + reg.data.f32 = log2f(reg.data.f32); + break; + case TYPE_F64: + reg.data.f64 = log2(reg.data.f64); + break; + default: + assert(0); + break; + } +} + +bool +ImmediateValue::compare(CondCode cc, float fval) const +{ + if (reg.type != TYPE_F32) + ERROR("immediate value is not of type f32"); + + switch (static_cast(cc & 7)) { + case CC_TR: return true; + case CC_FL: return false; + case CC_LT: return reg.data.f32 < fval; + case CC_LE: return reg.data.f32 <= fval; + case CC_GT: return reg.data.f32 > fval; + case CC_GE: return reg.data.f32 >= fval; + case CC_EQ: return reg.data.f32 == fval; + case CC_NE: return reg.data.f32 != fval; + default: + assert(0); + return false; + } +} + +bool +Value::interfers(const Value *that) const +{ + uint32_t idA, idB; + + if (that->reg.file != reg.file || that->reg.fileIndex != reg.fileIndex) + return false; + if (this->asImm()) + return false; + + if (this->asSym()) { + idA = this->join->reg.data.offset; + idB = that->join->reg.data.offset; + } else { + idA = this->join->reg.data.id * this->reg.size; + idB = that->join->reg.data.id * that->reg.size; + } + + if (idA < idB) + return (idA + this->reg.size > idB); + else + if (idA > idB) + return (idB + that->reg.size > idA); + else + return (idA == idB); +} + +bool +Value::equals(const Value *that, bool strict) const +{ + that = that->join; + + if (strict) + return this == that; + + if (that->reg.file != reg.file || that->reg.fileIndex != reg.fileIndex) + return false; + if (that->reg.size != this->reg.size) + return false; + + if (that->reg.data.id != this->reg.data.id) + return false; + + return true; +} + +bool +ImmediateValue::equals(const Value *that, bool strict) const +{ + const ImmediateValue *imm = that->asImm(); + if (!imm) + return false; + return reg.data.u64 == imm->reg.data.u64; +} + +bool +Symbol::equals(const Value *that, bool strict) const +{ + if (this->reg.file != that->reg.file) + return false; + assert(that->asSym()); + + if (this->baseSym != that->asSym()->baseSym) + return false; + + return this->reg.data.offset == that->reg.data.offset; +} + +void Instruction::init() +{ + next = prev = 0; + + cc = CC_ALWAYS; + rnd = ROUND_N; + cache = CACHE_CA; + subOp = 0; + + saturate = 0; + join = terminator = 0; + ftz = dnz = 0; + atomic = 0; + perPatch = 0; + fixed = 0; + encSize = 0; + ipa = 0; + + lanes = 0xf; + + postFactor = 0; + + for (int p = 0; p < NV50_IR_MAX_DEFS; ++p) + def[p].setInsn(this); + for (int p = 0; p < NV50_IR_MAX_SRCS; ++p) + src[p].setInsn(this); + + predSrc = -1; + flagsDef = -1; + flagsSrc = -1; +} + +Instruction::Instruction() +{ + init(); + + op = OP_NOP; + dType = sType = TYPE_F32; + + id = -1; + bb = 0; +} + +Instruction::Instruction(Function *fn, operation opr, DataType ty) +{ + init(); + + op = opr; + dType = sType = ty; + + fn->add(this, id); +} + +Instruction::~Instruction() +{ + if (bb) { + Function *fn = bb->getFunction(); + bb->remove(this); + fn->allInsns.remove(id); + } + + for (int s = 0; srcExists(s); ++s) + setSrc(s, NULL); + // must unlink defs too since the list pointers will get deallocated + for (int d = 0; defExists(d); ++d) + setDef(d, NULL); +} + +void +Instruction::setSrc(int s, ValueRef& ref) +{ + setSrc(s, ref.get()); + src[s].mod = ref.mod; +} + +void +Instruction::swapSources(int a, int b) +{ + Value *value = src[a].get(); + Modifier m = src[a].mod; + + setSrc(a, src[b]); + + src[b].set(value); + src[b].mod = m; +} + +void +Instruction::takeExtraSources(int s, Value *values[3]) +{ + values[0] = getIndirect(s, 0); + if (values[0]) + setIndirect(s, 0, NULL); + + values[1] = getIndirect(s, 1); + if (values[1]) + setIndirect(s, 1, NULL); + + values[2] = getPredicate(); + if (values[2]) + setPredicate(cc, NULL); +} + +void +Instruction::putExtraSources(int s, Value *values[3]) +{ + if (values[0]) + setIndirect(s, 0, values[0]); + if (values[1]) + setIndirect(s, 1, values[1]); + if (values[2]) + setPredicate(cc, values[2]); +} + +Instruction * +Instruction::clone(bool deep) const +{ + Instruction *insn = new_Instruction(bb->getFunction(), op, dType); + assert(!asCmp() && !asFlow()); + cloneBase(insn, deep); + return insn; +} + +void +Instruction::cloneBase(Instruction *insn, bool deep) const +{ + insn->sType = this->sType; + + insn->cc = this->cc; + insn->rnd = this->rnd; + insn->cache = this->cache; + insn->subOp = this->subOp; + + insn->saturate = this->saturate; + insn->atomic = this->atomic; + insn->ftz = this->ftz; + insn->dnz = this->dnz; + insn->ipa = this->ipa; + insn->lanes = this->lanes; + insn->perPatch = this->perPatch; + + insn->postFactor = this->postFactor; + + if (deep) { + if (!bb) + return; + Function *fn = bb->getFunction(); + for (int d = 0; this->defExists(d); ++d) + insn->setDef(d, this->getDef(d)->clone(fn)); + } else { + for (int d = 0; this->defExists(d); ++d) + insn->setDef(d, this->getDef(d)); + } + + for (int s = 0; this->srcExists(s); ++s) + insn->src[s].set(this->src[s]); + + insn->predSrc = this->predSrc; + insn->flagsDef = this->flagsDef; + insn->flagsSrc = this->flagsSrc; +} + +unsigned int +Instruction::defCount(unsigned int mask) const +{ + unsigned int i, n; + + for (n = 0, i = 0; this->defExists(i); ++i, mask >>= 1) + n += mask & 1; + return n; +} + +unsigned int +Instruction::srcCount(unsigned int mask) const +{ + unsigned int i, n; + + for (n = 0, i = 0; this->srcExists(i); ++i, mask >>= 1) + n += mask & 1; + return n; +} + +bool +Instruction::setIndirect(int s, int dim, Value *value) +{ + int p = src[s].indirect[dim]; + + assert(this->srcExists(s)); + if (p < 0) { + if (!value) + return true; + for (p = s + 1; this->srcExists(p); ++p); + } + assert(p < NV50_IR_MAX_SRCS); + + src[p] = value; + src[p].usedAsPtr = (value != 0); + src[s].indirect[dim] = value ? p : -1; + return true; +} + +bool +Instruction::setPredicate(CondCode ccode, Value *value) +{ + cc = ccode; + + if (!value) { + if (predSrc >= 0) { + src[predSrc] = 0; + predSrc = -1; + } + return true; + } + + if (predSrc < 0) { + int s; + for (s = 0; this->srcExists(s); ++s) + assert(s < NV50_IR_MAX_SRCS); + predSrc = s; + } + src[predSrc] = value; + return true; +} + +bool +Instruction::writesPredicate() const +{ + for (int d = 0; d < 2 && def[d].exists(); ++d) + if (def[d].exists() && + (getDef(d)->inFile(FILE_PREDICATE) || getDef(d)->inFile(FILE_FLAGS))) + return true; + return false; +} + +static bool +insnCheckCommutation(const Instruction *a, const Instruction *b) +{ + for (int d = 0; a->defExists(d); ++d) + for (int s = 0; b->srcExists(s); ++s) + if (a->getDef(d)->interfers(b->getSrc(s))) + return false; + return true; +} + +bool +Instruction::isCommutationLegal(const Instruction *i) const +{ + bool ret = true; + ret = ret && insnCheckCommutation(this, i); + ret = ret && insnCheckCommutation(i, this); + return ret; +} + +TexInstruction::TexInstruction(Function *fn, operation op) + : Instruction(fn, op, TYPE_F32) +{ + memset(&tex, 0, sizeof(tex)); + + tex.rIndirectSrc = -1; + tex.sIndirectSrc = -1; +} + +TexInstruction::~TexInstruction() +{ + for (int c = 0; c < 3; ++c) { + dPdx[c].set(NULL); + dPdy[c].set(NULL); + } +} + +Instruction * +TexInstruction::clone(bool deep) const +{ + TexInstruction *tex = new_TexInstruction(bb->getFunction(), op); + cloneBase(tex, deep); + + tex->tex = this->tex; + + if (op == OP_TXD) { + for (unsigned int c = 0; c < tex->tex.target.getDim(); ++c) { + tex->dPdx[c].set(dPdx[c]); + tex->dPdy[c].set(dPdy[c]); + } + } + + return tex; +} + +const struct TexInstruction::Target::Desc TexInstruction::Target::descTable[] = +{ + { "1D", 1, 1, false, false, false }, + { "2D", 2, 2, false, false, false }, + { "2D_MS", 2, 2, false, false, false }, + { "3D", 3, 3, false, false, false }, + { "CUBE", 2, 3, false, true, false }, + { "1D_SHADOW", 1, 1, false, false, true }, + { "2D_SHADOW", 2, 2, false, false, true }, + { "CUBE_SHADOW", 2, 3, false, true, true }, + { "1D_ARRAY", 1, 2, true, false, false }, + { "2D_ARRAY", 2, 3, true, false, false }, + { "2D_MS_ARRAY", 2, 3, true, false, false }, + { "CUBE_ARRAY", 2, 3, true, true, false }, + { "1D_ARRAY_SHADOW", 1, 2, true, false, true }, + { "2D_ARRAY_SHADOW", 2, 3, true, false, true }, + { "RECT", 2, 2, false, false, false }, + { "RECT_SHADOW", 2, 2, false, false, true }, + { "CUBE_ARRAY_SHADOW", 2, 4, true, true, true }, + { "BUFFER", 1, 1, false, false, false }, +}; + +CmpInstruction::CmpInstruction(Function *fn, operation op) + : Instruction(fn, op, TYPE_F32) +{ + setCond = CC_ALWAYS; +} + +Instruction * +CmpInstruction::clone(bool deep) const +{ + CmpInstruction *cmp = new_CmpInstruction(bb->getFunction(), op); + cloneBase(cmp, deep); + cmp->setCond = setCond; + cmp->dType = dType; + return cmp; +} + +FlowInstruction::FlowInstruction(Function *fn, operation op, + BasicBlock *targ) + : Instruction(fn, op, TYPE_NONE) +{ + target.bb = targ; + + if (op == OP_BRA || + op == OP_CONT || op == OP_BREAK || + op == OP_RET || op == OP_EXIT) + terminator = 1; + else + if (op == OP_JOIN) + terminator = targ ? 1 : 0; + + allWarp = absolute = limit = 0; +} + +Program::Program(Type type, Target *arch) + : progType(type), + target(arch), + mem_Instruction(sizeof(Instruction), 6), + mem_CmpInstruction(sizeof(CmpInstruction), 4), + mem_TexInstruction(sizeof(TexInstruction), 4), + mem_FlowInstruction(sizeof(FlowInstruction), 4), + mem_LValue(sizeof(LValue), 8), + mem_Symbol(sizeof(Symbol), 7), + mem_ImmediateValue(sizeof(ImmediateValue), 7) +{ + code = NULL; + binSize = 0; + + maxGPR = -1; + + main = new Function(this, "MAIN"); + + dbgFlags = 0; +} + +Program::~Program() +{ + if (main) + delete main; +} + +void Program::releaseInstruction(Instruction *insn) +{ + // TODO: make this not suck so much + + insn->~Instruction(); + + if (insn->asCmp()) + mem_CmpInstruction.release(insn); + else + if (insn->asTex()) + mem_TexInstruction.release(insn); + else + if (insn->asFlow()) + mem_FlowInstruction.release(insn); + else + mem_Instruction.release(insn); +} + +void Program::releaseValue(Value *value) +{ + if (value->asLValue()) + mem_LValue.release(value); + else + if (value->asImm()) + mem_ImmediateValue.release(value); + else + if (value->asSym()) + mem_Symbol.release(value); +} + + +} // namespace nv50_ir + +extern "C" { + +static void +nv50_ir_init_prog_info(struct nv50_ir_prog_info *info) +{ + info->io.clipDistance = 0xff; + info->io.pointSize = 0xff; + info->io.edgeFlagIn = 0xff; + info->io.edgeFlagOut = 0xff; + info->io.fragDepth = 0xff; + info->io.sampleMask = 0xff; + info->io.backFaceColor[0] = info->io.backFaceColor[1] = 0xff; +} + +int +nv50_ir_generate_code(struct nv50_ir_prog_info *info) +{ + int ret = 0; + + nv50_ir::Program::Type type; + + nv50_ir_init_prog_info(info); + +#define PROG_TYPE_CASE(a, b) \ + case PIPE_SHADER_##a: type = nv50_ir::Program::TYPE_##b; break + + switch (info->type) { + PROG_TYPE_CASE(VERTEX, VERTEX); +// PROG_TYPE_CASE(HULL, TESSELLATION_CONTROL); +// PROG_TYPE_CASE(DOMAIN, TESSELLATION_EVAL); + PROG_TYPE_CASE(GEOMETRY, GEOMETRY); + PROG_TYPE_CASE(FRAGMENT, FRAGMENT); + default: + type = nv50_ir::Program::TYPE_COMPUTE; + break; + } + INFO_DBG(info->dbgFlags, VERBOSE, "translating program of type %u\n", type); + + nv50_ir::Target *targ = nv50_ir::Target::create(info->target); + if (!targ) + return -1; + + nv50_ir::Program *prog = new nv50_ir::Program(type, targ); + if (!prog) + return -1; + prog->dbgFlags = info->dbgFlags; + + switch (info->bin.sourceRep) { +#if 0 + case PIPE_IR_LLVM: + case PIPE_IR_GLSL: + return -1; + case PIPE_IR_SM4: + ret = prog->makeFromSM4(info) ? 0 : -2; + break; + case PIPE_IR_TGSI: +#endif + default: + ret = prog->makeFromTGSI(info) ? 0 : -2; + break; + } + if (ret < 0) + goto out; + if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE) + prog->print(); + + prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA); + + prog->convertToSSA(); + + if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE) + prog->print(); + + prog->optimizeSSA(info->optLevel); + prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA); + + if (prog->dbgFlags & NV50_IR_DEBUG_BASIC) + prog->print(); + + if (!prog->registerAllocation()) { + ret = -4; + goto out; + } + prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA); + + prog->optimizePostRA(info->optLevel); + + if (!prog->emitBinary(info)) { + ret = -5; + goto out; + } + +out: + INFO_DBG(prog->dbgFlags, VERBOSE, "nv50_ir_generate_code: ret = %i\n", ret); + + info->bin.maxGPR = prog->maxGPR; + info->bin.code = prog->code; + info->bin.codeSize = prog->binSize; + + delete prog; + nv50_ir::Target::destroy(targ); + + return ret; +} + +} // extern "C" diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.h b/src/gallium/drivers/nv50/codegen/nv50_ir.h new file mode 100644 index 00000000000..6eef1abb69d --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir.h @@ -0,0 +1,1049 @@ + +#ifndef __NV50_IR_H__ +#define __NV50_IR_H__ + +#include +#include +#include + +#include "nv50_ir_util.h" +#include "nv50_ir_graph.h" + +#include "nv50_ir_driver.h" + +namespace nv50_ir { + +enum operation +{ + OP_NOP = 0, + OP_PHI, + OP_UNION, // unify a new definition and several source values + OP_SPLIT, // $r0d -> { $r0, $r1 } ($r0d and $r0/$r1 will be coalesced) + OP_MERGE, // opposite of split, e.g. combine 2 32 bit into a 64 bit value + OP_CONSTRAINT, // copy values into consecutive registers + OP_MOV, + OP_LOAD, + OP_STORE, + OP_ADD, + OP_SUB, + OP_MUL, + OP_DIV, + OP_MOD, + OP_MAD, + OP_FMA, + OP_SAD, // abs(src0 - src1) + src2 + OP_ABS, + OP_NEG, + OP_NOT, + OP_AND, + OP_OR, + OP_XOR, + OP_SHL, + OP_SHR, + OP_MAX, + OP_MIN, + OP_SAT, // CLAMP(f32, 0.0, 1.0) + OP_CEIL, + OP_FLOOR, + OP_TRUNC, + OP_CVT, + OP_SET_AND, // dst = (src0 CMP src1) & src2 + OP_SET_OR, + OP_SET_XOR, + OP_SET, + OP_SELP, // dst = src2 ? src0 : src1 + OP_SLCT, // dst = (src2 CMP 0) ? src0 : src1 + OP_RCP, + OP_RSQ, + OP_LG2, + OP_SIN, + OP_COS, + OP_EX2, + OP_EXP, // exponential (base M_E) + OP_LOG, // natural logarithm + OP_PRESIN, + OP_PREEX2, + OP_SQRT, + OP_POW, + OP_BRA, + OP_CALL, + OP_RET, + OP_CONT, + OP_BREAK, + OP_PRERET, + OP_PRECONT, + OP_PREBREAK, + OP_BRKPT, // breakpoint (not related to loops) + OP_JOINAT, // push control flow convergence point + OP_JOIN, // converge + OP_DISCARD, + OP_EXIT, + OP_MEMBAR, + OP_VFETCH, // indirection 0 in attribute space, indirection 1 is vertex base + OP_PFETCH, // fetch base address of vertex src0 (immediate) [+ src1] + OP_EXPORT, + OP_LINTERP, + OP_PINTERP, + OP_EMIT, // emit vertex + OP_RESTART, // restart primitive + OP_TEX, + OP_TXB, // texture bias + OP_TXL, // texure lod + OP_TXF, // texel fetch + OP_TXQ, // texture size query + OP_TXD, // texture derivatives + OP_TXG, // texture gather + OP_TEXCSAA, + OP_SULD, // surface load + OP_SUST, // surface store + OP_DFDX, + OP_DFDY, + OP_RDSV, // read system value + OP_WRSV, // write system value + OP_PIXLD, + OP_QUADOP, + OP_QUADON, + OP_QUADPOP, + OP_POPCNT, // bitcount(src0 & src1) + OP_INSBF, // insert first src1[8:15] bits of src0 into src2 at src1[0:7] + OP_EXTBF, + OP_LAST +}; + +#define NV50_IR_SUBOP_MUL_HIGH 1 +#define NV50_IR_SUBOP_EMIT_RESTART 1 +#define NV50_IR_SUBOP_LDC_IL 1 +#define NV50_IR_SUBOP_LDC_IS 2 +#define NV50_IR_SUBOP_LDC_ISL 3 + +enum DataType +{ + TYPE_NONE, + TYPE_U8, + TYPE_S8, + TYPE_U16, + TYPE_S16, + TYPE_U32, + TYPE_S32, + TYPE_U64, // 64 bit operations are only lowered after register allocation + TYPE_S64, + TYPE_F16, + TYPE_F32, + TYPE_F64, + TYPE_B96, + TYPE_B128 +}; + +enum CondCode +{ + CC_FL = 0, + CC_NEVER = CC_FL, // when used with FILE_FLAGS + CC_LT = 1, + CC_EQ = 2, + CC_NOT_P = CC_EQ, // when used with FILE_PREDICATE + CC_LE = 3, + CC_GT = 4, + CC_NE = 5, + CC_P = CC_NE, + CC_GE = 6, + CC_TR = 7, + CC_ALWAYS = CC_TR, + CC_U = 8, + CC_LTU = 9, + CC_EQU = 10, + CC_LEU = 11, + CC_GTU = 12, + CC_NEU = 13, + CC_GEU = 14, + CC_NO = 0x10, + CC_NC = 0x11, + CC_NS = 0x12, + CC_NA = 0x13, + CC_A = 0x14, + CC_S = 0x15, + CC_C = 0x16, + CC_O = 0x17 +}; + +enum RoundMode +{ + ROUND_N, // nearest + ROUND_M, // towards -inf + ROUND_Z, // towards 0 + ROUND_P, // towards +inf + ROUND_NI, // nearest integer + ROUND_MI, // to integer towards -inf + ROUND_ZI, // to integer towards 0 + ROUND_PI, // to integer towards +inf +}; + +enum CacheMode +{ + CACHE_CA, // cache at all levels + CACHE_WB = CACHE_CA, // cache write back + CACHE_CG, // cache at global level + CACHE_CS, // cache streaming + CACHE_CV, // cache as volatile + CACHE_WT = CACHE_CV // cache write-through +}; + +enum DataFile +{ + FILE_NULL = 0, + FILE_GPR, + FILE_PREDICATE, // boolean predicate + FILE_FLAGS, // zero/sign/carry/overflow bits + FILE_ADDRESS, + FILE_IMMEDIATE, + FILE_MEMORY_CONST, + FILE_SHADER_INPUT, + FILE_SHADER_OUTPUT, + FILE_MEMORY_GLOBAL, + FILE_MEMORY_SHARED, + FILE_MEMORY_LOCAL, + FILE_SYSTEM_VALUE, + DATA_FILE_COUNT +}; + +enum TexTarget +{ + TEX_TARGET_1D, + TEX_TARGET_2D, + TEX_TARGET_2D_MS, + TEX_TARGET_3D, + TEX_TARGET_CUBE, + TEX_TARGET_1D_SHADOW, + TEX_TARGET_2D_SHADOW, + TEX_TARGET_CUBE_SHADOW, + TEX_TARGET_1D_ARRAY, + TEX_TARGET_2D_ARRAY, + TEX_TARGET_2D_MS_ARRAY, + TEX_TARGET_CUBE_ARRAY, + TEX_TARGET_1D_ARRAY_SHADOW, + TEX_TARGET_2D_ARRAY_SHADOW, + TEX_TARGET_RECT, + TEX_TARGET_RECT_SHADOW, + TEX_TARGET_CUBE_ARRAY_SHADOW, + TEX_TARGET_BUFFER, + TEX_TARGET_COUNT +}; + +enum SVSemantic +{ + SV_POSITION, // WPOS + SV_VERTEX_ID, + SV_INSTANCE_ID, + SV_INVOCATION_ID, + SV_PRIMITIVE_ID, + SV_VERTEX_COUNT, // gl_PatchVerticesIn + SV_LAYER, + SV_VIEWPORT_INDEX, + SV_YDIR, + SV_FACE, + SV_POINT_SIZE, + SV_POINT_COORD, + SV_CLIP_DISTANCE, + SV_SAMPLE_INDEX, + SV_TESS_FACTOR, + SV_TESS_COORD, + SV_TID, + SV_CTAID, + SV_NTID, + SV_GRIDID, + SV_NCTAID, + SV_LANEID, + SV_PHYSID, + SV_NPHYSID, + SV_CLOCK, + SV_LBASE, + SV_SBASE, + SV_UNDEFINED, + SV_LAST +}; + +class Program; +class Function; +class BasicBlock; + +class Target; + +class Instruction; +class CmpInstruction; +class TexInstruction; +class FlowInstruction; + +class Value; +class LValue; +class Symbol; +class ImmediateValue; + +struct Storage +{ + DataFile file; + int8_t fileIndex; // signed, may be indirect for CONST[] + uint8_t size; // this should match the Instruction type's size + DataType type; // mainly for pretty printing + union { + uint64_t u64; // immediate values + uint32_t u32; + uint16_t u16; + uint8_t u8; + int64_t s64; + int32_t s32; + int16_t s16; + int8_t s8; + float f32; + double f64; + int32_t offset; // offset from 0 (base of address space) + int32_t id; // register id (< 0 if virtual/unassigned) + struct { + SVSemantic sv; + int index; + } sv; + } data; +}; + +// precedence: NOT after SAT after NEG after ABS +#define NV50_IR_MOD_ABS (1 << 0) +#define NV50_IR_MOD_NEG (1 << 1) +#define NV50_IR_MOD_SAT (1 << 2) +#define NV50_IR_MOD_NOT (1 << 3) +#define NV50_IR_MOD_NEG_ABS (NV50_IR_MOD_NEG | NV50_IR_MOD_ABS) + +#define NV50_IR_INTERP_MODE_MASK 0x3 +#define NV50_IR_INTERP_LINEAR (0 << 0) +#define NV50_IR_INTERP_PERSPECTIVE (1 << 0) +#define NV50_IR_INTERP_FLAT (2 << 0) +#define NV50_IR_INTERP_SC (3 << 0) // what exactly is that ? +#define NV50_IR_INTERP_SAMPLE_MASK 0xc +#define NV50_IR_INTERP_DEFAULT (0 << 2) +#define NV50_IR_INTERP_CENTROID (1 << 2) +#define NV50_IR_INTERP_OFFSET (2 << 2) +#define NV50_IR_INTERP_SAMPLEID (3 << 2) + +// do we really want this to be a class ? +class Modifier +{ +public: + Modifier() : bits(0) { } + Modifier(unsigned int m) : bits(m) { } + Modifier(operation op); + + // @return new Modifier applying a after b (asserts if unrepresentable) + Modifier operator*(const Modifier) const; + Modifier operator==(const Modifier m) const { return m.bits == bits; } + Modifier operator!=(const Modifier m) const { return m.bits != bits; } + + inline Modifier operator&(const Modifier m) const { return bits & m.bits; } + inline Modifier operator|(const Modifier m) const { return bits | m.bits; } + inline Modifier operator^(const Modifier m) const { return bits ^ m.bits; } + + operation getOp() const; + + inline int neg() const { return (bits & NV50_IR_MOD_NEG) ? 1 : 0; } + inline int abs() const { return (bits & NV50_IR_MOD_ABS) ? 1 : 0; } + + inline operator bool() { return bits ? true : false; } + + void applyTo(ImmediateValue &imm) const; + + int print(char *buf, size_t size) const; + +private: + uint8_t bits; +}; + +class ValueRef +{ +public: + ValueRef(); + ~ValueRef(); + + inline ValueRef& operator=(Value *val) { this->set(val); return *this; } + + inline bool exists() const { return value != NULL; } + + void set(Value *); + void set(const ValueRef&); + inline Value *get() const { return value; } + inline Value *rep() const; + + inline Instruction *getInsn() const { return insn; } + inline void setInsn(Instruction *inst) { insn = inst; } + + inline bool isIndirect(int dim) const { return indirect[dim] >= 0; } + inline const ValueRef *getIndirect(int dim) const; + + inline DataFile getFile() const; + inline unsigned getSize() const; + + // SSA: return eventual (traverse MOVs) literal value, if it exists + ImmediateValue *getImmediate() const; + + class Iterator + { + public: + Iterator(ValueRef *ref) : pos(ref), ini(ref) { } + + inline ValueRef *get() const { return pos; } + inline bool end() const { return pos == NULL; } + inline void next() { pos = (pos->next != ini) ? pos->next : 0; } + + private: + ValueRef *pos, *ini; + }; + + inline Iterator iterator() { return Iterator(this); } + +public: + Modifier mod; + int8_t indirect[2]; // >= 0 if relative to lvalue in insn->src[indirect[i]] + uint8_t swizzle; + + bool usedAsPtr; // for printing + +private: + Value *value; + Instruction *insn; + ValueRef *next; // to link uses of the value + ValueRef *prev; +}; + +class ValueDef +{ +public: + ValueDef(); + ~ValueDef(); + + inline ValueDef& operator=(Value *val) { this->set(val); return *this; } + + inline bool exists() const { return value != NULL; } + + inline Value *get() const { return value; } + inline Value *rep() const; + void set(Value *); + void replace(Value *, bool doSet); // replace all uses of the old value + + inline Instruction *getInsn() const { return insn; } + inline void setInsn(Instruction *inst) { insn = inst; } + + inline DataFile getFile() const; + inline unsigned getSize() const; + + // HACK: save the pre-SSA value in 'prev', in SSA we don't need the def list + // but we'll use it again for coalescing in register allocation + inline void setSSA(LValue *); + inline const LValue *preSSA() const; + inline void restoreDefList(); // after having been abused for SSA hack + void mergeDefs(ValueDef *); + + class Iterator + { + public: + Iterator(ValueDef *def) : pos(def), ini(def) { } + + inline ValueDef *get() const { return pos; } + inline bool end() const { return pos == NULL; } + inline void next() { pos = (pos->next != ini) ? pos->next : NULL; } + + private: + ValueDef *pos, *ini; + }; + + inline Iterator iterator() { return Iterator(this); } + +private: + Value *value; // should make this LValue * ... + Instruction *insn; + ValueDef *next; // circular list of all definitions of the same value + ValueDef *prev; +}; + +class Value +{ +public: + Value(); + + virtual Value *clone(Function *) const { return NULL; } + + virtual int print(char *, size_t, DataType ty = TYPE_NONE) const = 0; + + virtual bool equals(const Value *, bool strict = false) const; + virtual bool interfers(const Value *) const; + + inline Instruction *getUniqueInsn() const; + inline Instruction *getInsn() const; // use when uniqueness is certain + + inline int refCount() { return refCnt; } + inline int ref() { return ++refCnt; } + inline int unref() { --refCnt; assert(refCnt >= 0); return refCnt; } + + inline LValue *asLValue(); + inline Symbol *asSym(); + inline ImmediateValue *asImm(); + inline const Symbol *asSym() const; + inline const ImmediateValue *asImm() const; + + bool coalesce(Value *, bool force = false); + + inline bool inFile(DataFile f) { return reg.file == f; } + + static inline Value *get(Iterator&); + +protected: + int refCnt; + + friend class ValueDef; + friend class ValueRef; + +public: + int id; + ValueRef *uses; + ValueDef *defs; + Storage reg; + + // TODO: these should be in LValue: + Interval livei; + Value *join; +}; + +class LValue : public Value +{ +public: + LValue(Function *, DataFile file); + LValue(Function *, LValue *); + + virtual Value *clone(Function *) const; + + virtual int print(char *, size_t, DataType ty = TYPE_NONE) const; + +public: + unsigned ssa : 1; + + int affinity; +}; + +class Symbol : public Value +{ +public: + Symbol(Program *, DataFile file = FILE_MEMORY_CONST, ubyte fileIdx = 0); + + virtual Value *clone(Function *) const; + + virtual bool equals(const Value *that, bool strict) const; + + virtual int print(char *, size_t, DataType ty = TYPE_NONE) const; + + // print with indirect values + int print(char *, size_t, Value *, Value *, DataType ty = TYPE_NONE) const; + + inline void setFile(DataFile file, ubyte fileIndex = 0) + { + reg.file = file; + reg.fileIndex = fileIndex; + } + + inline void setOffset(int32_t offset); + inline void setAddress(Symbol *base, int32_t offset); + inline void setSV(SVSemantic sv, uint32_t idx = 0); + + inline const Symbol *getBase() const { return baseSym; } + +private: + Symbol *baseSym; // array base for Symbols representing array elements +}; + +class ImmediateValue : public Value +{ +public: + ImmediateValue(Program *, uint32_t); + ImmediateValue(Program *, float); + ImmediateValue(Program *, double); + + // NOTE: not added to program with + ImmediateValue(const ImmediateValue *, DataType ty); + + virtual bool equals(const Value *that, bool strict) const; + + // these only work if 'type' is valid (we mostly use untyped literals): + bool isInteger(const int ival) const; // ival is cast to this' type + bool isNegative() const; + bool isPow2() const; + + void applyLog2(); + + // for constant folding: + ImmediateValue operator+(const ImmediateValue&) const; + ImmediateValue operator-(const ImmediateValue&) const; + ImmediateValue operator*(const ImmediateValue&) const; + ImmediateValue operator/(const ImmediateValue&) const; + + bool compare(CondCode cc, float fval) const; + + virtual int print(char *, size_t, DataType ty = TYPE_NONE) const; +}; + + +#define NV50_IR_MAX_DEFS 4 +#define NV50_IR_MAX_SRCS 8 + +class Instruction +{ +public: + Instruction(); + Instruction(Function *, operation, DataType); + virtual ~Instruction(); + + virtual Instruction *clone(bool deep) const; + + inline void setDef(int i, Value *val) { def[i].set(val); } + inline void setSrc(int s, Value *val) { src[s].set(val); } + void setSrc(int s, ValueRef&); + void swapSources(int a, int b); + bool setIndirect(int s, int dim, Value *); + + inline Value *getDef(int d) const { return def[d].get(); } + inline Value *getSrc(int s) const { return src[s].get(); } + inline Value *getIndirect(int s, int dim) const; + + inline bool defExists(int d) const { return d < 4 && def[d].exists(); } + inline bool srcExists(int s) const { return s < 8 && src[s].exists(); } + + inline bool constrainedDefs() const { return def[1].exists(); } + + bool setPredicate(CondCode ccode, Value *); + inline Value *getPredicate() const; + bool writesPredicate() const; + + unsigned int defCount(unsigned int mask) const; + unsigned int srcCount(unsigned int mask) const; + + // save & remove / set indirect[0,1] and predicate source + void takeExtraSources(int s, Value *[3]); + void putExtraSources(int s, Value *[3]); + + inline void setType(DataType type) { dType = sType = type; } + + inline void setType(DataType dtype, DataType stype) + { + dType = dtype; + sType = stype; + } + + inline bool isPseudo() const { return op < OP_MOV; } + bool isDead() const; + bool isNop() const; + bool isCommutationLegal(const Instruction *) const; // must be adjacent ! + bool isActionEqual(const Instruction *) const; + bool isResultEqual(const Instruction *) const; + + void print() const; + + inline CmpInstruction *asCmp(); + inline TexInstruction *asTex(); + inline FlowInstruction *asFlow(); + inline const TexInstruction *asTex() const; + inline const CmpInstruction *asCmp() const; + inline const FlowInstruction *asFlow() const; + +public: + Instruction *next; + Instruction *prev; + int id; + int serial; // CFG order + + operation op; + DataType dType; // destination or defining type + DataType sType; // source or secondary type + CondCode cc; + RoundMode rnd; + CacheMode cache; + + uint8_t subOp; // quadop, 1 for mul-high, etc. + + unsigned encSize : 4; // encoding size in bytes + unsigned saturate : 1; // to [0.0f, 1.0f] + unsigned join : 1; // converge control flow (use OP_JOIN until end) + unsigned fixed : 1; // prevent dead code elimination + unsigned terminator : 1; // end of basic block + unsigned atomic : 1; + unsigned ftz : 1; // flush denormal to zero + unsigned dnz : 1; // denormals, NaN are zero + unsigned ipa : 4; // interpolation mode + unsigned lanes : 4; + unsigned perPatch : 1; + unsigned exit : 1; // terminate program after insn + + int8_t postFactor; // MUL/DIV(if < 0) by 1 << postFactor + + int8_t predSrc; + int8_t flagsDef; + int8_t flagsSrc; + + // NOTE: should make these pointers, saves space and work on shuffling + ValueDef def[NV50_IR_MAX_DEFS]; // no gaps ! + ValueRef src[NV50_IR_MAX_SRCS]; // no gaps ! + + BasicBlock *bb; + + // instruction specific methods: + // (don't want to subclass, would need more constructors and memory pools) +public: + inline void setInterpolate(unsigned int mode) { ipa = mode; } + + unsigned int getInterpMode() const { return ipa & 0x3; } + unsigned int getSampleMode() const { return ipa & 0xc; } + +private: + void init(); +protected: + void cloneBase(Instruction *clone, bool deep) const; +}; + +enum TexQuery +{ + TXQ_DIMS, + TXQ_TYPE, + TXQ_SAMPLE_POSITION, + TXQ_FILTER, + TXQ_LOD, + TXQ_WRAP, + TXQ_BORDER_COLOUR +}; + +class TexInstruction : public Instruction +{ +public: + class Target + { + public: + Target(TexTarget targ = TEX_TARGET_2D) : target(targ) { } + + const char *getName() const { return descTable[target].name; } + unsigned int getArgCount() const { return descTable[target].argc; } + unsigned int getDim() const { return descTable[target].dim; } + int isArray() const { return descTable[target].array ? 1 : 0; } + int isCube() const { return descTable[target].cube ? 1 : 0; } + int isShadow() const { return descTable[target].shadow ? 1 : 0; } + + Target& operator=(TexTarget targ) + { + assert(targ < TEX_TARGET_COUNT); + return *this; + } + + inline bool operator==(TexTarget targ) const { return target == targ; } + + private: + struct Desc + { + char name[19]; + uint8_t dim; + uint8_t argc; + bool array; + bool cube; + bool shadow; + }; + + static const struct Desc descTable[TEX_TARGET_COUNT]; + + private: + enum TexTarget target; + }; + +public: + TexInstruction(Function *, operation); + virtual ~TexInstruction(); + + virtual Instruction *clone(bool deep) const; + + inline void setTexture(Target targ, uint8_t r, uint8_t s) + { + tex.r = r; + tex.s = s; + tex.target = targ; + } + + inline Value *getIndirectR() const; + inline Value *getIndirectS() const; + +public: + struct { + Target target; + + uint8_t r; + int8_t rIndirectSrc; + uint8_t s; + int8_t sIndirectSrc; + + uint8_t mask; + uint8_t gatherComp; + + bool liveOnly; // only execute on live pixels of a quad (optimization) + bool levelZero; + + int8_t useOffsets; // 0, 1, or 4 for textureGatherOffsets + int8_t offset[4][3]; + + enum TexQuery query; + } tex; + + ValueRef dPdx[3]; + ValueRef dPdy[3]; +}; + +class CmpInstruction : public Instruction +{ +public: + CmpInstruction(Function *, operation); + + virtual Instruction *clone(bool deep) const; + + void setCondition(CondCode cond) { setCond = cond; } + CondCode getCondition() const { return setCond; } + +public: + CondCode setCond; +}; + +class FlowInstruction : public Instruction +{ +public: + FlowInstruction(Function *, operation, BasicBlock *target); + +public: + unsigned allWarp : 1; + unsigned absolute : 1; + unsigned limit : 1; + unsigned builtin : 1; // true for calls to emulation code + + union { + BasicBlock *bb; + int builtin; + Function *fn; + } target; +}; + +class BasicBlock +{ +public: + BasicBlock(Function *); + ~BasicBlock(); + + inline int getId() const { return id; } + inline unsigned int getInsnCount() const { return numInsns; } + inline bool isTerminated() const { return exit && exit->terminator; } + + bool dominatedBy(BasicBlock *bb); + inline bool reachableBy(BasicBlock *by, BasicBlock *term); + + // returns mask of conditional out blocks + // e.g. 3 for IF { .. } ELSE { .. } ENDIF, 1 for IF { .. } ENDIF + unsigned int initiatesSimpleConditional() const; + +public: + Function *getFunction() const { return func; } + Program *getProgram() const { return program; } + + Instruction *getEntry() const { return entry; } // first non-phi instruction + Instruction *getPhi() const { return phi; } + Instruction *getFirst() const { return phi ? phi : entry; } + Instruction *getExit() const { return exit; } + + void insertHead(Instruction *); + void insertTail(Instruction *); + void insertBefore(Instruction *, Instruction *); + void insertAfter(Instruction *, Instruction *); + void remove(Instruction *); + void permuteAdjacent(Instruction *, Instruction *); + + BasicBlock *idom() const; + + DLList& getDF() { return df; } + DLList::Iterator iterDF() { return df.iterator(); } + + static inline BasicBlock *get(Iterator&); + static inline BasicBlock *get(Graph::Node *); + +public: + Graph::Node cfg; // first edge is branch *taken* (the ELSE branch) + Graph::Node dom; + + BitSet liveSet; + + uint32_t binPos; + uint32_t binSize; + + Instruction *joinAt; // for quick reference + + bool explicitCont; // loop headers: true if loop contains continue stmts + +private: + int id; + DLList df; + + Instruction *phi; + Instruction *entry; + Instruction *exit; + + unsigned int numInsns; + +private: + Function *func; + Program *program; +}; + +class Function +{ +public: + Function(Program *, const char *name); + ~Function(); + + inline Program *getProgram() const { return prog; } + inline const char *getName() const { return name; } + inline int getId() const { return id; } + + void print(); + void printLiveIntervals() const; + void printCFGraph(const char *filePath); + + bool setEntry(BasicBlock *); + bool setExit(BasicBlock *); + + unsigned int orderInstructions(ArrayList&); + + inline void add(BasicBlock *bb, int& id) { allBBlocks.insert(bb, id); } + inline void add(Instruction *insn, int& id) { allInsns.insert(insn, id); } + inline void add(LValue *lval, int& id) { allLValues.insert(lval, id); } + + inline LValue *getLValue(int id); + + bool convertToSSA(); + +public: + Graph cfg; + Graph::Node *cfgExit; + Graph *domTree; + Graph::Node call; // node in the call graph + + BasicBlock **bbArray; // BBs in emission order + int bbCount; + + unsigned int loopNestingBound; + int regClobberMax; + + uint32_t binPos; + uint32_t binSize; + + ArrayList allBBlocks; + ArrayList allInsns; + ArrayList allLValues; + +private: + void buildLiveSetsPreSSA(BasicBlock *, const int sequence); + +private: + int id; + const char *const name; + Program *prog; +}; + +enum CGStage +{ + CG_STAGE_PRE_SSA, + CG_STAGE_SSA, // expected directly before register allocation + CG_STAGE_POST_RA +}; + +class Program +{ +public: + enum Type + { + TYPE_VERTEX, + TYPE_TESSELLATION_CONTROL, + TYPE_TESSELLATION_EVAL, + TYPE_GEOMETRY, + TYPE_FRAGMENT, + TYPE_COMPUTE + }; + + Program(Type type, Target *targ); + ~Program(); + + void print(); + + Type getType() const { return progType; } + + inline void add(Function *fn, int& id) { allFuncs.insert(fn, id); } + inline void add(Value *rval, int& id) { allRValues.insert(rval, id); } + + bool makeFromTGSI(struct nv50_ir_prog_info *); + bool makeFromSM4(struct nv50_ir_prog_info *); + bool convertToSSA(); + bool optimizeSSA(int level); + bool optimizePostRA(int level); + bool registerAllocation(); + bool emitBinary(struct nv50_ir_prog_info *); + + const Target *getTarget() const { return target; } + +private: + Type progType; + Target *target; + +public: + Function *main; + Graph calls; + + ArrayList allFuncs; + ArrayList allRValues; + + uint32_t *code; + uint32_t binSize; + + int maxGPR; + + MemoryPool mem_Instruction; + MemoryPool mem_CmpInstruction; + MemoryPool mem_TexInstruction; + MemoryPool mem_FlowInstruction; + MemoryPool mem_LValue; + MemoryPool mem_Symbol; + MemoryPool mem_ImmediateValue; + + uint32_t dbgFlags; + + void releaseInstruction(Instruction *); + void releaseValue(Value *); +}; + +// TODO: add const version +class Pass +{ +public: + bool run(Program *, bool ordered = false, bool skipPhi = false); + bool run(Function *, bool ordered = false, bool skipPhi = false); + +private: + // return false to continue with next entity on next higher level + virtual bool visit(Function *) { return true; } + virtual bool visit(BasicBlock *) { return true; } + virtual bool visit(Instruction *) { return false; } + + bool doRun(Program *, bool ordered, bool skipPhi); + bool doRun(Function *, bool ordered, bool skipPhi); + +protected: + bool err; + Function *func; + Program *prog; +}; + +// ============================================================================= + +#include "nv50_ir_inlines.h" + +} // namespace nv50_ir + +#endif // __NV50_IR_H__ diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp new file mode 100644 index 00000000000..5bf08b37c51 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_bb.cpp @@ -0,0 +1,409 @@ + +#include "nv50_ir.h" + +namespace nv50_ir { + +Function::Function(Program *p, const char *fnName) + : call(this), + name(fnName), + prog(p) +{ + cfgExit = NULL; + domTree = NULL; + + bbArray = NULL; + bbCount = 0; + loopNestingBound = 0; + regClobberMax = 0; + + binPos = 0; + binSize = 0; + + prog->add(this, id); +} + +Function::~Function() +{ + if (domTree) + delete domTree; + if (bbArray) + delete[] bbArray; + + for (ArrayList::Iterator BBs = allBBlocks.iterator(); !BBs.end(); BBs.next()) + delete reinterpret_cast(BBs.get()); +} + +BasicBlock::BasicBlock(Function *fn) : cfg(this), dom(this), func(fn) +{ + program = func->getProgram(); + + joinAt = phi = entry = exit = NULL; + + numInsns = 0; + binPos = 0; + binSize = 0; + + explicitCont = false; + + func->add(this, this->id); +} + +BasicBlock::~BasicBlock() +{ + // nothing yet +} + +BasicBlock * +BasicBlock::idom() const +{ + Graph::Node *dn = dom.parent(); + return dn ? BasicBlock::get(dn) : NULL; +} + +void +BasicBlock::insertHead(Instruction *inst) +{ + assert(inst->next == 0 && inst->prev == 0); + + if (inst->op == OP_PHI) { + if (phi) { + insertBefore(phi, inst); + } else { + if (entry) { + insertBefore(entry, phi); + } else { + assert(!exit); + phi = exit = inst; + inst->bb = this; + ++numInsns; + } + } + } else { + if (entry) { + insertBefore(entry, inst); + } else { + if (phi) { + insertAfter(phi, inst); + } else { + assert(!exit); + entry = exit = inst; + inst->bb = this; + ++numInsns; + } + } + } +} + +void +BasicBlock::insertTail(Instruction *inst) +{ + assert(inst->next == 0 && inst->prev == 0); + + if (inst->op == OP_PHI) { + if (entry) { + insertBefore(entry, inst); + } else + if (exit) { + assert(phi); + insertAfter(exit, inst); + } else { + assert(!phi); + phi = exit = inst; + inst->bb = this; + ++numInsns; + } + } else { + if (exit) { + insertAfter(exit, inst); + } else { + assert(!phi); + entry = exit = inst; + inst->bb = this; + ++numInsns; + } + } +} + +void +BasicBlock::insertBefore(Instruction *q, Instruction *p) +{ + assert(p && q); + + assert(p->next == 0 && p->prev == 0); + + if (q == entry) { + if (p->op == OP_PHI) { + if (!phi) + phi = p; + } else { + entry = p; + } + } else + if (q == phi) { + assert(p->op == OP_PHI); + phi = p; + } + + p->next = q; + p->prev = q->prev; + if (p->prev) + p->prev->next = p; + q->prev = p; + + p->bb = this; + ++numInsns; +} + +void +BasicBlock::insertAfter(Instruction *p, Instruction *q) +{ + assert(p && q); + assert(q->op != OP_PHI || p->op == OP_PHI); + + assert(q->next == 0 && q->prev == 0); + + if (p == exit) + exit = q; + if (p->op == OP_PHI && q->op != OP_PHI) + entry = q; + + q->prev = p; + q->next = p->next; + if (q->next) + q->next->prev = q; + p->next = q; + + q->bb = this; + ++numInsns; +} + +void +BasicBlock::remove(Instruction *insn) +{ + assert(insn->bb == this); + + if (insn->prev) + insn->prev->next = insn->next; + + if (insn->next) + insn->next->prev = insn->prev; + else + exit = insn->prev; + + if (insn == entry) + entry = insn->next ? insn->next : insn->prev; + + if (insn == phi) + phi = (insn->next && insn->next->op == OP_PHI) ? insn->next : 0; + + --numInsns; + insn->bb = NULL; + insn->next = + insn->prev = NULL; +} + +void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b) +{ + assert(a->bb == b->bb); + + if (a->next != b) { + Instruction *i = a; + a = b; + b = i; + } + assert(a->next == b); + assert(a->op != OP_PHI && b->op != OP_PHI); + + if (b == exit) + exit = a; + if (a == entry) + entry = b; + + b->prev = a->prev; + a->next = b->next; + b->next = a; + a->prev = b; + + if (b->prev) + b->prev->next = b; + if (a->prev) + a->next->prev = a; +} + +bool +BasicBlock::dominatedBy(BasicBlock *that) +{ + Graph::Node *bn = &that->dom; + Graph::Node *dn = &this->dom; + + while (dn && dn != bn) + dn = dn->parent(); + + return dn != NULL; +} + +unsigned int +BasicBlock::initiatesSimpleConditional() const +{ + Graph::Node *out[2]; + int n; + Graph::Edge::Type eR; + + if (cfg.outgoingCount() != 2) // -> if and -> else/endif + return false; + + n = 0; + for (Graph::EdgeIterator ei = cfg.outgoing(); !ei.end(); ei.next()) + out[n++] = ei.getNode(); + eR = out[1]->outgoing().getType(); + + // IF block is out edge to the right + if (eR == Graph::Edge::CROSS || eR == Graph::Edge::BACK) + return 0x2; + + if (out[1]->outgoingCount() != 1) // 0 is IF { RET; }, >1 is more divergence + return 0x0; + // do they reconverge immediately ? + if (out[1]->outgoing().getNode() == out[0]) + return 0x1; + if (out[0]->outgoingCount() == 1) + if (out[0]->outgoing().getNode() == out[1]->outgoing().getNode()) + return 0x3; + + return 0x0; +} + +bool +Function::setEntry(BasicBlock *bb) +{ + if (cfg.getRoot()) + return false; + cfg.insert(&bb->cfg); + return true; +} + +bool +Function::setExit(BasicBlock *bb) +{ + if (cfgExit) + return false; + cfgExit = &bb->cfg; + return true; +} + +unsigned int +Function::orderInstructions(ArrayList &result) +{ + Iterator *iter; + for (iter = cfg.iteratorCFG(); !iter->end(); iter->next()) + for (Instruction *insn = BasicBlock::get(*iter)->getFirst(); + insn; insn = insn->next) + result.insert(insn, insn->serial); + cfg.putIterator(iter); + return result.getSize(); +} + +bool +Pass::run(Program *prog, bool ordered, bool skipPhi) +{ + this->prog = prog; + err = false; + return doRun(prog, ordered, skipPhi); +} + +bool +Pass::doRun(Program *prog, bool ordered, bool skipPhi) +{ + for (ArrayList::Iterator fi = prog->allFuncs.iterator(); + !fi.end(); fi.next()) { + Function *fn = reinterpret_cast(fi.get()); + if (!doRun(fn, ordered, skipPhi)) + return false; + } + return !err; +} + +bool +Pass::run(Function *func, bool ordered, bool skipPhi) +{ + prog = func->getProgram(); + err = false; + return doRun(func, ordered, skipPhi); +} + +bool +Pass::doRun(Function *func, bool ordered, bool skipPhi) +{ + Iterator *bbIter; + BasicBlock *bb; + Instruction *insn, *next; + + this->func = func; + if (!visit(func)) + return false; + + bbIter = ordered ? func->cfg.iteratorCFG() : func->cfg.iteratorDFS(); + + for (; !bbIter->end(); bbIter->next()) { + bb = BasicBlock::get(reinterpret_cast(bbIter->get())); + if (!visit(bb)) + break; + for (insn = skipPhi ? bb->getEntry() : bb->getFirst(); insn != NULL; + insn = next) { + next = insn->next; + if (!visit(insn)) + break; + } + } + func->cfg.putIterator(bbIter); + return !err; +} + +void +Function::printCFGraph(const char *filePath) +{ + FILE *out = fopen(filePath, "a"); + if (!out) { + ERROR("failed to open file: %s\n", filePath); + return; + } + INFO("printing control flow graph to: %s\n", filePath); + + fprintf(out, "digraph G {\n"); + + Iterator *iter; + for (iter = cfg.iteratorDFS(); !iter->end(); iter->next()) { + BasicBlock *bb = BasicBlock::get( + reinterpret_cast(iter->get())); + int idA = bb->getId(); + for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { + int idB = BasicBlock::get(ei.getNode())->getId(); + switch (ei.getType()) { + case Graph::Edge::TREE: + fprintf(out, "\t%i -> %i;\n", idA, idB); + break; + case Graph::Edge::FORWARD: + fprintf(out, "\t%i -> %i [color=green];\n", idA, idB); + break; + case Graph::Edge::CROSS: + fprintf(out, "\t%i -> %i [color=red];\n", idA, idB); + break; + case Graph::Edge::BACK: + fprintf(out, "\t%i -> %i;\n", idA, idB); + break; + case Graph::Edge::DUMMY: + fprintf(out, "\t%i -> %i [style=dotted];\n", idA, idB); + break; + default: + assert(0); + break; + } + } + } + cfg.putIterator(iter); + + fprintf(out, "}\n"); + fclose(out); +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp new file mode 100644 index 00000000000..284736838ab --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp @@ -0,0 +1,501 @@ + +#include "nv50_ir.h" +#include "nv50_ir_build_util.h" + +namespace nv50_ir { + +BuildUtil::BuildUtil() +{ + prog = NULL; + func = NULL; + bb = NULL; + pos = NULL; + + memset(imms, 0, sizeof(imms)); + immCount = 0; +} + +void +BuildUtil::addImmediate(ImmediateValue *imm) +{ + if (immCount > (NV50_IR_BUILD_IMM_HT_SIZE * 3) / 4) + return; + + unsigned int pos = u32Hash(imm->reg.data.u32); + + while (imms[pos]) + pos = (pos + 1) % NV50_IR_BUILD_IMM_HT_SIZE; + imms[pos] = imm; + immCount++; +} + +Instruction * +BuildUtil::mkOp1(operation op, DataType ty, Value *dst, Value *src) +{ + Instruction *insn = new_Instruction(func, op, ty); + + insn->setDef(0, dst); + insn->setSrc(0, src); + + insert(insn); + return insn; +} + +Instruction * +BuildUtil::mkOp2(operation op, DataType ty, Value *dst, + Value *src0, Value *src1) +{ + Instruction *insn = new_Instruction(func, op, ty); + + insn->setDef(0, dst); + insn->setSrc(0, src0); + insn->setSrc(1, src1); + + insert(insn); + return insn; +} + +Instruction * +BuildUtil::mkOp3(operation op, DataType ty, Value *dst, + Value *src0, Value *src1, Value *src2) +{ + Instruction *insn = new_Instruction(func, op, ty); + + insn->setDef(0, dst); + insn->setSrc(0, src0); + insn->setSrc(1, src1); + insn->setSrc(2, src2); + + insert(insn); + return insn; +} + +LValue * +BuildUtil::mkLoad(DataType ty, Symbol *mem, Value *ptr) +{ + Instruction *insn = new_Instruction(func, OP_LOAD, ty); + LValue *def = getScratch(); + + insn->setDef(0, def); + insn->setSrc(0, mem); + if (ptr) + insn->setIndirect(0, 0, ptr); + + insert(insn); + return def; +} + +Instruction * +BuildUtil::mkStore(operation op, DataType ty, Symbol *mem, Value *ptr, + Value *stVal) +{ + Instruction *insn = new_Instruction(func, op, ty); + + insn->setSrc(0, mem); + insn->setSrc(1, stVal); + if (ptr) + insn->setIndirect(0, 0, ptr); + + insert(insn); + return insn; +} + +Instruction * +BuildUtil::mkFetch(Value *dst, DataType ty, DataFile file, int32_t offset, + Value *attrRel, Value *primRel) +{ + Symbol *sym = mkSymbol(file, 0, ty, offset); + + Instruction *insn = mkOp1(OP_VFETCH, ty, dst, sym); + + insn->setIndirect(0, 0, attrRel); + insn->setIndirect(0, 1, primRel); + + // already inserted + return insn; +} + +Instruction * +BuildUtil::mkMov(Value *dst, Value *src, DataType ty) +{ + Instruction *insn = new_Instruction(func, OP_MOV, ty); + + insn->setDef(0, dst); + insn->setSrc(0, src); + + insert(insn); + return insn; +} + +Instruction * +BuildUtil::mkMovToReg(int id, Value *src) +{ + Instruction *insn = new_Instruction(func, OP_MOV, typeOfSize(src->reg.size)); + + insn->setDef(0, new_LValue(func, FILE_GPR)); + insn->getDef(0)->reg.data.id = id; + insn->setSrc(0, src); + + insert(insn); + return insn; +} + +Instruction * +BuildUtil::mkMovFromReg(Value *dst, int id) +{ + Instruction *insn = new_Instruction(func, OP_MOV, typeOfSize(dst->reg.size)); + + insn->setDef(0, dst); + insn->setSrc(0, new_LValue(func, FILE_GPR)); + insn->getSrc(0)->reg.data.id = id; + + insert(insn); + return insn; +} + +Instruction * +BuildUtil::mkCvt(operation op, + DataType dstTy, Value *dst, DataType srcTy, Value *src) +{ + Instruction *insn = new_Instruction(func, op, dstTy); + + insn->setType(dstTy, srcTy); + insn->setDef(0, dst); + insn->setSrc(0, src); + + insert(insn); + return insn; +} + +Instruction * +BuildUtil::mkCmp(operation op, CondCode cc, DataType ty, Value *dst, + Value *src0, Value *src1, Value *src2) +{ + CmpInstruction *insn = new_CmpInstruction(func, op); + + insn->setType(dst->reg.file == FILE_PREDICATE ? TYPE_U8 : ty, ty); + insn->setCondition(cc); + insn->setDef(0, dst); + insn->setSrc(0, src0); + insn->setSrc(1, src1); + if (src2) + insn->setSrc(2, src2); + + insert(insn); + return insn; +} + +Instruction * +BuildUtil::mkTex(operation op, TexTarget targ, uint8_t tic, uint8_t tsc, + Value **def, Value **src) +{ + TexInstruction *tex = new_TexInstruction(func, op); + + for (int d = 0; d < 4 && def[d]; ++d) + tex->setDef(d, def[d]); + for (int s = 0; s < 4 && src[s]; ++s) + tex->setSrc(s, src[s]); + + tex->setTexture(targ, tic, tsc); + + return tex; +} + +Instruction * +BuildUtil::mkQuadop(uint8_t q, Value *def, uint8_t l, Value *src0, Value *src1) +{ + Instruction *quadop = mkOp2(OP_QUADOP, TYPE_F32, def, src0, src1); + quadop->subOp = q; + quadop->lanes = l; + return quadop; +} + +Instruction * +BuildUtil::mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc) +{ + Instruction *insn; + LValue *def0 = getSSA(); + LValue *def1 = getSSA(); + + mkMov(def0, trSrc)->setPredicate(CC_P, pred); + mkMov(def1, flSrc)->setPredicate(CC_NOT_P, pred); + + insn = mkOp2(OP_UNION, typeOfSize(dst->reg.size), dst, def0, def1); + + insert(insn); + return insn; +} + +FlowInstruction * +BuildUtil::mkFlow(operation op, BasicBlock *targ, CondCode cc, Value *pred) +{ + FlowInstruction *insn = new_FlowInstruction(func, op, targ); + + if (pred) + insn->setPredicate(cc, pred); + + insert(insn); + return insn; +} + +void +BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit) +{ + static const uint16_t baseSize2[16] = + { + 0x0000, 0x0010, 0x0011, 0x0020, 0x0012, 0x1210, 0x1211, 0x1220, + 0x0013, 0x1310, 0x1311, 0x0020, 0x1320, 0x0022, 0x2210, 0x0040, + }; + + int base = 0; + + for (; rMask; rMask >>= 4, base += 4) { + const uint32_t mask = rMask & 0xf; + if (!mask) + continue; + int base1 = (baseSize2[mask] >> 0) & 0xf; + int size1 = (baseSize2[mask] >> 4) & 0xf; + int base2 = (baseSize2[mask] >> 8) & 0xf; + int size2 = (baseSize2[mask] >> 12) & 0xf; + Instruction *insn = mkOp(OP_NOP, TYPE_NONE, NULL); + if (1) { // size1 can't be 0 + LValue *reg = new_LValue(func, f); + reg->reg.size = size1 << unit; + reg->reg.data.id = base + base1; + insn->setDef(0, reg); + } + if (size2) { + LValue *reg = new_LValue(func, f); + reg->reg.size = size2 << unit; + reg->reg.data.id = base + base2; + insn->setDef(1, reg); + } + } +} + +ImmediateValue * +BuildUtil::mkImm(uint32_t u) +{ + unsigned int pos = u32Hash(u); + + while (imms[pos] && imms[pos]->reg.data.u32 != u) + pos = (pos + 1) % NV50_IR_BUILD_IMM_HT_SIZE; + + ImmediateValue *imm = imms[pos]; + if (!imm) { + imm = new_ImmediateValue(prog, u); + addImmediate(imm); + } + return imm; +} + +ImmediateValue * +BuildUtil::mkImm(uint64_t u) +{ + ImmediateValue *imm = new_ImmediateValue(prog, (uint32_t)0); + + imm->reg.size = 8; + imm->reg.type = TYPE_U64; + imm->reg.data.u64 = u; + + return imm; +} + +ImmediateValue * +BuildUtil::mkImm(float f) +{ + union { + float f32; + uint32_t u32; + } u; + u.f32 = f; + return mkImm(u.u32); +} + +Value * +BuildUtil::loadImm(Value *dst, float f) +{ + return mkOp1v(OP_MOV, TYPE_F32, dst ? dst : getScratch(), mkImm(f)); +} + +Value * +BuildUtil::loadImm(Value *dst, uint32_t u) +{ + return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u)); +} + +Value * +BuildUtil::loadImm(Value *dst, uint64_t u) +{ + return mkOp1v(OP_MOV, TYPE_U64, dst ? dst : getScratch(8), mkImm(u)); +} + +Symbol * +BuildUtil::mkSymbol(DataFile file, int8_t fileIndex, DataType ty, + uint32_t baseAddr) +{ + Symbol *sym = new_Symbol(prog, file, fileIndex); + + sym->setOffset(baseAddr); + sym->reg.type = ty; + sym->reg.size = typeSizeof(ty); + + return sym; +} + +Symbol * +BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex) +{ + Symbol *sym = new_Symbol(prog, FILE_SYSTEM_VALUE, 0); + + assert(svIndex < 4 || + (svName == SV_CLIP_DISTANCE || svName == SV_TESS_FACTOR)); + + switch (svName) { + case SV_POSITION: + case SV_FACE: + case SV_YDIR: + case SV_POINT_SIZE: + case SV_POINT_COORD: + case SV_CLIP_DISTANCE: + case SV_TESS_FACTOR: + sym->reg.type = TYPE_F32; + break; + default: + sym->reg.type = TYPE_U32; + break; + } + sym->reg.size = typeSizeof(sym->reg.type); + + sym->reg.data.sv.sv = svName; + sym->reg.data.sv.index = svIndex; + + return sym; +} + +void +BuildUtil::DataArray::init() +{ + values = NULL; + baseAddr = 0; + arrayLen = 0; + + vecDim = 4; + eltSize = 2; + + file = FILE_GPR; + regOnly = true; +} + +BuildUtil::DataArray::DataArray() +{ + init(); +} + +BuildUtil::DataArray::DataArray(BuildUtil *bld) : up(bld) +{ + init(); +} + +BuildUtil::DataArray::~DataArray() +{ + if (values) + delete[] values; +} + +void +BuildUtil::DataArray::setup(uint32_t base, int len, int v, int size, + DataFile f, int8_t fileIndex) +{ + baseAddr = base; + arrayLen = len; + + vecDim = v; + eltSize = size; + + file = f; + regOnly = !isMemoryFile(f); + + values = new Value * [arrayLen * vecDim]; + if (values) + memset(values, 0, arrayLen * vecDim * sizeof(Value *)); + + if (!regOnly) { + baseSym = new_Symbol(up->getProgram(), file, fileIndex); + baseSym->setOffset(baseAddr); + baseSym->reg.size = size; + } +} + +Value * +BuildUtil::DataArray::acquire(int i, int c) +{ + const unsigned int idx = i * vecDim + c; + + assert(idx < arrayLen * vecDim); + + if (regOnly) { + const unsigned int idx = i * 4 + c; // vecDim always 4 if regOnly + if (!values[idx]) + values[idx] = new_LValue(up->getFunction(), file); + return values[idx]; + } else { + return up->getScratch(); + } +} + +Value * +BuildUtil::DataArray::load(int i, int c, Value *ptr) +{ + const unsigned int idx = i * vecDim + c; + + assert(idx < arrayLen * vecDim); + + if (regOnly) { + if (!values[idx]) + values[idx] = new_LValue(up->getFunction(), file); + return values[idx]; + } else { + Symbol *sym = reinterpret_cast(values[idx]); + if (!sym) + values[idx] = sym = this->mkSymbol(i, c, baseSym); + return up->mkLoad(typeOfSize(eltSize), sym, ptr); + } +} + +void +BuildUtil::DataArray::store(int i, int c, Value *ptr, Value *value) +{ + const unsigned int idx = i * vecDim + c; + + assert(idx < arrayLen * vecDim); + + if (regOnly) { + assert(!ptr); + assert(!values[idx] || values[idx] == value); + values[idx] = value; + } else { + Symbol *sym = reinterpret_cast(values[idx]); + if (!sym) + values[idx] = sym = this->mkSymbol(i, c, baseSym); + up->mkStore(OP_STORE, typeOfSize(value->reg.size), sym, ptr, value); + } +} + +Symbol * +BuildUtil::DataArray::mkSymbol(int i, int c, Symbol *base) +{ + const unsigned int idx = i * vecDim + c; + + Symbol *sym = new_Symbol(up->getProgram(), file, 0); + + assert(base || (idx < arrayLen && c < vecDim)); + + sym->reg.size = eltSize; + sym->reg.type = typeOfSize(eltSize); + + sym->setAddress(base, baseAddr + idx * eltSize); + return sym; +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h new file mode 100644 index 00000000000..4c3addb27e4 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h @@ -0,0 +1,245 @@ + +#ifndef __NV50_IR_BUILD_UTIL__ +#define __NV50_IR_BUILD_UTIL__ + +namespace nv50_ir { + +class BuildUtil +{ +public: + BuildUtil(); + + inline void setProgram(Program *); + inline Program *getProgram() const { return prog; } + inline Function *getFunction() const { return func; } + + // keeps inserting at head/tail of block + inline void setPosition(BasicBlock *, bool tail); + // position advances only if @after is true + inline void setPosition(Instruction *, bool after); + + inline BasicBlock *getBB() { return bb; } + + inline void insert(Instruction *); + inline void remove(Instruction *i) { assert(i->bb == bb); bb->remove(i); } + + inline LValue *getScratch(int size = 4); + inline LValue *getSSA(int size = 4); // scratch value for a single assignment + + inline Instruction *mkOp(operation, DataType, Value *); + Instruction *mkOp1(operation, DataType, Value *, Value *); + Instruction *mkOp2(operation, DataType, Value *, Value *, Value *); + Instruction *mkOp3(operation, DataType, Value *, Value *, Value *, Value *); + + LValue *mkOp1v(operation, DataType, Value *, Value *); + LValue *mkOp2v(operation, DataType, Value *, Value *, Value *); + LValue *mkOp3v(operation, DataType, Value *, Value *, Value *, Value *); + + LValue *mkLoad(DataType, Symbol *, Value *ptr); + Instruction *mkStore(operation, DataType, Symbol *, Value *ptr, Value *val); + + Instruction *mkMov(Value *, Value *, DataType = TYPE_U32); + Instruction *mkMovToReg(int id, Value *); + Instruction *mkMovFromReg(Value *, int id); + + Instruction *mkFetch(Value *, DataType, DataFile, int32_t offset, + Value *attrRel, Value *primRel); + + Instruction *mkCvt(operation, DataType, Value *, DataType, Value *); + Instruction *mkCmp(operation, CondCode, DataType, + Value *, + Value *, Value *, Value * = NULL); + Instruction *mkTex(operation, TexTarget, uint8_t tic, uint8_t tsc, + Value **def, Value **src); + Instruction *mkQuadop(uint8_t qop, Value *, uint8_t l, Value *, Value *); + + FlowInstruction *mkFlow(operation, BasicBlock *target, + CondCode, Value *pred); + + Instruction *mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc); + + void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2); + + ImmediateValue *mkImm(float); + ImmediateValue *mkImm(uint32_t); + ImmediateValue *mkImm(uint64_t); + + ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); } + + Value *loadImm(Value *dst, float); + Value *loadImm(Value *dst, uint32_t); + Value *loadImm(Value *dst, uint64_t); + + Value *loadImm(Value *dst, int i) { return loadImm(dst, (uint32_t)i); } + + class DataArray + { + public: + DataArray(); + DataArray(BuildUtil *); + ~DataArray(); + + inline void setParent(BuildUtil *bld) { assert(!up); up = bld; } + + void setup(uint32_t base, int len, int vecDim, int size, + DataFile, int8_t fileIndex = 0); + + inline bool exists(unsigned int i, unsigned int c); + + Value *load(int i, int c, Value *ptr); + void store(int i, int c, Value *ptr, Value *value); + Value *acquire(int i, int c); + + private: + Symbol *mkSymbol(int i, int c, Symbol *base); + + private: + Value **values; + uint32_t baseAddr; + uint32_t arrayLen; + Symbol *baseSym; + + uint8_t vecDim; + uint8_t eltSize; // in bytes + + DataFile file; + bool regOnly; + + BuildUtil *up; + + void init(); + }; + + Symbol *mkSymbol(DataFile file, int8_t fileIndex, + DataType ty, uint32_t baseAddress); + + Symbol *mkSysVal(SVSemantic svName, uint32_t svIndex); + +private: + void addImmediate(ImmediateValue *); + inline unsigned int u32Hash(uint32_t); + +protected: + Program *prog; + Function *func; + Instruction *pos; + BasicBlock *bb; + bool tail; + +#define NV50_IR_BUILD_IMM_HT_SIZE 256 + + ImmediateValue *imms[NV50_IR_BUILD_IMM_HT_SIZE]; + unsigned int immCount; +}; + +unsigned int BuildUtil::u32Hash(uint32_t u) +{ + return (u % 273) % NV50_IR_BUILD_IMM_HT_SIZE; +} + +void BuildUtil::setProgram(Program *program) +{ + prog = program; +} + +void +BuildUtil::setPosition(BasicBlock *block, bool atTail) +{ + bb = block; + prog = bb->getProgram(); + func = bb->getFunction(); + pos = NULL; + tail = atTail; +} + +void +BuildUtil::setPosition(Instruction *i, bool after) +{ + bb = i->bb; + prog = bb->getProgram(); + func = bb->getFunction(); + pos = i; + tail = after; + assert(bb); +} + +LValue * +BuildUtil::getScratch(int size) +{ + LValue *lval = new_LValue(func, FILE_GPR); + if (size != 4) + lval->reg.size = size; + return lval; +} + +LValue * +BuildUtil::getSSA(int size) +{ + LValue *lval = new_LValue(func, FILE_GPR); + lval->ssa = 1; + if (size != 4) + lval->reg.size = size; + return lval; +} + +void BuildUtil::insert(Instruction *i) +{ + if (!pos) { + tail ? bb->insertTail(i) : bb->insertHead(i); + } else { + if (tail) { + bb->insertAfter(pos, i); + pos = i; + } else { + bb->insertBefore(pos, i); + } + } +} + +Instruction * +BuildUtil::mkOp(operation op, DataType ty, Value *dst) +{ + Instruction *insn = new_Instruction(func, op, ty); + insn->setDef(0, dst); + insert(insn); + if (op == OP_DISCARD || op == OP_EXIT || + op == OP_JOIN || + op == OP_QUADON || op == OP_QUADPOP || + op == OP_EMIT || op == OP_RESTART) + insn->fixed = 1; + return insn; +} + +inline LValue * +BuildUtil::mkOp1v(operation op, DataType ty, Value *dst, Value *src) +{ + mkOp1(op, ty, dst, src); + return dst->asLValue(); +} + +inline LValue * +BuildUtil::mkOp2v(operation op, DataType ty, Value *dst, + Value *src0, Value *src1) +{ + mkOp2(op, ty, dst, src0, src1); + return dst->asLValue(); +} + +inline LValue * +BuildUtil::mkOp3v(operation op, DataType ty, Value *dst, + Value *src0, Value *src1, Value *src2) +{ + mkOp3(op, ty, dst, src0, src1, src2); + return dst->asLValue(); +} + +bool +BuildUtil::DataArray::exists(unsigned int i, unsigned int c) +{ + assert(i < arrayLen && c < vecDim); + return !regOnly || values[i * vecDim + c]; +} + +} // namespace nv50_ir + +#endif // __NV50_IR_BUILD_UTIL_H__ diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h b/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h new file mode 100644 index 00000000000..27e435d4ea1 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_driver.h @@ -0,0 +1,149 @@ + +#ifndef __NV50_IR_DRIVER_H__ +#define __NV50_IR_DRIVER_H__ + +#include "pipe/p_shader_tokens.h" + +#include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_scan.h" + +/* + * This struct constitutes linkage information in TGSI terminology. + * + * It is created by the code generator and handed to the pipe driver + * for input/output slot assignment. + */ +struct nv50_ir_varying +{ + uint8_t slot[4]; /* native slots for xyzw (addresses in 32-bit words) */ + + unsigned mask : 4; /* vec4 mask */ + unsigned linear : 1; /* linearly interpolated if true (and not flat) */ + unsigned flat : 1; + unsigned centroid : 1; + unsigned patch : 1; /* patch constant value */ + unsigned regular : 1; /* driver-specific meaning (e.g. input in sreg) */ + unsigned input : 1; /* indicates direction of system values */ + unsigned oread : 1; /* true if output is read from parallel TCP */ + + ubyte id; /* TGSI register index */ + ubyte sn; /* TGSI semantic name */ + ubyte si; /* TGSI semantic index */ +}; + +#define NV50_PROGRAM_IR_TGSI 0 +#define NV50_PROGRAM_IR_SM4 1 +#define NV50_PROGRAM_IR_GLSL 2 +#define NV50_PROGRAM_IR_LLVM 3 + +#ifdef DEBUG +# define NV50_IR_DEBUG_BASIC (1 << 0) +# define NV50_IR_DEBUG_VERBOSE (2 << 0) +# define NV50_IR_DEBUG_REG_ALLOC (1 << 2) +#else +# define NV50_IR_DEBUG_BASIC 0 +# define NV50_IR_DEBUG_VERBOSE 0 +# define NV50_IR_DEBUG_REG_ALLOC 0 +#endif + +struct nv50_ir_prog_info +{ + uint16_t target; /* chipset (0x50, 0x84, 0xc0, ...) */ + + uint8_t type; /* PIPE_SHADER */ + + uint8_t optLevel; /* optimization level (0 to 3) */ + uint8_t dbgFlags; + + struct { + int16_t maxGPR; /* may be -1 if none used */ + int16_t maxOutput; + uint32_t tlsSpace; /* required local memory per thread */ + uint32_t *code; + uint32_t codeSize; + uint8_t sourceRep; /* NV50_PROGRAM_IR */ + const void *source; + void *relocData; + } bin; + + struct nv50_ir_varying sv[PIPE_MAX_SHADER_INPUTS]; + struct nv50_ir_varying in[PIPE_MAX_SHADER_INPUTS]; + struct nv50_ir_varying out[PIPE_MAX_SHADER_OUTPUTS]; + uint8_t numInputs; + uint8_t numOutputs; + uint8_t numPatchConstants; /* also included in numInputs/numOutputs */ + uint8_t numSysVals; + + struct { + uint32_t *buf; /* for IMMEDIATE_ARRAY */ + uint16_t bufSize; /* size of immediate array */ + uint16_t count; /* count of inline immediates */ + uint32_t *data; /* inline immediate data */ + uint8_t *type; /* for each vec4 (128 bit) */ + } immd; + + union { + struct { + uint32_t inputMask[4]; /* mask of attributes read (1 bit per scalar) */ + } vp; + struct { + uint8_t inputPatchSize; + uint8_t outputPatchSize; + uint8_t partitioning; /* PIPE_TESS_PART */ + int8_t winding; /* +1 (clockwise) / -1 (counter-clockwise) */ + uint8_t domain; /* PIPE_PRIM_{QUADS,TRIANGLES,LINES} */ + uint8_t outputPrim; /* PIPE_PRIM_{TRIANGLES,LINES,POINTS} */ + } tp; + struct { + uint8_t inputPrim; + uint8_t outputPrim; + unsigned instanceCount; + unsigned maxVertices; + } gp; + struct { + unsigned numColourResults; + boolean writesDepth; + boolean earlyFragTests; + boolean separateFragData; + boolean usesDiscard; + } fp; + } prop; + + struct { + uint8_t clipDistance; /* index of first clip distance output */ + uint8_t clipDistanceCount; + uint8_t cullDistanceMask; /* clip distance mode (1 bit per output) */ + uint8_t pointSize; /* output index for PointSize */ + uint8_t edgeFlagIn; + uint8_t edgeFlagOut; + uint8_t fragDepth; /* output index of FragDepth */ + uint8_t sampleMask; /* output index of SampleMask */ + uint8_t backFaceColor[2]; /* input/output indices of back face colour */ + uint8_t globalAccess; /* 1 for read, 2 for wr, 3 for rw */ + } io; + + /* driver callback to assign input/output locations */ + int (*assignSlots)(struct nv50_ir_prog_info *); +}; + +#ifdef __cplusplus +extern "C" { +#endif + +extern int nv50_ir_generate_code(struct nv50_ir_prog_info *); + +extern void nv50_ir_relocate_code(void *relocData, uint32_t *code, + uint32_t codePos, + uint32_t libPos, + uint32_t dataPos); + +/* obtain code that will be shared among programs */ +extern void nv50_ir_get_target_library(uint32_t chipset, + const uint32_t **code, uint32_t *size); + +#ifdef __cplusplus +} +#endif + +#endif // __NV50_IR_DRIVER_H__ diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp new file mode 100644 index 00000000000..0a61a1ddaef --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_emit_nv50.cpp @@ -0,0 +1,1333 @@ + +#include "nv50_ir.h" +#include "nv50_ir_target.h" + +namespace nv50_ir { + +class CodeEmitterNV50 : public CodeEmitter +{ +public: + CodeEmitterNV50(const Target *); + + virtual bool emitInstruction(Instruction *); + + virtual uint32_t getMinEncodingSize(const Instruction *) const; + + inline void setProgramType(Program::Type pType) { progType = pType; } + +private: + const Target *targ; + + Program::Type progType; + +private: + inline void defId(const ValueDef&, const int pos); + inline void srcId(const ValueRef&, const int pos); + inline void srcId(const ValueRef *, const int pos); + + inline void srcAddr16(const ValueRef&, const int pos); + inline void srcAddr8(const ValueRef&, const int pos); + + void emitFlagsRd(const Instruction *); + void emitFlagsWr(const Instruction *); + + void emitCondCode(CondCode cc, int pos); + + inline void setARegBits(unsigned int); + + void setAReg16(const Instruction *, int s); + void setImmediate(const Instruction *, int s); + + void setDst(const Value *); + void setDst(const Instruction *, int d); + void emitSrc0(const ValueRef&); + void emitSrc1(const ValueRef&); + void emitSrc2(const ValueRef&); + + void emitForm_MAD(const Instruction *); + void emitForm_ADD(const Instruction *); + void emitForm_MUL(const Instruction *); + void emitForm_IMM(const Instruction *); + + void emitLoadStoreSize(DataType ty, int pos); + + void roundMode_MAD(const Instruction *); + void roundMode_CVT(RoundMode); + + void emitMNeg12(const Instruction *); + + void emitLOAD(const Instruction *); + void emitSTORE(const Instruction *); + void emitMOV(const Instruction *); + void emitNOP(); + void emitINTERP(const Instruction *); + void emitPFETCH(const Instruction *); + void emitOUT(const Instruction *); + + void emitUADD(const Instruction *); + void emitAADD(const Instruction *); + void emitFADD(const Instruction *); + void emitUMUL(const Instruction *); + void emitFMUL(const Instruction *); + void emitFMAD(const Instruction *); + + void emitMINMAX(const Instruction *); + + void emitPreOp(const Instruction *); + void emitSFnOp(const Instruction *, uint8_t subOp); + + void emitShift(const Instruction *); + void emitARL(const Instruction *); + void emitLogicOp(const Instruction *); + + void emitCVT(const Instruction *); + void emitSET(const Instruction *); + + void emitTEX(const TexInstruction *); + + void emitQUADOP(const Instruction *, uint8_t lane, uint8_t quOp); + + void emitFlow(const Instruction *, uint8_t flowOp); +}; + +#define SDATA(a) ((a).rep()->reg.data) +#define DDATA(a) ((a).rep()->reg.data) + +void CodeEmitterNV50::srcId(const ValueRef& src, const int pos) +{ + assert(src.get()); + code[pos / 32] |= SDATA(src).id << (pos % 32); +} + +void CodeEmitterNV50::srcId(const ValueRef *src, const int pos) +{ + assert(src->get()); + code[pos / 32] |= SDATA(*src).id << (pos % 32); +} + +void CodeEmitterNV50::srcAddr16(const ValueRef& src, const int pos) +{ + assert(src.get()); + + uint32_t offset = SDATA(src).offset; + + assert(offset <= 0xffff && (pos % 32) <= 16); + + code[pos / 32] |= offset << (pos % 32); +} + +void CodeEmitterNV50::srcAddr8(const ValueRef& src, const int pos) +{ + assert(src.get()); + + uint32_t offset = SDATA(src).offset; + + assert(offset <= 0x1fc && !(offset & 0x3)); + + code[pos / 32] |= (offset >> 2) << (pos % 32); +} + +void CodeEmitterNV50::defId(const ValueDef& def, const int pos) +{ + assert(def.get()); + code[pos / 32] |= DDATA(def).id << (pos % 32); +} + +void +CodeEmitterNV50::roundMode_MAD(const Instruction *insn) +{ + switch (insn->rnd) { + case ROUND_M: code[1] |= 1 << 22; break; + case ROUND_P: code[1] |= 2 << 22; break; + case ROUND_Z: code[1] |= 3 << 22; break; + default: + assert(insn->rnd == ROUND_N); + break; + } +} + +void +CodeEmitterNV50::emitMNeg12(const Instruction *i) +{ + code[1] |= i->src[0].mod.neg() << 26; + code[1] |= i->src[1].mod.neg() << 27; +} + +void CodeEmitterNV50::emitCondCode(CondCode cc, int pos) +{ + uint8_t enc; + + assert(pos >= 32 || pos <= 27); + + switch (cc) { + case CC_LT: enc = 0x1; break; + case CC_LTU: enc = 0x9; break; + case CC_EQ: enc = 0x2; break; + case CC_EQU: enc = 0xa; break; + case CC_LE: enc = 0x3; break; + case CC_LEU: enc = 0xb; break; + case CC_GT: enc = 0x4; break; + case CC_GTU: enc = 0xc; break; + case CC_NE: enc = 0x5; break; + case CC_NEU: enc = 0xd; break; + case CC_GE: enc = 0x6; break; + case CC_GEU: enc = 0xe; break; + case CC_TR: enc = 0xf; break; + case CC_FL: enc = 0x0; break; + + case CC_O: enc = 0x10; break; + case CC_C: enc = 0x11; break; + case CC_A: enc = 0x12; break; + case CC_S: enc = 0x13; break; + case CC_NS: enc = 0x1c; break; + case CC_NA: enc = 0x1d; break; + case CC_NC: enc = 0x1e; break; + case CC_NO: enc = 0x1f; break; + + default: + enc = 0; + assert(!"invalid condition code"); + break; + } + code[pos / 32] |= enc << (pos % 32); +} + +void +CodeEmitterNV50::emitFlagsRd(const Instruction *i) +{ + int s = (i->flagsSrc >= 0) ? i->flagsSrc : i->predSrc; + + assert(!(code[1] & 0x00003f80)); + + if (s >= 0) { + assert(i->getSrc(s)->reg.file == FILE_FLAGS); + emitCondCode(i->cc, 32 + 7); + srcId(i->src[s], 32 + 12); + } else { + code[1] |= 0x0780; + } +} + +void +CodeEmitterNV50::emitFlagsWr(const Instruction *i) +{ + assert(!(code[1] & 0x70)); + + if (i->flagsDef >= 0) + code[1] |= (DDATA(i->def[i->flagsDef]).id << 4) | 0x40; +} + +void +CodeEmitterNV50::setARegBits(unsigned int u) +{ + code[0] |= (u & 3) << 26; + code[1] |= (u & 4); +} + +void +CodeEmitterNV50::setAReg16(const Instruction *i, int s) +{ + s = i->src[s].indirect[0]; + if (s >= 0) + setARegBits(SDATA(i->src[s]).id + 1); +} + +void +CodeEmitterNV50::setImmediate(const Instruction *i, int s) +{ + const ImmediateValue *imm = i->src[s].get()->asImm(); + assert(imm); + + code[1] |= 3; + code[0] |= (imm->reg.data.u32 & 0x3f) << 16; + code[1] |= (imm->reg.data.u32 >> 6) << 2; +} + +void +CodeEmitterNV50::setDst(const Value *dst) +{ + const Storage *reg = &dst->join->reg; + + assert(reg->file != FILE_ADDRESS); + + if (reg->data.id < 0) { + code[0] |= (127 << 2) | 1; + code[1] |= 8; + } else { + if (reg->file == FILE_SHADER_OUTPUT) + code[1] |= 8; + code[0] |= reg->data.id << 2; + } +} + +void +CodeEmitterNV50::setDst(const Instruction *i, int d) +{ + if (i->defExists(d)) { + setDst(i->getDef(d)); + } else + if (!d) { + code[0] |= 0x01fc; // bit bucket + code[1] |= 0x0008; + } +} + +void +CodeEmitterNV50::emitSrc0(const ValueRef& ref) +{ + const Storage *reg = &ref.rep()->reg; + + if (reg->file == FILE_SHADER_INPUT) + code[1] |= 0x00200000; + else + if (reg->file != FILE_GPR) + ERROR("invalid src0 register file: %d\n", reg->file); + + assert(reg->data.id < 128); + code[0] |= reg->data.id << 9; +} + +void +CodeEmitterNV50::emitSrc1(const ValueRef& ref) +{ + const Storage *reg = &ref.rep()->reg; + + if (reg->file == FILE_MEMORY_CONST) { + assert(!(code[1] & 0x01800000)); + code[0] |= 1 << 23; + code[1] |= reg->fileIndex << 22; + } else + if (reg->file != FILE_GPR) { + ERROR("invalid src1 register file: %d\n", reg->file); + } + + assert(reg->data.id < 128); + code[0] |= reg->data.id << 16; +} + +void +CodeEmitterNV50::emitSrc2(const ValueRef& ref) +{ + const Storage *reg = &ref.rep()->reg; + + if (reg->file == FILE_MEMORY_CONST) { + assert(!(code[1] & 0x01800000)); + code[0] |= 1 << 24; + code[1] |= reg->fileIndex << 22; + } else + if (reg->file != FILE_GPR) { + ERROR("invalid src1 register file: %d\n", reg->file); + } + + assert(reg->data.id < 128); + code[1] |= reg->data.id << 14; +} + +// the default form: +// - long instruction +// - 1 to 3 sources in slots 0, 1, 2 +// - address & flags +void +CodeEmitterNV50::emitForm_MAD(const Instruction *i) +{ + assert(i->encSize == 8); + code[0] |= 1; + + emitFlagsRd(i); + emitFlagsWr(i); + + setDst(i, 0); + + if (i->srcExists(0)) + emitSrc0(i->src[0]); + + if (i->srcExists(1)) + emitSrc1(i->src[1]); + + if (i->srcExists(2)) + emitSrc2(i->src[2]); + + setAReg16(i, 1); +} + +// like default form, but 2nd source in slot 2, and no 3rd source +void +CodeEmitterNV50::emitForm_ADD(const Instruction *i) +{ + assert(i->encSize == 8); + code[0] |= 1; + + emitFlagsRd(i); + emitFlagsWr(i); + + setDst(i, 0); + + if (i->srcExists(0)) + emitSrc0(i->src[0]); + + if (i->srcExists(1)) + emitSrc2(i->src[1]); + + setAReg16(i, 1); +} + +// default short form +void +CodeEmitterNV50::emitForm_MUL(const Instruction *i) +{ + assert(i->encSize == 4 && !(code[0] & 1)); + assert(i->defExists(0)); + assert(!i->getPredicate()); + + setDst(i, 0); + + if (i->srcExists(0)) + emitSrc0(i->src[0]); + + if (i->srcExists(1)) + emitSrc1(i->src[1]); +} + +// usual immediate form +// - 1 to 3 sources where last is immediate +// - no address or predicate possible +void +CodeEmitterNV50::emitForm_IMM(const Instruction *i) +{ + assert(i->encSize == 8); + code[0] |= 1; + + assert(i->defExists(0) && i->srcExists(0)); + + setDst(i, 0); + + if (i->srcExists(2)) { + emitSrc0(i->src[0]); + emitSrc1(i->src[1]); + setImmediate(i, 2); + } else + if (i->srcExists(1)) { + emitSrc0(i->src[0]); + setImmediate(i, 1); + } else { + setImmediate(i, 0); + } +} + +void +CodeEmitterNV50::emitLoadStoreSize(DataType ty, int pos) +{ + uint8_t enc; + + switch (ty) { + case TYPE_F32: // fall through + case TYPE_S32: // fall through + case TYPE_U32: enc = 0x6; break; + case TYPE_B128: enc = 0x5; break; + case TYPE_F64: enc = 0x4; break; + case TYPE_S16: enc = 0x3; break; + case TYPE_U16: enc = 0x2; break; + case TYPE_S8: enc = 0x1; break; + case TYPE_U8: enc = 0x0; break; + default: + enc = 0; + assert(!"invalid load/store type"); + break; + } + code[pos / 32] |= enc << (pos % 32); +} + +void +CodeEmitterNV50::emitLOAD(const Instruction *i) +{ + DataFile sf = i->src[0].getFile(); + + switch (sf) { + case FILE_SHADER_INPUT: + code[0] = 0x10000001; + code[1] = 0x04200000 | (i->lanes << 14); + break; + case FILE_MEMORY_CONST: + code[0] = 0x10000001; + code[1] = 0x24000000 | (i->getSrc(0)->reg.fileIndex << 22); + break; + case FILE_MEMORY_LOCAL: + code[0] = 0xd0000001; + code[1] = 0x40000000; + break; + case FILE_MEMORY_GLOBAL: + code[0] = 0xd0000001 | (i->getSrc(0)->reg.fileIndex << 16); + code[1] = 0x80000000; + break; + default: + assert(!"invalid load source file"); + break; + } + if (sf == FILE_MEMORY_LOCAL || + sf == FILE_MEMORY_GLOBAL) + emitLoadStoreSize(i->sType, 21 + 32); + + setDst(i, 0); + + emitFlagsRd(i); + emitFlagsWr(i); + + if (i->src[0].getFile() == FILE_MEMORY_GLOBAL) { + srcId(*i->src[0].getIndirect(0), 9); + } else { + setAReg16(i, 0); + srcAddr16(i->src[0], 9); + } +} + +void +CodeEmitterNV50::emitSTORE(const Instruction *i) +{ + DataFile f = i->getSrc(0)->reg.file; + int32_t offset = i->getSrc(0)->reg.data.offset; + + switch (f) { + case FILE_SHADER_OUTPUT: + code[0] = 0x00000001 | ((offset >> 2) << 2); + code[1] = 0x80c00000; + srcId(i->src[1], 32 + 15); + break; + case FILE_MEMORY_GLOBAL: + code[0] = 0xd0000000; + code[1] = 0xa0000000; + emitLoadStoreSize(i->dType, 21 + 32); + break; + case FILE_MEMORY_LOCAL: + code[0] = 0xd0000001; + code[1] = 0x60000000; + emitLoadStoreSize(i->dType, 21 + 32); + break; + case FILE_MEMORY_SHARED: + code[0] = 0x00000001; + code[1] = 0xe0000000; + switch (typeSizeof(i->dType)) { + case 1: + code[0] |= offset << 9; + code[1] |= 0x00400000; + break; + case 2: + code[0] |= (offset >> 1) << 9; + break; + case 4: + code[0] |= (offset >> 2) << 9; + code[1] |= 0x04000000; + break; + default: + assert(0); + break; + } + break; + default: + assert(!"invalid store destination file"); + break; + } + + if (f != FILE_SHADER_OUTPUT) { + srcId(i->src[1], 2); + if (f == FILE_MEMORY_GLOBAL) + srcId(*i->src[0].getIndirect(0), 9); + if (f == FILE_MEMORY_LOCAL) + srcAddr16(i->src[0], 9); + } + if (f != FILE_MEMORY_GLOBAL) + setAReg16(i, 0); + + emitFlagsRd(i); +} + +void +CodeEmitterNV50::emitMOV(const Instruction *i) +{ + DataFile sf = i->getSrc(0)->reg.file; + DataFile df = i->getDef(0)->reg.file; + + assert(sf == FILE_GPR || df == FILE_GPR); + + if (sf == FILE_FLAGS) { + code[0] = 0x00000001; + code[1] = 0x20000000; + defId(i->def[0], 2); + srcId(i->src[0], 12); + emitFlagsRd(i); + } else + if (sf == FILE_ADDRESS) { + code[0] = 0x00000001; + code[1] = 0x40000000; + defId(i->def[0], 2); + setARegBits(SDATA(i->src[0]).id + 1); + } else + if (df == FILE_FLAGS) { + code[0] = 0x00000001; + code[1] = 0xa0000000; + defId(i->def[0], 4); + srcId(i->src[0], 9); + emitFlagsRd(i); + } else + if (sf == FILE_IMMEDIATE) { + code[0] = 0x10008001; + code[1] = 0x00000003; + emitForm_IMM(i); + } else { + if (i->encSize == 4) { + code[0] = 0x10008000; + } else { + code[0] = 0x10000001; + code[1] = 0x04000000 | (i->lanes << 14); + } + defId(i->def[0], 2); + srcId(i->src[0], 9); + } + if (df == FILE_SHADER_OUTPUT) { + assert(i->encSize == 8); + code[1] |= 0x8; + } +} + +void +CodeEmitterNV50::emitNOP() +{ + code[0] = 0xf0000001; + code[1] = 0xe0000000; +} + +void +CodeEmitterNV50::emitQUADOP(const Instruction *i, uint8_t lane, uint8_t quOp) +{ + code[0] = 0xc0000000 | (lane << 16); + code[1] = 0x80000000; + + code[0] |= (quOp & 0x03) << 20; + code[1] |= (quOp & 0xfc) << 20; + + emitForm_ADD(i); + + if (!i->srcExists(1)) + srcId(i->src[0], 32 + 14); +} + +void +CodeEmitterNV50::emitPFETCH(const Instruction *i) +{ + code[0] = 0x11800001; + code[1] = 0x04200000 | (0xf << 14); + + defId(i->def[0], 2); + srcAddr8(i->src[0], 9); + setAReg16(i, 0); +} + +void +CodeEmitterNV50::emitINTERP(const Instruction *i) +{ + code[0] = 0x80000000; + + defId(i->def[0], 2); + srcAddr8(i->src[0], 16); + + if (i->getInterpMode() == NV50_IR_INTERP_FLAT) { + code[0] |= 1 << 8; + } else { + if (i->op == OP_PINTERP) { + code[0] |= 1 << 25; + srcId(i->src[1], 9); + } + if (i->getSampleMode() == NV50_IR_INTERP_CENTROID) + code[0] |= 1 << 24; + } + + if (i->encSize == 8) { + emitFlagsRd(i); + code[1] |= + (code[0] & (3 << 24)) >> (24 - 16) | + (code[0] & (1 << 8)) >> (18 - 8); + code[0] &= ~0x03000100; + code[0] |= 1; + } +} + +void +CodeEmitterNV50::emitMINMAX(const Instruction *i) +{ + if (i->dType == TYPE_F64) { + code[0] = 0xe0000000; + code[1] = (i->op == OP_MIN) ? 0xa0000000 : 0xc0000000; + } else { + code[0] = 0x30000000; + code[1] = 0x80000000; + if (i->op == OP_MIN) + code[1] |= 0x20000000; + + switch (i->dType) { + case TYPE_F32: code[0] |= 0x80000000; break; + case TYPE_S32: code[1] |= 0x8c000000; break; + case TYPE_U32: code[1] |= 0x84000000; break; + case TYPE_S16: code[1] |= 0x80000000; break; + case TYPE_U16: break; + default: + assert(0); + break; + } + code[1] |= i->src[0].mod.abs() << 20; + code[1] |= i->src[1].mod.abs() << 19; + } + emitForm_MAD(i); +} + +void +CodeEmitterNV50::emitFMAD(const Instruction *i) +{ + const int neg_mul = i->src[0].mod.neg() ^ i->src[1].mod.neg(); + const int neg_add = i->src[2].mod.neg(); + + code[0] = 0xe0000000; + + if (i->encSize == 4) { + emitForm_MUL(i); + assert(!neg_mul && !neg_add); + } else { + emitForm_MAD(i); + code[1] |= neg_mul << 26; + code[1] |= neg_add << 27; + if (i->saturate) + code[1] |= 1 << 29; + } +} + +void +CodeEmitterNV50::emitFADD(const Instruction *i) +{ + const int neg0 = i->src[0].mod.neg(); + const int neg1 = i->src[1].mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0); + + code[0] = 0xb0000000; + + assert(!(i->src[0].mod | i->src[1].mod).abs()); + + if (i->src[1].getFile() == FILE_IMMEDIATE) { + emitForm_IMM(i); + code[0] |= neg0 << 15; + code[0] |= neg1 << 22; + } else + if (i->encSize == 8) { + emitForm_ADD(i); + code[1] |= neg0 << 26; + code[1] |= neg1 << 27; + if (i->saturate) + code[1] |= 1 << 29; + } else { + emitForm_MUL(i); + code[0] |= neg0 << 15; + code[0] |= neg1 << 22; + } +} + +void +CodeEmitterNV50::emitUADD(const Instruction *i) +{ + code[0] = 0x20008000; + + if (i->src[0].getFile() == FILE_IMMEDIATE) { + emitForm_IMM(i); + } else + if (i->encSize == 8) { + code[0] = 0x20000000; + code[1] = 0x04000000; + emitForm_ADD(i); + } else { + emitForm_MUL(i); + } + assert(!(i->src[0].mod.neg() && i->src[1].mod.neg())); + code[0] |= i->src[0].mod.neg() << 28; + code[0] |= i->src[1].mod.neg() << 22; +} + +void +CodeEmitterNV50::emitAADD(const Instruction *i) +{ + const int s = (i->op == OP_MOV) ? 0 : 1; + + code[0] = 0xd0000001 | (i->getSrc(s)->reg.data.u16 << 9); + code[1] = 0x20000000; + + code[0] |= (DDATA(i->def[0]).id + 1) << 2; + + emitFlagsRd(i); + + if (s && i->srcExists(0)) + setARegBits(SDATA(i->src[0]).id + 1); +} + +void +CodeEmitterNV50::emitFMUL(const Instruction *i) +{ + const int neg = (i->src[0].mod ^ i->src[1].mod).neg(); + + code[0] = 0xc0000000; + + if (i->src[0].getFile() == FILE_IMMEDIATE) { + emitForm_IMM(i); + if (neg) + code[0] |= 0x8000; + } else + if (i->encSize == 8) { + emitForm_MAD(i); + if (neg) + code[1] |= 0x08000000; + } else { + emitForm_MUL(i); + if (neg) + code[0] |= 0x8000; + } +} + +void +CodeEmitterNV50::emitSET(const Instruction *i) +{ + code[0] = 0x30000000; + code[1] = 0x60000000; + + emitCondCode(i->asCmp()->setCond, 32 + 14); + + switch (i->sType) { + case TYPE_F32: code[0] |= 0x80000000; break; + case TYPE_S32: code[1] |= 0x0c000000; break; + case TYPE_U32: code[1] |= 0x04000000; break; + case TYPE_S16: code[1] |= 0x08000000; break; + case TYPE_U16: break; + default: + assert(0); + break; + } + emitForm_MAD(i); +} + +void +CodeEmitterNV50::roundMode_CVT(RoundMode rnd) +{ + switch (rnd) { + case ROUND_NI: code[1] |= 0x08000000; break; + case ROUND_M: code[1] |= 0x00020000; break; + case ROUND_MI: code[1] |= 0x08020000; break; + case ROUND_P: code[1] |= 0x00040000; break; + case ROUND_PI: code[1] |= 0x08040000; break; + case ROUND_Z: code[1] |= 0x00060000; break; + case ROUND_ZI: code[1] |= 0x08060000; break; + default: + assert(rnd == ROUND_N); + break; + } +} + +void +CodeEmitterNV50::emitCVT(const Instruction *i) +{ + const bool f2f = isFloatType(i->dType) && isFloatType(i->sType); + RoundMode rnd; + + switch (i->op) { + case OP_CEIL: rnd = f2f ? ROUND_PI : ROUND_P; break; + case OP_FLOOR: rnd = f2f ? ROUND_MI : ROUND_M; break; + case OP_TRUNC: rnd = f2f ? ROUND_ZI : ROUND_Z; break; + default: + rnd = i->rnd; + break; + } + + code[0] = 0xa0000000; + + switch (i->dType) { + case TYPE_F64: + switch (i->sType) { + case TYPE_F64: code[1] = 0xc4404000; break; + case TYPE_S64: code[1] = 0x44414000; break; + case TYPE_U64: code[1] = 0x44404000; break; + case TYPE_F32: code[1] = 0xc4400000; break; + case TYPE_S32: code[1] = 0x44410000; break; + case TYPE_U32: code[1] = 0x44400000; break; + default: + assert(0); + break; + } + break; + case TYPE_S64: + switch (i->sType) { + case TYPE_F64: code[1] = 0x8c404000; break; + case TYPE_F32: code[1] = 0x8c400000; break; + default: + assert(0); + break; + } + break; + case TYPE_U64: + switch (i->sType) { + case TYPE_F64: code[1] = 0x84404000; break; + case TYPE_F32: code[1] = 0x84400000; break; + default: + assert(0); + break; + } + break; + case TYPE_F32: + switch (i->sType) { + case TYPE_F64: code[1] = 0xc0404000; break; + case TYPE_S64: code[1] = 0x40414000; break; + case TYPE_U64: code[1] = 0x40404000; break; + case TYPE_F32: code[1] = 0xc4004000; break; + case TYPE_S32: code[1] = 0x44014000; break; + case TYPE_U32: code[1] = 0x44004000; break; + case TYPE_F16: code[1] = 0xc4000000; break; + default: + assert(0); + break; + } + break; + case TYPE_S32: + switch (i->sType) { + case TYPE_F64: code[1] = 0x88404000; break; + case TYPE_F32: code[1] = 0x8c004000; break; + case TYPE_S32: code[1] = 0x0c014000; break; + case TYPE_U32: code[1] = 0x0c004000; break; + case TYPE_F16: code[1] = 0x8c000000; break; + case TYPE_S16: code[1] = 0x0c010000; break; + case TYPE_U16: code[1] = 0x0c000000; break; + case TYPE_S8: code[1] = 0x0c018000; break; + case TYPE_U8: code[1] = 0x0c008000; break; + default: + assert(0); + break; + } + break; + case TYPE_U32: + switch (i->sType) { + case TYPE_F64: code[1] = 0x80404000; break; + case TYPE_F32: code[1] = 0x84004000; break; + case TYPE_S32: code[1] = 0x04014000; break; + case TYPE_U32: code[1] = 0x04004000; break; + case TYPE_F16: code[1] = 0x84000000; break; + case TYPE_S16: code[1] = 0x04010000; break; + case TYPE_U16: code[1] = 0x04000000; break; + case TYPE_S8: code[1] = 0x04018000; break; + case TYPE_U8: code[1] = 0x04008000; break; + default: + assert(0); + break; + } + case TYPE_S16: + case TYPE_U16: + case TYPE_S8: + case TYPE_U8: + default: + assert(0); + break; + } + if (typeSizeof(i->sType) == 1 && i->getSrc(0)->reg.size == 4) + code[1] |= 0x00004000; + + roundMode_CVT(rnd); + + switch (i->op) { + case OP_ABS: code[1] |= 1 << 20; break; + case OP_SAT: code[1] |= 1 << 19; break; + case OP_NEG: code[1] |= 1 << 29; break; + default: + break; + } + code[1] ^= i->src[0].mod.neg() << 29; + code[1] |= i->src[0].mod.abs() << 20; + if (i->saturate) + code[1] |= 1 << 19; + + assert(i->op != OP_ABS || !i->src[0].mod.neg()); + + emitForm_MAD(i); +} + +void +CodeEmitterNV50::emitPreOp(const Instruction *i) +{ + code[0] = 0xb0000000; + code[1] = (i->op == OP_PREEX2) ? 0xc0004000 : 0xc0000000; + + code[1] |= i->src[0].mod.abs() << 20; + code[1] |= i->src[0].mod.neg() << 26; + + emitForm_MAD(i); +} + +void +CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp) +{ + code[0] = 0x90000000; + + if (i->encSize == 4) { + assert(i->op == OP_RCP); + emitForm_MUL(i); + } else { + code[1] = subOp << 29; + code[1] |= i->src[0].mod.abs() << 20; + code[1] |= i->src[0].mod.neg() << 26; + emitForm_MAD(i); + } +} + +void +CodeEmitterNV50::emitLogicOp(const Instruction *i) +{ + code[0] = 0xd0000000; + + if (i->src[1].getFile() == FILE_IMMEDIATE) { + switch (i->op) { + case OP_OR: code[0] |= 0x0100; break; + case OP_XOR: code[0] |= 0x8000; break; + default: + assert(i->op == OP_AND); + break; + } + emitForm_IMM(i); + } else { + switch (i->op) { + case OP_AND: code[1] = 0x04000000; break; + case OP_OR: code[1] = 0x04004000; break; + case OP_XOR: code[1] = 0x04008000; break; + default: + assert(0); + break; + } + emitForm_MAD(i); + } +} + +void +CodeEmitterNV50::emitARL(const Instruction *i) +{ + assert(i->src[1].getFile() == FILE_IMMEDIATE); + + code[0] = 0x00000001 | (i->getSrc(1)->reg.data.u32 & 0x3f) << 16; + code[1] = 0xc0000000; + + code[0] |= (DDATA(i->def[0]).id + 1) << 2; + emitSrc0(i->src[0]); + emitFlagsRd(i); +} + +void +CodeEmitterNV50::emitShift(const Instruction *i) +{ + if (i->def[0].getFile() == FILE_ADDRESS) { + emitARL(i); + } else { + code[0] = 0x30000001; + code[1] = (i->op == OP_SHR) ? 0xe4000000 : 0xc4000000; + if (isSignedType(i->sType)) + code[1] |= 1 << 27; + + if (i->src[1].getFile() == FILE_IMMEDIATE) { + code[1] |= 1 << 20; + code[0] |= (i->getSrc(1)->reg.data.u32 & 0x7f) << 16; + emitFlagsRd(i); + } else { + emitForm_MAD(i); + } + } +} + +void +CodeEmitterNV50::emitOUT(const Instruction *i) +{ + code[0] = (i->op == OP_EMIT) ? 0xf0000200 : 0xf0000400; + code[1] = 0xc0000001; + + emitFlagsRd(i); +} + +void +CodeEmitterNV50::emitTEX(const TexInstruction *i) +{ + code[0] = 0xf0000001; + code[1] = 0x00000000; + + switch (i->op) { + case OP_TXB: + code[1] = 0x20000000; + break; + case OP_TXL: + code[1] = 0x40000000; + break; + case OP_TXF: + code[0] = 0x01000000; + break; + case OP_TXG: + code[0] = 0x01000000; + code[1] = 0x80000000; + break; + default: + assert(i->op == OP_TEX); + break; + } + + code[0] |= i->tex.r << 9; + code[0] |= i->tex.s << 17; + + int argc = i->tex.target.getArgCount(); + + if (i->op == OP_TXB || i->op == OP_TXL) + argc += 1; + if (i->tex.target.isShadow()) + argc += 1; + assert(argc <= 4); + + code[0] |= (argc - 1) << 22; + + if (i->tex.target.isCube()) { + code[0] |= 0x08000000; + } else + if (i->tex.useOffsets) { + code[1] |= (i->tex.offset[0][0] & 0xf) << 16; + code[1] |= (i->tex.offset[0][1] & 0xf) << 20; + code[1] |= (i->tex.offset[0][2] & 0xf) << 24; + } + + code[0] |= (i->tex.mask & 0x3) << 25; + code[1] |= (i->tex.mask & 0xc) << 12; + + if (i->tex.liveOnly) + code[1] |= 4; + + defId(i->def[0], 2); + + emitFlagsRd(i); +} + +void +CodeEmitterNV50::emitFlow(const Instruction *i, uint8_t flowOp) +{ + const FlowInstruction *f = i->asFlow(); + + code[0] = 0x00000003 | (flowOp << 28); + code[1] = 0x00000000; + + emitFlagsRd(i); + + if (f && f->target.bb) { + uint32_t pos; + + if (f->op == OP_CALL) { + if (f->builtin) { + pos = 0; // XXX: TODO + } else { + pos = f->target.fn->binPos; + } + } else { + pos = f->target.bb->binPos; + } + + code[0] |= ((pos >> 2) & 0xffff) << 11; + code[1] |= ((pos >> 18) & 0x003f) << 14; + } +} + +bool +CodeEmitterNV50::emitInstruction(Instruction *insn) +{ + if (!insn->encSize) { + ERROR("skipping unencodable instruction: "); insn->print(); + return false; + } else + if (codeSize + insn->encSize > codeSizeLimit) { + ERROR("code emitter output buffer too small\n"); + return false; + } + + switch (insn->op) { + case OP_MOV: + emitMOV(insn); + break; + case OP_NOP: + case OP_JOIN: + emitNOP(); + break; + case OP_VFETCH: + case OP_LOAD: + emitLOAD(insn); + break; + case OP_EXPORT: + case OP_STORE: + emitSTORE(insn); + break; + case OP_PFETCH: + emitPFETCH(insn); + break; + case OP_LINTERP: + case OP_PINTERP: + emitINTERP(insn); + break; + case OP_ADD: + case OP_SUB: + if (isFloatType(insn->dType)) + emitFADD(insn); + else + emitUADD(insn); + break; + case OP_MUL: + if (isFloatType(insn->dType)) + emitFMUL(insn); + else + emitUMUL(insn); + break; + case OP_MAD: + case OP_FMA: + emitFMAD(insn); + break; + break; + case OP_AND: + case OP_OR: + case OP_XOR: + emitLogicOp(insn); + break; + case OP_MIN: + case OP_MAX: + emitMINMAX(insn); + break; + case OP_CEIL: + case OP_FLOOR: + case OP_TRUNC: + case OP_CVT: + emitCVT(insn); + break; + case OP_RCP: + emitSFnOp(insn, 0); + break; + case OP_RSQ: + emitSFnOp(insn, 2); + break; + case OP_LG2: + emitSFnOp(insn, 3); + break; + case OP_SIN: + emitSFnOp(insn, 4); + break; + case OP_COS: + emitSFnOp(insn, 5); + break; + case OP_EX2: + emitSFnOp(insn, 6); + break; + case OP_PRESIN: + case OP_PREEX2: + emitPreOp(insn); + break; + case OP_TEX: + case OP_TXB: + case OP_TXL: + emitTEX(insn->asTex()); + break; + case OP_EMIT: + case OP_RESTART: + emitOUT(insn); + break; + case OP_DISCARD: + emitFlow(insn, 0x0); + break; + case OP_BRA: + emitFlow(insn, 0x1); + break; + case OP_CALL: + emitFlow(insn, 0x2); + break; + case OP_RET: + emitFlow(insn, 0x3); + break; + case OP_PREBREAK: + emitFlow(insn, 0x4); + break; + case OP_BREAK: + emitFlow(insn, 0x5); + break; + case OP_QUADON: + emitFlow(insn, 0x6); + break; + case OP_QUADPOP: + emitFlow(insn, 0x7); + break; + case OP_JOINAT: + emitFlow(insn, 0xa); + break; + case OP_PRERET: + emitFlow(insn, 0xd); + break; + case OP_QUADOP: + emitQUADOP(insn, insn->lanes, insn->subOp); + break; + case OP_DFDX: + emitQUADOP(insn, 4, insn->src[0].mod.neg() ? 0x66 : 0x99); + break; + case OP_DFDY: + emitQUADOP(insn, 5, insn->src[0].mod.neg() ? 0x5a : 0xa5); + break; + case OP_PHI: + case OP_UNION: + case OP_CONSTRAINT: + ERROR("operation should have been eliminated"); + return false; + case OP_EXP: + case OP_LOG: + case OP_SQRT: + case OP_POW: + case OP_SELP: + case OP_SLCT: + case OP_TXD: + case OP_PRECONT: + case OP_CONT: + case OP_POPCNT: + case OP_INSBF: + case OP_EXTBF: + ERROR("operation should have been lowered\n"); + return false; + default: + ERROR("unknow op\n"); + return false; + } + if (insn->join) + code[1] |= 0x2; + else + if (insn->exit) + code[1] |= 0x1; + + assert((insn->encSize == 8) == (code[1] & 1)); + + code += insn->encSize / 4; + codeSize += insn->encSize; + return true; +} + +uint32_t +CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const +{ + const Target::OpInfo &info = targ->getOpInfo(i); + + if (info.minEncSize == 8) + return 8; + + return 4; +} + +CodeEmitterNV50::CodeEmitterNV50(const Target *target) : targ(target) +{ + code = NULL; + codeSize = codeSizeLimit = 0; +} + +CodeEmitter * +Target::getCodeEmitter(Program::Type type) +{ + CodeEmitterNV50 *emit = new CodeEmitterNV50(this); + emit->setProgramType(type); + return emit; +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp new file mode 100644 index 00000000000..c2f464de31b --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp @@ -0,0 +1,2288 @@ + +extern "C" { +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_scan.h" +} + +#include "nv50_ir.h" +#include "nv50_ir_util.h" +#include "nv50_ir_build_util.h" + +namespace tgsi { + +class Source; + +static nv50_ir::operation translateOpcode(uint opcode); +static nv50_ir::DataFile translateFile(uint file); +static nv50_ir::TexTarget translateTexture(uint texTarg); +static nv50_ir::SVSemantic translateSysVal(uint sysval); + +class Instruction +{ +public: + Instruction(const struct tgsi_full_instruction *inst) : insn(inst) { } + + class SrcRegister + { + public: + SrcRegister(const struct tgsi_full_src_register *src) + : reg(src->Register), + fsr(src) + { } + + SrcRegister(const struct tgsi_src_register& src) : reg(src), fsr(NULL) { } + + struct tgsi_src_register offsetToSrc(struct tgsi_texture_offset off) + { + struct tgsi_src_register reg; + memset(®, 0, sizeof(reg)); + reg.Index = off.Index; + reg.File = off.File; + reg.SwizzleX = off.SwizzleX; + reg.SwizzleY = off.SwizzleY; + reg.SwizzleZ = off.SwizzleZ; + return reg; + } + + SrcRegister(const struct tgsi_texture_offset& off) : + reg(offsetToSrc(off)), + fsr(NULL) + { } + + uint getFile() const { return reg.File; } + + bool is2D() const { return reg.Dimension; } + + bool isIndirect(int dim) const + { + return (dim && fsr) ? fsr->Dimension.Indirect : reg.Indirect; + } + + int getIndex(int dim) const + { + return (dim && fsr) ? fsr->Dimension.Index : reg.Index; + } + + int getSwizzle(int chan) const + { + return tgsi_util_get_src_register_swizzle(®, chan); + } + + nv50_ir::Modifier getMod(int chan) const; + + SrcRegister getIndirect(int dim) const + { + assert(fsr && isIndirect(dim)); + if (dim) + return SrcRegister(fsr->DimIndirect); + return SrcRegister(fsr->Indirect); + } + + uint32_t getValueU32(int c, const struct nv50_ir_prog_info *info) const + { + assert(reg.File == TGSI_FILE_IMMEDIATE); + assert(!reg.Absolute); + assert(!reg.Negate); + return info->immd.data[reg.Index * 4 + getSwizzle(c)]; + } + + private: + const struct tgsi_src_register reg; + const struct tgsi_full_src_register *fsr; + }; + + class DstRegister + { + public: + DstRegister(const struct tgsi_full_dst_register *dst) + : reg(dst->Register), + fdr(dst) + { } + + DstRegister(const struct tgsi_dst_register& dst) : reg(dst), fdr(NULL) { } + + uint getFile() const { return reg.File; } + + bool is2D() const { return reg.Dimension; } + + bool isIndirect(int dim) const + { + return (dim && fdr) ? fdr->Dimension.Indirect : reg.Indirect; + } + + int getIndex(int dim) const + { + return (dim && fdr) ? fdr->Dimension.Dimension : reg.Index; + } + + unsigned int getMask() const { return reg.WriteMask; } + + bool isMasked(int chan) const { return !(getMask() & (1 << chan)); } + + SrcRegister getIndirect(int dim) const + { + assert(fdr && isIndirect(dim)); + if (dim) + return SrcRegister(fdr->DimIndirect); + return SrcRegister(fdr->Indirect); + } + + private: + const struct tgsi_dst_register reg; + const struct tgsi_full_dst_register *fdr; + }; + + inline uint getOpcode() const { return insn->Instruction.Opcode; } + + unsigned int srcCount() const { return insn->Instruction.NumSrcRegs; } + unsigned int dstCount() const { return insn->Instruction.NumDstRegs; } + + // mask of used components of source s + unsigned int srcMask(unsigned int s) const; + + SrcRegister getSrc(unsigned int s) const + { + assert(s < srcCount()); + return SrcRegister(&insn->Src[s]); + } + + DstRegister getDst(unsigned int d) const + { + assert(d < dstCount()); + return DstRegister(&insn->Dst[d]); + } + + SrcRegister getTexOffset(unsigned int i) const + { + assert(i < TGSI_FULL_MAX_TEX_OFFSETS); + return SrcRegister(insn->TexOffsets[i]); + } + + unsigned int getNumTexOffsets() const { return insn->Texture.NumOffsets; } + + bool checkDstSrcAliasing() const; + + inline nv50_ir::operation getOP() const { + return translateOpcode(getOpcode()); } + + nv50_ir::DataType inferSrcType() const; + nv50_ir::DataType inferDstType() const; + + nv50_ir::CondCode getSetCond() const; + + nv50_ir::TexInstruction::Target getTexture(const Source *, int s) const; + + inline uint getLabel() { return insn->Label.Label; } + + unsigned getSaturate() const { return insn->Instruction.Saturate; } + + void print() const + { + tgsi_dump_instruction(insn, 1); + } + +private: + const struct tgsi_full_instruction *insn; +}; + +unsigned int Instruction::srcMask(unsigned int s) const +{ + unsigned int mask = insn->Dst[0].Register.WriteMask; + + switch (insn->Instruction.Opcode) { + case TGSI_OPCODE_COS: + case TGSI_OPCODE_SIN: + return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); + case TGSI_OPCODE_DP3: + return 0x7; + case TGSI_OPCODE_DP4: + case TGSI_OPCODE_DPH: + case TGSI_OPCODE_KIL: /* WriteMask ignored */ + return 0xf; + case TGSI_OPCODE_DST: + return mask & (s ? 0xa : 0x6); + case TGSI_OPCODE_EX2: + case TGSI_OPCODE_EXP: + case TGSI_OPCODE_LG2: + case TGSI_OPCODE_LOG: + case TGSI_OPCODE_POW: + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_RSQ: + case TGSI_OPCODE_SCS: + return 0x1; + case TGSI_OPCODE_IF: + return 0x1; + case TGSI_OPCODE_LIT: + return 0xb; + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXB: + case TGSI_OPCODE_TXD: + case TGSI_OPCODE_TXL: + case TGSI_OPCODE_TXP: + { + const struct tgsi_instruction_texture *tex = &insn->Texture; + + assert(insn->Instruction.Texture); + + mask = 0x7; + if (insn->Instruction.Opcode != TGSI_OPCODE_TEX && + insn->Instruction.Opcode != TGSI_OPCODE_TXD) + mask |= 0x8; /* bias, lod or proj */ + + switch (tex->Texture) { + case TGSI_TEXTURE_1D: + mask &= 0x9; + break; + case TGSI_TEXTURE_SHADOW1D: + mask &= 0x5; + break; + case TGSI_TEXTURE_1D_ARRAY: + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + mask &= 0xb; + break; + default: + break; + } + } + return mask; + case TGSI_OPCODE_XPD: + { + unsigned int x = 0; + if (mask & 1) x |= 0x6; + if (mask & 2) x |= 0x5; + if (mask & 4) x |= 0x3; + return x; + } + default: + break; + } + + return mask; +} + +nv50_ir::Modifier Instruction::SrcRegister::getMod(int chan) const +{ + nv50_ir::Modifier m(0); + + if (reg.Absolute) + m = m | nv50_ir::Modifier(NV50_IR_MOD_ABS); + if (reg.Negate) + m = m | nv50_ir::Modifier(NV50_IR_MOD_NEG); + return m; +} + +static nv50_ir::DataFile translateFile(uint file) +{ + switch (file) { + case TGSI_FILE_CONSTANT: return nv50_ir::FILE_MEMORY_CONST; + case TGSI_FILE_INPUT: return nv50_ir::FILE_SHADER_INPUT; + case TGSI_FILE_OUTPUT: return nv50_ir::FILE_SHADER_OUTPUT; + case TGSI_FILE_TEMPORARY: return nv50_ir::FILE_GPR; + case TGSI_FILE_ADDRESS: return nv50_ir::FILE_ADDRESS; + case TGSI_FILE_PREDICATE: return nv50_ir::FILE_PREDICATE; + case TGSI_FILE_IMMEDIATE: return nv50_ir::FILE_IMMEDIATE; + case TGSI_FILE_SYSTEM_VALUE: return nv50_ir::FILE_SYSTEM_VALUE; + case TGSI_FILE_IMMEDIATE_ARRAY: return nv50_ir::FILE_IMMEDIATE; + case TGSI_FILE_TEMPORARY_ARRAY: return nv50_ir::FILE_MEMORY_LOCAL; + case TGSI_FILE_RESOURCE: return nv50_ir::FILE_MEMORY_GLOBAL; + case TGSI_FILE_SAMPLER: + case TGSI_FILE_NULL: + default: + return nv50_ir::FILE_NULL; + } +} + +static nv50_ir::SVSemantic translateSysVal(uint sysval) +{ + switch (sysval) { + case TGSI_SEMANTIC_FACE: return nv50_ir::SV_FACE; + case TGSI_SEMANTIC_PSIZE: return nv50_ir::SV_POINT_SIZE; + case TGSI_SEMANTIC_PRIMID: return nv50_ir::SV_PRIMITIVE_ID; + case TGSI_SEMANTIC_INSTANCEID: return nv50_ir::SV_INSTANCE_ID; + default: + assert(0); + return nv50_ir::SV_CLOCK; + } +} + +#define NV50_IR_TEX_TARG_CASE(a, b) \ + case TGSI_TEXTURE_##a: return nv50_ir::TEX_TARGET_##b; + +static nv50_ir::TexTarget translateTexture(uint tex) +{ + switch (tex) { + NV50_IR_TEX_TARG_CASE(1D, 1D); + NV50_IR_TEX_TARG_CASE(2D, 2D); + NV50_IR_TEX_TARG_CASE(3D, 3D); + NV50_IR_TEX_TARG_CASE(CUBE, CUBE); + NV50_IR_TEX_TARG_CASE(RECT, RECT); + NV50_IR_TEX_TARG_CASE(1D_ARRAY, 1D_ARRAY); + NV50_IR_TEX_TARG_CASE(2D_ARRAY, 2D_ARRAY); + NV50_IR_TEX_TARG_CASE(SHADOW1D, 1D_SHADOW); + NV50_IR_TEX_TARG_CASE(SHADOW2D, 2D_SHADOW); + NV50_IR_TEX_TARG_CASE(SHADOW1D_ARRAY, 1D_ARRAY_SHADOW); + NV50_IR_TEX_TARG_CASE(SHADOW2D_ARRAY, 2D_ARRAY_SHADOW); + NV50_IR_TEX_TARG_CASE(SHADOWRECT, RECT_SHADOW); + + case TGSI_TEXTURE_UNKNOWN: + default: + assert(!"invalid texture target"); + return nv50_ir::TEX_TARGET_2D; + } +} + +nv50_ir::DataType Instruction::inferSrcType() const +{ + switch (getOpcode()) { + case TGSI_OPCODE_AND: + case TGSI_OPCODE_OR: + case TGSI_OPCODE_XOR: + case TGSI_OPCODE_U2F: + case TGSI_OPCODE_UADD: + case TGSI_OPCODE_UDIV: + case TGSI_OPCODE_UMOD: + case TGSI_OPCODE_UMAD: + case TGSI_OPCODE_UMUL: + case TGSI_OPCODE_UMAX: + case TGSI_OPCODE_UMIN: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + case TGSI_OPCODE_USHR: + case TGSI_OPCODE_UCMP: + return nv50_ir::TYPE_U32; + case TGSI_OPCODE_I2F: + case TGSI_OPCODE_IDIV: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_INEG: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_SAD: // not sure about SAD, but no one has a float version + case TGSI_OPCODE_MOD: + case TGSI_OPCODE_UARL: + return nv50_ir::TYPE_S32; + default: + return nv50_ir::TYPE_F32; + } +} + +nv50_ir::DataType Instruction::inferDstType() const +{ + switch (getOpcode()) { + case TGSI_OPCODE_F2U: return nv50_ir::TYPE_U32; + case TGSI_OPCODE_F2I: return nv50_ir::TYPE_S32; + case TGSI_OPCODE_I2F: + case TGSI_OPCODE_U2F: + return nv50_ir::TYPE_F32; + default: + return inferSrcType(); + } +} + +nv50_ir::CondCode Instruction::getSetCond() const +{ + using namespace nv50_ir; + + switch (getOpcode()) { + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_USLT: + return CC_LT; + case TGSI_OPCODE_SLE: + return CC_LE; + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_USGE: + return CC_GE; + case TGSI_OPCODE_SGT: + return CC_GT; + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_USEQ: + return CC_EQ; + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_USNE: + return CC_NE; + case TGSI_OPCODE_SFL: + return CC_NEVER; + case TGSI_OPCODE_STR: + default: + return CC_ALWAYS; + } +} + +#define NV50_IR_OPCODE_CASE(a, b) case TGSI_OPCODE_##a: return nv50_ir::OP_##b + +static nv50_ir::operation translateOpcode(uint opcode) +{ + switch (opcode) { + NV50_IR_OPCODE_CASE(ARL, SHL); + NV50_IR_OPCODE_CASE(MOV, MOV); + + NV50_IR_OPCODE_CASE(RCP, RCP); + NV50_IR_OPCODE_CASE(RSQ, RSQ); + + NV50_IR_OPCODE_CASE(MUL, MUL); + NV50_IR_OPCODE_CASE(ADD, ADD); + + NV50_IR_OPCODE_CASE(MIN, MIN); + NV50_IR_OPCODE_CASE(MAX, MAX); + NV50_IR_OPCODE_CASE(SLT, SET); + NV50_IR_OPCODE_CASE(SGE, SET); + NV50_IR_OPCODE_CASE(MAD, MAD); + NV50_IR_OPCODE_CASE(SUB, SUB); + + NV50_IR_OPCODE_CASE(FLR, FLOOR); + NV50_IR_OPCODE_CASE(ROUND, CVT); + NV50_IR_OPCODE_CASE(EX2, EX2); + NV50_IR_OPCODE_CASE(LG2, LG2); + NV50_IR_OPCODE_CASE(POW, POW); + + NV50_IR_OPCODE_CASE(ABS, ABS); + + NV50_IR_OPCODE_CASE(COS, COS); + NV50_IR_OPCODE_CASE(DDX, DFDX); + NV50_IR_OPCODE_CASE(DDY, DFDY); + NV50_IR_OPCODE_CASE(KILP, DISCARD); + + NV50_IR_OPCODE_CASE(SEQ, SET); + NV50_IR_OPCODE_CASE(SFL, SET); + NV50_IR_OPCODE_CASE(SGT, SET); + NV50_IR_OPCODE_CASE(SIN, SIN); + NV50_IR_OPCODE_CASE(SLE, SET); + NV50_IR_OPCODE_CASE(SNE, SET); + NV50_IR_OPCODE_CASE(STR, SET); + NV50_IR_OPCODE_CASE(TEX, TEX); + NV50_IR_OPCODE_CASE(TXD, TXD); + NV50_IR_OPCODE_CASE(TXP, TEX); + + NV50_IR_OPCODE_CASE(BRA, BRA); + NV50_IR_OPCODE_CASE(CAL, CALL); + NV50_IR_OPCODE_CASE(RET, RET); + NV50_IR_OPCODE_CASE(CMP, SLCT); + + NV50_IR_OPCODE_CASE(TXB, TXB); + + NV50_IR_OPCODE_CASE(DIV, DIV); + + NV50_IR_OPCODE_CASE(TXL, TXL); + + NV50_IR_OPCODE_CASE(CEIL, CEIL); + NV50_IR_OPCODE_CASE(I2F, CVT); + NV50_IR_OPCODE_CASE(NOT, NOT); + NV50_IR_OPCODE_CASE(TRUNC, TRUNC); + NV50_IR_OPCODE_CASE(SHL, SHL); + + NV50_IR_OPCODE_CASE(AND, AND); + NV50_IR_OPCODE_CASE(OR, OR); + NV50_IR_OPCODE_CASE(MOD, MOD); + NV50_IR_OPCODE_CASE(XOR, XOR); + NV50_IR_OPCODE_CASE(SAD, SAD); + NV50_IR_OPCODE_CASE(TXF, TXF); + NV50_IR_OPCODE_CASE(TXQ, TXQ); + + NV50_IR_OPCODE_CASE(EMIT, EMIT); + NV50_IR_OPCODE_CASE(ENDPRIM, RESTART); + + NV50_IR_OPCODE_CASE(KIL, DISCARD); + + NV50_IR_OPCODE_CASE(F2I, CVT); + NV50_IR_OPCODE_CASE(IDIV, DIV); + NV50_IR_OPCODE_CASE(IMAX, MAX); + NV50_IR_OPCODE_CASE(IMIN, MIN); + NV50_IR_OPCODE_CASE(INEG, NEG); + NV50_IR_OPCODE_CASE(ISGE, SET); + NV50_IR_OPCODE_CASE(ISHR, SHR); + NV50_IR_OPCODE_CASE(ISLT, SET); + NV50_IR_OPCODE_CASE(F2U, CVT); + NV50_IR_OPCODE_CASE(U2F, CVT); + NV50_IR_OPCODE_CASE(UADD, ADD); + NV50_IR_OPCODE_CASE(UDIV, DIV); + NV50_IR_OPCODE_CASE(UMAD, MAD); + NV50_IR_OPCODE_CASE(UMAX, MAX); + NV50_IR_OPCODE_CASE(UMIN, MIN); + NV50_IR_OPCODE_CASE(UMOD, MOD); + NV50_IR_OPCODE_CASE(UMUL, MUL); + NV50_IR_OPCODE_CASE(USEQ, SET); + NV50_IR_OPCODE_CASE(USGE, SET); + NV50_IR_OPCODE_CASE(USHR, SHR); + NV50_IR_OPCODE_CASE(USLT, SET); + NV50_IR_OPCODE_CASE(USNE, SET); + + NV50_IR_OPCODE_CASE(LOAD, TXF); + NV50_IR_OPCODE_CASE(SAMPLE, TEX); + NV50_IR_OPCODE_CASE(SAMPLE_B, TXB); + NV50_IR_OPCODE_CASE(SAMPLE_C, TEX); + NV50_IR_OPCODE_CASE(SAMPLE_C_LZ, TEX); + NV50_IR_OPCODE_CASE(SAMPLE_D, TXD); + NV50_IR_OPCODE_CASE(SAMPLE_L, TXL); + NV50_IR_OPCODE_CASE(GATHER4, TXG); + NV50_IR_OPCODE_CASE(RESINFO, TXQ); + + NV50_IR_OPCODE_CASE(END, EXIT); + + default: + return nv50_ir::OP_NOP; + } +} + +bool Instruction::checkDstSrcAliasing() const +{ + if (insn->Dst[0].Register.Indirect) // no danger if indirect, using memory + return false; + + for (int s = 0; s < TGSI_FULL_MAX_SRC_REGISTERS; ++s) { + if (insn->Src[s].Register.File == TGSI_FILE_NULL) + break; + if (insn->Src[s].Register.File == insn->Dst[0].Register.File && + insn->Src[s].Register.Index == insn->Dst[0].Register.Index) + return true; + } + return false; +} + +class Source +{ +public: + Source(struct nv50_ir_prog_info *); + ~Source(); + + struct Subroutine + { + unsigned pc; + }; + +public: + bool scanSource(); + unsigned fileSize(unsigned file) const { return scan.file_max[file] + 1; } + +public: + struct tgsi_shader_info scan; + struct tgsi_full_instruction *insns; + const struct tgsi_token *tokens; + struct nv50_ir_prog_info *info; + + nv50_ir::DynArray tempArrays; + nv50_ir::DynArray immdArrays; + int tempArrayCount; + int immdArrayCount; + + bool mainTempsInLMem; + + uint8_t *resourceTargets; // TGSI_TEXTURE_* + unsigned resourceCount; + + Subroutine *subroutines; + unsigned subroutineCount; + +private: + int inferSysValDirection(unsigned sn) const; + bool scanDeclaration(const struct tgsi_full_declaration *); + bool scanInstruction(const struct tgsi_full_instruction *); + void scanProperty(const struct tgsi_full_property *); + void scanImmediate(const struct tgsi_full_immediate *); + + inline bool isEdgeFlagPassthrough(const Instruction&) const; +}; + +Source::Source(struct nv50_ir_prog_info *prog) : info(prog) +{ + tokens = (const struct tgsi_token *)info->bin.source; + + if (prog->dbgFlags & NV50_IR_DEBUG_BASIC) + tgsi_dump(tokens, 0); + + resourceTargets = NULL; + subroutines = NULL; + + mainTempsInLMem = FALSE; +} + +Source::~Source() +{ + if (insns) + FREE(insns); + + if (info->immd.data) + FREE(info->immd.data); + if (info->immd.type) + FREE(info->immd.type); + + if (resourceTargets) + delete[] resourceTargets; + if (subroutines) + delete[] subroutines; +} + +bool Source::scanSource() +{ + unsigned insnCount = 0; + unsigned subrCount = 0; + struct tgsi_parse_context parse; + + tgsi_scan_shader(tokens, &scan); + + insns = (struct tgsi_full_instruction *)MALLOC(scan.num_instructions * + sizeof(insns[0])); + if (!insns) + return false; + + resourceCount = scan.file_max[TGSI_FILE_RESOURCE] + 1; + resourceTargets = new uint8_t[resourceCount]; + + subroutineCount = scan.opcode_count[TGSI_OPCODE_BGNSUB] + 1; + subroutines = new Subroutine[subroutineCount]; + + info->immd.bufSize = 0; + tempArrayCount = 0; + immdArrayCount = 0; + + info->numInputs = scan.file_max[TGSI_FILE_INPUT] + 1; + info->numOutputs = scan.file_max[TGSI_FILE_OUTPUT] + 1; + info->numSysVals = scan.file_max[TGSI_FILE_SYSTEM_VALUE] + 1; + + if (info->type == PIPE_SHADER_FRAGMENT) { + info->prop.fp.writesDepth = scan.writes_z; + info->prop.fp.usesDiscard = scan.uses_kill; + } else + if (info->type == PIPE_SHADER_GEOMETRY) { + info->prop.gp.instanceCount = 1; // default value + } + + info->immd.data = (uint32_t *)MALLOC(scan.immediate_count * 16); + info->immd.type = (ubyte *)MALLOC(scan.immediate_count * sizeof(ubyte)); + + tgsi_parse_init(&parse, tokens); + while (!tgsi_parse_end_of_tokens(&parse)) { + tgsi_parse_token(&parse); + + switch (parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_IMMEDIATE: + scanImmediate(&parse.FullToken.FullImmediate); + break; + case TGSI_TOKEN_TYPE_DECLARATION: + scanDeclaration(&parse.FullToken.FullDeclaration); + break; + case TGSI_TOKEN_TYPE_INSTRUCTION: + insns[insnCount++] = parse.FullToken.FullInstruction; + if (insns[insnCount - 1].Instruction.Opcode == TGSI_OPCODE_BGNSUB) + subroutines[++subrCount].pc = insnCount - 1; + else + scanInstruction(&parse.FullToken.FullInstruction); + break; + case TGSI_TOKEN_TYPE_PROPERTY: + scanProperty(&parse.FullToken.FullProperty); + break; + default: + INFO("unknown TGSI token type: %d\n", parse.FullToken.Token.Type); + break; + } + } + tgsi_parse_free(&parse); + + if (mainTempsInLMem) + info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16; + + return info->assignSlots(info) == 0; +} + +void Source::scanProperty(const struct tgsi_full_property *prop) +{ + switch (prop->Property.PropertyName) { + case TGSI_PROPERTY_GS_OUTPUT_PRIM: + info->prop.gp.outputPrim = prop->u[0].Data; + break; + case TGSI_PROPERTY_GS_INPUT_PRIM: + info->prop.gp.inputPrim = prop->u[0].Data; + break; + case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES: + info->prop.gp.maxVertices = prop->u[0].Data; + break; +#if 0 + case TGSI_PROPERTY_GS_INSTANCE_COUNT: + info->prop.gp.instanceCount = prop->u[0].Data; + break; +#endif + case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS: + info->prop.fp.separateFragData = TRUE; + break; + case TGSI_PROPERTY_FS_COORD_ORIGIN: + case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER: + // we don't care + break; + default: + INFO("unhandled TGSI property %d\n", prop->Property.PropertyName); + break; + } +} + +void Source::scanImmediate(const struct tgsi_full_immediate *imm) +{ + const unsigned n = info->immd.count++; + + assert(n < scan.immediate_count); + + for (int c = 0; c < 4; ++c) + info->immd.data[n * 4 + c] = imm->u[c].Uint; + + info->immd.type[n] = imm->Immediate.DataType; +} + +int Source::inferSysValDirection(unsigned sn) const +{ + switch (sn) { + case TGSI_SEMANTIC_INSTANCEID: +// case TGSI_SEMANTIC_VERTEXID: + return 1; +#if 0 + case TGSI_SEMANTIC_LAYER: + case TGSI_SEMANTIC_VIEWPORTINDEX: + return 0; +#endif + case TGSI_SEMANTIC_PRIMID: + return (info->type == PIPE_SHADER_FRAGMENT) ? 1 : 0; + default: + return 0; + } +} + +bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) +{ + unsigned i; + unsigned sn = TGSI_SEMANTIC_GENERIC; + unsigned si = 0; + const unsigned first = decl->Range.First, last = decl->Range.Last; + + if (decl->Declaration.Semantic) { + sn = decl->Semantic.Name; + si = decl->Semantic.Index; + } + + switch (decl->Declaration.File) { + case TGSI_FILE_INPUT: + if (info->type == PIPE_SHADER_VERTEX) { + // all vertex attributes are equal + for (i = first; i <= last; ++i) { + info->in[i].sn = TGSI_SEMANTIC_GENERIC; + info->in[i].si = i; + } + } else { + for (i = first; i <= last; ++i, ++si) { + info->in[i].id = i; + info->in[i].sn = sn; + info->in[i].si = si; + if (info->type == PIPE_SHADER_FRAGMENT) { + // translate interpolation mode + switch (decl->Declaration.Interpolate) { + case TGSI_INTERPOLATE_CONSTANT: + info->in[i].flat = 1; + break; + case TGSI_INTERPOLATE_LINEAR: + if (sn != TGSI_SEMANTIC_COLOR) // GL_NICEST + info->in[i].linear = 1; + break; + default: + break; + } + if (decl->Declaration.Centroid) + info->in[i].centroid = 1; + } + } + } + break; + case TGSI_FILE_OUTPUT: + for (i = first; i <= last; ++i, ++si) { + switch (sn) { + case TGSI_SEMANTIC_POSITION: + if (info->type == PIPE_SHADER_FRAGMENT) + info->io.fragDepth = i; + break; + case TGSI_SEMANTIC_COLOR: + if (info->type == PIPE_SHADER_FRAGMENT) + info->prop.fp.numColourResults++; + break; + case TGSI_SEMANTIC_EDGEFLAG: + info->io.edgeFlagOut = i; + break; + default: + break; + } + info->out[i].id = i; + info->out[i].sn = sn; + info->out[i].si = si; + } + break; + case TGSI_FILE_SYSTEM_VALUE: + for (i = first; i <= last; ++i, ++si) { + info->sv[i].sn = sn; + info->sv[i].si = si; + info->sv[i].input = inferSysValDirection(sn); + } + break; + case TGSI_FILE_RESOURCE: + for (i = first; i <= last; ++i) + resourceTargets[i] = decl->Resource.Resource; + break; + case TGSI_FILE_IMMEDIATE_ARRAY: + { + if (decl->Dim.Index2D >= immdArrayCount) + immdArrayCount = decl->Dim.Index2D + 1; + immdArrays[decl->Dim.Index2D].u32 = (last + 1) << 2; + int c; + uint32_t base, count; + switch (decl->Declaration.UsageMask) { + case 0x1: c = 1; break; + case 0x3: c = 2; break; + default: + c = 4; + break; + } + immdArrays[decl->Dim.Index2D].u32 |= c; + count = (last + 1) * c; + base = info->immd.bufSize / 4; + info->immd.bufSize = (info->immd.bufSize + count * 4 + 0xf) & ~0xf; + info->immd.buf = (uint32_t *)REALLOC(info->immd.buf, base * 4, + info->immd.bufSize); + // NOTE: this assumes array declarations are ordered by Dim.Index2D + for (i = 0; i < count; ++i) + info->immd.buf[base + i] = decl->ImmediateData.u[i].Uint; + } + break; + case TGSI_FILE_TEMPORARY_ARRAY: + { + if (decl->Dim.Index2D >= tempArrayCount) + tempArrayCount = decl->Dim.Index2D + 1; + tempArrays[decl->Dim.Index2D].u32 = (last + 1) << 2; + int c; + uint32_t count; + switch (decl->Declaration.UsageMask) { + case 0x1: c = 1; break; + case 0x3: c = 2; break; + default: + c = 4; + break; + } + tempArrays[decl->Dim.Index2D].u32 |= c; + count = (last + 1) * c; + info->bin.tlsSpace += (info->bin.tlsSpace + count * 4 + 0xf) & ~0xf; + } + break; + case TGSI_FILE_NULL: + case TGSI_FILE_TEMPORARY: + case TGSI_FILE_ADDRESS: + case TGSI_FILE_CONSTANT: + case TGSI_FILE_IMMEDIATE: + case TGSI_FILE_PREDICATE: + case TGSI_FILE_SAMPLER: + break; + default: + ERROR("unhandled TGSI_FILE %d\n", decl->Declaration.File); + return false; + } + return true; +} + +inline bool Source::isEdgeFlagPassthrough(const Instruction& insn) const +{ + return insn.getOpcode() == TGSI_OPCODE_MOV && + insn.getDst(0).getIndex(0) == info->io.edgeFlagOut && + insn.getSrc(0).getFile() == TGSI_FILE_INPUT; +} + +bool Source::scanInstruction(const struct tgsi_full_instruction *inst) +{ + Instruction insn(inst); + + if (insn.dstCount()) { + if (insn.getDst(0).getFile() == TGSI_FILE_OUTPUT) { + Instruction::DstRegister dst = insn.getDst(0); + + if (dst.isIndirect(0)) + for (unsigned i = 0; i < info->numOutputs; ++i) + info->out[i].mask = 0xf; + else + info->out[dst.getIndex(0)].mask |= dst.getMask(); + + if (isEdgeFlagPassthrough(insn)) + info->io.edgeFlagIn = insn.getSrc(0).getIndex(0); + } else + if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) { + if (insn.getDst(0).isIndirect(0)) + mainTempsInLMem = TRUE; + } + } + + for (unsigned s = 0; s < insn.srcCount(); ++s) { + Instruction::SrcRegister src = insn.getSrc(s); + if (src.getFile() == TGSI_FILE_TEMPORARY) + if (src.isIndirect(0)) + mainTempsInLMem = TRUE; + if (src.getFile() != TGSI_FILE_INPUT) + continue; + unsigned mask = insn.srcMask(s); + + if (src.isIndirect(0)) { + for (unsigned i = 0; i < info->numInputs; ++i) + info->in[i].mask = 0xf; + } else { + for (unsigned c = 0; c < 4; ++c) { + if (!(mask & (1 << c))) + continue; + int k = src.getSwizzle(c); + int i = src.getIndex(0); + if (info->in[i].sn != TGSI_SEMANTIC_FOG || k == TGSI_SWIZZLE_X) + if (k <= TGSI_SWIZZLE_W) + info->in[i].mask |= 1 << k; + } + } + } + return true; +} + +nv50_ir::TexInstruction::Target +Instruction::getTexture(const tgsi::Source *code, int s) const +{ + if (insn->Instruction.Texture) { + return translateTexture(insn->Texture.Texture); + } else { + // XXX: indirect access + unsigned int r = getSrc(s).getIndex(0); + assert(r < code->resourceCount); + return translateTexture(code->resourceTargets[r]); + } +} + +} // namespace tgsi + +namespace { + +using namespace nv50_ir; + +class Converter : public BuildUtil +{ +public: + Converter(Program *, const tgsi::Source *); + ~Converter(); + + bool run(); + +private: + Value *getVertexBase(int s); + Value *fetchSrc(int s, int c); + Value *acquireDst(int d, int c); + void storeDst(int d, int c, Value *); + + Value *fetchSrc(const tgsi::Instruction::SrcRegister src, int c, Value *ptr); + void storeDst(const tgsi::Instruction::DstRegister dst, int c, + Value *val, Value *ptr); + + Value *applySrcMod(Value *, int s, int c); + + Symbol *makeSym(uint file, int fileIndex, int idx, int c, uint32_t addr); + Symbol *srcToSym(tgsi::Instruction::SrcRegister, int c); + Symbol *dstToSym(tgsi::Instruction::DstRegister, int c); + + bool handleInstruction(const struct tgsi_full_instruction *); + void exportOutputs(); + inline bool isEndOfSubroutine(uint ip); + + void loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask); + + // R,S,L,C,Dx,Dy encode TGSI sources for respective values (0xSf for auto) + void setTexRS(TexInstruction *, unsigned int& s, int R, int S); + void handleTEX(Value *dst0[4], int R, int S, int L, int C, int Dx, int Dy); + void handleTXF(Value *dst0[4], int R); + void handleTXQ(Value *dst0[4], enum TexQuery); + void handleLIT(Value *dst0[4]); + void handleUserClipPlanes(); + + Value *interpolate(tgsi::Instruction::SrcRegister, int c, Value *ptr); + + void insertConvergenceOps(BasicBlock *conv, BasicBlock *fork); + + Value *buildDot(int dim); + +private: + const struct tgsi::Source *code; + const struct nv50_ir_prog_info *info; + + uint ip; // instruction pointer + + tgsi::Instruction tgsi; + + DataType dstTy; + DataType srcTy; + + DataArray tData; // TGSI_FILE_TEMPORARY + DataArray aData; // TGSI_FILE_ADDRESS + DataArray pData; // TGSI_FILE_PREDICATE + DataArray oData; // TGSI_FILE_OUTPUT (if outputs in registers) + DataArray *lData; // TGSI_FILE_TEMPORARY_ARRAY + DataArray *iData; // TGSI_FILE_IMMEDIATE_ARRAY + + Value *zero; + Value *fragCoord[4]; + Value *clipVtx[4]; + + Value *vtxBase[5]; // base address of vertex in primitive (for TP/GP) + uint8_t vtxBaseValid; + + Stack condBBs; // fork BB, then else clause BB + Stack joinBBs; // fork BB, for inserting join ops on ENDIF + Stack loopBBs; // loop headers + Stack breakBBs; // end of / after loop + Stack entryBBs; // start of current (inlined) subroutine + Stack leaveBBs; // end of current (inlined) subroutine + Stack retIPs; // return instruction pointer +}; + +Symbol * +Converter::srcToSym(tgsi::Instruction::SrcRegister src, int c) +{ + const int swz = src.getSwizzle(c); + + return makeSym(src.getFile(), + src.is2D() ? src.getIndex(1) : 0, + src.isIndirect(0) ? -1 : src.getIndex(0), swz, + src.getIndex(0) * 16 + swz * 4); +} + +Symbol * +Converter::dstToSym(tgsi::Instruction::DstRegister dst, int c) +{ + return makeSym(dst.getFile(), + dst.is2D() ? dst.getIndex(1) : 0, + dst.isIndirect(0) ? -1 : dst.getIndex(0), c, + dst.getIndex(0) * 16 + c * 4); +} + +Symbol * +Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address) +{ + Symbol *sym = new_Symbol(prog, tgsi::translateFile(tgsiFile)); + + sym->reg.fileIndex = fileIdx; + + if (idx >= 0) { + if (sym->reg.file == FILE_SHADER_INPUT) + sym->setOffset(info->in[idx].slot[c] * 4); + else + if (sym->reg.file == FILE_SHADER_OUTPUT) + sym->setOffset(info->out[idx].slot[c] * 4); + else + if (sym->reg.file == FILE_SYSTEM_VALUE) + sym->setSV(tgsi::translateSysVal(info->sv[idx].sn), c); + else + sym->setOffset(address); + } else { + sym->setOffset(address); + } + return sym; +} + +static inline uint8_t +translateInterpMode(const struct nv50_ir_varying *var, operation& op) +{ + uint8_t mode; + + if (var->flat) + mode = NV50_IR_INTERP_FLAT; + else + if (var->linear) + mode = NV50_IR_INTERP_LINEAR; + else + mode = NV50_IR_INTERP_PERSPECTIVE; + + op = (mode == NV50_IR_INTERP_PERSPECTIVE) ? OP_PINTERP : OP_LINTERP; + + if (var->centroid) + mode |= NV50_IR_INTERP_CENTROID; + + return mode; +} + +Value * +Converter::interpolate(tgsi::Instruction::SrcRegister src, int c, Value *ptr) +{ + operation op; + + // XXX: no way to know interpolation mode if we don't know what's accessed + const uint8_t mode = translateInterpMode(&info->in[ptr ? 0 : + src.getIndex(0)], op); + + Instruction *insn = new_Instruction(func, op, TYPE_F32); + + insn->setDef(0, getScratch()); + insn->setSrc(0, srcToSym(src, c)); + if (op == OP_PINTERP) + insn->setSrc(1, fragCoord[3]); + if (ptr) + insn->setIndirect(0, 0, ptr); + + insn->setInterpolate(mode); + + bb->insertTail(insn); + return insn->getDef(0); +} + +Value * +Converter::applySrcMod(Value *val, int s, int c) +{ + Modifier m = tgsi.getSrc(s).getMod(c); + DataType ty = tgsi.inferSrcType(); + + if (m & Modifier(NV50_IR_MOD_ABS)) + val = mkOp1v(OP_ABS, ty, getScratch(), val); + + if (m & Modifier(NV50_IR_MOD_NEG)) + val = mkOp1v(OP_NEG, ty, getScratch(), val); + + return val; +} + +Value * +Converter::getVertexBase(int s) +{ + assert(s < 5); + if (!(vtxBaseValid & (1 << s))) { + const int index = tgsi.getSrc(s).getIndex(1); + Value *rel = NULL; + if (tgsi.getSrc(s).isIndirect(1)) + rel = fetchSrc(tgsi.getSrc(s).getIndirect(1), 0, NULL); + vtxBaseValid |= 1 << s; + vtxBase[s] = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(), mkImm(index), rel); + } + return vtxBase[s]; +} + +Value * +Converter::fetchSrc(int s, int c) +{ + Value *res; + Value *ptr = NULL, *dimRel = NULL; + + tgsi::Instruction::SrcRegister src = tgsi.getSrc(s); + + if (src.isIndirect(0)) + ptr = fetchSrc(src.getIndirect(0), 0, NULL); + + if (src.is2D()) { + switch (src.getFile()) { + case TGSI_FILE_INPUT: + dimRel = getVertexBase(s); + break; + case TGSI_FILE_CONSTANT: + // on NVC0, this is valid and c{I+J}[k] == cI[(J << 16) + k] + if (src.isIndirect(1)) + dimRel = fetchSrc(src.getIndirect(1), 0, 0); + break; + default: + break; + } + } + + res = fetchSrc(src, c, ptr); + + if (dimRel) + res->getInsn()->setIndirect(0, 1, dimRel); + + return applySrcMod(res, s, c); +} + +Value * +Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr) +{ + const int idx = src.getIndex(0); + const int swz = src.getSwizzle(c); + + switch (src.getFile()) { + case TGSI_FILE_TEMPORARY: + return tData.load(idx, swz, ptr); + case TGSI_FILE_PREDICATE: + return pData.load(idx, swz, ptr); + case TGSI_FILE_ADDRESS: + return aData.load(idx, swz, ptr); + + case TGSI_FILE_TEMPORARY_ARRAY: + assert(src.is2D() && src.getIndex(1) < code->tempArrayCount); + return lData[src.getIndex(1)].load(idx, swz, ptr); + case TGSI_FILE_IMMEDIATE_ARRAY: + assert(src.is2D() && src.getIndex(1) < code->immdArrayCount); + return iData[src.getIndex(1)].load(idx, swz, ptr); + + case TGSI_FILE_IMMEDIATE: + assert(!ptr); + return loadImm(NULL, info->immd.data[idx * 4 + swz]); + + case TGSI_FILE_CONSTANT: + return mkLoad(TYPE_U32, srcToSym(src, c), ptr); + + case TGSI_FILE_INPUT: + if (prog->getType() == Program::TYPE_FRAGMENT) { + // don't load masked inputs, won't be assigned a slot + if (!ptr && !(info->in[idx].mask & (1 << swz))) + return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f); + return interpolate(src, c, ptr); + } + return mkLoad(TYPE_U32, srcToSym(src, c), ptr); + + case TGSI_FILE_SYSTEM_VALUE: + assert(!ptr); + return mkOp1v(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c)); + + case TGSI_FILE_OUTPUT: + case TGSI_FILE_RESOURCE: + case TGSI_FILE_SAMPLER: + case TGSI_FILE_NULL: + default: + assert(!"invalid/unhandled TGSI source file"); + return NULL; + } +} + +Value * +Converter::acquireDst(int d, int c) +{ + const tgsi::Instruction::DstRegister dst = tgsi.getDst(d); + + if (dst.isMasked(c)) + return NULL; + if (dst.isIndirect(0)) + return getScratch(); + + const int idx = dst.getIndex(0); + + switch (dst.getFile()) { + case TGSI_FILE_TEMPORARY: + return tData.acquire(idx, c); + case TGSI_FILE_TEMPORARY_ARRAY: + return getScratch(); + case TGSI_FILE_PREDICATE: + return pData.acquire(idx, c); + case TGSI_FILE_ADDRESS: + return aData.acquire(idx, c); + + case TGSI_FILE_OUTPUT: + if (prog->getType() == Program::TYPE_FRAGMENT) + return oData.acquire(idx, c); + // fall through + case TGSI_FILE_SYSTEM_VALUE: + return getScratch(); + + default: + assert(!"invalid dst file"); + return NULL; + } +} + +void +Converter::storeDst(int d, int c, Value *val) +{ + const tgsi::Instruction::DstRegister dst = tgsi.getDst(d); + + switch (tgsi.getSaturate()) { + case TGSI_SAT_NONE: + break; + case TGSI_SAT_ZERO_ONE: + mkOp1(OP_SAT, dstTy, val, val); + break; + case TGSI_SAT_MINUS_PLUS_ONE: + mkOp2(OP_MAX, dstTy, val, val, mkImm(-1.0f)); + mkOp2(OP_MIN, dstTy, val, val, mkImm(+1.0f)); + break; + default: + assert(!"invalid saturation mode"); + break; + } + + Value *ptr = dst.isIndirect(0) ? + fetchSrc(dst.getIndirect(0), 0, NULL) : NULL; + + if (info->io.clipDistanceCount && + dst.getFile() == TGSI_FILE_OUTPUT && + info->out[dst.getIndex(0)].sn == TGSI_SEMANTIC_POSITION) { + mkMov(clipVtx[c], val); + val = clipVtx[c]; + } + + storeDst(dst, c, val, ptr); +} + +void +Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c, + Value *val, Value *ptr) +{ + const int idx = dst.getIndex(0); + + switch (dst.getFile()) { + case TGSI_FILE_TEMPORARY: + tData.store(idx, c, ptr, val); + break; + case TGSI_FILE_TEMPORARY_ARRAY: + assert(dst.is2D() && dst.getIndex(1) < code->tempArrayCount); + lData[dst.getIndex(1)].store(idx, c, ptr, val); + break; + case TGSI_FILE_PREDICATE: + pData.store(idx, c, ptr, val); + break; + case TGSI_FILE_ADDRESS: + aData.store(idx, c, ptr, val); + break; + + case TGSI_FILE_OUTPUT: + if (prog->getType() == Program::TYPE_FRAGMENT) + oData.store(idx, c, ptr, val); + else + mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val); + break; + + case TGSI_FILE_SYSTEM_VALUE: + assert(!ptr); + mkOp2(OP_WRSV, TYPE_U32, NULL, dstToSym(dst, c), val); + break; + + default: + assert(!"invalid dst file"); + break; + } +} + +#define FOR_EACH_DST_ENABLED_CHANNEL(d, chan, inst) \ + for (chan = 0; chan < 4; ++chan) \ + if (!inst.getDst(d).isMasked(chan)) + +Value * +Converter::buildDot(int dim) +{ + assert(dim > 0); + + Value *src0 = fetchSrc(0, 0), *src1 = fetchSrc(1, 0); + Value *dotp = getScratch(); + + mkOp2(OP_MUL, TYPE_F32, dotp, src0, src1); + + for (int c = 1; c < dim; ++c) { + src0 = fetchSrc(0, c); + src1 = fetchSrc(1, c); + mkOp3(OP_MAD, TYPE_F32, dotp, src0, src1, dotp); + } + return dotp; +} + +void +Converter::insertConvergenceOps(BasicBlock *conv, BasicBlock *fork) +{ + FlowInstruction *join = new_FlowInstruction(func, OP_JOIN, NULL); + join->fixed = 1; + conv->insertHead(join); + + fork->joinAt = new_FlowInstruction(func, OP_JOINAT, conv); + fork->insertBefore(fork->getExit(), fork->joinAt); +} + +void +Converter::setTexRS(TexInstruction *tex, unsigned int& s, int R, int S) +{ + unsigned rIdx = 0, sIdx = 0; + + if (R >= 0) + rIdx = tgsi.getSrc(R).getIndex(0); + if (S >= 0) + sIdx = tgsi.getSrc(S).getIndex(0); + + tex->setTexture(tgsi.getTexture(code, R), rIdx, sIdx); + + if (tgsi.getSrc(R).isIndirect(0)) { + tex->tex.rIndirectSrc = s; + tex->setSrc(s++, fetchSrc(tgsi.getSrc(R).getIndirect(0), 0, NULL)); + } + if (S >= 0 && tgsi.getSrc(S).isIndirect(0)) { + tex->tex.sIndirectSrc = s; + tex->setSrc(s++, fetchSrc(tgsi.getSrc(S).getIndirect(0), 0, NULL)); + } +} + +void +Converter::handleTXQ(Value *dst0[4], enum TexQuery query) +{ + TexInstruction *tex = new_TexInstruction(func, OP_TXQ); + tex->tex.query = query; + unsigned int c, d; + + for (d = 0, c = 0; c < 4; ++c) { + if (!dst0[c]) + continue; + tex->tex.mask |= 1 << c; + tex->setDef(d++, dst0[c]); + } + tex->setSrc((c = 0), fetchSrc(0, 0)); // mip level + + setTexRS(tex, c, 1, -1); + + bb->insertTail(tex); +} + +void +Converter::loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask) +{ + Value *proj = fetchSrc(0, 3); + Instruction *insn = proj->getUniqueInsn(); + int c; + + if (insn->op == OP_PINTERP) { + bb->insertTail(insn = insn->clone(true)); + insn->op = OP_LINTERP; + insn->setInterpolate(NV50_IR_INTERP_LINEAR | insn->getSampleMode()); + insn->setSrc(1, NULL); + proj = insn->getDef(0); + } + proj = mkOp1v(OP_RCP, TYPE_F32, getSSA(), proj); + + for (c = 0; c < 4; ++c) { + if (!(mask & (1 << c))) + continue; + if ((insn = src[c]->getUniqueInsn())->op != OP_PINTERP) + continue; + mask &= ~(1 << c); + + bb->insertTail(insn = insn->clone(true)); + insn->setInterpolate(NV50_IR_INTERP_PERSPECTIVE | insn->getSampleMode()); + insn->setSrc(1, proj); + dst[c] = insn->getDef(0); + } + if (!mask) + return; + + proj = mkOp1v(OP_RCP, TYPE_F32, getSSA(), fetchSrc(0, 3)); + + for (c = 0; c < 4; ++c) + if (mask & (1 << c)) + dst[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), src[c], proj); +} + +// order of nv50 ir sources: x y z layer lod/bias shadow +// order of TGSI TEX sources: x y z layer shadow lod/bias +// lowering will finally set the hw specific order (like array first on nvc0) +void +Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy) +{ + Value *val; + Value *arg[4], *src[8]; + Value *lod = NULL, *shd = NULL; + unsigned int s, c, d; + TexInstruction *texi = new_TexInstruction(func, tgsi.getOP()); + + TexInstruction::Target tgt = tgsi.getTexture(code, R); + + for (s = 0; s < tgt.getArgCount(); ++s) + arg[s] = src[s] = fetchSrc(0, s); + + if (texi->op == OP_TXL || texi->op == OP_TXB) + lod = fetchSrc(L >> 4, L & 3); + + if (C == 0x0f) + C = 0x00 | MAX2(tgt.getArgCount(), 2); // guess DC src + + if (tgt.isShadow()) + shd = fetchSrc(C >> 4, C & 3); + + if (texi->op == OP_TXD) { + for (c = 0; c < tgt.getDim(); ++c) { + texi->dPdx[c] = fetchSrc(Dx >> 4, (Dx & 3) + c); + texi->dPdy[c] = fetchSrc(Dy >> 4, (Dy & 3) + c); + } + } + + // cube textures don't care about projection value, it's divided out + if (tgsi.getOpcode() == TGSI_OPCODE_TXP && !tgt.isCube() && !tgt.isArray()) { + unsigned int n = tgt.getDim(); + if (shd) { + arg[n] = shd; + ++n; + assert(tgt.getDim() == tgt.getArgCount()); + } + loadProjTexCoords(src, arg, (1 << n) - 1); + if (shd) + shd = src[n - 1]; + } + + if (tgt.isCube()) { + for (c = 0; c < 3; ++c) + src[c] = mkOp1v(OP_ABS, TYPE_F32, getSSA(), arg[c]); + val = getScratch(); + mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); + mkOp2(OP_MAX, TYPE_F32, val, src[2], val); + mkOp1(OP_RCP, TYPE_F32, val, val); + for (c = 0; c < 3; ++c) + src[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), arg[c], val); + } + + for (c = 0, d = 0; c < 4; ++c) { + if (dst[c]) { + texi->setDef(d++, dst[c]); + texi->tex.mask |= 1 << c; + } else { + // NOTE: maybe hook up def too, for CSE + } + } + for (s = 0; s < tgt.getArgCount(); ++s) + texi->setSrc(s, src[s]); + if (lod) + texi->setSrc(s++, lod); + if (shd) + texi->setSrc(s++, shd); + + setTexRS(texi, s, R, S); + + if (tgsi.getOpcode() == TGSI_OPCODE_SAMPLE_C_LZ) + texi->tex.levelZero = true; + + bb->insertTail(texi); +} + +// 1st source: xyz = coordinates, w = lod +// 2nd source: offset +void +Converter::handleTXF(Value *dst[4], int R) +{ + TexInstruction *texi = new_TexInstruction(func, tgsi.getOP()); + unsigned int c, d, s; + + texi->tex.target = tgsi.getTexture(code, R); + + for (c = 0, d = 0; c < 4; ++c) { + if (dst[c]) { + texi->setDef(d++, dst[c]); + texi->tex.mask |= 1 << c; + } + } + for (c = 0; c < texi->tex.target.getArgCount(); ++c) + texi->setSrc(c, fetchSrc(0, c)); + texi->setSrc(c++, fetchSrc(0, 3)); // lod + + setTexRS(texi, c, R, -1); + + for (s = 0; s < tgsi.getNumTexOffsets(); ++s) { + for (c = 0; c < 3; ++c) { + texi->tex.offset[s][c] = tgsi.getTexOffset(s).getValueU32(c, info); + if (texi->tex.offset[s][c]) + texi->tex.useOffsets = s + 1; + } + } + + bb->insertTail(texi); +} + +void +Converter::handleLIT(Value *dst0[4]) +{ + Value *val0 = NULL; + unsigned int mask = tgsi.getDst(0).getMask(); + + if (mask & (1 << 0)) + loadImm(dst0[0], 1.0f); + + if (mask & (1 << 3)) + loadImm(dst0[3], 1.0f); + + if (mask & (3 << 1)) { + val0 = getScratch(); + mkOp2(OP_MAX, TYPE_F32, val0, fetchSrc(0, 0), zero); + if (mask & (1 << 1)) + mkMov(dst0[1], val0); + } + + if (mask & (1 << 2)) { + Value *src1 = fetchSrc(0, 1), *src3 = fetchSrc(0, 3); + Value *val1 = getScratch(), *val3 = getScratch(); + + Value *pos128 = loadImm(NULL, +127.999999f); + Value *neg128 = loadImm(NULL, -127.999999f); + + mkOp2(OP_MAX, TYPE_F32, val1, src1, zero); + mkOp2(OP_MAX, TYPE_F32, val3, src3, neg128); + mkOp2(OP_MIN, TYPE_F32, val3, val3, pos128); + mkOp2(OP_POW, TYPE_F32, val3, val1, val3); + + mkCmp(OP_SLCT, CC_GT, TYPE_F32, dst0[2], val3, zero, val0); + } +} + +bool +Converter::isEndOfSubroutine(uint ip) +{ + assert(ip < code->scan.num_instructions); + tgsi::Instruction insn(&code->insns[ip]); + return (insn.getOpcode() == TGSI_OPCODE_END || + insn.getOpcode() == TGSI_OPCODE_ENDSUB || + // does END occur at end of main or the very end ? + insn.getOpcode() == TGSI_OPCODE_BGNSUB); +} + +bool +Converter::handleInstruction(const struct tgsi_full_instruction *insn) +{ + Value *dst0[4], *rDst0[4]; + Value *src0, *src1, *src2; + Value *val0, *val1; + int c; + + tgsi = tgsi::Instruction(insn); + + bool useScratchDst = tgsi.checkDstSrcAliasing(); + + operation op = tgsi.getOP(); + dstTy = tgsi.inferDstType(); + srcTy = tgsi.inferSrcType(); + + unsigned int mask = tgsi.dstCount() ? tgsi.getDst(0).getMask() : 0; + + if (tgsi.dstCount()) { + for (c = 0; c < 4; ++c) { + rDst0[c] = acquireDst(0, c); + dst0[c] = (useScratchDst && rDst0[c]) ? getScratch() : rDst0[c]; + } + } + + switch (tgsi.getOpcode()) { + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_UADD: + case TGSI_OPCODE_AND: + case TGSI_OPCODE_DIV: + case TGSI_OPCODE_IDIV: + case TGSI_OPCODE_UDIV: + case TGSI_OPCODE_MAX: + case TGSI_OPCODE_MIN: + case TGSI_OPCODE_IMAX: + case TGSI_OPCODE_IMIN: + case TGSI_OPCODE_UMAX: + case TGSI_OPCODE_UMIN: + case TGSI_OPCODE_MOD: + case TGSI_OPCODE_UMOD: + case TGSI_OPCODE_MUL: + case TGSI_OPCODE_UMUL: + case TGSI_OPCODE_OR: + case TGSI_OPCODE_POW: + case TGSI_OPCODE_SHL: + case TGSI_OPCODE_ISHR: + case TGSI_OPCODE_USHR: + case TGSI_OPCODE_SUB: + case TGSI_OPCODE_XOR: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = fetchSrc(0, c); + src1 = fetchSrc(1, c); + mkOp2(op, dstTy, dst0[c], src0, src1); + } + break; + case TGSI_OPCODE_MAD: + case TGSI_OPCODE_UMAD: + case TGSI_OPCODE_SAD: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = fetchSrc(0, c); + src1 = fetchSrc(1, c); + src2 = fetchSrc(2, c); + mkOp3(op, dstTy, dst0[c], src0, src1, src2); + } + break; + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_ABS: + case TGSI_OPCODE_CEIL: + case TGSI_OPCODE_FLR: + case TGSI_OPCODE_TRUNC: + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_INEG: + case TGSI_OPCODE_NOT: + case TGSI_OPCODE_DDX: + case TGSI_OPCODE_DDY: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkOp1(op, dstTy, dst0[c], fetchSrc(0, c)); + break; + case TGSI_OPCODE_RSQ: + src0 = fetchSrc(0, 0); + val0 = getScratch(); + mkOp1(OP_ABS, TYPE_F32, val0, src0); + mkOp1(OP_RSQ, TYPE_F32, val0, val0); + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkMov(dst0[c], val0); + break; + case TGSI_OPCODE_ARL: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = fetchSrc(0, c); + mkCvt(OP_CVT, TYPE_S32, dst0[c], TYPE_F32, src0)->rnd = ROUND_M; + mkOp2(OP_SHL, TYPE_U32, dst0[c], dst0[c], mkImm(4)); + } + break; + case TGSI_OPCODE_UARL: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkOp2(OP_SHL, TYPE_U32, dst0[c], fetchSrc(0, c), mkImm(4)); + break; + case TGSI_OPCODE_EX2: + case TGSI_OPCODE_LG2: + val0 = mkOp1(op, TYPE_F32, getScratch(), fetchSrc(0, 0))->getDef(0); + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkOp1(OP_MOV, TYPE_F32, dst0[c], val0); + break; + case TGSI_OPCODE_COS: + case TGSI_OPCODE_SIN: + val0 = getScratch(); + if (mask & 7) { + mkOp1(OP_PRESIN, TYPE_F32, val0, fetchSrc(0, 0)); + mkOp1(op, TYPE_F32, val0, val0); + for (c = 0; c < 3; ++c) + if (dst0[c]) + mkMov(dst0[c], val0); + } + if (dst0[3]) { + mkOp1(OP_PRESIN, TYPE_F32, val0, fetchSrc(0, 3)); + mkOp1(op, TYPE_F32, dst0[3], val0); + } + break; + case TGSI_OPCODE_SCS: + if (mask & 3) { + val0 = mkOp1v(OP_PRESIN, TYPE_F32, getSSA(), fetchSrc(0, 0)); + if (dst0[0]) + mkOp1(OP_COS, TYPE_F32, dst0[0], val0); + if (dst0[1]) + mkOp1(OP_SIN, TYPE_F32, dst0[1], val0); + } + if (dst0[2]) + loadImm(dst0[2], 0.0f); + if (dst0[3]) + loadImm(dst0[3], 1.0f); + break; + case TGSI_OPCODE_EXP: + src0 = fetchSrc(0, 0); + val0 = mkOp1v(OP_FLOOR, TYPE_F32, getSSA(), src0); + if (dst0[1]) + mkOp2(OP_SUB, TYPE_F32, dst0[1], src0, val0); + if (dst0[0]) + mkOp1(OP_EX2, TYPE_F32, dst0[0], val0); + if (dst0[2]) + mkOp1(OP_EX2, TYPE_F32, dst0[2], src0); + if (dst0[3]) + loadImm(dst0[3], 1.0f); + break; + case TGSI_OPCODE_LOG: + src0 = mkOp1v(OP_ABS, TYPE_F32, getSSA(), fetchSrc(0, 0)); + val0 = mkOp1v(OP_LG2, TYPE_F32, dst0[2] ? dst0[2] : getSSA(), src0); + if (dst0[0] || dst0[1]) + val1 = mkOp1v(OP_FLOOR, TYPE_F32, dst0[0] ? dst0[0] : getSSA(), val0); + if (dst0[1]) { + mkOp1(OP_EX2, TYPE_F32, dst0[1], val1); + mkOp1(OP_RCP, TYPE_F32, dst0[1], dst0[1]); + mkOp2(OP_MUL, TYPE_F32, dst0[1], dst0[1], src0); + } + if (dst0[3]) + loadImm(dst0[3], 1.0f); + break; + case TGSI_OPCODE_DP2: + val0 = buildDot(2); + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkMov(dst0[c], val0); + break; + case TGSI_OPCODE_DP3: + val0 = buildDot(3); + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkMov(dst0[c], val0); + break; + case TGSI_OPCODE_DP4: + val0 = buildDot(4); + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkMov(dst0[c], val0); + break; + case TGSI_OPCODE_DPH: + val0 = buildDot(3); + src1 = fetchSrc(1, 3); + mkOp2(OP_ADD, TYPE_F32, val0, val0, src1); + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkMov(dst0[c], val0); + break; + case TGSI_OPCODE_DST: + if (dst0[0]) + loadImm(dst0[0], 1.0f); + if (dst0[1]) { + src0 = fetchSrc(0, 1); + src1 = fetchSrc(1, 1); + mkOp2(OP_MUL, TYPE_F32, dst0[1], src0, src1); + } + if (dst0[2]) + mkMov(dst0[2], fetchSrc(0, 2)); + if (dst0[3]) + mkMov(dst0[3], fetchSrc(1, 3)); + break; + case TGSI_OPCODE_LRP: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = fetchSrc(0, c); + src1 = fetchSrc(1, c); + src2 = fetchSrc(2, c); + mkOp3(OP_MAD, TYPE_F32, dst0[c], + mkOp2v(OP_SUB, TYPE_F32, getSSA(), src1, src2), src0, src2); + } + break; + case TGSI_OPCODE_LIT: + handleLIT(dst0); + break; + case TGSI_OPCODE_XPD: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + if (c < 3) { + val0 = getSSA(); + src0 = fetchSrc(1, (c + 1) % 3); + src1 = fetchSrc(0, (c + 2) % 3); + mkOp2(OP_MUL, TYPE_F32, val0, src0, src1); + mkOp1(OP_NEG, TYPE_F32, val0, val0); + + src0 = fetchSrc(0, (c + 1) % 3); + src1 = fetchSrc(1, (c + 2) % 3); + mkOp3(OP_MAD, TYPE_F32, dst0[c], src0, src1, val0); + } else { + loadImm(dst0[c], 1.0f); + } + } + break; + case TGSI_OPCODE_SSG: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = fetchSrc(0, c); + val0 = getScratch(); + val1 = getScratch(); + mkCmp(OP_SET, CC_GT, TYPE_F32, val0, src0, zero); + mkCmp(OP_SET, CC_LT, TYPE_F32, val1, src0, zero); + mkOp2(OP_SUB, TYPE_F32, dst0[c], val0, val1); + } + break; + case TGSI_OPCODE_UCMP: + case TGSI_OPCODE_CMP: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = fetchSrc(0, c); + src1 = fetchSrc(1, c); + src2 = fetchSrc(2, c); + if (src1 == src2) + mkMov(dst0[c], src1); + else + mkCmp(OP_SLCT, (srcTy == TYPE_F32) ? CC_LT : CC_NE, + srcTy, dst0[c], src1, src2, src0); + } + break; + case TGSI_OPCODE_FRC: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = fetchSrc(0, c); + val0 = getScratch(); + mkOp1(OP_FLOOR, TYPE_F32, val0, src0); + mkOp2(OP_SUB, TYPE_F32, dst0[c], src0, val0); + } + break; + case TGSI_OPCODE_ROUND: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkCvt(OP_CVT, TYPE_F32, dst0[c], TYPE_F32, fetchSrc(0, c)) + ->rnd = ROUND_NI; + break; + case TGSI_OPCODE_CLAMP: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = fetchSrc(0, c); + src1 = fetchSrc(1, c); + src2 = fetchSrc(2, c); + val0 = getScratch(); + mkOp2(OP_MIN, TYPE_F32, val0, src0, src1); + mkOp2(OP_MAX, TYPE_F32, dst0[c], val0, src2); + } + break; + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SFL: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_STR: + case TGSI_OPCODE_ISGE: + case TGSI_OPCODE_ISLT: + case TGSI_OPCODE_USEQ: + case TGSI_OPCODE_USGE: + case TGSI_OPCODE_USLT: + case TGSI_OPCODE_USNE: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) { + src0 = fetchSrc(0, c); + src1 = fetchSrc(1, c); + mkCmp(op, tgsi.getSetCond(), dstTy, dst0[c], src0, src1); + } + break; + case TGSI_OPCODE_KIL: + val0 = new_LValue(func, FILE_PREDICATE); + for (c = 0; c < 4; ++c) { + mkCmp(OP_SET, CC_LT, TYPE_F32, val0, fetchSrc(0, c), zero); + mkOp(OP_DISCARD, TYPE_NONE, NULL)->setPredicate(CC_P, val0); + } + break; + case TGSI_OPCODE_KILP: + mkOp(OP_DISCARD, TYPE_NONE, NULL); + break; + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXB: + case TGSI_OPCODE_TXL: + case TGSI_OPCODE_TXP: + // R S L C Dx Dy + handleTEX(dst0, 1, 1, 0x03, 0x0f, 0x00, 0x00); + break; + case TGSI_OPCODE_TXD: + handleTEX(dst0, 3, 3, 0x03, 0x0f, 0x10, 0x20); + break; + case TGSI_OPCODE_SAMPLE: + case TGSI_OPCODE_SAMPLE_B: + case TGSI_OPCODE_SAMPLE_D: + case TGSI_OPCODE_SAMPLE_L: + case TGSI_OPCODE_SAMPLE_C: + case TGSI_OPCODE_SAMPLE_C_LZ: + handleTEX(dst0, 1, 2, 0x30, 0x31, 0x40, 0x50); + break; + case TGSI_OPCODE_TXF: + case TGSI_OPCODE_LOAD: + handleTXF(dst0, 1); + break; + case TGSI_OPCODE_TXQ: + case TGSI_OPCODE_RESINFO: + handleTXQ(dst0, TXQ_DIMS); + break; + case TGSI_OPCODE_F2I: + case TGSI_OPCODE_F2U: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c))->rnd = ROUND_Z; + break; + case TGSI_OPCODE_I2F: + case TGSI_OPCODE_U2F: + FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) + mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c)); + break; + case TGSI_OPCODE_EMIT: + case TGSI_OPCODE_ENDPRIM: + // get vertex stream if specified (must be immediate) + src0 = tgsi.srcCount() ? + mkImm(tgsi.getSrc(0).getValueU32(0, info)) : zero; + mkOp1(op, TYPE_U32, NULL, src0)->fixed = 1; + break; + case TGSI_OPCODE_IF: + { + BasicBlock *ifBB = new BasicBlock(func); + + bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE); + condBBs.push(bb); + joinBBs.push(bb); + + mkFlow(OP_BRA, NULL, CC_NOT_P, fetchSrc(0, 0)); + + setPosition(ifBB, true); + } + break; + case TGSI_OPCODE_ELSE: + { + BasicBlock *elseBB = new BasicBlock(func); + BasicBlock *forkBB = reinterpret_cast(condBBs.pop().u.p); + + forkBB->cfg.attach(&elseBB->cfg, Graph::Edge::TREE); + condBBs.push(bb); + + forkBB->getExit()->asFlow()->target.bb = elseBB; + if (!bb->isTerminated()) + mkFlow(OP_BRA, NULL, CC_ALWAYS, NULL); + + setPosition(elseBB, true); + } + break; + case TGSI_OPCODE_ENDIF: + { + BasicBlock *convBB = new BasicBlock(func); + BasicBlock *prevBB = reinterpret_cast(condBBs.pop().u.p); + BasicBlock *forkBB = reinterpret_cast(joinBBs.pop().u.p); + + if (!bb->isTerminated()) { + // we only want join if none of the clauses ended with CONT/BREAK/RET + if (prevBB->getExit()->op == OP_BRA && joinBBs.getSize() < 6) + insertConvergenceOps(convBB, forkBB); + mkFlow(OP_BRA, convBB, CC_ALWAYS, NULL); + bb->cfg.attach(&convBB->cfg, Graph::Edge::FORWARD); + } + + if (prevBB->getExit()->op == OP_BRA) { + prevBB->cfg.attach(&convBB->cfg, Graph::Edge::FORWARD); + prevBB->getExit()->asFlow()->target.bb = convBB; + } + setPosition(convBB, true); + } + break; + case TGSI_OPCODE_BGNLOOP: + { + BasicBlock *lbgnBB = new BasicBlock(func); + BasicBlock *lbrkBB = new BasicBlock(func); + + loopBBs.push(lbgnBB); + breakBBs.push(lbrkBB); + if (loopBBs.getSize() > func->loopNestingBound) + func->loopNestingBound++; + + mkFlow(OP_PREBREAK, lbrkBB, CC_ALWAYS, NULL); + + bb->cfg.attach(&lbgnBB->cfg, Graph::Edge::TREE); + setPosition(lbgnBB, true); + mkFlow(OP_PRECONT, lbgnBB, CC_ALWAYS, NULL); + } + break; + case TGSI_OPCODE_ENDLOOP: + { + BasicBlock *loopBB = reinterpret_cast(loopBBs.pop().u.p); + + if (!bb->isTerminated()) { + mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL); + bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK); + } + setPosition(reinterpret_cast(breakBBs.pop().u.p), true); + } + break; + case TGSI_OPCODE_BRK: + { + if (bb->isTerminated()) + break; + BasicBlock *brkBB = reinterpret_cast(breakBBs.peek().u.p); + mkFlow(OP_BREAK, brkBB, CC_ALWAYS, NULL); + bb->cfg.attach(&brkBB->cfg, Graph::Edge::CROSS); + } + break; + case TGSI_OPCODE_CONT: + { + if (bb->isTerminated()) + break; + BasicBlock *contBB = reinterpret_cast(loopBBs.peek().u.p); + mkFlow(OP_CONT, contBB, CC_ALWAYS, NULL); + contBB->explicitCont = true; + bb->cfg.attach(&contBB->cfg, Graph::Edge::BACK); + } + break; + case TGSI_OPCODE_BGNSUB: + { + if (!retIPs.getSize()) { + // end of main function + ip = code->scan.num_instructions - 2; // goto END + return true; + } + BasicBlock *entry = new BasicBlock(func); + BasicBlock *leave = new BasicBlock(func); + entryBBs.push(entry); + leaveBBs.push(leave); + bb->cfg.attach(&entry->cfg, Graph::Edge::TREE); + setPosition(entry, true); + } + return true; + case TGSI_OPCODE_ENDSUB: + { + BasicBlock *leave = reinterpret_cast(leaveBBs.pop().u.p); + entryBBs.pop(); + bb->cfg.attach(&leave->cfg, Graph::Edge::TREE); + setPosition(leave, true); + ip = retIPs.pop().u.u; + } + return true; + case TGSI_OPCODE_CAL: + // we don't have function declarations, so inline everything + retIPs.push(ip); + ip = code->subroutines[tgsi.getLabel()].pc - 1; // +1 after return + return true; + case TGSI_OPCODE_RET: + { + if (bb->isTerminated()) + return true; + BasicBlock *entry = reinterpret_cast(entryBBs.peek().u.p); + BasicBlock *leave = reinterpret_cast(leaveBBs.peek().u.p); + if (!isEndOfSubroutine(ip + 1)) { + // insert a PRERET at the entry if this is an early return + FlowInstruction *preRet = new_FlowInstruction(func, OP_PRERET, leave); + preRet->fixed = 1; + entry->insertHead(preRet); + bb->cfg.attach(&leave->cfg, Graph::Edge::CROSS); + } + // everything inlined so RET serves only to wrap up the stack + if (entry->getEntry() && entry->getEntry()->op == OP_PRERET) + mkFlow(OP_RET, NULL, CC_ALWAYS, NULL)->fixed = 1; + } + break; + case TGSI_OPCODE_END: + { + // attach and generate epilogue code + BasicBlock *epilogue = reinterpret_cast(leaveBBs.pop().u.p); + entryBBs.pop(); + bb->cfg.attach(&epilogue->cfg, Graph::Edge::TREE); + setPosition(epilogue, true); + if (prog->getType() == Program::TYPE_FRAGMENT) + exportOutputs(); + if (info->io.clipDistanceCount) + handleUserClipPlanes(); + mkOp(OP_EXIT, TYPE_NONE, NULL)->terminator = 1; + } + break; + case TGSI_OPCODE_SWITCH: + case TGSI_OPCODE_CASE: + ERROR("switch/case opcode encountered, should have been lowered\n"); + abort(); + break; + default: + ERROR("unhandled TGSI opcode: %u\n", tgsi.getOpcode()); + assert(0); + break; + } + + if (tgsi.dstCount()) { + for (c = 0; c < 4; ++c) { + if (!dst0[c]) + continue; + if (dst0[c] != rDst0[c]) + mkMov(rDst0[c], dst0[c]); + storeDst(0, c, rDst0[c]); + } + } + vtxBaseValid = 0; + + return true; +} + +void +Converter::handleUserClipPlanes() +{ + Value *res[8]; + int i, c; + + for (c = 0; c < 4; ++c) { + for (i = 0; i < info->io.clipDistanceCount; ++i) { + Value *ucp; + ucp = mkLoad(TYPE_F32, mkSymbol(FILE_MEMORY_CONST, 15, TYPE_F32, + i * 16 + c * 4), NULL); + if (c == 0) + res[i] = mkOp2v(OP_MUL, TYPE_F32, getScratch(), clipVtx[c], ucp); + else + mkOp3(OP_MAD, TYPE_F32, res[i], clipVtx[c], ucp, res[i]); + } + } + + for (i = 0; i < info->io.clipDistanceCount; ++i) + mkOp2(OP_WRSV, TYPE_F32, NULL, mkSysVal(SV_CLIP_DISTANCE, i), res[i]); +} + +void +Converter::exportOutputs() +{ + for (unsigned int i = 0; i < info->numOutputs; ++i) { + for (unsigned int c = 0; c < 4; ++c) { + if (!oData.exists(i, c)) + continue; + Symbol *sym = mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, + info->out[i].slot[c] * 4); + Value *val = oData.load(i, c, NULL); + if (val) + mkStore(OP_EXPORT, TYPE_F32, sym, NULL, val); + } + } +} + +Converter::Converter(Program *ir, const tgsi::Source *src) + : code(src), + tgsi(NULL), + tData(this), aData(this), pData(this), oData(this) +{ + prog = ir; + info = code->info; + + DataFile tFile = code->mainTempsInLMem ? FILE_MEMORY_LOCAL : FILE_GPR; + + tData.setup(0, code->fileSize(TGSI_FILE_TEMPORARY), 4, 4, tFile); + pData.setup(0, code->fileSize(TGSI_FILE_PREDICATE), 4, 4, FILE_PREDICATE); + aData.setup(0, code->fileSize(TGSI_FILE_ADDRESS), 4, 4, FILE_ADDRESS); + oData.setup(0, code->fileSize(TGSI_FILE_OUTPUT), 4, 4, FILE_GPR); + + lData = NULL; + iData = NULL; + + zero = mkImm((uint32_t)0); + + vtxBaseValid = 0; +} + +Converter::~Converter() +{ + if (lData) + delete[] lData; + if (iData) + delete[] iData; +} + +bool +Converter::run() +{ + BasicBlock *entry = new BasicBlock(prog->main); + BasicBlock *leave = new BasicBlock(prog->main); + + if (code->tempArrayCount && !lData) { + uint32_t volume = 0; + lData = new DataArray[code->tempArrayCount]; + if (!lData) + return false; + for (int i = 0; i < code->tempArrayCount; ++i) { + int len = code->tempArrays[i].u32 >> 2; + int dim = code->tempArrays[i].u32 & 3; + lData[i].setParent(this); + lData[i].setup(volume, len, dim, 4, FILE_MEMORY_LOCAL); + volume += (len * dim * 4 + 0xf) & ~0xf; + } + } + if (code->immdArrayCount && !iData) { + uint32_t volume = 0; + iData = new DataArray[code->immdArrayCount]; + if (!iData) + return false; + for (int i = 0; i < code->immdArrayCount; ++i) { + int len = code->immdArrays[i].u32 >> 2; + int dim = code->immdArrays[i].u32 & 3; + iData[i].setParent(this); + iData[i].setup(volume, len, dim, 4, FILE_MEMORY_CONST, 14); + volume += (len * dim * 4 + 0xf) & ~0xf; + } + } + + prog->main->setEntry(entry); + prog->main->setExit(leave); + + setPosition(entry, true); + entryBBs.push(entry); + leaveBBs.push(leave); + + if (info->io.clipDistanceCount) { + for (int c = 0; c < 4; ++c) + clipVtx[c] = getScratch(); + } + + if (prog->getType() == Program::TYPE_FRAGMENT) { + Symbol *sv = mkSysVal(SV_POSITION, 3); + fragCoord[3] = mkOp1v(OP_RDSV, TYPE_F32, getSSA(), sv); + mkOp1(OP_RCP, TYPE_F32, fragCoord[3], fragCoord[3]); + } + + for (ip = 0; ip < code->scan.num_instructions; ++ip) { + if (!handleInstruction(&code->insns[ip])) + return false; + } + return true; +} + +} // unnamed namespace + +namespace nv50_ir { + +bool +Program::makeFromTGSI(struct nv50_ir_prog_info *info) +{ + tgsi::Source src(info); + if (!src.scanSource()) + return false; + + Converter builder(this, &src); + return builder.run(); +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp new file mode 100644 index 00000000000..08075751d14 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp @@ -0,0 +1,381 @@ + +#include "nv50_ir_graph.h" + +namespace nv50_ir { + +Graph::Graph() +{ + root = NULL; + size = 0; + sequence = 0; +} + +Graph::~Graph() +{ + Iterator *iter = this->safeIteratorDFS(); + + for (; !iter->end(); iter->next()) + reinterpret_cast(iter->get())->cut(); + + putIterator(iter); +} + +void Graph::insert(Node *node) +{ + if (!root) { + root = node; + size = 1; + node->graph = this; + } else { + root->attach(node, Edge::TREE); + } +} + +void Graph::Edge::unlink() +{ + if (origin) { + prev[0]->next[0] = next[0]; + next[0]->prev[0] = prev[0]; + if (origin->out == this) + origin->out = (next[0] == this) ? NULL : next[0]; + + --origin->outCount; + } + if (target) { + prev[1]->next[1] = next[1]; + next[1]->prev[1] = prev[1]; + if (target->in == this) + target->in = (next[1] == this) ? NULL : next[1]; + + --target->inCount; + } +} + +const char *Graph::Edge::typeStr() const +{ + switch (type) { + case TREE: return "tree"; + case FORWARD: return "forward"; + case BACK: return "back"; + case CROSS: return "cross"; + case DUMMY: return "dummy"; + case UNKNOWN: + default: + return "unk"; + } +} + +Graph::Node::Node(void *priv) : data(priv), + in(0), out(0), graph(0), + visited(0), + inCount(0), outCount(0) +{ + // nothing to do +} + +void Graph::Node::attach(Node *node, Edge::Type kind) +{ + Edge *edge = new Edge(this, node, kind); + + // insert head + if (this->out) { + edge->next[0] = this->out; + edge->prev[0] = this->out->prev[0]; + edge->prev[0]->next[0] = edge; + this->out->prev[0] = edge; + } + this->out = edge; + + if (node->in) { + edge->next[1] = node->in; + edge->prev[1] = node->in->prev[1]; + edge->prev[1]->next[1] = edge; + node->in->prev[1] = edge; + } + node->in = edge; + + ++this->outCount; + ++node->inCount; + + assert(this->graph); + if (!node->graph) { + node->graph = this->graph; + ++node->graph->size; + } + + if (kind == Edge::UNKNOWN) + graph->classifyEdges(); +} + +bool Graph::Node::detach(Graph::Node *node) +{ + EdgeIterator ei = this->outgoing(); + for (; !ei.end(); ei.next()) + if (ei.getNode() == node) + break; + if (ei.end()) { + ERROR("no such node attached\n"); + return false; + } + delete ei.getEdge(); + return true; +} + +// Cut a node from the graph, deleting all attached edges. +void Graph::Node::cut() +{ + if (!graph || (!in && !out)) + return; + + while (out) + delete out; + while (in) + delete in; + + if (graph->root == this) + graph->root = NULL; +} + +Graph::Edge::Edge(Node *org, Node *tgt, Type kind) +{ + target = tgt; + origin = org; + type = kind; + + next[0] = next[1] = this; + prev[0] = prev[1] = this; +} + +bool +Graph::Node::reachableBy(Node *node, Node *term) +{ + Stack stack; + Node *pos; + const int seq = graph->nextSequence(); + + stack.push(node); + + while (stack.getSize()) { + pos = reinterpret_cast(stack.pop().u.p); + + if (pos == this) + return true; + if (pos == term) + continue; + + for (EdgeIterator ei = pos->outgoing(); !ei.end(); ei.next()) { + if (ei.getType() == Edge::BACK || ei.getType() == Edge::DUMMY) + continue; + if (ei.getNode()->visit(seq)) + stack.push(ei.getNode()); + } + } + return pos == this; +} + +class DFSIterator : public Graph::GraphIterator +{ +public: + DFSIterator(Graph *graph, const bool preorder) + { + unsigned int seq = graph->nextSequence(); + + nodes = new Graph::Node * [graph->getSize() + 1]; + count = 0; + pos = 0; + nodes[graph->getSize()] = 0; + + if (graph->getRoot()) { + graph->getRoot()->visit(seq); + search(graph->getRoot(), preorder, seq); + } + } + + ~DFSIterator() + { + if (nodes) + delete[] nodes; + } + + void search(Graph::Node *node, const bool preorder, const int sequence) + { + if (preorder) + nodes[count++] = node; + + for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) + if (ei.getNode()->visit(sequence)) + search(ei.getNode(), preorder, sequence); + + if (!preorder) + nodes[count++] = node; + } + + virtual bool end() const { return pos >= count; } + virtual void next() { if (pos < count) ++pos; } + virtual void *get() const { return nodes[pos]; } + + void reset() { pos = 0; } + +protected: + Graph::Node **nodes; + int count; + int pos; +}; + +Graph::GraphIterator *Graph::iteratorDFS(bool preorder) +{ + return new DFSIterator(this, preorder); +} + +Graph::GraphIterator *Graph::safeIteratorDFS(bool preorder) +{ + return this->iteratorDFS(preorder); +} + +class CFGIterator : public Graph::GraphIterator +{ +public: + CFGIterator(Graph *graph) + { + nodes = new Graph::Node * [graph->getSize() + 1]; + count = 0; + pos = 0; + nodes[graph->getSize()] = 0; + + // TODO: argh, use graph->sequence instead of tag and just raise it by > 1 + Iterator *iter; + for (iter = graph->iteratorDFS(); !iter->end(); iter->next()) + reinterpret_cast(iter->get())->tag = 0; + graph->putIterator(iter); + + if (graph->getRoot()) + search(graph->getRoot(), graph->nextSequence()); + } + + ~CFGIterator() + { + if (nodes) + delete[] nodes; + } + + virtual void *get() const { return nodes[pos]; } + virtual bool end() const { return pos >= count; } + virtual void next() { if (pos < count) ++pos; } + +private: + void search(Graph::Node *node, const int sequence) + { + Stack bb, cross; + + bb.push(node); + + while (bb.getSize()) { + node = reinterpret_cast(bb.pop().u.p); + assert(node); + if (!node->visit(sequence)) + continue; + node->tag = 0; + + for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) { + switch (ei.getType()) { + case Graph::Edge::TREE: + case Graph::Edge::FORWARD: + case Graph::Edge::DUMMY: + if (++(ei.getNode()->tag) == ei.getNode()->incidentCountFwd()) + bb.push(ei.getNode()); + break; + case Graph::Edge::BACK: + continue; + case Graph::Edge::CROSS: + if (++(ei.getNode()->tag) == 1) + cross.push(ei.getNode()); + break; + default: + assert(!"unknown edge kind in CFG"); + break; + } + } + nodes[count++] = node; + + if (bb.getSize() == 0) + cross.moveTo(bb); + } + } + +private: + Graph::Node **nodes; + int count; + int pos; +}; + +Graph::GraphIterator *Graph::iteratorCFG() +{ + return new CFGIterator(this); +} + +Graph::GraphIterator *Graph::safeIteratorCFG() +{ + return this->iteratorCFG(); +} + +void Graph::classifyEdges() +{ + DFSIterator *iter; + int seq; + + for (iter = new DFSIterator(this, true); !iter->end(); iter->next()) { + Node *node = reinterpret_cast(iter->get()); + node->visit(0); + node->tag = 0; + } + putIterator(iter); + + classifyDFS(root, (seq = 0)); + + sequence = seq; +} + +void Graph::classifyDFS(Node *curr, int& seq) +{ + Graph::Edge *edge; + Graph::Node *node; + + curr->visit(++seq); + curr->tag = 1; + + for (edge = curr->out; edge; edge = edge->next[0]) { + node = edge->target; + if (edge->type == Edge::DUMMY) + continue; + + if (node->getSequence() == 0) { + edge->type = Edge::TREE; + classifyDFS(node, seq); + } else + if (node->getSequence() > curr->getSequence()) { + edge->type = Edge::FORWARD; + } else { + edge->type = node->tag ? Edge::BACK : Edge::CROSS; + } + } + + for (edge = curr->in; edge; edge = edge->next[1]) { + node = edge->origin; + if (edge->type == Edge::DUMMY) + continue; + + if (node->getSequence() == 0) { + edge->type = Edge::TREE; + classifyDFS(node, seq); + } else + if (node->getSequence() > curr->getSequence()) { + edge->type = Edge::FORWARD; + } else { + edge->type = node->tag ? Edge::BACK : Edge::CROSS; + } + } + + curr->tag = 0; +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h new file mode 100644 index 00000000000..6407ff98ab5 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h @@ -0,0 +1,207 @@ + +#ifndef __NV50_IR_GRAPH_H__ +#define __NV50_IR_GRAPH_H__ + +#include "nv50_ir_util.h" + +namespace nv50_ir { + +#define ITER_NODE(x) reinterpret_cast((x).get()) +#define ITER_EDGE(x) reinterpret_cast((x).get()) + +// A connected graph. +class Graph +{ +public: + class Node; + + class GraphIterator : public Iterator + { + public: + virtual ~GraphIterator() { }; + }; + + class Edge + { + public: + enum Type + { + UNKNOWN, + TREE, + FORWARD, + BACK, + CROSS, // e.g. loop break + DUMMY + }; + + Edge(Node *dst, Node *src, Type kind); + ~Edge() { unlink(); } + + inline Node *getOrigin() const { return origin; } + inline Node *getTarget() const { return target; } + + inline Type getType() const { return type; } + const char *typeStr() const; + + private: + Node *origin; + Node *target; + + Type type; + Edge *next[2]; // next edge outgoing/incident from/to origin/target + Edge *prev[2]; + + void unlink(); + + friend class Graph; + }; + + class EdgeIterator : public Iterator + { + public: + EdgeIterator() : e(0), t(0), d(0) { } + EdgeIterator(Graph::Edge *first, int dir) : e(first), t(first), d(dir) { } + + virtual void next() { e = (e->next[d] == t) ? 0 : e->next[d]; } + virtual bool end() const { return !e; } + virtual void *get() const { return e; } + + inline Node *getNode() const { assert(e); return d ? + e->origin : e->target; } + inline Edge *getEdge() const { return e; } + inline Edge::Type getType() { return e ? e->getType() : Edge::UNKNOWN; } + + private: + Graph::Edge *e; + Graph::Edge *t; + int d; + }; + + class Node + { + public: + Node(void *); + ~Node() { cut(); } + + void attach(Node *, Edge::Type); + bool detach(Node *); + void cut(); + + inline EdgeIterator outgoing() const; + inline EdgeIterator incident() const; + + inline Node *parent() const; // returns NULL if count(incident edges) != 1 + + bool reachableBy(Node *node, Node *term); + + inline bool visit(int); + inline int getSequence() const; + + inline int incidentCountFwd() const; // count of incident non-back edges + inline int incidentCount() const { return inCount; } + inline int outgoingCount() const { return outCount; } + + Graph *getGraph() const { return graph; } + + void *data; + + private: + Edge *in; + Edge *out; + Graph *graph; + + int visited; + + int16_t inCount; + int16_t outCount; + public: + int tag; // for temporary use + + friend class Graph; + }; + +public: + Graph(); + ~Graph(); // does *not* free the nodes (make it an option ?) + + inline Node *getRoot() const { return root; } + + inline unsigned int getSize() const { return size; } + + inline int nextSequence(); + + void insert(Node *node); // attach to or set as root + + GraphIterator *iteratorDFS(bool preorder = true); + GraphIterator *iteratorCFG(); + + // safe iterators are unaffected by changes to the *edges* of the graph + GraphIterator *safeIteratorDFS(bool preorder = true); + GraphIterator *safeIteratorCFG(); + + inline void putIterator(Iterator *); // should be GraphIterator * + + void classifyEdges(); + +private: + void classifyDFS(Node *, int&); + +private: + Node *root; + unsigned int size; + int sequence; +}; + +int Graph::nextSequence() +{ + return ++sequence; +} + +Graph::Node *Graph::Node::parent() const +{ + if (inCount != 1) + return NULL; + assert(in); + return in->origin; +} + +bool Graph::Node::visit(int v) +{ + if (visited == v) + return false; + visited = v; + return true; +} + +int Graph::Node::getSequence() const +{ + return visited; +} + +void Graph::putIterator(Iterator *iter) +{ + delete reinterpret_cast(iter); +} + +Graph::EdgeIterator Graph::Node::outgoing() const +{ + return EdgeIterator(out, 0); +} + +Graph::EdgeIterator Graph::Node::incident() const +{ + return EdgeIterator(in, 1); +} + +int Graph::Node::incidentCountFwd() const +{ + int n = 0; + for (EdgeIterator ei = incident(); !ei.end(); ei.next()) + if (ei.getType() != Edge::BACK) + ++n; + return n; +} + +} // namespace nv50_ir + +#endif // __NV50_IR_GRAPH_H__ diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h b/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h new file mode 100644 index 00000000000..8730e953482 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_inlines.h @@ -0,0 +1,328 @@ + +#ifndef __NV50_IR_INLINES_H__ +#define __NV50_IR_INLINES_H__ + +static inline CondCode reverseCondCode(CondCode cc) +{ + static const uint8_t ccRev[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; + + return static_cast(ccRev[cc & 7] | (cc & ~7)); +} + +static inline CondCode inverseCondCode(CondCode cc) +{ + return static_cast(cc ^ 7); +} + +static inline bool isMemoryFile(DataFile f) +{ + return (f >= FILE_MEMORY_CONST && f <= FILE_MEMORY_LOCAL); +} + +static inline bool isTextureOp(operation op) +{ + return (op >= OP_TEX && op <= OP_TEXCSAA); +} + +static inline unsigned int typeSizeof(DataType ty) +{ + switch (ty) { + case TYPE_U8: + case TYPE_S8: + return 1; + case TYPE_F16: + case TYPE_U16: + case TYPE_S16: + return 2; + case TYPE_F32: + case TYPE_U32: + case TYPE_S32: + return 4; + case TYPE_F64: + case TYPE_U64: + case TYPE_S64: + return 8; + case TYPE_B96: + return 12; + case TYPE_B128: + return 16; + default: + return 0; + } +} + +static inline DataType typeOfSize(unsigned int size, + bool flt = false, bool sgn = false) +{ + switch (size) { + case 1: return sgn ? TYPE_S8 : TYPE_U8; + case 2: return flt ? TYPE_F16 : (sgn ? TYPE_S16 : TYPE_U16); + case 8: return flt ? TYPE_F64 : (sgn ? TYPE_S64 : TYPE_U64); + case 12: return TYPE_B96; + case 16: return TYPE_B128; + case 4: + default: + return flt ? TYPE_F32 : (sgn ? TYPE_S32 : TYPE_U32); + } +} + +static inline bool isFloatType(DataType ty) +{ + return (ty >= TYPE_F16 && ty <= TYPE_F64); +} + +static inline bool isSignedIntType(DataType ty) +{ + return (ty == TYPE_S8 || ty == TYPE_S16 || ty == TYPE_S32); +} + +static inline bool isSignedType(DataType ty) +{ + switch (ty) { + case TYPE_NONE: + case TYPE_U8: + case TYPE_U16: + case TYPE_U32: + case TYPE_B96: + case TYPE_B128: + return false; + default: + return true; + } +} + +const ValueRef *ValueRef::getIndirect(int dim) const +{ + return isIndirect(dim) ? &insn->src[indirect[dim]] : NULL; +} + +DataFile ValueRef::getFile() const +{ + return value ? value->reg.file : FILE_NULL; +} + +unsigned int ValueRef::getSize() const +{ + return value ? value->reg.size : 0; +} + +Value *ValueRef::rep() const +{ + assert(value); + return value->join; +} + +Value *ValueDef::rep() const +{ + assert(value); + return value->join; +} + +DataFile ValueDef::getFile() const +{ + return value ? value->reg.file : FILE_NULL; +} + +unsigned int ValueDef::getSize() const +{ + return value ? value->reg.size : 0; +} + +void ValueDef::setSSA(LValue *lval) +{ + Value *save = value; + + this->set(NULL); + prev = reinterpret_cast(save); + value = lval; + lval->defs = this; +} + +void ValueDef::restoreDefList() +{ + if (next == this) + prev = this; +} + +const LValue *ValueDef::preSSA() const +{ + return reinterpret_cast(prev); +} + +Instruction *Value::getInsn() const +{ + assert(!defs || getUniqueInsn()); + return defs ? defs->getInsn() : NULL; +} + +Instruction *Value::getUniqueInsn() const +{ + if (defs) { + if (join != this) { + ValueDef::Iterator it = defs->iterator(); + while (!it.end() && it.get()->get() != this) + it.next(); + assert(it.get()->get() == this); + return it.get()->getInsn(); + } + + // after regalloc, the definitions of coalesced values are linked + if (reg.data.id < 0) { + ValueDef::Iterator it = defs->iterator(); + int nDef; + for (nDef = 0; !it.end() && nDef < 2; it.next()) + if (it.get()->get() == this) // don't count joined values + ++nDef; + if (nDef > 1) + WARN("value %%%i not uniquely defined\n", id); // return NULL ? + } + + assert(defs->get() == this); + return defs->getInsn(); + } + return NULL; +} + +Value *Instruction::getIndirect(int s, int dim) const +{ + return src[s].isIndirect(dim) ? getSrc(src[s].indirect[dim]) : NULL; +} + +Value *Instruction::getPredicate() const +{ + return (predSrc >= 0) ? getSrc(predSrc) : NULL; +} + +Value *TexInstruction::getIndirectR() const +{ + return tex.rIndirectSrc >= 0 ? getSrc(tex.rIndirectSrc) : NULL; +} + +Value *TexInstruction::getIndirectS() const +{ + return tex.rIndirectSrc >= 0 ? getSrc(tex.rIndirectSrc) : NULL; +} + +CmpInstruction *Instruction::asCmp() +{ + if (op >= OP_SET_AND && op <= OP_SLCT && op != OP_SELP) + return static_cast(this); + return NULL; +} + +const CmpInstruction *Instruction::asCmp() const +{ + if (op >= OP_SET_AND && op <= OP_SLCT && op != OP_SELP) + return static_cast(this); + return NULL; +} + +FlowInstruction *Instruction::asFlow() +{ + if (op >= OP_BRA && op <= OP_JOIN) + return static_cast(this); + return NULL; +} + +const FlowInstruction *Instruction::asFlow() const +{ + if (op >= OP_BRA && op <= OP_JOINAT) + return static_cast(this); + return NULL; +} + +TexInstruction *Instruction::asTex() +{ + if (op >= OP_TEX && op <= OP_TEXCSAA) + return static_cast(this); + return NULL; +} + +const TexInstruction *Instruction::asTex() const +{ + if (op >= OP_TEX && op <= OP_TEXCSAA) + return static_cast(this); + return NULL; +} + +// XXX: use a virtual function so we're really really safe ? +LValue *Value::asLValue() +{ + if (reg.file >= FILE_GPR && reg.file <= FILE_ADDRESS) + return static_cast(this); + return NULL; +} + +Symbol *Value::asSym() +{ + if (reg.file >= FILE_MEMORY_CONST) + return static_cast(this); + return NULL; +} + +const Symbol *Value::asSym() const +{ + if (reg.file >= FILE_MEMORY_CONST) + return static_cast(this); + return NULL; +} + +void Symbol::setOffset(int32_t offset) +{ + reg.data.offset = offset; +} + +void Symbol::setAddress(Symbol *base, int32_t offset) +{ + baseSym = base; + reg.data.offset = offset; +} + +void Symbol::setSV(SVSemantic sv, uint32_t index) +{ + reg.data.sv.sv = sv; + reg.data.sv.index = index; +} + +ImmediateValue *Value::asImm() +{ + if (reg.file == FILE_IMMEDIATE) + return static_cast(this); + return NULL; +} + +const ImmediateValue *Value::asImm() const +{ + if (reg.file == FILE_IMMEDIATE) + return static_cast(this); + return NULL; +} + +Value *Value::get(Iterator &it) +{ + return reinterpret_cast(it.get()); +} + +bool BasicBlock::reachableBy(BasicBlock *by, BasicBlock *term) +{ + return cfg.reachableBy(&by->cfg, &term->cfg); +} + +BasicBlock *BasicBlock::get(Iterator &iter) +{ + return reinterpret_cast(iter.get()); +} + +BasicBlock *BasicBlock::get(Graph::Node *node) +{ + assert(node); + return reinterpret_cast(node->data); +} + +LValue *Function::getLValue(int id) +{ + assert((unsigned int)id < (unsigned int)allLValues.getSize()); + return reinterpret_cast(allLValues.get(id)); +} + +#endif // __NV50_IR_INLINES_H__ diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp new file mode 100644 index 00000000000..bd331ea8f03 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp @@ -0,0 +1,2192 @@ + +#include "nv50_ir.h" +#include "nv50_ir_target.h" +#include "nv50_ir_build_util.h" + +extern "C" { +#include "util/u_math.h" +} + +namespace nv50_ir { + +bool +Instruction::isNop() const +{ + if (op == OP_CONSTRAINT || op == OP_PHI) + return true; + if (terminator || join) // XXX: should terminator imply flow ? + return false; + if (!fixed && op == OP_NOP) + return true; + + if (def[0].exists() && def[0].rep()->reg.data.id < 0) { + for (int d = 1; defExists(d); ++d) + if (def[d].rep()->reg.data.id >= 0) + WARN("part of vector result is unused !\n"); + return true; + } + + if (op == OP_MOV || op == OP_UNION) { + if (!def[0].rep()->equals(getSrc(0))) + return false; + if (op == OP_UNION) + if (!def[0].rep()->equals(getSrc(1))) + return false; + return true; + } + + return false; +} + +bool Instruction::isDead() const +{ + if (op == OP_STORE || + op == OP_EXPORT) + return false; + + for (int d = 0; defExists(d); ++d) + if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0) + return false; + + if (terminator || asFlow()) + return false; + if (fixed) + return false; + + return true; +}; + +// ============================================================================= + +class CopyPropagation : public Pass +{ +private: + virtual bool visit(BasicBlock *); +}; + +// Propagate all MOVs forward to make subsequent optimization easier, except if +// the sources stem from a phi, in which case we don't want to mess up potential +// swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def. +bool +CopyPropagation::visit(BasicBlock *bb) +{ + Instruction *mov, *si, *next; + + for (mov = bb->getEntry(); mov; mov = next) { + next = mov->next; + if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue()) + continue; + si = mov->getSrc(0)->getInsn(); + if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) { + // propagate + mov->def[0].replace(mov->getSrc(0), false); + delete_Instruction(prog, mov); + } + } + return true; +} + +// ============================================================================= + +class LoadPropagation : public Pass +{ +private: + virtual bool visit(BasicBlock *); + + void checkSwapSrc01(Instruction *); + + bool isCSpaceLoad(Instruction *); + bool isImmd32Load(Instruction *); +}; + +bool +LoadPropagation::isCSpaceLoad(Instruction *ld) +{ + return ld && ld->op == OP_LOAD && ld->src[0].getFile() == FILE_MEMORY_CONST; +} + +bool +LoadPropagation::isImmd32Load(Instruction *ld) +{ + if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4)) + return false; + return ld->src[0].getFile() == FILE_IMMEDIATE; +} + +void +LoadPropagation::checkSwapSrc01(Instruction *insn) +{ + if (!prog->getTarget()->getOpInfo(insn).commutative) + if (insn->op != OP_SET && insn->op != OP_SLCT) + return; + if (insn->src[1].getFile() != FILE_GPR) + return; + + Instruction *i0 = insn->getSrc(0)->getInsn(); + Instruction *i1 = insn->getSrc(1)->getInsn(); + + if (isCSpaceLoad(i0)) { + if (!isCSpaceLoad(i1)) + insn->swapSources(0, 1); + else + return; + } else + if (isImmd32Load(i0)) { + if (!isCSpaceLoad(i1) && !isImmd32Load(i1)) + insn->swapSources(0, 1); + else + return; + } else { + return; + } + + if (insn->op == OP_SET) + insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond); + else + if (insn->op == OP_SLCT) + insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond); +} + +bool +LoadPropagation::visit(BasicBlock *bb) +{ + const Target *targ = prog->getTarget(); + Instruction *next; + + for (Instruction *i = bb->getEntry(); i; i = next) { + next = i->next; + + if (i->srcExists(1)) + checkSwapSrc01(i); + + for (int s = 0; i->srcExists(s); ++s) { + Instruction *ld = i->getSrc(s)->getInsn(); + + if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV)) + continue; + if (!targ->insnCanLoad(i, s, ld)) + continue; + + // propagate ! + i->setSrc(s, ld->getSrc(0)); + if (ld->src[0].isIndirect(0)) + i->setIndirect(s, 0, ld->getIndirect(0, 0)); + + if (ld->getDef(0)->refCount() == 0) + delete_Instruction(prog, ld); + } + } + return true; +} + +// ============================================================================= + +// Evaluate constant expressions. +class ConstantFolding : public Pass +{ +public: + bool foldAll(Program *); + +private: + virtual bool visit(BasicBlock *); + + void expr(Instruction *, ImmediateValue *, ImmediateValue *); + void opnd(Instruction *, ImmediateValue *, int s); + + void unary(Instruction *, const ImmediateValue&); + + // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET + CmpInstruction *findOriginForTestWithZero(Value *); + + unsigned int foldCount; + + BuildUtil bld; +}; + +// TODO: remember generated immediates and only revisit these +bool +ConstantFolding::foldAll(Program *prog) +{ + unsigned int iterCount = 0; + do { + foldCount = 0; + if (!run(prog)) + return false; + } while (foldCount && ++iterCount < 2); + return true; +} + +bool +ConstantFolding::visit(BasicBlock *bb) +{ + Instruction *i, *next; + + for (i = bb->getEntry(); i; i = next) { + next = i->next; + if (i->op == OP_MOV) // continue early, MOV appears frequently + continue; + + ImmediateValue *src0 = i->src[0].getImmediate(); + ImmediateValue *src1 = i->src[1].getImmediate(); + + if (src0 && src1) + expr(i, src0, src1); + else + if (src0) + opnd(i, src0, 0); + else + if (src1) + opnd(i, src1, 1); + } + return true; +} + +CmpInstruction * +ConstantFolding::findOriginForTestWithZero(Value *value) +{ + if (!value) + return NULL; + Instruction *insn = value->getInsn(); + + while (insn && insn->op != OP_SET) { + Instruction *next = NULL; + switch (insn->op) { + case OP_NEG: + case OP_ABS: + case OP_CVT: + next = insn->getSrc(0)->getInsn(); + if (insn->sType != next->dType) + return NULL; + break; + case OP_MOV: + next = insn->getSrc(0)->getInsn(); + break; + default: + return NULL; + } + insn = next; + } + return insn ? insn->asCmp() : NULL; +} + +void +Modifier::applyTo(ImmediateValue& imm) const +{ + switch (imm.reg.type) { + case TYPE_F32: + if (bits & NV50_IR_MOD_ABS) + imm.reg.data.f32 = fabsf(imm.reg.data.f32); + if (bits & NV50_IR_MOD_NEG) + imm.reg.data.f32 = -imm.reg.data.f32; + if (bits & NV50_IR_MOD_SAT) { + if (imm.reg.data.f32 < 0.0f) + imm.reg.data.f32 = 0.0f; + else + if (imm.reg.data.f32 > 1.0f) + imm.reg.data.f32 = 1.0f; + } + assert(!(bits & NV50_IR_MOD_NOT)); + break; + + case TYPE_S8: // NOTE: will be extended + case TYPE_S16: + case TYPE_S32: + case TYPE_U8: // NOTE: treated as signed + case TYPE_U16: + case TYPE_U32: + if (bits & NV50_IR_MOD_ABS) + imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ? + imm.reg.data.s32 : -imm.reg.data.s32; + if (bits & NV50_IR_MOD_NEG) + imm.reg.data.s32 = -imm.reg.data.s32; + if (bits & NV50_IR_MOD_NOT) + imm.reg.data.s32 = ~imm.reg.data.s32; + break; + + case TYPE_F64: + if (bits & NV50_IR_MOD_ABS) + imm.reg.data.f64 = fabs(imm.reg.data.f64); + if (bits & NV50_IR_MOD_NEG) + imm.reg.data.f64 = -imm.reg.data.f64; + if (bits & NV50_IR_MOD_SAT) { + if (imm.reg.data.f64 < 0.0) + imm.reg.data.f64 = 0.0; + else + if (imm.reg.data.f64 > 1.0) + imm.reg.data.f64 = 1.0; + } + assert(!(bits & NV50_IR_MOD_NOT)); + break; + + default: + assert(!"invalid/unhandled type"); + imm.reg.data.u64 = 0; + break; + } +} + +operation +Modifier::getOp() const +{ + switch (bits) { + case NV50_IR_MOD_ABS: return OP_ABS; + case NV50_IR_MOD_NEG: return OP_NEG; + case NV50_IR_MOD_SAT: return OP_SAT; + case NV50_IR_MOD_NOT: return OP_NOT; + case 0: + return OP_MOV; + default: + return OP_CVT; + } +} + +void +ConstantFolding::expr(Instruction *i, + ImmediateValue *src0, ImmediateValue *src1) +{ + ImmediateValue imm0(src0, i->sType); + ImmediateValue imm1(src1, i->sType); + struct Storage res; + struct Storage *const a = &imm0.reg, *const b = &imm1.reg; + + i->src[0].mod.applyTo(imm0); + i->src[1].mod.applyTo(imm1); + + switch (i->op) { + case OP_MAD: + case OP_FMA: + case OP_MUL: + if (i->dnz && i->dType == TYPE_F32) { + if (!isfinite(a->data.f32)) + a->data.f32 = 0.0f; + if (!isfinite(b->data.f32)) + b->data.f32 = 0.0f; + } + switch (i->dType) { + case TYPE_F32: res.data.f32 = a->data.f32 * b->data.f32; break; + case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break; + case TYPE_S32: + case TYPE_U32: res.data.u32 = a->data.u32 * b->data.u32; break; + default: + return; + } + break; + case OP_DIV: + if (b->data.u32 == 0) + break; + switch (i->dType) { + case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break; + case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break; + case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break; + case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break; + default: + return; + } + break; + case OP_ADD: + switch (i->dType) { + case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break; + case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break; + case TYPE_S32: + case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break; + default: + return; + } + break; + case OP_POW: + switch (i->dType) { + case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break; + case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break; + default: + return; + } + break; + case OP_MAX: + switch (i->dType) { + case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break; + case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break; + case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break; + case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break; + default: + return; + } + break; + case OP_MIN: + switch (i->dType) { + case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break; + case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break; + case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break; + case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break; + default: + return; + } + break; + case OP_AND: + res.data.u64 = a->data.u64 & b->data.u64; + break; + case OP_OR: + res.data.u64 = a->data.u64 | b->data.u64; + break; + case OP_XOR: + res.data.u64 = a->data.u64 ^ b->data.u64; + break; + case OP_SHL: + res.data.u32 = a->data.u32 << b->data.u32; + break; + case OP_SHR: + switch (i->dType) { + case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break; + case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break; + default: + return; + } + break; + case OP_SLCT: + if (a->data.u32 != b->data.u32) + return; + res.data.u32 = a->data.u32; + break; + default: + return; + } + ++foldCount; + + i->src[0].mod = Modifier(0); + i->src[1].mod = Modifier(0); + + i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32)); + i->setSrc(1, NULL); + + i->getSrc(0)->reg.data = res.data; + + if (i->op == OP_MAD || i->op == OP_FMA) { + i->op = OP_ADD; + + i->setSrc(1, i->getSrc(0)); + i->setSrc(0, i->getSrc(2)); + i->setSrc(2, NULL); + + i->src[1].mod = i->src[2].mod; + + src0 = i->src[0].getImmediate(); + if (src0) + expr(i, src0, i->getSrc(1)->asImm()); + } else { + i->op = OP_MOV; + } +} + +void +ConstantFolding::unary(Instruction *i, const ImmediateValue &imm) +{ + Storage res; + + if (i->dType != TYPE_F32) + return; + switch (i->op) { + case OP_NEG: res.data.f32 = -imm.reg.data.f32; break; + case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break; + case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break; + case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break; + case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break; + case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break; + case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break; + case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break; + case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break; + case OP_PRESIN: + case OP_PREEX2: + // these should be handled in subsequent OP_SIN/COS/EX2 + res.data.f32 = imm.reg.data.f32; + break; + default: + return; + } + i->op = OP_MOV; + i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32)); + i->src[0].mod = Modifier(0); +} + +void +ConstantFolding::opnd(Instruction *i, ImmediateValue *src, int s) +{ + const int t = !s; + const operation op = i->op; + + ImmediateValue imm(src, i->sType); + + i->src[s].mod.applyTo(imm); + + switch (i->op) { + case OP_MUL: + if (i->dType == TYPE_F32 && i->getSrc(t)->refCount() == 1) { + Instruction *si = i->getSrc(t)->getUniqueInsn(); + + if (si && si->op == OP_MUL) { + float f = imm.reg.data.f32; + + if (si->src[1].getImmediate()) { + f *= si->src[1].getImmediate()->reg.data.f32; + si->setSrc(1, new_ImmediateValue(prog, f)); + i->def[0].replace(i->getSrc(t), false); + break; + } else { + int fac; + if (f == 0.125f) fac = -3; + else + if (f == 0.250f) fac = -2; + else + if (f == 0.500f) fac = -1; + else + if (f == 2.000f) fac = +1; + else + if (f == 4.000f) fac = +2; + else + if (f == 8.000f) fac = +3; + else + fac = 0; + if (fac) { + // FIXME: allowed & modifier + si->postFactor = fac; + i->def[0].replace(i->getSrc(t), false); + break; + } + } + } + } + if (imm.isInteger(0)) { + i->op = OP_MOV; + i->setSrc(0, i->getSrc(s)); + i->setSrc(1, NULL); + } else + if (imm.isInteger(1) || imm.isInteger(-1)) { + if (imm.isNegative()) + i->src[t].mod = i->src[t].mod ^ Modifier(NV50_IR_MOD_NEG); + i->op = i->src[t].mod.getOp(); + if (s == 0) { + i->setSrc(0, i->getSrc(1)); + i->src[0].mod = i->src[1].mod; + i->src[1].mod = 0; + } + if (i->op != OP_CVT) + i->src[0].mod = 0; + i->setSrc(1, NULL); + } else + if (imm.isInteger(2) || imm.isInteger(-2)) { + if (imm.isNegative()) + i->src[t].mod = i->src[t].mod ^ Modifier(NV50_IR_MOD_NEG); + i->op = OP_ADD; + i->setSrc(s, i->getSrc(t)); + i->src[s].mod = i->src[t].mod; + } else + if (!isFloatType(i->sType) && !imm.isNegative() && imm.isPow2()) { + i->op = OP_SHL; + imm.applyLog2(); + i->setSrc(1, new_ImmediateValue(prog, imm.reg.data.u32)); + } + break; + case OP_ADD: + if (imm.isInteger(0)) { + if (s == 0) { + i->setSrc(0, i->getSrc(1)); + i->src[0].mod = i->src[1].mod; + } + i->setSrc(1, NULL); + i->op = i->src[0].mod.getOp(); + if (i->op != OP_CVT) + i->src[0].mod = Modifier(0); + } + break; + + case OP_DIV: + if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32)) + break; + bld.setPosition(i, false); + if (imm.reg.data.u32 == 0) { + break; + } else + if (imm.reg.data.u32 == 1) { + i->op = OP_MOV; + i->setSrc(1, NULL); + } else + if (i->dType == TYPE_U32 && imm.isPow2()) { + i->op = OP_SHL; + i->setSrc(1, bld.mkImm(util_logbase2(imm.reg.data.u32))); + } else + if (i->dType == TYPE_U32) { + Instruction *mul; + Value *tA, *tB; + const uint32_t d = imm.reg.data.u32; + uint32_t m; + int r, s; + uint32_t l = util_logbase2(d); + if (((uint32_t)1 << l) < d) + ++l; + m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1; + r = l ? 1 : 0; + s = l ? (l - 1) : 0; + + tA = bld.getSSA(); + tB = bld.getSSA(); + mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0), + bld.loadImm(NULL, m)); + mul->subOp = NV50_IR_SUBOP_MUL_HIGH; + bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA); + tA = bld.getSSA(); + if (r) + bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r)); + else + tA = tB; + tB = s ? bld.getSSA() : i->getDef(0); + bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA); + if (s) + bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s)); + + delete_Instruction(prog, i); + } else + if (imm.reg.data.s32 == -1) { + i->op = OP_NEG; + i->setSrc(1, NULL); + } else { + LValue *tA, *tB; + LValue *tD; + const int32_t d = imm.reg.data.s32; + int32_t m; + int32_t l = util_logbase2(static_cast(abs(d))); + if ((1 << l) < abs(d)) + ++l; + if (!l) + l = 1; + m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32); + + tA = bld.getSSA(); + tB = bld.getSSA(); + bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m), + i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH; + if (l > 1) + bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1)); + else + tB = tA; + tA = bld.getSSA(); + bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, i->getSrc(0), bld.mkImm(0)); + tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue(); + bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA); + if (d < 0) + bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB); + + delete_Instruction(prog, i); + } + break; + + case OP_SET: // TODO: SET_AND,OR,XOR + { + CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t)); + CondCode cc, ccZ; + if (i->src[t].mod != Modifier(0)) + return; + if (imm.reg.data.u32 != 0 || !si || si->op != OP_SET) + return; + cc = si->setCond; + ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U); + if (s == 0) + ccZ = reverseCondCode(ccZ); + switch (ccZ) { + case CC_LT: cc = CC_FL; break; + case CC_GE: cc = CC_TR; break; + case CC_EQ: cc = inverseCondCode(cc); break; + case CC_LE: cc = inverseCondCode(cc); break; + case CC_GT: break; + case CC_NE: break; + default: + return; + } + i->asCmp()->setCond = cc; + i->setSrc(0, si->src[0]); + i->setSrc(1, si->src[1]); + i->sType = si->sType; + } + break; + + case OP_SHL: + { + if (s != 1 || i->src[0].mod != Modifier(0)) + break; + // try to concatenate shifts + Instruction *si = i->getSrc(0)->getInsn(); + if (!si || + si->op != OP_SHL || si->src[1].mod != Modifier(0)) + break; + ImmediateValue *siImm = si->src[1].getImmediate(); + if (siImm) { + bld.setPosition(i, false); + i->setSrc(0, si->getSrc(0)); + i->setSrc(1, bld.loadImm(NULL, + imm.reg.data.u32 + siImm->reg.data.u32)); + } + } + break; + + case OP_ABS: + case OP_NEG: + case OP_LG2: + case OP_RCP: + case OP_SQRT: + case OP_RSQ: + case OP_PRESIN: + case OP_SIN: + case OP_COS: + case OP_PREEX2: + case OP_EX2: + unary(i, imm); + break; + default: + return; + } + if (i->op != op) + foldCount++; +} + +// ============================================================================= + +// Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed. +class ModifierFolding : public Pass +{ +private: + virtual bool visit(BasicBlock *); +}; + +bool +ModifierFolding::visit(BasicBlock *bb) +{ + const Target *target = prog->getTarget(); + + Instruction *i, *next, *mi; + Modifier mod; + + for (i = bb->getEntry(); i; i = next) { + next = i->next; + + if (0 && i->op == OP_SUB) { + // turn "sub" into "add neg" (do we really want this ?) + i->op = OP_ADD; + i->src[0].mod = i->src[0].mod ^ Modifier(NV50_IR_MOD_NEG); + } + + for (int s = 0; s < 3 && i->srcExists(s); ++s) { + mi = i->getSrc(s)->getInsn(); + if (!mi || + mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8) + continue; + if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) { + if ((i->op != OP_ADD && + i->op != OP_MUL) || + (mi->op != OP_ABS && + mi->op != OP_NEG)) + continue; + } else + if (i->sType != mi->dType) { + continue; + } + if ((mod = Modifier(mi->op)) == Modifier(0)) + continue; + mod = mod * mi->src[0].mod; + + if ((i->op == OP_ABS) || i->src[s].mod.abs()) { + // abs neg [abs] = abs + mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS)); + } else + if ((i->op == OP_NEG) && mod.neg()) { + assert(s == 0); + // neg as both opcode and modifier on same insn is prohibited + // neg neg abs = abs, neg neg = identity + mod = mod & Modifier(~NV50_IR_MOD_NEG); + i->op = mod.getOp(); + mod = mod & Modifier(~NV50_IR_MOD_ABS); + if (mod == Modifier(0)) + i->op = OP_MOV; + } + + if (target->isModSupported(i, s, mod)) { + i->setSrc(s, mi->getSrc(0)); + i->src[s].mod = i->src[s].mod * mod; + } + } + + if (i->op == OP_SAT) { + mi = i->getSrc(0)->getInsn(); + if (mi && + mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) { + mi->saturate = 1; + mi->setDef(0, i->getDef(0)); + delete_Instruction(prog, i); + } + } + } + + return true; +} + +// ============================================================================= + +// MUL + ADD -> MAD/FMA +// MIN/MAX(a, a) -> a, etc. +// SLCT(a, b, const) -> cc(const) ? a : b +// RCP(RCP(a)) -> a +// MUL(MUL(a, b), const) -> MUL_Xconst(a, b) +class AlgebraicOpt : public Pass +{ +private: + virtual bool visit(BasicBlock *); + + void handleADD(Instruction *); + void handleMINMAX(Instruction *); + void handleRCP(Instruction *); + void handleSLCT(Instruction *); + void handleLOGOP(Instruction *); + void handleCVT(Instruction *); +}; + +void +AlgebraicOpt::handleADD(Instruction *add) +{ + Value *src0 = add->getSrc(0); + Value *src1 = add->getSrc(1); + Value *src; + int s; + Modifier mod[4]; + + if (!prog->getTarget()->isOpSupported(OP_MAD, add->dType)) + return; + + if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR) + return; + + if (src0->refCount() == 1 && + src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_MUL) + s = 0; + else + if (src1->refCount() == 1 && + src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_MUL) + s = 1; + else + return; + + if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) || + (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb)) + return; + + src = add->getSrc(s); + + mod[0] = add->src[0].mod; + mod[1] = add->src[1].mod; + mod[2] = src->getUniqueInsn()->src[0].mod; + mod[3] = src->getUniqueInsn()->src[1].mod; + + if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & Modifier(~NV50_IR_MOD_NEG)) + return; + + add->op = OP_MAD; + add->subOp = src->getInsn()->subOp; // potentially mul-high + + add->setSrc(2, add->src[s ? 0 : 1]); + + add->setSrc(0, src->getInsn()->getSrc(0)); + add->src[0].mod = mod[2] ^ mod[s]; + add->setSrc(1, src->getInsn()->getSrc(1)); + add->src[1].mod = mod[3]; +} + +void +AlgebraicOpt::handleMINMAX(Instruction *minmax) +{ + Value *src0 = minmax->getSrc(0); + Value *src1 = minmax->getSrc(1); + + if (src0 != src1 || src0->reg.file != FILE_GPR) + return; + if (minmax->src[0].mod == minmax->src[1].mod) { + if (minmax->src[0].mod) { + minmax->op = OP_CVT; + minmax->setSrc(1, NULL); + } else { + minmax->def[0].replace(minmax->getSrc(0), false); + minmax->bb->remove(minmax); + } + } else { + // TODO: + // min(x, -x) = -abs(x) + // min(x, -abs(x)) = -abs(x) + // min(x, abs(x)) = x + // max(x, -abs(x)) = x + // max(x, abs(x)) = abs(x) + // max(x, -x) = abs(x) + } +} + +void +AlgebraicOpt::handleRCP(Instruction *rcp) +{ + Instruction *si = rcp->getSrc(0)->getUniqueInsn(); + + if (si && si->op == OP_RCP) { + Modifier mod = rcp->src[0].mod * si->src[0].mod; + rcp->op = mod.getOp(); + rcp->setSrc(0, si->getSrc(0)); + } +} + +void +AlgebraicOpt::handleSLCT(Instruction *slct) +{ + if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) { + if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f)) + slct->setSrc(0, slct->getSrc(1)); + } else + if (slct->getSrc(0) != slct->getSrc(1)) { + return; + } + slct->op = OP_MOV; + slct->setSrc(1, NULL); + slct->setSrc(2, NULL); +} + +void +AlgebraicOpt::handleLOGOP(Instruction *logop) +{ + Value *src0 = logop->getSrc(0); + Value *src1 = logop->getSrc(1); + + if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR) + return; + + if (src0 == src1) { + if (logop->src[0].mod != Modifier(0) || + logop->src[1].mod != Modifier(0)) + return; + if (logop->op == OP_AND || logop->op == OP_OR) { + logop->def[0].replace(logop->getSrc(0), false); + delete_Instruction(prog, logop); + } + } else { + // try AND(SET, SET) -> SET_AND(SET) + Instruction *set0 = src0->getInsn(); + Instruction *set1 = src1->getInsn(); + + if (!set0 || set0->fixed || !set1 || set1->fixed) + return; + if (set1->op != OP_SET) { + Instruction *xchg = set0; + set0 = set1; + set1 = xchg; + if (set1->op != OP_SET) + return; + } + if (set0->op != OP_SET && + set0->op != OP_SET_AND && + set0->op != OP_SET_OR && + set0->op != OP_SET_XOR) + return; + if (set0->getDef(0)->refCount() > 1 && + set1->getDef(0)->refCount() > 1) + return; + if (set0->getPredicate() || set1->getPredicate()) + return; + // check that they don't source each other + for (int s = 0; s < 2; ++s) + if (set0->getSrc(s) == set1->getDef(0) || + set1->getSrc(s) == set0->getDef(0)) + return; + + set0 = set0->clone(true); + set1 = set1->clone(false); + logop->bb->insertAfter(logop, set1); + logop->bb->insertAfter(logop, set0); + + set0->dType = TYPE_U8; + set0->getDef(0)->reg.file = FILE_PREDICATE; + set0->getDef(0)->reg.size = 1; + set1->setSrc(2, set0->getDef(0)); + switch (logop->op) { + case OP_AND: set1->op = OP_SET_AND; break; + case OP_OR: set1->op = OP_SET_OR; break; + case OP_XOR: set1->op = OP_SET_XOR; break; + default: + assert(0); + break; + } + set1->setDef(0, logop->getDef(0)); + delete_Instruction(prog, logop); + } +} + +// F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0 +void +AlgebraicOpt::handleCVT(Instruction *cvt) +{ + if (cvt->sType != TYPE_F32 || + cvt->dType != TYPE_S32 || cvt->src[0].mod != Modifier(0)) + return; + Instruction *insn = cvt->getSrc(0)->getInsn(); + if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32) + return; + if (insn->src[0].mod != Modifier(0)) + return; + insn = insn->getSrc(0)->getInsn(); + if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) + return; + + Instruction *bset = insn->clone(false); + bset->dType = TYPE_U32; + bset->setDef(0, cvt->getDef(0)); + cvt->bb->insertAfter(cvt, bset); + delete_Instruction(prog, cvt); +} + +bool +AlgebraicOpt::visit(BasicBlock *bb) +{ + Instruction *next; + for (Instruction *i = bb->getEntry(); i; i = next) { + next = i->next; + switch (i->op) { + case OP_ADD: + handleADD(i); + break; + case OP_RCP: + handleRCP(i); + break; + case OP_MIN: + case OP_MAX: + handleMINMAX(i); + break; + case OP_SLCT: + handleSLCT(i); + break; + case OP_AND: + case OP_OR: + case OP_XOR: + handleLOGOP(i); + break; + case OP_CVT: + handleCVT(i); + break; + default: + break; + } + } + + return true; +} + +// ============================================================================= + +static inline void +updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn) +{ + if (offset != ldst->getSrc(0)->reg.data.offset) { + if (ldst->getSrc(0)->refCount() > 1) + ldst->setSrc(0, ldst->getSrc(0)->clone(fn)); + ldst->getSrc(0)->reg.data.offset = offset; + } +} + +// Combine loads and stores, forward stores to loads where possible. +class MemoryOpt : public Pass +{ +private: + class Record + { + public: + Record *next; + Instruction *insn; + const Value *rel[2]; + const Value *base; + int32_t offset; + int8_t fileIndex; + uint8_t size; + bool locked; + Record *prev; + + bool overlaps(const Instruction *ldst) const; + + inline void link(Record **); + inline void unlink(Record **); + inline void set(const Instruction *ldst); + }; + +public: + MemoryOpt(); + + Record *loads[DATA_FILE_COUNT]; + Record *stores[DATA_FILE_COUNT]; + + MemoryPool recordPool; + +private: + virtual bool visit(BasicBlock *); + bool runOpt(BasicBlock *); + + Record **getList(const Instruction *); + + Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const; + + // merge @insn into load/store instruction from @rec + bool combineLd(Record *rec, Instruction *ld); + bool combineSt(Record *rec, Instruction *st); + + bool replaceLdFromLd(Instruction *ld, Record *ldRec); + bool replaceLdFromSt(Instruction *ld, Record *stRec); + bool replaceStFromSt(Instruction *restrict st, Record *stRec); + + void addRecord(Instruction *ldst); + void purgeRecords(Instruction *const st, DataFile); + void lockStores(Instruction *const ld); + void reset(); + +private: + Record *prevRecord; +}; + +MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6) +{ + for (int i = 0; i < DATA_FILE_COUNT; ++i) { + loads[i] = NULL; + stores[i] = NULL; + } + prevRecord = NULL; +} + +void +MemoryOpt::reset() +{ + for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) { + Record *it, *next; + for (it = loads[i]; it; it = next) { + next = it->next; + recordPool.release(it); + } + loads[i] = NULL; + for (it = stores[i]; it; it = next) { + next = it->next; + recordPool.release(it); + } + stores[i] = NULL; + } +} + +bool +MemoryOpt::combineLd(Record *rec, Instruction *ld) +{ + int32_t offRc = rec->offset; + int32_t offLd = ld->getSrc(0)->reg.data.offset; + int sizeRc = rec->size; + int sizeLd = typeSizeof(ld->dType); + int size = sizeRc + sizeLd; + int d, j; + + // only VFETCH can do a 96 byte load + if (ld->op != OP_VFETCH && size == 12) + return false; + // no unaligned loads + if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) || + ((size == 0xc) && (MIN2(offLd, offRc) & 0xf))) + return false; + + assert(sizeRc + sizeLd <= 16 && offRc != offLd); + + for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j); + + if (offLd < offRc) { + int sz; + for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d); + // d: nr of definitions in ld + // j: nr of definitions in rec->insn, move: + for (d = d + j - 1; j > 0; --j, --d) + rec->insn->setDef(d, rec->insn->getDef(j - 1)); + + if (rec->insn->getSrc(0)->refCount() > 1) + rec->insn->setSrc(0, rec->insn->getSrc(0)->clone(func)); + rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd; + + d = 0; + } else { + d = j; + } + // move definitions of @ld to @rec->insn + for (j = 0; sizeLd; ++j, ++d) { + sizeLd -= ld->getDef(j)->reg.size; + rec->insn->setDef(d, ld->getDef(j)); + } + + rec->size = size; + rec->insn->setType(typeOfSize(size)); + + delete_Instruction(prog, ld); + + return true; +} + +bool +MemoryOpt::combineSt(Record *rec, Instruction *st) +{ + int32_t offRc = rec->offset; + int32_t offSt = st->getSrc(0)->reg.data.offset; + int sizeRc = rec->size; + int sizeSt = typeSizeof(st->dType); + int s = sizeSt / 4; + int size = sizeRc + sizeSt; + int j, k; + Value *src[4]; // no modifiers in ValueRef allowed for st + Value *extra[3]; + + if (size == 12) // XXX: check if EXPORT a[] can do this after all + return false; + if (size == 8 && MIN2(offRc, offSt) & 0x7) + return false; + + st->takeExtraSources(0, extra); // save predicate and indirect address + + if (offRc < offSt) { + // save values from @st + for (s = 0; sizeSt; ++s) { + sizeSt -= st->getSrc(s + 1)->reg.size; + src[s] = st->getSrc(s + 1); + } + // set record's values as low sources of @st + for (j = 1; sizeRc; ++j) { + sizeRc -= st->getSrc(j)->reg.size; + st->setSrc(j, rec->insn->getSrc(j)); + } + // set saved values as high sources of @st + for (k = j, j = 0; j < s; ++j) + st->setSrc(k++, src[j]); + + updateLdStOffset(st, offRc, func); + } else { + for (j = 1; sizeSt; ++j) + sizeSt -= st->getSrc(j)->reg.size; + for (s = 1; sizeRc; ++j, ++s) { + sizeRc -= rec->insn->getSrc(s)->reg.size; + st->setSrc(j, rec->insn->getSrc(s)); + } + rec->offset = offSt; + } + st->putExtraSources(0, extra); // restore pointer and predicate + + delete_Instruction(prog, rec->insn); + rec->insn = st; + rec->size = size; + rec->insn->setType(typeOfSize(size)); + return true; +} + +void +MemoryOpt::Record::set(const Instruction *ldst) +{ + const Symbol *mem = ldst->getSrc(0)->asSym(); + fileIndex = mem->reg.fileIndex; + rel[0] = ldst->getIndirect(0, 0); + rel[1] = ldst->getIndirect(0, 1); + offset = mem->reg.data.offset; + base = mem->getBase(); + size = typeSizeof(ldst->sType); +} + +void +MemoryOpt::Record::link(Record **list) +{ + next = *list; + if (next) + next->prev = this; + prev = NULL; + *list = this; +} + +void +MemoryOpt::Record::unlink(Record **list) +{ + if (next) + next->prev = prev; + if (prev) + prev->next = next; + else + *list = next; +} + +MemoryOpt::Record ** +MemoryOpt::getList(const Instruction *insn) +{ + if (insn->op == OP_LOAD || insn->op == OP_VFETCH) + return &loads[insn->src[0].getFile()]; + return &stores[insn->src[0].getFile()]; +} + +void +MemoryOpt::addRecord(Instruction *i) +{ + Record **list = getList(i); + Record *it = reinterpret_cast(recordPool.allocate()); + + it->link(list); + it->set(i); + it->insn = i; + it->locked = false; +} + +MemoryOpt::Record * +MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const +{ + const Symbol *sym = insn->getSrc(0)->asSym(); + const int size = typeSizeof(insn->sType); + Record *rec = NULL; + Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file]; + + for (; it; it = it->next) { + if (it->locked && insn->op != OP_LOAD) + continue; + if ((it->offset >> 4) != (sym->reg.data.offset >> 4) || + it->rel[0] != insn->getIndirect(0, 0) || + it->fileIndex != sym->reg.fileIndex || + it->rel[1] != insn->getIndirect(0, 1)) + continue; + + if (it->offset < sym->reg.data.offset) { + if (it->offset + it->size >= sym->reg.data.offset) { + isAdj = (it->offset + it->size == sym->reg.data.offset); + if (!isAdj) + return it; + if (!(it->offset & 0x7)) + rec = it; + } + } else { + isAdj = it->offset != sym->reg.data.offset; + if (size <= it->size && !isAdj) + return it; + else + if (!(sym->reg.data.offset & 0x7)) + if (it->offset - size <= sym->reg.data.offset) + rec = it; + } + } + return rec; +} + +bool +MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec) +{ + Instruction *st = rec->insn; + int32_t offSt = rec->offset; + int32_t offLd = ld->getSrc(0)->reg.data.offset; + int d, s; + + for (s = 1; offSt != offLd && st->srcExists(s); ++s) + offSt += st->getSrc(s)->reg.size; + if (offSt != offLd) + return false; + + for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) { + if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size) + return false; + if (st->getSrc(s)->reg.file != FILE_GPR) + return false; + ld->def[d].replace(st->getSrc(s), false); + } + ld->bb->remove(ld); + return true; +} + +bool +MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec) +{ + Instruction *ldR = rec->insn; + int32_t offR = rec->offset; + int32_t offE = ldE->getSrc(0)->reg.data.offset; + int dR, dE; + + assert(offR <= offE); + for (dR = 0; offR < offE && ldR->defExists(dR); ++dR) + offR += ldR->getDef(dR)->reg.size; + if (offR != offE) + return false; + + for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) { + if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size) + return false; + ldE->def[dE].replace(ldR->getDef(dR), false); + } + + delete_Instruction(prog, ldE); + return true; +} + +bool +MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec) +{ + const Instruction *const ri = rec->insn; + Value *extra[3]; + + int32_t offS = st->getSrc(0)->reg.data.offset; + int32_t offR = rec->offset; + int32_t endS = offS + typeSizeof(st->dType); + int32_t endR = offR + typeSizeof(ri->dType); + + rec->size = MAX2(endS, endR) - MIN2(offS, offR); + + st->takeExtraSources(0, extra); + + if (offR < offS) { + Value *vals[4]; + int s, n; + int k = 0; + // get non-replaced sources of ri + for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s) + vals[k++] = ri->getSrc(s); + n = s; + // get replaced sources of st + for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s) + vals[k++] = st->getSrc(s); + // skip replaced sources of ri + for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s); + // get non-replaced sources after values covered by st + for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s) + vals[k++] = ri->getSrc(s); + for (s = 0; s < k; ++s) + st->setSrc(s + 1, vals[s]); + st->setSrc(0, ri->getSrc(0)); + } else + if (endR > endS) { + int j, s; + for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size); + for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size); + for (; offR < endR; offR += ri->getSrc(j++)->reg.size) + st->setSrc(s++, ri->getSrc(j)); + } + st->putExtraSources(0, extra); + + delete_Instruction(prog, rec->insn); + + rec->insn = st; + rec->offset = st->getSrc(0)->reg.data.offset; + + st->setType(typeOfSize(rec->size)); + + return true; +} + +bool +MemoryOpt::Record::overlaps(const Instruction *ldst) const +{ + Record that; + that.set(ldst); + + if (this->fileIndex != that.fileIndex) + return false; + + if (this->rel[0] || that.rel[0]) + return this->base == that.base; + return + (this->offset < that.offset + that.size) && + (this->offset + this->size > that.offset); +} + +// We must not eliminate stores that affect the result of @ld if +// we find later stores to the same location, and we may no longer +// merge them with later stores. +// The stored value can, however, still be used to determine the value +// returned by future loads. +void +MemoryOpt::lockStores(Instruction *const ld) +{ + for (Record *r = stores[ld->src[0].getFile()]; r; r = r->next) + if (!r->locked && r->overlaps(ld)) + r->locked = true; +} + +// Prior loads from the location of @st are no longer valid. +// Stores to the location of @st may no longer be used to derive +// the value at it nor be coalesced into later stores. +void +MemoryOpt::purgeRecords(Instruction *const st, DataFile f) +{ + if (st) + f = st->src[0].getFile(); + + for (Record *r = loads[f]; r; r = r->next) + if (!st || r->overlaps(st)) + r->unlink(&loads[f]); + + for (Record *r = stores[f]; r; r = r->next) + if (!st || r->overlaps(st)) + r->unlink(&stores[f]); +} + +bool +MemoryOpt::visit(BasicBlock *bb) +{ + bool ret = runOpt(bb); + // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st + // where 96 bit memory operations are forbidden. + if (ret) + ret = runOpt(bb); + return ret; +} + +bool +MemoryOpt::runOpt(BasicBlock *bb) +{ + Instruction *ldst, *next; + Record *rec; + bool isAdjacent = true; + + for (ldst = bb->getEntry(); ldst; ldst = next) { + bool keep = true; + bool isLoad = true; + next = ldst->next; + + if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) { + if (ldst->isDead()) { + // might have been produced by earlier optimization + delete_Instruction(prog, ldst); + continue; + } + } else + if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) { + isLoad = false; + } else { + // TODO: maybe have all fixed ops act as barrier ? + if (ldst->op == OP_CALL) { + purgeRecords(NULL, FILE_MEMORY_LOCAL); + purgeRecords(NULL, FILE_MEMORY_GLOBAL); + purgeRecords(NULL, FILE_MEMORY_SHARED); + purgeRecords(NULL, FILE_SHADER_OUTPUT); + } else + if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) { + purgeRecords(NULL, FILE_SHADER_OUTPUT); + } + continue; + } + if (ldst->getPredicate()) // TODO: handle predicated ld/st + continue; + + if (isLoad) { + DataFile file = ldst->src[0].getFile(); + + // if ld l[]/g[] look for previous store to eliminate the reload + if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) { + // TODO: shared memory ? + rec = findRecord(ldst, false, isAdjacent); + if (rec && !isAdjacent) + keep = !replaceLdFromSt(ldst, rec); + } + + // or look for ld from the same location and replace this one + rec = keep ? findRecord(ldst, true, isAdjacent) : NULL; + if (rec) { + if (!isAdjacent) + keep = !replaceLdFromLd(ldst, rec); + else + // or combine a previous load with this one + keep = !combineLd(rec, ldst); + } + if (keep) + lockStores(ldst); + } else { + rec = findRecord(ldst, false, isAdjacent); + if (rec) { + if (!isAdjacent) + keep = !replaceStFromSt(ldst, rec); + else + keep = !combineSt(rec, ldst); + } + if (keep) + purgeRecords(ldst, DATA_FILE_COUNT); + } + if (keep) + addRecord(ldst); + } + reset(); + + return true; +} + +// ============================================================================= + +// Turn control flow into predicated instructions (after register allocation !). +// TODO: +// Could move this to before register allocation on NVC0 and also handle nested +// constructs. +class FlatteningPass : public Pass +{ +private: + virtual bool visit(BasicBlock *); + + bool tryPredicateConditional(BasicBlock *); + void predicateInstructions(BasicBlock *, Value *pred, CondCode cc); + void tryPropagateBranch(BasicBlock *); + inline bool isConstantCondition(Value *pred); + inline bool mayPredicate(const Instruction *, const Value *pred) const; + inline void removeFlow(Instruction *); +}; + +bool +FlatteningPass::isConstantCondition(Value *pred) +{ + Instruction *insn = pred->getUniqueInsn(); + assert(insn); + if (insn->op != OP_SET || insn->srcExists(2)) + return false; + + for (int s = 0; s < 2 && insn->srcExists(s); ++s) { + Instruction *ld = insn->getSrc(s)->getUniqueInsn(); + DataFile file; + if (ld) { + if (ld->op != OP_MOV && ld->op != OP_LOAD) + return false; + if (ld->src[0].isIndirect(0)) + return false; + file = ld->src[0].getFile(); + } else { + file = insn->src[s].getFile(); + // catch $r63 on NVC0 + if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR) + file = FILE_IMMEDIATE; + } + if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST) + return false; + } + return true; +} + +void +FlatteningPass::removeFlow(Instruction *insn) +{ + FlowInstruction *term = insn ? insn->asFlow() : NULL; + if (!term) + return; + Graph::Edge::Type ty = term->bb->cfg.outgoing().getType(); + + if (term->op == OP_BRA) { + // TODO: this might get more difficult when we get arbitrary BRAs + if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK) + return; + } else + if (term->op != OP_JOIN) + return; + + delete_Instruction(prog, term); + + Value *pred = term->getPredicate(); + + if (pred && pred->refCount() == 0) { + Instruction *pSet = pred->getUniqueInsn(); + pred->join->reg.data.id = -1; // deallocate + if (pSet->isDead()) + delete_Instruction(prog, pSet); + } +} + +void +FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc) +{ + for (Instruction *i = bb->getEntry(); i; i = i->next) { + if (i->isNop()) + continue; + assert(!i->getPredicate()); + i->setPredicate(cc, pred); + } + removeFlow(bb->getExit()); +} + +bool +FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const +{ + if (insn->isPseudo()) + return true; + // TODO: calls where we don't know which registers are modified + + if (!prog->getTarget()->mayPredicate(insn, pred)) + return false; + for (int d = 0; insn->defExists(d); ++d) + if (insn->getDef(d)->equals(pred)) + return false; + return true; +} + +// If we conditionally skip over or to a branch instruction, replace it. +// NOTE: We do not update the CFG anymore here ! +void +FlatteningPass::tryPropagateBranch(BasicBlock *bb) +{ + BasicBlock *bf = NULL; + unsigned int i; + + if (bb->cfg.outgoingCount() != 2) + return; + if (!bb->getExit() || bb->getExit()->op != OP_BRA) + return; + Graph::EdgeIterator ei = bb->cfg.outgoing(); + + for (i = 0; !ei.end(); ++i, ei.next()) { + bf = BasicBlock::get(ei.getNode()); + if (bf->getInsnCount() == 1) + break; + } + if (ei.end() || !bf->getExit()) + return; + FlowInstruction *bra = bb->getExit()->asFlow(); + FlowInstruction *rep = bf->getExit()->asFlow(); + + if (rep->getPredicate()) + return; + if (rep->op != OP_BRA && + rep->op != OP_JOIN && + rep->op != OP_EXIT) + return; + + bra->op = rep->op; + bra->target.bb = rep->target.bb; + if (i) // 2nd out block means branch not taken + bra->cc = inverseCondCode(bra->cc); + bf->remove(rep); +} + +bool +FlatteningPass::visit(BasicBlock *bb) +{ + if (tryPredicateConditional(bb)) + return true; + + // try to attach join to previous instruction + Instruction *insn = bb->getExit(); + if (insn && insn->op == OP_JOIN && !insn->getPredicate()) { + insn = insn->prev; + if (insn && !insn->getPredicate() && !insn->asFlow() && !insn->isNop()) { + insn->join = 1; + bb->remove(bb->getExit()); + return true; + } + } + + tryPropagateBranch(bb); + + return true; +} + +bool +FlatteningPass::tryPredicateConditional(BasicBlock *bb) +{ + BasicBlock *bL = NULL, *bR = NULL; + unsigned int nL = 0, nR = 0, limit = 12; + Instruction *insn; + unsigned int mask; + + mask = bb->initiatesSimpleConditional(); + if (!mask) + return false; + + assert(bb->getExit()); + Value *pred = bb->getExit()->getPredicate(); + assert(pred); + + if (isConstantCondition(pred)) + limit = 4; + + Graph::EdgeIterator ei = bb->cfg.outgoing(); + + if (mask & 1) { + bL = BasicBlock::get(ei.getNode()); + for (insn = bL->getEntry(); insn; insn = insn->next, ++nL) + if (!mayPredicate(insn, pred)) + return false; + if (nL > limit) + return false; // too long, do a real branch + } + ei.next(); + + if (mask & 2) { + bR = BasicBlock::get(ei.getNode()); + for (insn = bR->getEntry(); insn; insn = insn->next, ++nR) + if (!mayPredicate(insn, pred)) + return false; + if (nR > limit) + return false; // too long, do a real branch + } + + if (bL) + predicateInstructions(bL, pred, bb->getExit()->cc); + if (bR) + predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc)); + + if (bb->joinAt) { + bb->remove(bb->joinAt); + bb->joinAt = NULL; + } + removeFlow(bb->getExit()); // delete the branch/join at the fork point + + // remove potential join operations at the end of the conditional + if (prog->getTarget()->joinAnterior) { + bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode()); + if (bb->getEntry() && bb->getEntry()->op == OP_JOIN) + removeFlow(bb->getEntry()); + } + + return true; +} + +// ============================================================================= + +// Common subexpression elimination. Stupid O^2 implementation. +class LocalCSE : public Pass +{ +private: + virtual bool visit(BasicBlock *); + + inline bool tryReplace(Instruction **, Instruction *); + + DLList ops[OP_LAST + 1]; +}; + +class GlobalCSE : public Pass +{ +private: + virtual bool visit(BasicBlock *); +}; + +bool +Instruction::isActionEqual(const Instruction *that) const +{ + if (this->op != that->op || + this->dType != that->dType || + this->sType != that->sType) + return false; + if (this->cc != that->cc) + return false; + + if (this->asTex()) { + if (memcmp(&this->asTex()->tex, + &that->asTex()->tex, + sizeof(this->asTex()->tex))) + return false; + } else + if (this->asCmp()) { + if (this->asCmp()->setCond != that->asCmp()->setCond) + return false; + } else + if (this->asFlow()) { + return false; + } else { + if (this->atomic != that->atomic || + this->ipa != that->ipa || + this->lanes != that->lanes || + this->perPatch != that->perPatch) + return false; + if (this->postFactor != that->postFactor) + return false; + } + + if (this->subOp != that->subOp || + this->saturate != that->saturate || + this->rnd != that->rnd || + this->ftz != that->ftz || + this->dnz != that->dnz || + this->cache != that->cache) + return false; + + return true; +} + +bool +Instruction::isResultEqual(const Instruction *that) const +{ + unsigned int d, s; + + // NOTE: location of discard only affects tex with liveOnly and quadops + if (!this->defExists(0) && this->op != OP_DISCARD) + return false; + + if (!isActionEqual(that)) + return false; + + if (this->predSrc != that->predSrc) + return false; + + for (d = 0; this->defExists(d); ++d) { + if (!that->defExists(d) || + !this->getDef(d)->equals(that->getDef(d), false)) + return false; + } + if (that->defExists(d)) + return false; + + for (s = 0; this->srcExists(s); ++s) { + if (!that->srcExists(s)) + return false; + if (this->src[s].mod != that->src[s].mod) + return false; + if (!this->getSrc(s)->equals(that->getSrc(s), true)) + return false; + } + if (that->srcExists(s)) + return false; + + if (op == OP_LOAD || op == OP_VFETCH) { + switch (src[0].getFile()) { + case FILE_MEMORY_CONST: + case FILE_SHADER_INPUT: + return true; + default: + return false; + } + } + + return true; +} + +// pull through common expressions from different in-blocks +bool +GlobalCSE::visit(BasicBlock *bb) +{ + Instruction *phi, *next, *ik; + int s; + + for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) { + next = phi->next; + if (phi->getSrc(0)->refCount() > 1) + continue; + ik = phi->getSrc(0)->getInsn(); + for (s = 1; phi->srcExists(s); ++s) { + if (phi->getSrc(s)->refCount() > 1) + break; + if (!phi->getSrc(s)->getInsn()->isResultEqual(ik)) + break; + } + if (!phi->srcExists(s)) { + Instruction *entry = bb->getEntry(); + ik->bb->remove(ik); + if (!entry || entry->op != OP_JOIN) + bb->insertHead(ik); + else + bb->insertAfter(entry, ik); + ik->setDef(0, phi->getDef(0)); + delete_Instruction(prog, phi); + } + } + + return true; +} + +bool +LocalCSE::tryReplace(Instruction **ptr, Instruction *i) +{ + Instruction *old = *ptr; + if (!old->isResultEqual(i)) + return false; + for (int d = 0; old->defExists(d); ++d) + old->def[d].replace(i->getDef(d), false); + delete_Instruction(prog, old); + *ptr = NULL; + return true; +} + +bool +LocalCSE::visit(BasicBlock *bb) +{ + unsigned int replaced; + + do { + Instruction *ir, *next; + + replaced = 0; + + // will need to know the order of instructions + int serial = 0; + for (ir = bb->getEntry(); ir; ir = ir->next) + ir->serial = serial++; + + for (ir = bb->getEntry(); ir; ir = next) { + int s; + Value *src = NULL; + + next = ir->next; + + if (ir->fixed) { + ops[ir->op].insert(ir); + continue; + } + + for (s = 0; ir->srcExists(s); ++s) + if (ir->getSrc(s)->asLValue()) + if (!src || ir->getSrc(s)->refCount() < src->refCount()) + src = ir->getSrc(s); + + if (src) { + for (ValueRef::Iterator refs = src->uses->iterator(); !refs.end(); + refs.next()) { + Instruction *ik = refs.get()->getInsn(); + if (ik->serial < ir->serial && ik->bb == ir->bb) + if (tryReplace(&ir, ik)) + break; + } + } else { + DLLIST_FOR_EACH(&ops[ir->op], iter) + { + Instruction *ik = reinterpret_cast(iter.get()); + if (tryReplace(&ir, ik)) + break; + } + } + + if (ir) + ops[ir->op].insert(ir); + else + ++replaced; + } + for (unsigned int i = 0; i <= OP_LAST; ++i) + ops[i].clear(); + + } while (replaced); + + return true; +} + +// ============================================================================= + +// Remove computations of unused values. +class DeadCodeElim : public Pass +{ +public: + bool buryAll(Program *); + +private: + virtual bool visit(BasicBlock *); + + void checkSplitLoad(Instruction *ld); // for partially dead loads + + unsigned int deadCount; +}; + +bool +DeadCodeElim::buryAll(Program *prog) +{ + do { + deadCount = 0; + if (!this->run(prog, false, false)) + return false; + } while (deadCount); + + return true; +} + +bool +DeadCodeElim::visit(BasicBlock *bb) +{ + Instruction *next; + + for (Instruction *i = bb->getFirst(); i; i = next) { + next = i->next; + if (i->isDead()) { + ++deadCount; + delete_Instruction(prog, i); + } else + if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) { + checkSplitLoad(i); + } + } + return true; +} + +void +DeadCodeElim::checkSplitLoad(Instruction *ld1) +{ + Instruction *ld2 = NULL; // can get at most 2 loads + Value *def1[4]; + Value *def2[4]; + int32_t addr1, addr2; + int32_t size1, size2; + int d, n1, n2; + uint32_t mask = 0xffffffff; + + for (d = 0; ld1->defExists(d); ++d) + if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0) + mask &= ~(1 << d); + if (mask == 0xffffffff) + return; + + addr1 = ld1->getSrc(0)->reg.data.offset; + n1 = n2 = 0; + size1 = size2 = 0; + for (d = 0; ld1->defExists(d); ++d) { + if (mask & (1 << d)) { + if (size1 && (addr1 & 0x7)) + break; + def1[n1] = ld1->getDef(d); + size1 += def1[n1++]->reg.size; + } else + if (!n1) { + addr1 += ld1->getDef(d)->reg.size; + } else { + break; + } + } + for (addr2 = addr1 + size1; ld1->defExists(d); ++d) { + if (mask & (1 << d)) { + def2[n2] = ld1->getDef(d); + size2 += def2[n2++]->reg.size; + } else { + assert(!n2); + addr2 += ld1->getDef(d)->reg.size; + } + } + + updateLdStOffset(ld1, addr1, func); + ld1->setType(typeOfSize(size1)); + for (d = 0; d < 4; ++d) + ld1->setDef(d, (d < n1) ? def1[d] : NULL); + + if (!n2) + return; + + ld2 = ld1->clone(false); + updateLdStOffset(ld2, addr2, func); + ld2->setType(typeOfSize(size2)); + for (d = 0; d < 4; ++d) + ld2->setDef(d, (d < n2) ? def2[d] : NULL); + + ld1->bb->insertAfter(ld1, ld2); +} + +// ============================================================================= + +#define RUN_PASS(l, n, f) \ + if (level >= (l)) { \ + if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \ + INFO("PEEPHOLE: %s\n", #n); \ + n pass; \ + if (!pass.f(this)) \ + return false; \ + } + +bool +Program::optimizeSSA(int level) +{ + RUN_PASS(1, DeadCodeElim, buryAll); + RUN_PASS(1, CopyPropagation, run); + RUN_PASS(2, GlobalCSE, run); + RUN_PASS(1, LocalCSE, run); + RUN_PASS(2, AlgebraicOpt, run); + RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks + RUN_PASS(1, ConstantFolding, foldAll); + RUN_PASS(1, LoadPropagation, run); + RUN_PASS(2, MemoryOpt, run); + RUN_PASS(2, LocalCSE, run); + RUN_PASS(0, DeadCodeElim, buryAll); + return true; +} + +bool +Program::optimizePostRA(int level) +{ + RUN_PASS(2, FlatteningPass, run); + return true; +} + +} diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp new file mode 100644 index 00000000000..b5ca3814098 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp @@ -0,0 +1,558 @@ + +#include "nv50_ir.h" +#include "nv50_ir_target.h" + +namespace nv50_ir { + +enum TextStyle +{ + TXT_DEFAULT, + TXT_GPR, + TXT_REGISTER, + TXT_FLAGS, + TXT_MEM, + TXT_IMMD, + TXT_BRA, + TXT_INSN +}; + +static const char *colour[8] = +{ +#if 1 + "\x1b[00m", + "\x1b[34m", + "\x1b[35m", + "\x1b[35m", + "\x1b[36m", + "\x1b[33m", + "\x1b[37m", + "\x1b[32m" +#else + "", "", "", "", "", "", "", "" +#endif +}; + +const char *operationStr[OP_LAST + 1] = +{ + "nop", + "phi", + "union", + "split", + "merge", + "consec", + "mov", + "ld", + "st", + "add", + "sub", + "mul", + "div", + "mod", + "mad", + "fma", + "sad", + "abs", + "neg", + "not", + "and", + "or", + "xor", + "shl", + "shr", + "max", + "min", + "sat", + "ceil", + "floor", + "trunc", + "cvt", + "set and", + "set or", + "set xor", + "set", + "selp", + "slct", + "rcp", + "rsq", + "lg2", + "sin", + "cos", + "ex2", + "exp", + "log", + "presin", + "preex2", + "sqrt", + "pow", + "bra", + "call", + "ret", + "cont", + "break", + "preret", + "precont", + "prebreak", + "brkpt", + "joinat", + "join", + "discard", + "exit", + "barrier", + "vfetch", + "pfetch", + "export", + "linterp", + "pinterp", + "emit", + "restart", + "tex", + "texbias", + "texlod", + "texfetch", + "texquery", + "texgrad", + "texgather", + "texcsaa", + "suld", + "sust", + "dfdx", + "dfdy", + "rdsv", + "wrsv", + "pixld", + "quadop", + "quadon", + "quadpop", + "popcnt", + "insbf", + "extbf", + "(invalid)" +}; + +static const char *DataTypeStr[] = +{ + "-", + "u8", "s8", + "u16", "s16", + "u32", "s32", + "u64", "s64", + "f16", "f32", "f64", + "b96", "b128" +}; + +static const char *RoundModeStr[] = +{ + "", "rm", "rz", "rp", "rni", "rmi", "rzi", "rpi" +}; + +static const char *CondCodeStr[] = +{ + "never", + "lt", + "eq", + "le", + "gt", + "ne", + "ge", + "", + "(invalid)", + "ltu", + "equ", + "leu", + "gtu", + "neu", + "geu", + "", + "no", + "nc", + "ns", + "na", + "a", + "s", + "c", + "o" +}; + +static const char *SemanticStr[SV_LAST + 1] = +{ + "POSITION", + "VERTEX_ID", + "INSTANCE_ID", + "INVOCATION_ID", + "PRIMITIVE_ID", + "VERTEX_COUNT", + "LAYER", + "VIEWPORT_INDEX", + "Y_DIR", + "FACE", + "POINT_SIZE", + "POINT_COORD", + "CLIP_DISTANCE", + "SAMPLE_INDEX", + "TESS_FACTOR", + "TESS_COORD", + "TID", + "CTAID", + "NTID", + "GRIDID", + "NCTAID", + "LANEID", + "PHYSID", + "NPHYSID", + "CLOCK", + "LBASE", + "SBASE", + "?", + "(INVALID)" +}; + +#define PRINT(args...) \ + do { \ + pos += snprintf(&buf[pos], size - pos, args); \ + } while(0) + +#define SPACE_PRINT(cond, args...) \ + do { \ + if (cond) \ + buf[pos++] = ' '; \ + pos += snprintf(&buf[pos], size - pos, args); \ + } while(0) + +#define SPACE() \ + do { \ + if (pos < size) \ + buf[pos++] = ' '; \ + } while(0) + +int Modifier::print(char *buf, size_t size) const +{ + size_t pos = 0; + + if (bits) + PRINT("%s", colour[TXT_INSN]); + + size_t base = pos; + + if (bits & NV50_IR_MOD_NOT) + PRINT("not"); + if (bits & NV50_IR_MOD_SAT) + SPACE_PRINT(pos > base && pos < size, "sat"); + if (bits & NV50_IR_MOD_NEG) + SPACE_PRINT(pos > base && pos < size, "neg"); + if (bits & NV50_IR_MOD_ABS) + SPACE_PRINT(pos > base && pos < size, "abs"); + + return pos; +} + +int LValue::print(char *buf, size_t size, DataType ty) const +{ + const char *postFix = ""; + size_t pos = 0; + int idx = join->reg.data.id >= 0 ? join->reg.data.id : id; + char p = join->reg.data.id >= 0 ? '$' : '%'; + char r; + int col = TXT_DEFAULT; + + switch (reg.file) { + case FILE_GPR: + r = 'r'; col = TXT_GPR; + if (reg.size == 8) + postFix = "d"; + else + if (reg.size == 16) + postFix = "q"; + break; + case FILE_PREDICATE: + r = 'p'; col = TXT_REGISTER; + if (reg.size == 2) + postFix = "d"; + else + if (reg.size == 4) + postFix = "q"; + break; + case FILE_FLAGS: + r = 'c'; col = TXT_FLAGS; + break; + case FILE_ADDRESS: + r = 'a'; col = TXT_REGISTER; + break; + default: + assert(!"invalid file for lvalue"); + r = '?'; + break; + } + + PRINT("%s%c%c%i%s", colour[col], p, r, idx, postFix); + + return pos; +} + +int ImmediateValue::print(char *buf, size_t size, DataType ty) const +{ + size_t pos = 0; + + PRINT("%s", colour[TXT_IMMD]); + + switch (ty) { + case TYPE_F32: PRINT("%f", reg.data.f32); break; + case TYPE_F64: PRINT("%f", reg.data.f64); break; + case TYPE_U8: PRINT("0x%02x", reg.data.u8); break; + case TYPE_S8: PRINT("%i", reg.data.s8); break; + case TYPE_U16: PRINT("0x%04x", reg.data.u16); break; + case TYPE_S16: PRINT("%i", reg.data.s16); break; + case TYPE_U32: PRINT("0x%08x", reg.data.u32); break; + case TYPE_S32: PRINT("%i", reg.data.s32); break; + case TYPE_U64: + case TYPE_S64: + default: + PRINT("0x%016lx", reg.data.u64); + break; + } + return pos; +} + +int Symbol::print(char *buf, size_t size, DataType ty) const +{ + return print(buf, size, NULL, NULL, ty); +} + +int Symbol::print(char *buf, size_t size, + Value *rel, Value *dimRel, DataType ty) const +{ + size_t pos = 0; + char c; + + if (ty == TYPE_NONE) + ty = typeOfSize(reg.size); + + if (reg.file == FILE_SYSTEM_VALUE) { + PRINT("%ssv[%s%s:%i%s", colour[TXT_MEM], + colour[TXT_REGISTER], + SemanticStr[reg.data.sv.sv], reg.data.sv.index, colour[TXT_MEM]); + if (rel) { + PRINT("%s+", colour[TXT_DEFAULT]); + pos += rel->print(&buf[pos], size - pos); + } + PRINT("%s]", colour[TXT_MEM]); + return pos; + } + + switch (reg.file) { + case FILE_MEMORY_CONST: c = 'c'; break; + case FILE_SHADER_INPUT: c = 'a'; break; + case FILE_SHADER_OUTPUT: c = 'o'; break; + case FILE_MEMORY_GLOBAL: c = 'g'; break; + case FILE_MEMORY_SHARED: c = 's'; break; + case FILE_MEMORY_LOCAL: c = 'l'; break; + default: + assert(!"invalid file"); + c = '?'; + break; + } + + if (c == 'c') + PRINT("%s%c%i[", colour[TXT_MEM], c, reg.fileIndex); + else + PRINT("%s%c[", colour[TXT_MEM], c); + + if (dimRel) { + pos += dimRel->print(&buf[pos], size - pos, TYPE_S32); + PRINT("%s][", colour[TXT_MEM]); + } + + if (rel) { + pos += rel->print(&buf[pos], size - pos); + PRINT("%s%c", colour[TXT_DEFAULT], (reg.data.offset < 0) ? '-' : '+'); + } else { + assert(reg.data.offset >= 0); + } + PRINT("%s0x%x%s]", colour[TXT_IMMD], abs(reg.data.offset), colour[TXT_MEM]); + + return pos; +} + +void Instruction::print() const +{ + #define BUFSZ 512 + + const size_t size = BUFSZ; + + char buf[BUFSZ]; + int s, d; + size_t pos = 0; + + PRINT("%s", colour[TXT_INSN]); + + if (join) + PRINT("join "); + + if (predSrc >= 0) { + const size_t pre = pos; + if (getSrc(predSrc)->reg.file == FILE_PREDICATE) { + if (cc == CC_NOT_P) + PRINT("not"); + } else { + PRINT("%s", CondCodeStr[cc]); + } + if (pos > pre + 1) + SPACE(); + pos += src[predSrc].get()->print(&buf[pos], BUFSZ - pos); + PRINT(" %s", colour[TXT_INSN]); + } + + if (saturate) + PRINT("sat "); + + if (asFlow()) { + PRINT("%s", operationStr[op]); + if (op == OP_CALL && asFlow()->builtin) { + PRINT(" %sBUILTIN:%i", colour[TXT_BRA], asFlow()->target.builtin); + } else + if (op == OP_CALL && asFlow()->target.fn) { + PRINT(" %s%s", colour[TXT_BRA], asFlow()->target.fn->getName()); + } else + if (asFlow()->target.bb) + PRINT(" %sBB:%i", colour[TXT_BRA], asFlow()->target.bb->getId()); + } else { + PRINT("%s ", operationStr[op]); + if (perPatch) + PRINT("patch "); + if (asTex()) + PRINT("%s ", asTex()->tex.target.getName()); + if (postFactor) + PRINT("x2^%i ", postFactor); + PRINT("%s%s", dnz ? "dnz " : (ftz ? "ftz " : ""), DataTypeStr[dType]); + } + + if (rnd != ROUND_N) + PRINT(" %s", RoundModeStr[rnd]); + + if (def[1].exists()) + PRINT(" {"); + for (d = 0; defExists(d); ++d) { + SPACE(); + pos += def[d].get()->print(&buf[pos], size - pos); + } + if (d > 1) + PRINT(" %s}", colour[TXT_INSN]); + else + if (!d && !asFlow()) + PRINT(" %s#", colour[TXT_INSN]); + + if (asCmp()) + PRINT(" %s%s", colour[TXT_INSN], CondCodeStr[asCmp()->setCond]); + + if (sType != dType) + PRINT(" %s%s", colour[TXT_INSN], DataTypeStr[sType]); + + for (s = 0; srcExists(s); ++s) { + if (s == predSrc || src[s].usedAsPtr) + continue; + const size_t pre = pos; + SPACE(); + pos += src[s].mod.print(&buf[pos], BUFSZ - pos); + if (pos > pre + 1) + SPACE(); + if (src[s].isIndirect(0) || src[s].isIndirect(1)) + pos += src[s].get()->asSym()->print(&buf[pos], BUFSZ - pos, + getIndirect(s, 0), + getIndirect(s, 1)); + else + pos += src[s].get()->print(&buf[pos], BUFSZ - pos, sType); + } + + PRINT("%s", colour[TXT_DEFAULT]); + + buf[MIN2(pos, BUFSZ - 1)] = 0; + + INFO("%s (%u)\n", buf, encSize); +} + +class PrintPass : public Pass +{ +public: + PrintPass() : serial(0) { } + + virtual bool visit(Function *); + virtual bool visit(BasicBlock *); + virtual bool visit(Instruction *); + +private: + int serial; +}; + +bool +PrintPass::visit(Function *fn) +{ + INFO("\n%s:\n", fn->getName()); + + return true; +} + +bool +PrintPass::visit(BasicBlock *bb) +{ +#if 0 + INFO("---\n"); + for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) + INFO(" <- BB:%i (%s)\n", + BasicBlock::get(ei.getNode())->getId(), + ei.getEdge()->typeStr()); +#endif + INFO("BB:%i (%u instructions) - ", bb->getId(), bb->getInsnCount()); + + if (bb->idom()) + INFO("idom = BB:%i, ", bb->idom()->getId()); + + INFO("df = { "); + for (DLList::Iterator df = bb->getDF().iterator(); !df.end(); df.next()) + INFO("BB:%i ", BasicBlock::get(df)->getId()); + + INFO("}\n"); + + for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) + INFO(" -> BB:%i (%s)\n", + BasicBlock::get(ei.getNode())->getId(), + ei.getEdge()->typeStr()); + + return true; +} + +bool +PrintPass::visit(Instruction *insn) +{ + INFO("%3i: ", serial++); + insn->print(); + return true; +} + +void +Function::print() +{ + PrintPass pass; + pass.run(this, true, false); +} + +void +Program::print() +{ + PrintPass pass; + pass.run(this, true, false); +} + +void +Function::printLiveIntervals() const +{ + INFO("printing live intervals ...\n"); + + for (ArrayList::Iterator it = allLValues.iterator(); !it.end(); it.next()) { + const Value *lval = Value::get(it)->asLValue(); + if (lval && !lval->livei.isEmpty()) { + INFO("livei(%%%i): ", lval->id); + lval->livei.print(); + } + } +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp new file mode 100644 index 00000000000..7e3c44d3b15 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp @@ -0,0 +1,963 @@ + +#include "nv50_ir.h" +#include "nv50_ir_target.h" + +#include "nv50/nv50_debug.h" + +namespace nv50_ir { + +#define MAX_REGISTER_FILE_SIZE 256 + +class RegisterSet +{ +public: + RegisterSet(); + RegisterSet(const Target *); + + void init(const Target *); + void reset(); // reset allocation status, but not max assigned regs + + void periodicMask(DataFile f, uint32_t lock, uint32_t unlock); + void intersect(DataFile f, const RegisterSet *); + + bool assign(Value **, int nr); + void release(const Value *); + void occupy(const Value *); + + int getMaxAssigned(DataFile f) const { return fill[f]; } + + void print() const; + +private: + uint32_t bits[FILE_ADDRESS + 1][(MAX_REGISTER_FILE_SIZE + 31) / 32]; + + int unit[FILE_ADDRESS + 1]; // log2 of allocation granularity + + int last[FILE_ADDRESS + 1]; + int fill[FILE_ADDRESS + 1]; +}; + +void +RegisterSet::reset() +{ + memset(bits, 0, sizeof(bits)); +} + +RegisterSet::RegisterSet() +{ + reset(); +} + +void +RegisterSet::init(const Target *targ) +{ + for (unsigned int rf = 0; rf <= FILE_ADDRESS; ++rf) { + DataFile f = static_cast(rf); + last[rf] = targ->getFileSize(f) - 1; + unit[rf] = targ->getFileUnit(f); + fill[rf] = -1; + assert(last[rf] < MAX_REGISTER_FILE_SIZE); + } +} + +RegisterSet::RegisterSet(const Target *targ) +{ + reset(); + init(targ); +} + +void +RegisterSet::periodicMask(DataFile f, uint32_t lock, uint32_t unlock) +{ + for (int i = 0; i < (last[f] + 31) / 32; ++i) + bits[f][i] = (bits[f][i] | lock) & ~unlock; +} + +void +RegisterSet::intersect(DataFile f, const RegisterSet *set) +{ + for (int i = 0; i < (last[f] + 31) / 32; ++i) + bits[f][i] |= set->bits[f][i]; +} + +void +RegisterSet::print() const +{ + INFO("GPR:"); + for (int i = 0; i < (last[FILE_GPR] + 31) / 32; ++i) + INFO(" %08x", bits[FILE_GPR][i]); + INFO("\n"); +} + +bool +RegisterSet::assign(Value **def, int nr) +{ + DataFile f = def[0]->reg.file; + int n = nr; + if (n == 3) + n = 4; + int s = (n * def[0]->reg.size) >> unit[f]; + uint32_t m = (1 << s) - 1; + + int id = last[f] + 1; + int i; + + for (i = 0; (i * 32) < last[f]; ++i) { + if (bits[f][i] == 0xffffffff) + continue; + + for (id = 0; id < 32; id += s) + if (!(bits[f][i] & (m << id))) + break; + if (id < 32) + break; + } + id += i * 32; + if (id > last[f]) + return false; + + bits[f][id / 32] |= m << (id % 32); + + if (id + (s - 1) > fill[f]) + fill[f] = id + (s - 1); + + for (i = 0; i < nr; ++i, ++id) + if (!def[i]->livei.isEmpty()) // XXX: really increased id if empty ? + def[i]->reg.data.id = id; + return true; +} + +void +RegisterSet::occupy(const Value *val) +{ + int id = val->reg.data.id; + if (id < 0) + return; + unsigned int f = val->reg.file; + + uint32_t m = (1 << (val->reg.size >> unit[f])) - 1; + + INFO_DBG(0, REG_ALLOC, "reg occupy: %u[%i] %x\n", f, id, m); + + bits[f][id / 32] |= m << (id % 32); + + if (fill[f] < id) + fill[f] = id; +} + +void +RegisterSet::release(const Value *val) +{ + int id = val->reg.data.id; + if (id < 0) + return; + unsigned int f = val->reg.file; + + uint32_t m = (1 << (val->reg.size >> unit[f])) - 1; + + INFO_DBG(0, REG_ALLOC, "reg release: %u[%i] %x\n", f, id, m); + + bits[f][id / 32] &= ~(m << (id % 32)); +} + +#define JOIN_MASK_PHI (1 << 0) +#define JOIN_MASK_UNION (1 << 1) +#define JOIN_MASK_MOV (1 << 2) +#define JOIN_MASK_TEX (1 << 3) +#define JOIN_MASK_CONSTRAINT (1 << 4) + +class RegAlloc +{ +public: + RegAlloc(Program *program) : prog(program), sequence(0) { } + + bool exec(); + bool execFunc(); + +private: + bool coalesceValues(unsigned int mask); + bool linearScan(); + bool allocateConstrainedValues(); + +private: + class PhiMovesPass : public Pass { + private: + virtual bool visit(BasicBlock *); + inline bool needNewElseBlock(BasicBlock *b, BasicBlock *p); + }; + + class BuildIntervalsPass : public Pass { + private: + virtual bool visit(BasicBlock *); + void collectLiveValues(BasicBlock *); + void addLiveRange(Value *, const BasicBlock *, int end); + }; + + class InsertConstraintsPass : public Pass { + public: + bool exec(Function *func); + private: + virtual bool visit(BasicBlock *); + + bool insertConstraintMoves(); + + void addHazard(Instruction *i, const ValueRef *src); + void textureMask(TexInstruction *); + void addConstraint(Instruction *, int s, int n); + bool detectConflict(Instruction *, int s); + + DLList constrList; + }; + + bool buildLiveSets(BasicBlock *); + void collectLValues(DLList&, bool assignedOnly); + + void insertOrderedTail(DLList&, Value *); + inline Instruction *insnBySerial(int); + +private: + Program *prog; + Function *func; + + // instructions in control flow / chronological order + ArrayList insns; + + int sequence; // for manual passes through CFG +}; + +Instruction * +RegAlloc::insnBySerial(int serial) +{ + return reinterpret_cast(insns.get(serial)); +} + +void +RegAlloc::BuildIntervalsPass::addLiveRange(Value *val, + const BasicBlock *bb, + int end) +{ + Instruction *insn = val->getUniqueInsn(); + + if (!insn) + return; + assert(bb->getFirst()->serial <= bb->getExit()->serial); + assert(bb->getExit()->serial + 1 >= end); + + int begin = insn->serial; + if (begin < bb->getEntry()->serial || begin > bb->getExit()->serial) + begin = bb->getEntry()->serial; + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "%%%i <- live range [%i(%i), %i)\n", + val->id, begin, insn->serial, end); + + if (begin != end) // empty ranges are only added as hazards for fixed regs + val->livei.extend(begin, end); +} + +bool +RegAlloc::PhiMovesPass::needNewElseBlock(BasicBlock *b, BasicBlock *p) +{ + if (b->cfg.incidentCount() <= 1) + return false; + + int n = 0; + for (Graph::EdgeIterator ei = p->cfg.outgoing(); !ei.end(); ei.next()) + if (ei.getType() == Graph::Edge::TREE || + ei.getType() == Graph::Edge::FORWARD) + ++n; + return (n == 2); +} + +// For each operand of each PHI in b, generate a new value by inserting a MOV +// at the end of the block it is coming from and replace the operand with its +// result. This eliminates liveness conflicts and enables us to let values be +// copied to the right register if such a conflict exists nonetheless. +// +// These MOVs are also crucial in making sure the live intervals of phi srces +// are extended until the end of the loop, since they are not included in the +// live-in sets. +bool +RegAlloc::PhiMovesPass::visit(BasicBlock *bb) +{ + Instruction *phi, *mov; + BasicBlock *pb, *pn; + + for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) { + pb = pn = BasicBlock::get(ei.getNode()); + assert(pb); + + if (needNewElseBlock(bb, pb)) { + pn = new BasicBlock(func); + + // deletes an edge, iterator is invalid after this: + pb->cfg.detach(&bb->cfg); + pb->cfg.attach(&pn->cfg, Graph::Edge::TREE); + pn->cfg.attach(&bb->cfg, Graph::Edge::FORWARD); // XXX: check order ! + + assert(pb->getExit()->op != OP_CALL); + if (pb->getExit()->asFlow()->target.bb == bb) + pb->getExit()->asFlow()->target.bb = pn; + break; + } + } + + // insert MOVs (phi->src[j] should stem from j-th in-BB) + int j = 0; + for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) { + pb = BasicBlock::get(ei.getNode()); + if (!pb->isTerminated()) + pb->insertTail(new_FlowInstruction(func, OP_BRA, bb)); + + for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) { + mov = new_Instruction(func, OP_MOV, TYPE_U32); + + mov->setSrc(0, phi->getSrc(j)); + mov->setDef(0, new_LValue(func, phi->getDef(0)->asLValue())); + phi->setSrc(j, mov->getDef(0)); + + pb->insertBefore(pb->getExit(), mov); + } + ++j; + } + + return true; +} + +// Build the set of live-in variables of bb. +bool +RegAlloc::buildLiveSets(BasicBlock *bb) +{ + BasicBlock *bn; + Instruction *i; + unsigned int s, d; + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "buildLiveSets(BB:%i)\n", bb->getId()); + + bb->liveSet.allocate(func->allLValues.getSize(), false); + + int n = 0; + for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { + bn = BasicBlock::get(ei.getNode()); + if (bn == bb) + continue; + if (bn->cfg.visit(sequence)) + if (!buildLiveSets(bn)) + return false; + if (n++ == 0) + bb->liveSet = bn->liveSet; + else + bb->liveSet |= bn->liveSet; + } + if (!n && !bb->liveSet.marker) + bb->liveSet.fill(0); + bb->liveSet.marker = true; + + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) { + INFO("BB:%i live set of out blocks:\n", bb->getId()); + bb->liveSet.print(); + } + + // if (!bb->getEntry()) + // return true; + + for (i = bb->getExit(); i && i != bb->getEntry()->prev; i = i->prev) { + for (d = 0; i->defExists(d); ++d) + bb->liveSet.clr(i->getDef(d)->id); + for (s = 0; i->srcExists(s); ++s) + if (i->getSrc(s)->asLValue()) + bb->liveSet.set(i->getSrc(s)->id); + } + for (i = bb->getPhi(); i && i->op == OP_PHI; i = i->next) + bb->liveSet.clr(i->getDef(0)->id); + + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) { + INFO("BB:%i live set after propagation:\n", bb->getId()); + bb->liveSet.print(); + } + + return true; +} + +void +RegAlloc::BuildIntervalsPass::collectLiveValues(BasicBlock *bb) +{ + BasicBlock *bbA = NULL, *bbB = NULL; + + assert(bb->cfg.incidentCount() || bb->liveSet.popCount() == 0); + + if (bb->cfg.outgoingCount()) { + // trickery to save a loop of OR'ing liveSets + // aliasing works fine with BitSet::setOr + for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { + if (ei.getType() == Graph::Edge::DUMMY) + continue; + if (bbA) { + bb->liveSet.setOr(&bbA->liveSet, &bbB->liveSet); + bbA = bb; + } else { + bbA = bbB; + } + bbB = BasicBlock::get(ei.getNode()); + } + bb->liveSet.setOr(&bbB->liveSet, bbA ? &bbA->liveSet : NULL); + } else + if (bb->cfg.incidentCount()) { + bb->liveSet.fill(0); + } +} + +bool +RegAlloc::BuildIntervalsPass::visit(BasicBlock *bb) +{ + collectLiveValues(bb); + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "BuildIntervals(BB:%i)\n", bb->getId()); + + // go through out blocks and delete phi sources that do not originate from + // the current block from the live set + for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { + BasicBlock *out = BasicBlock::get(ei.getNode()); + + for (Instruction *i = out->getPhi(); i && i->op == OP_PHI; i = i->next) { + bb->liveSet.clr(i->getDef(0)->id); + + for (int s = 0; s < NV50_IR_MAX_SRCS && i->src[s].exists(); ++s) { + assert(i->src[s].getInsn()); + if (i->getSrc(s)->getUniqueInsn()->bb == bb) // XXX: reachableBy ? + bb->liveSet.set(i->getSrc(s)->id); + else + bb->liveSet.clr(i->getSrc(s)->id); + } + } + } + + // remaining live-outs are live until end + if (bb->getExit()) { + for (unsigned int j = 0; j < bb->liveSet.getSize(); ++j) + if (bb->liveSet.test(j)) + addLiveRange(func->getLValue(j), bb, bb->getExit()->serial + 1); + } + + for (Instruction *i = bb->getExit(); i && i->op != OP_PHI; i = i->prev) { + for (int d = 0; i->defExists(d); ++d) { + bb->liveSet.clr(i->getDef(d)->id); + if (i->getDef(d)->reg.data.id >= 0) // add hazard for fixed regs + i->getDef(d)->livei.extend(i->serial, i->serial); + } + + for (int s = 0; i->srcExists(s); ++s) { + if (!i->getSrc(s)->asLValue()) + continue; + if (!bb->liveSet.test(i->getSrc(s)->id)) { + bb->liveSet.set(i->getSrc(s)->id); + addLiveRange(i->getSrc(s), bb, i->serial); + } + } + } + + return true; +} + +bool +RegAlloc::coalesceValues(unsigned int mask) +{ + int c, n; + + for (n = 0; n < insns.getSize(); ++n) { + Instruction *i; + Instruction *insn = insnBySerial(n); + + switch (insn->op) { + case OP_PHI: + if (!(mask & JOIN_MASK_PHI)) + break; + for (c = 0; insn->srcExists(c); ++c) + if (!insn->getDef(0)->coalesce(insn->getSrc(c), false)) { + ERROR("failed to coalesce phi operands\n"); + return false; + } + break; + case OP_UNION: + if (!(mask & JOIN_MASK_UNION)) + break; + for (c = 0; insn->srcExists(c); ++c) + insn->getDef(0)->coalesce(insn->getSrc(c), true); + break; + case OP_CONSTRAINT: + if (!(mask & JOIN_MASK_CONSTRAINT)) + break; + for (c = 0; c < 4 && insn->srcExists(c); ++c) + insn->getDef(c)->coalesce(insn->getSrc(c), true); + break; + case OP_MOV: + if (!(mask & JOIN_MASK_MOV)) + break; + i = insn->getSrc(0)->getUniqueInsn(); + if (i && !i->constrainedDefs()) + insn->getDef(0)->coalesce(insn->getSrc(0), false); + break; + case OP_TEX: + case OP_TXB: + case OP_TXL: + case OP_TXF: + case OP_TXQ: + case OP_TXD: + case OP_TXG: + case OP_TEXCSAA: + if (!(mask & JOIN_MASK_TEX)) + break; + for (c = 0; c < 4 && insn->srcExists(c); ++c) + insn->getDef(c)->coalesce(insn->getSrc(c), true); + break; + default: + break; + } + } + return true; +} + +void +RegAlloc::insertOrderedTail(DLList &list, Value *val) +{ + // we insert the live intervals in order, so this should be short + DLList::Iterator iter = list.revIterator(); + const int begin = val->livei.begin(); + for (; !iter.end(); iter.next()) { + if (reinterpret_cast(iter.get())->livei.begin() <= begin) + break; + } + iter.insert(val); +} + +static void +checkList(DLList &list) +{ + Value *prev = NULL; + Value *next = NULL; + + for (DLList::Iterator iter = list.iterator(); !iter.end(); iter.next()) { + next = Value::get(iter); + assert(next); + if (prev) { + assert(prev->livei.begin() <= next->livei.begin()); + } + assert(next->join == next); + prev = next; + } +} + +void +RegAlloc::collectLValues(DLList &list, bool assignedOnly) +{ + for (int n = 0; n < insns.getSize(); ++n) { + Instruction *i = insnBySerial(n); + + for (int d = 0; i->defExists(d); ++d) + if (!i->getDef(d)->livei.isEmpty()) + if (!assignedOnly || i->getDef(d)->reg.data.id >= 0) + insertOrderedTail(list, i->getDef(d)); + } + checkList(list); +} + +bool +RegAlloc::allocateConstrainedValues() +{ + Value *defs[4]; + RegisterSet regSet[4]; + DLList regVals; + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "RA: allocating constrained values\n"); + + collectLValues(regVals, true); + + for (int c = 0; c < 4; ++c) + regSet[c].init(prog->getTarget()); + + for (int n = 0; n < insns.getSize(); ++n) { + Instruction *i = insnBySerial(n); + + const int vecSize = i->defCount(0xf); + if (vecSize < 2) + continue; + assert(vecSize <= 4); + + for (int c = 0; c < vecSize; ++c) + defs[c] = i->def[c].rep(); + + if (defs[0]->reg.data.id >= 0) { + for (int c = 1; c < vecSize; ++c) { + assert(defs[c]->reg.data.id >= 0); + } + continue; + } + + for (int c = 0; c < vecSize; ++c) { + uint32_t mask; + regSet[c].reset(); + + for (DLList::Iterator it = regVals.iterator(); !it.end(); it.next()) { + Value *rVal = Value::get(it); + if (rVal->reg.data.id >= 0 && rVal->livei.overlaps(defs[c]->livei)) + regSet[c].occupy(rVal); + } + mask = 0x11111111; + if (vecSize == 2) // granularity is 2 instead of 4 + mask |= 0x11111111 << 2; + regSet[c].periodicMask(defs[0]->reg.file, 0, ~(mask << c)); + + if (!defs[c]->livei.isEmpty()) + insertOrderedTail(regVals, defs[c]); + } + for (int c = 1; c < vecSize; ++c) + regSet[0].intersect(defs[0]->reg.file, ®Set[c]); + + if (!regSet[0].assign(&defs[0], vecSize)) // TODO: spilling + return false; + } + for (int c = 0; c < 4; c += 2) + if (regSet[c].getMaxAssigned(FILE_GPR) > prog->maxGPR) + prog->maxGPR = regSet[c].getMaxAssigned(FILE_GPR); + return true; +} + +bool +RegAlloc::linearScan() +{ + Value *cur, *val; + DLList unhandled, active, inactive; + RegisterSet f(prog->getTarget()), free(prog->getTarget()); + + INFO_DBG(prog->dbgFlags, REG_ALLOC, "RA: linear scan\n"); + + collectLValues(unhandled, false); + + for (DLList::Iterator cI = unhandled.iterator(); !cI.end();) { + cur = Value::get(cI); + cI.erase(); + + for (DLList::Iterator aI = active.iterator(); !aI.end();) { + val = Value::get(aI); + if (val->livei.end() <= cur->livei.begin()) { + free.release(val); + aI.erase(); + } else + if (!val->livei.contains(cur->livei.begin())) { + free.release(val); + aI.moveToList(inactive); + } else { + aI.next(); + } + } + + for (DLList::Iterator iI = inactive.iterator(); !iI.end();) { + val = Value::get(iI); + if (val->livei.end() <= cur->livei.begin()) { + iI.erase(); + } else + if (val->livei.contains(cur->livei.begin())) { + free.occupy(val); + iI.moveToList(active); + } else { + iI.next(); + } + } + f = free; + + for (DLList::Iterator iI = inactive.iterator(); !iI.end(); iI.next()) { + val = Value::get(iI); + if (val->livei.overlaps(cur->livei)) + f.occupy(val); + } + + for (DLList::Iterator uI = unhandled.iterator(); !uI.end(); uI.next()) { + val = Value::get(uI); + if (val->reg.data.id >= 0 && val->livei.overlaps(cur->livei)) + f.occupy(val); + } + + if (cur->reg.data.id < 0) { + bool spill = !f.assign(&cur, 1); + if (spill) { + ERROR("out of registers of file %u\n", cur->reg.file); + abort(); + } + } + free.occupy(cur); + active.insert(cur); + } + + if (f.getMaxAssigned(FILE_GPR) > prog->maxGPR) + prog->maxGPR = f.getMaxAssigned(FILE_GPR); + if (free.getMaxAssigned(FILE_GPR) > prog->maxGPR) + prog->maxGPR = free.getMaxAssigned(FILE_GPR); + return true; +} + +bool +RegAlloc::exec() +{ + for (ArrayList::Iterator fi = prog->allFuncs.iterator(); + !fi.end(); fi.next()) { + func = reinterpret_cast(fi.get()); + if (!execFunc()) + return false; + } + return true; +} + +bool +RegAlloc::execFunc() +{ + InsertConstraintsPass insertConstr; + PhiMovesPass insertMoves; + BuildIntervalsPass buildIntervals; + + unsigned int i; + bool ret; + + ret = insertConstr.exec(func); + if (!ret) + goto out; + + ret = insertMoves.run(func); + if (!ret) + goto out; + + for (sequence = func->cfg.nextSequence(), i = 0; + ret && i <= func->loopNestingBound; + sequence = func->cfg.nextSequence(), ++i) + ret = buildLiveSets(BasicBlock::get(func->cfg.getRoot())); + if (!ret) + goto out; + + func->orderInstructions(this->insns); + + ret = buildIntervals.run(func); + if (!ret) + goto out; + + ret = coalesceValues(JOIN_MASK_PHI); + if (!ret) + goto out; + switch (prog->getTarget()->getChipset() & 0xf0) { + case 0x50: + ret = coalesceValues(JOIN_MASK_UNION | JOIN_MASK_TEX); + break; + case 0xc0: + ret = coalesceValues(JOIN_MASK_UNION | JOIN_MASK_CONSTRAINT); + break; + default: + break; + } + if (!ret) + goto out; + ret = coalesceValues(JOIN_MASK_MOV); + if (!ret) + goto out; + + if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) { + func->print(); + func->printLiveIntervals(); + } + + ret = allocateConstrainedValues() && linearScan(); + if (!ret) + goto out; + +out: + // TODO: should probably call destructor on LValues later instead + for (ArrayList::Iterator it = func->allLValues.iterator(); + !it.end(); it.next()) + reinterpret_cast(it.get())->livei.clear(); + + return ret; +} + +bool Program::registerAllocation() +{ + RegAlloc ra(this); + return ra.exec(); +} + +bool +RegAlloc::InsertConstraintsPass::exec(Function *ir) +{ + constrList.clear(); + + bool ret = run(ir, true, true); + if (ret) + ret = insertConstraintMoves(); + return ret; +} + +// TODO: make part of texture insn +void +RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex) +{ + Value *def[4]; + int c, k, d; + uint8_t mask = 0; + + for (d = 0, k = 0, c = 0; c < 4; ++c) { + if (!(tex->tex.mask & (1 << c))) + continue; + if (tex->getDef(k)->refCount()) { + mask |= 1 << c; + def[d++] = tex->getDef(k); + } + ++k; + } + tex->tex.mask = mask; + +#if 0 // reorder or set the unused ones NULL ? + for (c = 0; c < 4; ++c) + if (!(tex->tex.mask & (1 << c))) + def[d++] = tex->getDef(c); +#endif + for (c = 0; c < d; ++c) + tex->setDef(c, def[c]); +#if 1 + for (; c < 4; ++c) + tex->setDef(c, NULL); +#endif +} + +bool +RegAlloc::InsertConstraintsPass::detectConflict(Instruction *cst, int s) +{ + // current register allocation can't handle it if a value participates in + // multiple constraints + for (ValueRef::Iterator it = cst->src[s].iterator(); !it.end(); it.next()) { + Instruction *insn = it.get()->getInsn(); + if (insn != cst) + return true; + } + + // can start at s + 1 because detectConflict is called on all sources + for (int c = s + 1; cst->srcExists(c); ++c) + if (cst->getSrc(c) == cst->getSrc(s)) + return true; + + Instruction *defi = cst->getSrc(s)->getInsn(); + + return (!defi || defi->constrainedDefs()); +} + +void +RegAlloc::InsertConstraintsPass::addConstraint(Instruction *i, int s, int n) +{ + Instruction *cst; + int d; + + // first, look for an existing identical constraint op + for (DLList::Iterator it = constrList.iterator(); !it.end(); it.next()) { + cst = reinterpret_cast(it.get()); + if (!i->bb->dominatedBy(cst->bb)) + break; + for (d = 0; d < n; ++d) + if (cst->getSrc(d) != i->getSrc(d + s)) + break; + if (d >= n) { + for (d = 0; d < n; ++d, ++s) + i->setSrc(s, cst->getDef(d)); + return; + } + } + cst = new_Instruction(func, OP_CONSTRAINT, i->dType); + + for (d = 0; d < n; ++s, ++d) { + cst->setDef(d, new_LValue(func, FILE_GPR)); + cst->setSrc(d, i->getSrc(s)); + i->setSrc(s, cst->getDef(d)); + } + i->bb->insertBefore(i, cst); + + constrList.insert(cst); +} + +// Add a dummy use of the pointer source of >= 8 byte loads after the load +// to prevent it from being assigned a register which overlapping the load's +// destination, which would produce random corruptions. +void +RegAlloc::InsertConstraintsPass::addHazard(Instruction *i, const ValueRef *src) +{ + Instruction *hzd = new_Instruction(func, OP_NOP, TYPE_NONE); + hzd->setSrc(0, src->get()); + i->bb->insertAfter(i, hzd); + +} + +// Insert constraint markers for instructions whose multiple sources must be +// located in consecutive registers. +bool +RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb) +{ + TexInstruction *tex; + Instruction *next; + int s, n, size; + + for (Instruction *i = bb->getEntry(); i; i = next) { + next = i->next; + + if ((tex = i->asTex())) { + textureMask(tex); + + // FIXME: this is target specific + if (tex->op == OP_TXQ) { + s = tex->srcCount(0xff); + n = 0; + } else { + s = tex->tex.target.getArgCount(); + if (!tex->tex.target.isArray() && + (tex->tex.rIndirectSrc >= 0 || tex->tex.sIndirectSrc >= 0)) + ++s; + n = tex->srcCount(0xff) - s; + assert(n <= 4); + } + + if (s > 1) + addConstraint(i, 0, s); + if (n > 1) + addConstraint(i, s, n); + } else + if (i->op == OP_EXPORT || i->op == OP_STORE) { + for (size = typeSizeof(i->dType), s = 1; size > 0; ++s) { + assert(i->srcExists(s)); + size -= i->getSrc(s)->reg.size; + } + if ((s - 1) > 1) + addConstraint(i, 1, s - 1); + } else + if (i->op == OP_LOAD) { + if (i->src[0].isIndirect(0) && typeSizeof(i->dType) >= 8) + addHazard(i, i->src[0].getIndirect(0)); + } + } + return true; +} + +// Insert extra moves so that, if multiple register constraints on a value are +// in conflict, these conflicts can be resolved. +bool +RegAlloc::InsertConstraintsPass::insertConstraintMoves() +{ + for (DLList::Iterator it = constrList.iterator(); !it.end(); it.next()) { + Instruction *cst = reinterpret_cast(it.get()); + + for (int s = 0; cst->srcExists(s); ++s) { + if (!detectConflict(cst, s)) + continue; + Instruction *mov = new_Instruction(func, OP_MOV, + typeOfSize(cst->src[s].getSize())); + mov->setSrc(0, cst->getSrc(s)); + mov->setDef(0, new_LValue(func, FILE_GPR)); + cst->setSrc(s, mov->getDef(0)); + + cst->bb->insertBefore(cst, mov); + } + } + return true; +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_ssa.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_ssa.cpp new file mode 100644 index 00000000000..841163b0ac9 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_ssa.cpp @@ -0,0 +1,463 @@ + +#include "nv50_ir.h" +#include "nv50_ir_target.h" + +namespace nv50_ir { + +// Converts nv50 IR generated from TGSI to SSA form. + +// DominatorTree implements an algorithm for finding immediate dominators, +// as described by T. Lengauer & R. Tarjan. +class DominatorTree : public Graph +{ +public: + DominatorTree(Graph *cfg); + ~DominatorTree() { } + + bool dominates(BasicBlock *, BasicBlock *); + + void findDominanceFrontiers(); + +private: + void build(); + void buildDFS(Node *); + + void squash(int); + inline void link(int, int); + inline int eval(int); + + void debugPrint(); + + Graph *cfg; + + Node **vert; + int *data; + const int count; + + #define SEMI(i) (data[(i) + 0 * count]) + #define ANCESTOR(i) (data[(i) + 1 * count]) + #define PARENT(i) (data[(i) + 2 * count]) + #define LABEL(i) (data[(i) + 3 * count]) + #define DOM(i) (data[(i) + 4 * count]) +}; + +void DominatorTree::debugPrint() +{ + for (int i = 0; i < count; ++i) { + INFO("SEMI(%i) = %i\n", i, SEMI(i)); + INFO("ANCESTOR(%i) = %i\n", i, ANCESTOR(i)); + INFO("PARENT(%i) = %i\n", i, PARENT(i)); + INFO("LABEL(%i) = %i\n", i, LABEL(i)); + INFO("DOM(%i) = %i\n", i, DOM(i)); + } +} + +DominatorTree::DominatorTree(Graph *cfgraph) : cfg(cfgraph), + count(cfg->getSize()) +{ + Iterator *iter; + int i; + + vert = new Node * [count]; + data = new int[5 * count]; + + for (i = 0, iter = cfg->iteratorDFS(true); !iter->end(); iter->next(), ++i) { + vert[i] = reinterpret_cast(iter->get()); + vert[i]->tag = i; + LABEL(i) = i; + SEMI(i) = ANCESTOR(i) = -1; + } + cfg->putIterator(iter); + + build(); + + delete[] vert; + delete[] data; +} + +void DominatorTree::buildDFS(Graph::Node *node) +{ + SEMI(node->tag) = node->tag; + + for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) { + if (SEMI(ei.getNode()->tag) < 0) { + buildDFS(ei.getNode()); + PARENT(ei.getNode()->tag) = node->tag; + } + } +} + +void DominatorTree::squash(int v) +{ + if (ANCESTOR(ANCESTOR(v)) >= 0) { + squash(ANCESTOR(v)); + + if (SEMI(LABEL(ANCESTOR(v))) < SEMI(LABEL(v))) + LABEL(v) = LABEL(ANCESTOR(v)); + ANCESTOR(v) = ANCESTOR(ANCESTOR(v)); + } +} + +int DominatorTree::eval(int v) +{ + if (ANCESTOR(v) < 0) + return v; + squash(v); + return LABEL(v); +} + +void DominatorTree::link(int v, int w) +{ + ANCESTOR(w) = v; +} + +void DominatorTree::build() +{ + DLList *bucket = new DLList[count]; + Node *nv, *nw; + int p, u, v, w; + + buildDFS(cfg->getRoot()); + + for (w = count - 1; w >= 1; --w) { + nw = vert[w]; + assert(nw->tag == w); + for (Graph::EdgeIterator ei = nw->incident(); !ei.end(); ei.next()) { + nv = ei.getNode(); + v = nv->tag; + u = eval(v); + if (SEMI(u) < SEMI(w)) + SEMI(w) = SEMI(u); + } + p = PARENT(w); + bucket[SEMI(w)].insert(nw); + link(p, w); + + for (DLList::Iterator it = bucket[p].iterator(); !it.end(); it.erase()) { + v = reinterpret_cast(it.get())->tag; + u = eval(v); + DOM(v) = (SEMI(u) < SEMI(v)) ? u : p; + } + } + for (w = 1; w < count; ++w) { + if (DOM(w) != SEMI(w)) + DOM(w) = DOM(DOM(w)); + } + DOM(0) = 0; + + insert(&BasicBlock::get(cfg->getRoot())->dom); + do { + p = 0; + for (v = 1; v < count; ++v) { + nw = &BasicBlock::get(vert[DOM(v)])->dom;; + nv = &BasicBlock::get(vert[v])->dom; + if (nw->getGraph() && !nv->getGraph()) { + ++p; + nw->attach(nv, Graph::Edge::TREE); + } + } + } while (p); + + delete[] bucket; +} + +#undef SEMI +#undef ANCESTOR +#undef PARENT +#undef LABEL +#undef DOM + +void DominatorTree::findDominanceFrontiers() +{ + Iterator *dtIter; + BasicBlock *bb; + + for (dtIter = this->iteratorDFS(false); !dtIter->end(); dtIter->next()) { + EdgeIterator succIter, chldIter; + + bb = BasicBlock::get(reinterpret_cast(dtIter->get())); + bb->getDF().clear(); + + for (succIter = bb->cfg.outgoing(); !succIter.end(); succIter.next()) { + BasicBlock *dfLocal = BasicBlock::get(succIter.getNode()); + if (dfLocal->idom() != bb) + bb->getDF().insert(dfLocal); + } + + for (chldIter = bb->dom.outgoing(); !chldIter.end(); chldIter.next()) { + BasicBlock *cb = BasicBlock::get(chldIter.getNode()); + + DLList::Iterator dfIter = cb->getDF().iterator(); + for (; !dfIter.end(); dfIter.next()) { + BasicBlock *dfUp = BasicBlock::get(dfIter); + if (dfUp->idom() != bb) + bb->getDF().insert(dfUp); + } + } + } + this->putIterator(dtIter); +} + +// liveIn(bb) = usedBeforeAssigned(bb) U (liveOut(bb) - assigned(bb)) +void +Function::buildLiveSetsPreSSA(BasicBlock *bb, const int seq) +{ + BitSet usedBeforeAssigned(allLValues.getSize(), true); + BitSet assigned(allLValues.getSize(), true); + + bb->liveSet.allocate(allLValues.getSize(), false); + + int n = 0; + for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { + BasicBlock *out = BasicBlock::get(ei.getNode()); + if (out == bb) + continue; + if (out->cfg.visit(seq)) + buildLiveSetsPreSSA(out, seq); + if (!n++) + bb->liveSet = out->liveSet; + else + bb->liveSet |= out->liveSet; + } + if (!n && !bb->liveSet.marker) + bb->liveSet.fill(0); + bb->liveSet.marker = true; + + for (Instruction *i = bb->getEntry(); i; i = i->next) { + for (int s = 0; i->srcExists(s); ++s) + if (i->getSrc(s)->asLValue() && !assigned.test(i->getSrc(s)->id)) + usedBeforeAssigned.set(i->getSrc(s)->id); + for (int d = 0; i->defExists(d); ++d) + assigned.set(i->getDef(d)->id); + } + + bb->liveSet.andNot(assigned); + bb->liveSet |= usedBeforeAssigned; +} + +class RenamePass +{ +public: + RenamePass(Function *); + ~RenamePass(); + + bool run(); + void search(BasicBlock *); + + inline LValue *getStackTop(Value *); + +private: + Stack *stack; + Function *func; + Program *prog; + Instruction *undef; +}; + +bool +Program::convertToSSA() +{ + for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) { + Function *fn = reinterpret_cast(fi.get()); + if (!fn->convertToSSA()) + return false; + } + return true; +} + +// XXX: add edge from entry to exit ? + +// Efficiently Computing Static Single Assignment Form and +// the Control Dependence Graph, +// R. Cytron, J. Ferrante, B. K. Rosen, M. N. Wegman, F. K. Zadeck +bool +Function::convertToSSA() +{ + // 0. calculate live in variables (for pruned SSA) + int seq = cfg.nextSequence(); + for (unsigned i = 0; i <= loopNestingBound; seq = cfg.nextSequence(), ++i) + buildLiveSetsPreSSA(BasicBlock::get(cfg.getRoot()), seq); + + // reset liveSet marker for use in regalloc + for (ArrayList::Iterator bi = allBBlocks.iterator(); !bi.end(); bi.next()) + reinterpret_cast(bi.get())->liveSet.marker = false; + + // 1. create the dominator tree + domTree = new DominatorTree(&cfg); + reinterpret_cast(domTree)->findDominanceFrontiers(); + + // 2. insert PHI functions + DLList workList; + LValue *lval; + BasicBlock *bb; + int var; + int iterCount = 0; + int *hasAlready = new int[allBBlocks.getSize() * 2]; + int *work = &hasAlready[allBBlocks.getSize()]; + + memset(hasAlready, 0, allBBlocks.getSize() * 2 * sizeof(int)); + + // for each variable + for (var = 0; var < allLValues.getSize(); ++var) { + if (!allLValues.get(var)) + continue; + lval = reinterpret_cast(allLValues.get(var))->asLValue(); + if (!lval || !lval->defs) + continue; + ++iterCount; + + // TODO: don't add phi functions for values that aren't used outside + // the BB they're defined in + + // gather blocks with assignments to lval in workList + for (ValueDef::Iterator d = lval->defs->iterator(); !d.end(); d.next()) { + bb = d.get()->getInsn()->bb; + if (!bb) + continue; // instruction likely been removed but not XXX deleted + + if (work[bb->getId()] == iterCount) + continue; + work[bb->getId()] = iterCount; + workList.insert(bb); + } + + // for each block in workList, insert a phi for lval in the block's + // dominance frontier (if we haven't already done so) + for (DLList::Iterator wI = workList.iterator(); !wI.end(); wI.erase()) { + bb = BasicBlock::get(wI); + + DLList::Iterator dfIter = bb->getDF().iterator(); + for (; !dfIter.end(); dfIter.next()) { + Instruction *phi; + BasicBlock *dfBB = BasicBlock::get(dfIter); + + if (hasAlready[dfBB->getId()] >= iterCount) + continue; + hasAlready[dfBB->getId()] = iterCount; + + // pruned SSA: don't need a phi if the value is not live-in + if (!dfBB->liveSet.test(lval->id)) + continue; + + // TODO: use dedicated PhiInstruction to lift this limit + assert(dfBB->cfg.incidentCount() <= NV50_IR_MAX_SRCS); + + phi = new_Instruction(this, OP_PHI, typeOfSize(lval->reg.size)); + dfBB->insertTail(phi); + + phi->setDef(0, lval); + for (int s = 0; s < dfBB->cfg.incidentCount(); ++s) + phi->setSrc(s, lval); + + if (work[dfBB->getId()] < iterCount) { + work[dfBB->getId()] = iterCount; + wI.insert(dfBB); + } + } + } + } + delete[] hasAlready; + + RenamePass rename(this); + return rename.run(); +} + +RenamePass::RenamePass(Function *fn) : func(fn), prog(fn->getProgram()) +{ + BasicBlock *root = BasicBlock::get(func->cfg.getRoot()); + + undef = new_Instruction(func, OP_NOP, TYPE_U32); + undef->setDef(0, new_LValue(func, FILE_GPR)); + root->insertHead(undef); + + stack = new Stack[func->allLValues.getSize()]; +} + +RenamePass::~RenamePass() +{ + if (stack) + delete[] stack; +} + +LValue * +RenamePass::getStackTop(Value *val) +{ + if (!stack[val->id].getSize()) + return 0; + return reinterpret_cast(stack[val->id].peek().u.p); +} + +bool RenamePass::run() +{ + if (!stack) + return false; + search(BasicBlock::get(func->domTree->getRoot())); + + ArrayList::Iterator iter = func->allInsns.iterator(); + for (; !iter.end(); iter.next()) { + Instruction *insn = reinterpret_cast(iter.get()); + for (int d = 0; insn->defExists(d); ++d) + insn->def[d].restoreDefList(); + } + + return true; +} + +void RenamePass::search(BasicBlock *bb) +{ + LValue *lval; + int d, s; + const Target *targ = prog->getTarget(); + + for (Instruction *stmt = bb->getFirst(); stmt; stmt = stmt->next) { + if (stmt->op != OP_PHI) { + for (s = 0; stmt->srcExists(s); ++s) { + lval = stmt->getSrc(s)->asLValue(); + if (!lval) + continue; + lval = getStackTop(lval); + if (!lval) + lval = static_cast(undef->getDef(0)); + stmt->setSrc(s, lval); + } + } + for (d = 0; stmt->defExists(d); ++d) { + lval = stmt->def[d].get()->asLValue(); + assert(lval); + stmt->def[d].setSSA( + new_LValue(func, targ->nativeFile(lval->reg.file))); + stmt->def[d].get()->reg.data.id = lval->reg.data.id; + stack[lval->id].push(stmt->def[d].get()); + } + } + + for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { + Instruction *phi; + int p = 0; + BasicBlock *sb = BasicBlock::get(ei.getNode()); + + // which predecessor of sb is bb ? + for (Graph::EdgeIterator ei = sb->cfg.incident(); !ei.end(); ei.next()) { + if (ei.getNode() == &bb->cfg) + break; + ++p; + } + assert(p < sb->cfg.incidentCount()); + + for (phi = sb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) { + lval = getStackTop(phi->getSrc(p)); + if (!lval) + lval = undef->getDef(0)->asLValue(); + phi->setSrc(p, lval); + } + } + + for (Graph::EdgeIterator ei = bb->dom.outgoing(); !ei.end(); ei.next()) + search(BasicBlock::get(ei.getNode())); + + for (Instruction *stmt = bb->getFirst(); stmt; stmt = stmt->next) { + for (d = 0; stmt->defExists(d); ++d) + stack[stmt->def[d].preSSA()->id].pop(); + } +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp new file mode 100644 index 00000000000..59fb0c19b0b --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp @@ -0,0 +1,304 @@ + +#include "nv50/codegen/nv50_ir.h" +#include "nv50/codegen/nv50_ir_target.h" + +namespace nv50_ir { + +const uint8_t Target::operationSrcNr[OP_LAST + 1] = +{ + 0, 0, // NOP, PHI + 0, 0, 0, 0, // UNION, SPLIT, MERGE, CONSTRAINT + 1, 1, 2, // MOV, LOAD, STORE + 2, 2, 2, 2, 2, 3, 3, 3, // ADD, SUB, MUL, DIV, MOD, MAD, FMA, SAD + 1, 1, 1, // ABS, NEG, NOT + 2, 2, 2, 2, 2, // AND, OR, XOR, SHL, SHR + 2, 2, 1, // MAX, MIN, SAT + 1, 1, 1, 1, // CEIL, FLOOR, TRUNC, CVT + 3, 3, 3, 2, 3, 3, // SET_AND,OR,XOR, SET, SELP, SLCT + 1, 1, 1, 1, 1, 1, // RCP, RSQ, LG2, SIN, COS, EX2 + 1, 1, 1, 1, 1, 2, // EXP, LOG, PRESIN, PREEX2, SQRT, POW + 0, 0, 0, 0, 0, // BRA, CALL, RET, CONT, BREAK, + 0, 0, 0, // PRERET,CONT,BREAK + 0, 0, 0, 0, 0, 0, // BRKPT, JOINAT, JOIN, DISCARD, EXIT, MEMBAR + 1, 1, 2, 1, 2, // VFETCH, PFETCH, EXPORT, LINTERP, PINTERP + 1, 1, // EMIT, RESTART + 1, 1, 1, // TEX, TXB, TXL, + 1, 1, 1, 1, 1, // TXF, TXQ, TXD, TXG, TEXCSAA + 1, 2, // SULD, SUST + 1, 1, // DFDX, DFDY + 1, 2, 2, 2, 0, 0, // RDSV, WRSV, PIXLD, QUADOP, QUADON, QUADPOP + 2, 3, 2, // POPCNT, INSBF, EXTBF + 0 +}; + + +extern Target *getTargetNVC0(unsigned int chipset); + +Target *Target::create(unsigned int chipset) +{ + switch (chipset & 0xf0) { + case 0xc0: + return getTargetNVC0(chipset); + case 0x50: + case 0x80: + case 0x90: + case 0xa0: + default: + ERROR("unsupported target: NV%x\n", chipset); + return 0; + } +} + +void Target::destroy(Target *targ) +{ + delete targ; +} + +void +CodeEmitter::setCodeLocation(void *ptr, uint32_t size) +{ + code = reinterpret_cast(ptr); + codeSize = 0; + codeSizeLimit = size; +} + +void +CodeEmitter::printBinary() const +{ + uint32_t *bin = code - codeSize / 4; + INFO("program binary (%u bytes)", codeSize); + for (unsigned int pos = 0; pos < codeSize / 4; ++pos) { + if ((pos % 8) == 0) + INFO("\n"); + INFO("%08x ", bin[pos]); + } + INFO("\n"); +} + +void +CodeEmitter::prepareEmission(Program *prog) +{ + for (ArrayList::Iterator fi = prog->allFuncs.iterator(); + !fi.end(); fi.next()) { + Function *func = reinterpret_cast(fi.get()); + func->binPos = prog->binSize; + prepareEmission(func); + prog->binSize += func->binSize; + } +} + +void +CodeEmitter::prepareEmission(Function *func) +{ + func->bbCount = 0; + func->bbArray = new BasicBlock * [func->cfg.getSize()]; + + BasicBlock::get(func->cfg.getRoot())->binPos = func->binPos; + + Graph::GraphIterator *iter; + for (iter = func->cfg.iteratorCFG(); !iter->end(); iter->next()) + prepareEmission(BasicBlock::get(*iter)); + func->cfg.putIterator(iter); +} + +void +CodeEmitter::prepareEmission(BasicBlock *bb) +{ + Instruction *i, *next; + Function *func = bb->getFunction(); + int j; + unsigned int nShort; + + for (j = func->bbCount - 1; j >= 0 && !func->bbArray[j]->binSize; --j); + + for (; j >= 0; --j) { + BasicBlock *in = func->bbArray[j]; + Instruction *exit = in->getExit(); + + if (exit && exit->op == OP_BRA && exit->asFlow()->target.bb == bb) { + in->binSize -= 8; + func->binSize -= 8; + + for (++j; j < func->bbCount; ++j) + func->bbArray[j]->binPos -= 8; + + in->remove(exit); + } + bb->binPos = in->binPos + in->binSize; + if (in->binSize) // no more no-op branches to bb + break; + } + func->bbArray[func->bbCount++] = bb; + + if (!bb->getExit()) + return; + + // determine encoding size, try to group short instructions + nShort = 0; + for (i = bb->getEntry(); i; i = next) { + next = i->next; + + i->encSize = getMinEncodingSize(i); + if (next && i->encSize < 8) + ++nShort; + else + if ((nShort & 1) && next && getMinEncodingSize(next) == 4) { + if (i->isCommutationLegal(i->next)) { + bb->permuteAdjacent(i, next); + next->encSize = 4; + next = i; + i = i->prev; + ++nShort; + } else + if (i->isCommutationLegal(i->prev) && next->next) { + bb->permuteAdjacent(i->prev, i); + next->encSize = 4; + next = next->next; + bb->binSize += 4; + ++nShort; + } else { + i->encSize = 8; + i->prev->encSize = 8; + bb->binSize += 4; + nShort = 0; + } + } else { + i->encSize = 8; + if (nShort & 1) { + i->prev->encSize = 8; + bb->binSize += 4; + } + nShort = 0; + } + bb->binSize += i->encSize; + } + + if (bb->getExit()->encSize == 4) { + assert(nShort); + bb->getExit()->encSize = 8; + bb->binSize += 4; + + if ((bb->getExit()->prev->encSize == 4) && !(nShort & 1)) { + bb->binSize += 8; + bb->getExit()->prev->encSize = 8; + } + } + assert(!bb->getEntry() || (bb->getExit() && bb->getExit()->encSize == 8)); + + func->binSize += bb->binSize; +} + +bool +Program::emitBinary(struct nv50_ir_prog_info *info) +{ + CodeEmitter *emit = target->getCodeEmitter(progType); + + emit->prepareEmission(this); + + if (dbgFlags & NV50_IR_DEBUG_BASIC) + this->print(); + + if (!binSize) { + code = NULL; + return false; + } + code = reinterpret_cast(MALLOC(binSize)); + if (!code) + return false; + emit->setCodeLocation(code, binSize); + + for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) { + Function *fn = reinterpret_cast(fi.get()); + + assert(emit->getCodeSize() == fn->binPos); + + for (int b = 0; b < fn->bbCount; ++b) + for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next) + emit->emitInstruction(i); + } + info->bin.relocData = emit->getRelocInfo(); + + delete emit; + return true; +} + +#define RELOC_ALLOC_INCREMENT 8 + +bool +CodeEmitter::addReloc(RelocEntry::Type ty, int w, uint32_t data, uint32_t m, + int s) +{ + unsigned int n = relocInfo ? relocInfo->count : 0; + + if (!(n % RELOC_ALLOC_INCREMENT)) { + size_t size = sizeof(RelocInfo) + n * sizeof(RelocEntry); + relocInfo = reinterpret_cast( + REALLOC(relocInfo, n ? size : 0, + size + RELOC_ALLOC_INCREMENT * sizeof(RelocEntry))); + if (!relocInfo) + return false; + } + ++relocInfo->count; + + relocInfo->entry[n].data = data; + relocInfo->entry[n].mask = m; + relocInfo->entry[n].offset = codeSize + w * 4; + relocInfo->entry[n].bitPos = s; + relocInfo->entry[n].type = ty; + + return true; +} + +void +RelocEntry::apply(uint32_t *binary, const RelocInfo *info) const +{ + uint32_t value = 0; + + switch (type) { + case TYPE_CODE: value = info->codePos; break; + case TYPE_BUILTIN: value = info->libPos; break; + case TYPE_DATA: value = info->dataPos; break; + default: + assert(0); + break; + } + value += data; + value = (bitPos < 0) ? (value >> -bitPos) : (value << bitPos); + + binary[offset / 4] &= ~mask; + binary[offset / 4] |= value & mask; +} + +} // namespace nv50_ir + + +#include "nv50/codegen/nv50_ir_driver.h" + +extern "C" { + +void +nv50_ir_relocate_code(void *relocData, uint32_t *code, + uint32_t codePos, + uint32_t libPos, + uint32_t dataPos) +{ + nv50_ir::RelocInfo *info = reinterpret_cast(relocData); + + info->codePos = codePos; + info->libPos = libPos; + info->dataPos = dataPos; + + for (unsigned int i = 0; i < info->count; ++i) + info->entry[i].apply(code, info); +} + +void +nv50_ir_get_target_library(uint32_t chipset, + const uint32_t **code, uint32_t *size) +{ + nv50_ir::Target *targ = nv50_ir::Target::create(chipset); + targ->getBuiltinCode(code, size); + nv50_ir::Target::destroy(targ); +} + +} diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target.h b/src/gallium/drivers/nv50/codegen/nv50_ir_target.h new file mode 100644 index 00000000000..ddde5586890 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target.h @@ -0,0 +1,164 @@ + +#ifndef __NV50_IR_TARGET_H__ +#define __NV50_IR_TARGET_H__ + +#include "nv50_ir.h" + +namespace nv50_ir { + +struct RelocInfo; + +struct RelocEntry +{ + enum Type + { + TYPE_CODE, + TYPE_BUILTIN, + TYPE_DATA + }; + + uint32_t data; + uint32_t mask; + uint32_t offset; + int8_t bitPos; + Type type; + + inline void apply(uint32_t *binary, const RelocInfo *info) const; +}; + +struct RelocInfo +{ + uint32_t codePos; + uint32_t libPos; + uint32_t dataPos; + + uint32_t count; + + RelocEntry entry[0]; +}; + +class CodeEmitter +{ +public: + // returns whether the instruction was encodable and written + virtual bool emitInstruction(Instruction *) = 0; + + virtual uint32_t getMinEncodingSize(const Instruction *) const = 0; + + void setCodeLocation(void *, uint32_t size); + inline void *getCodeLocation() const { return code; } + inline uint32_t getCodeSize() const { return codeSize; } + + bool addReloc(RelocEntry::Type, int w, uint32_t data, uint32_t m, + int s); + + inline void *getRelocInfo() const { return relocInfo; } + + void prepareEmission(Program *); + void prepareEmission(Function *); + virtual void prepareEmission(BasicBlock *); + + void printBinary() const; + +protected: + uint32_t *code; + uint32_t codeSize; + uint32_t codeSizeLimit; + + RelocInfo *relocInfo; +}; + +class Target +{ +public: + static Target *create(uint32_t chipset); + static void destroy(Target *); + + // 0x50 and 0x84 to 0xaf for nv50 + // 0xc0 to 0xdf for nvc0 + inline uint32_t getChipset() const { return chipset; } + + virtual CodeEmitter *getCodeEmitter(Program::Type) = 0; + + // Drivers should upload this so we can use it from all programs. + // The address chosen is supplied to the relocation routine. + virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const = 0; + + virtual bool runLegalizePass(Program *, CGStage stage) const = 0; + +public: + struct OpInfo + { + OpInfo *variants; + operation op; + uint16_t srcTypes; + uint16_t dstTypes; + uint32_t immdBits; + uint8_t srcNr; + uint8_t srcMods[3]; + uint8_t dstMods; + uint8_t srcFiles[3]; + uint8_t dstFiles; + unsigned int minEncSize : 4; + unsigned int vector : 1; + unsigned int predicate : 1; + unsigned int commutative : 1; + unsigned int pseudo : 1; + unsigned int flow : 1; + unsigned int hasDest : 1; + unsigned int terminator : 1; + }; + + inline const OpInfo& getOpInfo(const Instruction *) const; + inline const OpInfo& getOpInfo(const operation) const; + + inline DataFile nativeFile(DataFile f) const; + + virtual bool insnCanLoad(const Instruction *insn, int s, + const Instruction *ld) const = 0; + virtual bool isOpSupported(operation, DataType) const = 0; + virtual bool isModSupported(const Instruction *, + int s, Modifier) const = 0; + virtual bool isSatSupported(const Instruction *) const = 0; + virtual bool mayPredicate(const Instruction *, + const Value *) const = 0; + + virtual int getLatency(const Instruction *) const { return 1; } + virtual int getThroughput(const Instruction *) const { return 1; } + + virtual unsigned int getFileSize(DataFile) const = 0; + virtual unsigned int getFileUnit(DataFile) const = 0; + + virtual uint32_t getSVAddress(DataFile, const Symbol *) const = 0; + +public: + bool joinAnterior; // true if join is executed before the op + + static const uint8_t operationSrcNr[OP_LAST + 1]; + +protected: + uint32_t chipset; + + DataFile nativeFileMap[DATA_FILE_COUNT]; + + OpInfo opInfo[OP_LAST + 1]; +}; + +const Target::OpInfo& Target::getOpInfo(const Instruction *insn) const +{ + return opInfo[MIN2(insn->op, OP_LAST)]; +} + +const Target::OpInfo& Target::getOpInfo(const operation op) const +{ + return opInfo[op]; +} + +inline DataFile Target::nativeFile(DataFile f) const +{ + return nativeFileMap[f]; +} + +} // namespace nv50_ir + +#endif // __NV50_IR_TARGET_H__ diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp new file mode 100644 index 00000000000..97f47a3ddbc --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_util.cpp @@ -0,0 +1,253 @@ + +#include "nv50_ir_util.h" + +namespace nv50_ir { + +void DLList::clear() +{ + for (Item *next, *item = head.next; item != &head; item = next) { + next = item->next; + delete item; + } + head.next = head.prev = &head; +} + +void +DLList::Iterator::erase() +{ + Item *rem = pos; + + if (rem == term) + return; + pos = pos->next; + + DLLIST_DEL(rem); + delete rem; +} + +void DLList::Iterator::moveToList(DLList& dest) +{ + Item *item = pos; + + assert(term != &dest.head); + assert(pos != term); + + pos = pos->next; + + DLLIST_DEL(item); + DLLIST_ADDHEAD(&dest.head, item); +} + +bool +DLList::Iterator::insert(void *data) +{ + Item *ins = new Item(data); + + ins->next = pos->next; + ins->prev = pos; + pos->next->prev = ins; + pos->next = ins; + + if (pos == term) + term = ins; + + return true; +} + +void +Stack::moveTo(Stack& that) +{ + unsigned int newSize = this->size + that.size; + + while (newSize > that.limit) + that.resize(); + memcpy(&that.array[that.size], &array[0], this->size * sizeof(Item)); + + that.size = newSize; + this->size = 0; +} + +Interval::~Interval() +{ + clear(); +} + +void +Interval::clear() +{ + for (Range *next, *r = head; r; r = next) { + next = r->next; + delete r; + } +} + +bool +Interval::extend(int a, int b) +{ + Range *r, **nextp = &head; + + // NOTE: we need empty intervals for fixed registers + // if (a == b) + // return false; + assert(a <= b); + + for (r = head; r; r = r->next) { + if (b < r->bgn) + break; // insert before + if (a > r->end) { + // insert after + nextp = &r->next; + continue; + } + + // overlap + if (a < r->bgn) { + r->bgn = a; + if (b > r->end) + r->end = b; + r->coalesce(&tail); + return true; + } + if (b > r->end) { + r->end = b; + r->coalesce(&tail); + return true; + } + assert(a >= r->bgn); + assert(b <= r->end); + return true; + } + + (*nextp) = new Range(a, b); + (*nextp)->next = r; + + for (r = (*nextp); r->next; r = r->next); + tail = r; + return true; +} + +bool Interval::contains(int pos) +{ + for (Range *r = head; r && r->bgn <= pos; r = r->next) + if (r->end > pos) + return true; + return false; +} + +bool Interval::overlaps(const Interval &iv) const +{ + for (Range *rA = this->head; rA; rA = rA->next) + for (Range *rB = iv.head; rB; rB = rB->next) + if (rB->bgn < rA->end && + rB->end > rA->bgn) + return true; + return false; +} + +void Interval::unify(Interval &that) +{ + assert(this != &that); + for (Range *next, *r = that.head; r; r = next) { + next = r->next; + this->extend(r->bgn, r->end); + delete r; + } + that.head = NULL; +} + +void Interval::print() const +{ + if (!head) + return; + INFO("[%i %i)", head->bgn, head->end); + for (const Range *r = head->next; r; r = r->next) + INFO(" [%i %i)", r->bgn, r->end); + INFO("\n"); +} + +void +BitSet::andNot(const BitSet &set) +{ + assert(data && set.data); + assert(size >= set.size); + for (unsigned int i = 0; i < (set.size + 31) / 32; ++i) + data[i] &= ~set.data[i]; +} + +BitSet& BitSet::operator|=(const BitSet &set) +{ + assert(data && set.data); + assert(size >= set.size); + for (unsigned int i = 0; i < (set.size + 31) / 32; ++i) + data[i] |= set.data[i]; + return *this; +} + +bool BitSet::allocate(unsigned int nBits, bool zero) +{ + if (data && size < nBits) { + FREE(data); + data = NULL; + } + size = nBits; + + if (!data) + data = reinterpret_cast(CALLOC((size + 31) / 32, 4)); + + if (zero) + memset(data, 0, (size + 7) / 8); + else + data[(size + 31) / 32 - 1] = 0; // clear unused bits (e.g. for popCount) + + return data; +} + +unsigned int BitSet::popCount() const +{ + unsigned int count = 0; + + for (unsigned int i = 0; i < (size + 31) / 32; ++i) + if (data[i]) + count += util_bitcount(data[i]); + return count; +} + +void BitSet::fill(uint32_t val) +{ + unsigned int i; + for (i = 0; i < (size + 31) / 32; ++i) + data[i] = val; + if (val) + data[i] &= ~(0xffffffff << (size % 32)); // BE ? +} + +void BitSet::setOr(BitSet *pA, BitSet *pB) +{ + if (!pB) { + *this = *pA; + } else { + for (unsigned int i = 0; i < (size + 31) / 32; ++i) + data[i] = pA->data[i] | pB->data[i]; + } +} + +void BitSet::print() const +{ + unsigned int n = 0; + INFO("BitSet of size %u:\n", size); + for (unsigned int i = 0; i < (size + 31) / 32; ++i) { + uint32_t bits = data[i]; + while (bits) { + int pos = ffs(bits) - 1; + bits &= ~(1 << pos); + INFO(" %i", i * 32 + pos); + ++n; + if ((n % 16) == 0) + INFO("\n"); + } + } + if (n % 16) + INFO("\n"); +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_util.h b/src/gallium/drivers/nv50/codegen/nv50_ir_util.h new file mode 100644 index 00000000000..2ffdcd65568 --- /dev/null +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_util.h @@ -0,0 +1,585 @@ + +#ifndef __NV50_IR_UTIL_H__ +#define __NV50_IR_UTIL_H__ + +#include +#include +#include + +#include "util/u_inlines.h" +#include "util/u_memory.h" + +#define ERROR(args...) debug_printf("ERROR: " args) +#define WARN(args...) debug_printf("WARNING: " args) +#define INFO(args...) debug_printf(args) + +#define INFO_DBG(m, f, args...) \ + do { \ + if (m & NV50_IR_DEBUG_##f) \ + debug_printf(args); \ + } while(0) + +#define FATAL(args...) \ + do { \ + fprintf(stderr, args); \ + abort(); \ + } while(0) + + +#define NV50_IR_FUNC_ALLOC_OBJ_DEF(obj, f, args...) \ + new ((f)->getProgram()->mem_##obj.allocate()) obj(f, args) + +#define new_Instruction(f, args...) \ + NV50_IR_FUNC_ALLOC_OBJ_DEF(Instruction, f, args) +#define new_CmpInstruction(f, args...) \ + NV50_IR_FUNC_ALLOC_OBJ_DEF(CmpInstruction, f, args) +#define new_TexInstruction(f, args...) \ + NV50_IR_FUNC_ALLOC_OBJ_DEF(TexInstruction, f, args) +#define new_FlowInstruction(f, args...) \ + NV50_IR_FUNC_ALLOC_OBJ_DEF(FlowInstruction, f, args) + +#define new_LValue(f, args...) \ + NV50_IR_FUNC_ALLOC_OBJ_DEF(LValue, f, args) + + +#define NV50_IR_PROG_ALLOC_OBJ_DEF(obj, p, args...) \ + new ((p)->mem_##obj.allocate()) obj(p, args) + +#define new_Symbol(p, args...) \ + NV50_IR_PROG_ALLOC_OBJ_DEF(Symbol, p, args) +#define new_ImmediateValue(p, args...) \ + NV50_IR_PROG_ALLOC_OBJ_DEF(ImmediateValue, p, args) + + +#define delete_Instruction(p, insn) (p)->releaseInstruction(insn) +#define delete_Value(p, val) (p)->releaseValue(val) + + +namespace nv50_ir { + +class Iterator +{ +public: + virtual void next() = 0; + virtual void *get() const = 0; + virtual bool end() const = 0; // if true, get will return 0 +}; + +class ManipIterator : public Iterator +{ +public: + virtual bool insert(void *) = 0; // insert after current position + virtual void erase() = 0; +}; + +// WARNING: do not use a->prev/next for __item or __list + +#define DLLIST_DEL(__item) \ + do { \ + (__item)->prev->next = (__item)->next; \ + (__item)->next->prev = (__item)->prev; \ + (__item)->next = (__item); \ + (__item)->prev = (__item); \ + } while(0) + +#define DLLIST_ADDTAIL(__list, __item) \ + do { \ + (__item)->next = (__list); \ + (__item)->prev = (__list)->prev; \ + (__list)->prev->next = (__item); \ + (__list)->prev = (__item); \ + } while(0) + +#define DLLIST_ADDHEAD(__list, __item) \ + do { \ + (__item)->prev = (__list); \ + (__item)->next = (__list)->next; \ + (__list)->next->prev = (__item); \ + (__list)->next = (__item); \ + } while(0) + +#define DLLIST_MERGE(__listA, __listB, ty) \ + do { \ + ty prevB = (__listB)->prev; \ + (__listA)->prev->next = (__listB); \ + (__listB)->prev->next = (__listA); \ + (__listB)->prev = (__listA)->prev; \ + (__listA)->prev = prevB; \ + } while(0) + +#define DLLIST_FOR_EACH(list, it) \ + for (DLList::Iterator (it) = (list)->iterator(); !(it).end(); (it).next()) + +class DLList +{ +public: + class Item + { + public: + Item(void *priv) : next(this), prev(this), data(priv) { } + + public: + Item *next; + Item *prev; + void *data; + }; + + DLList() : head(0) { } + ~DLList() { clear(); } + + inline void insertHead(void *data) + { + Item *item = new Item(data); + + assert(data); + + item->prev = &head; + item->next = head.next; + head.next->prev = item; + head.next = item; + } + + inline void insertTail(void *data) + { + Item *item = new Item(data); + + assert(data); + + DLLIST_ADDTAIL(&head, item); + } + + inline void insert(void *data) { insertTail(data); } + + void clear(); + + class Iterator : public ManipIterator + { + public: + Iterator(Item *head, bool r) : rev(r), pos(r ? head->prev : head->next), + term(head) { } + + virtual void next() { if (!end()) pos = rev ? pos->prev : pos->next; } + virtual void *get() const { return pos->data; } + virtual bool end() const { return pos == term; } + + // caution: if you're at end-2 and erase it, then do next, you're at end + virtual void erase(); + virtual bool insert(void *data); + + // move item to a another list, no consistency with its iterators though + void moveToList(DLList&); + + private: + const bool rev; + Item *pos; + Item *term; + + friend class DLList; + }; + + inline void erase(Iterator& pos) + { + pos.erase(); + } + + Iterator iterator() + { + return Iterator(&head, false); + } + + Iterator revIterator() + { + return Iterator(&head, true); + } + +private: + Item head; +}; + +class Stack +{ +public: + class Item { + public: + union { + void *p; + int i; + unsigned int u; + float f; + double d; + } u; + + Item() { memset(&u, 0, sizeof(u)); } + }; + + Stack() : size(0), limit(0), array(0) { } + ~Stack() { if (array) FREE(array); } + + inline void push(int i) { Item data; data.u.i = i; push(data); } + inline void push(unsigned int u) { Item data; data.u.u = u; push(data); } + inline void push(void *p) { Item data; data.u.p = p; push(data); } + inline void push(float f) { Item data; data.u.f = f; push(data); } + + inline void push(Item data) + { + if (size == limit) + resize(); + array[size++] = data; + } + + inline Item pop() + { + if (!size) { + Item data; + assert(0); + return data; + } + return array[--size]; + } + + inline unsigned int getSize() { return size; } + + inline Item& peek() { assert(size); return array[size - 1]; } + + void clear(bool releaseStorage = false) + { + if (releaseStorage && array) + FREE(array); + size = limit = 0; + } + + void moveTo(Stack&); // move all items to target (not like push(pop())) + +private: + void resize() + { + unsigned int sizeOld, sizeNew; + + sizeOld = limit * sizeof(Item); + limit = MAX2(4, limit + limit); + sizeNew = limit * sizeof(Item); + + array = (Item *)REALLOC(array, sizeOld, sizeNew); + } + + unsigned int size; + unsigned int limit; + Item *array; +}; + +class DynArray +{ +public: + class Item + { + public: + union { + uint32_t u32; + void *p; + }; + }; + + DynArray() : data(NULL), size(0) { } + + ~DynArray() { if (data) FREE(data); } + + inline Item& operator[](unsigned int i) + { + if (i >= size) + resize(i); + return data[i]; + } + + inline const Item operator[](unsigned int i) const + { + return data[i]; + } + + void resize(unsigned int index) + { + const unsigned int oldSize = size * sizeof(Item); + + if (!size) + size = 8; + while (size <= index) + size <<= 1; + + data = (Item *)REALLOC(data, oldSize, size * sizeof(Item)); + } + +private: + Item *data; + unsigned int size; +}; + +class ArrayList +{ +public: + ArrayList() : size(0) { } + + void insert(void *item, int& id) + { + id = ids.getSize() ? ids.pop().u.i : size++; + data[id].p = item; + } + + void remove(int& id) + { + const unsigned int uid = id; + assert(uid < size && data[id].p); + ids.push(uid); + data[uid].p = NULL; + id = -1; + } + + inline int getSize() const { return size; } + + inline void *get(unsigned int id) { assert(id < size); return data[id].p; } + + class Iterator : public nv50_ir::Iterator + { + public: + Iterator(const ArrayList *array) : pos(0), data(array->data) + { + size = array->getSize(); + if (size) + nextValid(); + } + + void nextValid() { while ((pos < size) && !data[pos].p) ++pos; } + + void next() { if (pos < size) { ++pos; nextValid(); } } + void *get() const { assert(pos < size); return data[pos].p; } + bool end() const { return pos >= size; } + + private: + unsigned int pos; + unsigned int size; + const DynArray& data; + + friend class ArrayList; + }; + + Iterator iterator() const { return Iterator(this); } + +private: + DynArray data; + Stack ids; + unsigned int size; +}; + +class Interval +{ +public: + Interval() : head(0), tail(0) { } + ~Interval(); + + bool extend(int, int); + void unify(Interval&); // clears source interval + void clear(); + + inline int begin() { return head ? head->bgn : -1; } + inline int end() { checkTail(); return tail ? tail->end : -1; } + inline bool isEmpty() const { return !head; } + bool overlaps(const Interval&) const; + bool contains(int pos); + + void print() const; + + inline void checkTail() const; + +private: + class Range + { + public: + Range(int a, int b) : next(0), bgn(a), end(b) { } + + Range *next; + int bgn; + int end; + + void coalesce(Range **ptail) + { + Range *rnn; + + while (next && end >= next->bgn) { + assert(bgn <= next->bgn); + rnn = next->next; + end = MAX2(end, next->end); + delete next; + next = rnn; + } + if (!next) + *ptail = this; + } + }; + + Range *head; + Range *tail; +}; + +class BitSet +{ +public: + BitSet() : marker(false), data(0), size(0) { } + BitSet(unsigned int nBits, bool zero) : marker(false), data(0), size(0) + { + allocate(nBits, zero); + } + ~BitSet() + { + if (data) + FREE(data); + } + + bool allocate(unsigned int nBits, bool zero); + + inline unsigned int getSize() const { return size; } + + void fill(uint32_t val); + + void setOr(BitSet *, BitSet *); // second BitSet may be NULL + + inline void set(unsigned int i) + { + assert(i < size); + data[i / 32] |= 1 << (i % 32); + } + + inline void clr(unsigned int i) + { + assert(i < size); + data[i / 32] &= ~(1 << (i % 32)); + } + + inline bool test(unsigned int i) const + { + assert(i < size); + return data[i / 32] & (1 << (i % 32)); + } + + BitSet& operator|=(const BitSet&); + + BitSet& operator=(const BitSet& set) + { + assert(data && set.data); + assert(size == set.size); + memcpy(data, set.data, (set.size + 7) / 8); + return *this; + } + + void andNot(const BitSet&); + + unsigned int popCount() const; + + void print() const; + +public: + bool marker; // for user + +private: + uint32_t *data; + unsigned int size; +}; + +void Interval::checkTail() const +{ +#if NV50_DEBUG & NV50_DEBUG_PROG_RA + Range *r = head; + while (r->next) + r = r->next; + assert(tail == r); +#endif +} + +class MemoryPool +{ +private: + inline bool enlargeAllocationsArray(const unsigned int id, unsigned int nr) + { + const unsigned int size = sizeof(uint8_t *) * id; + const unsigned int incr = sizeof(uint8_t *) * nr; + + uint8_t **alloc = (uint8_t **)REALLOC(allocArray, size, size + incr); + if (!alloc) + return false; + allocArray = alloc; + return true; + } + + inline bool enlargeCapacity() + { + const unsigned int id = count >> objStepLog2; + + uint8_t *const mem = (uint8_t *)MALLOC(objSize << objStepLog2); + if (!mem) + return false; + + if (!(id % 32)) { + if (!enlargeAllocationsArray(id, 32)) { + FREE(mem); + return false; + } + } + allocArray[id] = mem; + return true; + } + +public: + MemoryPool(unsigned int size, unsigned int incr) : objSize(size), + objStepLog2(incr) + { + allocArray = NULL; + released = NULL; + count = 0; + } + + ~MemoryPool() + { + unsigned int allocCount = (count + (1 << objStepLog2) - 1) >> objStepLog2; + for (unsigned int i = 0; i < allocCount && allocArray[i]; ++i) + FREE(allocArray[i]); + if (allocArray) + FREE(allocArray); + } + + void *allocate() + { + void *ret; + const unsigned int mask = (1 << objStepLog2) - 1; + + if (released) { + ret = released; + released = *(void **)released; + return ret; + } + + if (!(count & mask)) + if (!enlargeCapacity()) + return NULL; + + ret = allocArray[count >> objStepLog2] + (count & mask) * objSize; + ++count; + return ret; + } + + void release(void *ptr) + { + *(void **)ptr = released; + released = ptr; + } + +private: + uint8_t **allocArray; // array (list) of MALLOC allocations + + void *released; // list of released objects + + unsigned int count; // highest allocated object + + const unsigned int objSize; + const unsigned int objStepLog2; +}; + +} // namespace nv50_ir + +#endif // __NV50_IR_UTIL_H__ diff --git a/src/gallium/drivers/nvc0/Makefile b/src/gallium/drivers/nvc0/Makefile index 3a5314625e6..c41262559cd 100644 --- a/src/gallium/drivers/nvc0/Makefile +++ b/src/gallium/drivers/nvc0/Makefile @@ -3,7 +3,7 @@ include $(TOP)/configs/current LIBNAME = nvc0 -# get C_SOURCES +# get C/CPP_SOURCES include Makefile.sources LIBRARY_INCLUDES = \ diff --git a/src/gallium/drivers/nvc0/Makefile.sources b/src/gallium/drivers/nvc0/Makefile.sources index a057f060130..9b1fb97f0cb 100644 --- a/src/gallium/drivers/nvc0/Makefile.sources +++ b/src/gallium/drivers/nvc0/Makefile.sources @@ -22,3 +22,8 @@ C_SOURCES := \ nvc0_push.c \ nvc0_push2.c \ nvc0_query.c + +CPP_SOURCES := \ + codegen/nv50_ir_emit_nvc0.cpp \ + codegen/nv50_ir_lowering_nvc0.cpp \ + codegen/nv50_ir_target_nvc0.cpp diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp new file mode 100644 index 00000000000..2ab06f426e5 --- /dev/null +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp @@ -0,0 +1,1714 @@ + +#include "nv50_ir_target_nvc0.h" + +namespace nv50_ir { + +// Argh, all these assertions ... + +class CodeEmitterNVC0 : public CodeEmitter +{ +public: + CodeEmitterNVC0(const TargetNVC0 *); + + virtual bool emitInstruction(Instruction *); + virtual uint32_t getMinEncodingSize(const Instruction *) const; + + inline void setProgramType(Program::Type pType) { progType = pType; } + +private: + const TargetNVC0 *targ; + + Program::Type progType; + +private: + void emitForm_A(const Instruction *, uint64_t); + void emitForm_B(const Instruction *, uint64_t); + void emitForm_S(const Instruction *, uint32_t, bool pred); + + void emitPredicate(const Instruction *); + + void setAddress16(const ValueRef&); + void setImmediate(const Instruction *, const int s); // needs op already set + void setImmediateS8(const ValueRef&); + + void emitCondCode(CondCode cc, int pos); + void emitInterpMode(const Instruction *); + void emitLoadStoreType(DataType ty); + void emitCachingMode(CacheMode c); + + void emitShortSrc2(const ValueRef&); + + inline uint8_t getSRegEncoding(const ValueRef&); + + void roundMode_A(const Instruction *); + void roundMode_C(const Instruction *); + void roundMode_CS(const Instruction *); + + void emitNegAbs12(const Instruction *); + + void emitNOP(const Instruction *); + + void emitLOAD(const Instruction *); + void emitSTORE(const Instruction *); + void emitMOV(const Instruction *); + + void emitINTERP(const Instruction *); + void emitPFETCH(const Instruction *); + void emitVFETCH(const Instruction *); + void emitEXPORT(const Instruction *); + void emitOUT(const Instruction *); + + void emitUADD(const Instruction *); + void emitFADD(const Instruction *); + void emitUMUL(const Instruction *); + void emitFMUL(const Instruction *); + void emitIMAD(const Instruction *); + void emitFMAD(const Instruction *); + + void emitNOT(Instruction *); + void emitLogicOp(const Instruction *, uint8_t subOp); + void emitPOPC(const Instruction *); + void emitINSBF(const Instruction *); + void emitShift(const Instruction *); + + void emitSFnOp(const Instruction *, uint8_t subOp); + + void emitCVT(Instruction *); + void emitMINMAX(const Instruction *); + void emitPreOp(const Instruction *); + + void emitSET(const CmpInstruction *); + void emitSLCT(const CmpInstruction *); + void emitSELP(const Instruction *); + + void emitTEX(const TexInstruction *); + void emitTEXCSAA(const TexInstruction *); + void emitTXQ(const TexInstruction *); + void emitPIXLD(const TexInstruction *); + + void emitQUADOP(const Instruction *, uint8_t qOp, uint8_t laneMask); + + void emitFlow(const Instruction *); + + inline void defId(const ValueDef&, const int pos); + inline void srcId(const ValueRef&, const int pos); + + inline void srcAddr32(const ValueRef&, const int pos); // address / 4 + + inline void srcId(const ValueRef *, const int pos); + + inline bool isLIMM(const ValueRef&, DataType ty); +}; + +// for better visibility +#define HEX64(h, l) 0x##h##l##ULL + +#define SDATA(a) ((a).rep()->reg.data) +#define DDATA(a) ((a).rep()->reg.data) + +void CodeEmitterNVC0::srcId(const ValueRef& src, const int pos) +{ + code[pos / 32] |= (src.get() ? SDATA(src).id : 63) << (pos % 32); +} + +void CodeEmitterNVC0::srcId(const ValueRef *src, const int pos) +{ + code[pos / 32] |= (src ? SDATA(*src).id : 63) << (pos % 32); +} + +void CodeEmitterNVC0::srcAddr32(const ValueRef& src, const int pos) +{ + code[pos / 32] |= (SDATA(src).offset >> 2) << (pos % 32); +} + +void CodeEmitterNVC0::defId(const ValueDef& def, const int pos) +{ + code[pos / 32] |= (def.get() ? DDATA(def).id : 63) << (pos % 32); +} + +bool CodeEmitterNVC0::isLIMM(const ValueRef& ref, DataType ty) +{ + const ImmediateValue *imm = ref.get()->asImm(); + + return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000)); +} + +void +CodeEmitterNVC0::roundMode_A(const Instruction *insn) +{ + switch (insn->rnd) { + case ROUND_M: code[1] |= 1 << 23; break; + case ROUND_P: code[1] |= 2 << 23; break; + case ROUND_Z: code[1] |= 3 << 23; break; + default: + assert(insn->rnd == ROUND_N); + break; + } +} + +void +CodeEmitterNVC0::emitNegAbs12(const Instruction *i) +{ + if (i->src[1].mod.abs()) code[0] |= 1 << 6; + if (i->src[0].mod.abs()) code[0] |= 1 << 7; + if (i->src[1].mod.neg()) code[0] |= 1 << 8; + if (i->src[0].mod.neg()) code[0] |= 1 << 9; +} + +void CodeEmitterNVC0::emitCondCode(CondCode cc, int pos) +{ + uint8_t val; + + switch (cc) { + case CC_LT: val = 0x1; break; + case CC_LTU: val = 0x9; break; + case CC_EQ: val = 0x2; break; + case CC_EQU: val = 0xa; break; + case CC_LE: val = 0x3; break; + case CC_LEU: val = 0xb; break; + case CC_GT: val = 0x4; break; + case CC_GTU: val = 0xc; break; + case CC_NE: val = 0x5; break; + case CC_NEU: val = 0xd; break; + case CC_GE: val = 0x6; break; + case CC_GEU: val = 0xe; break; + case CC_TR: val = 0xf; break; + case CC_FL: val = 0x0; break; + + case CC_A: val = 0x14; break; + case CC_NA: val = 0x13; break; + case CC_S: val = 0x15; break; + case CC_NS: val = 0x12; break; + case CC_C: val = 0x16; break; + case CC_NC: val = 0x11; break; + case CC_O: val = 0x17; break; + case CC_NO: val = 0x10; break; + + default: + val = 0; + assert(!"invalid condition code"); + break; + } + code[pos / 32] |= val << (pos % 32); +} + +void +CodeEmitterNVC0::emitPredicate(const Instruction *i) +{ + if (i->predSrc >= 0) { + assert(i->getPredicate()->reg.file == FILE_PREDICATE); + srcId(i->src[i->predSrc], 10); + if (i->cc == CC_NOT_P) + code[0] |= 0x2000; // negate + } else { + code[0] |= 0x1c00; + } +} + +void +CodeEmitterNVC0::setAddress16(const ValueRef& src) +{ + Symbol *sym = src.get()->asSym(); + + assert(sym); + + code[0] |= (sym->reg.data.offset & 0x003f) << 26; + code[1] |= (sym->reg.data.offset & 0xffc0) >> 6; +} + +void +CodeEmitterNVC0::setImmediate(const Instruction *i, const int s) +{ + const ImmediateValue *imm = i->src[s].get()->asImm(); + uint32_t u32; + + assert(imm); + u32 = imm->reg.data.u32; + + if ((code[0] & 0xf) == 0x2) { + // LIMM + code[0] |= (u32 & 0x3f) << 26; + code[1] |= u32 >> 6; + } else + if ((code[0] & 0xf) == 0x3 || (code[0] & 0xf) == 4) { + // integer immediate + assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000); + assert(!(code[1] & 0xc000)); + u32 &= 0xfffff; + code[0] |= (u32 & 0x3f) << 26; + code[1] |= 0xc000 | (u32 >> 6); + } else { + // float immediate + assert(!(u32 & 0x00000fff)); + assert(!(code[1] & 0xc000)); + code[0] |= ((u32 >> 12) & 0x3f) << 26; + code[1] |= 0xc000 | (u32 >> 18); + } +} + +void CodeEmitterNVC0::setImmediateS8(const ValueRef &ref) +{ + const ImmediateValue *imm = ref.get()->asImm(); + + int8_t s8 = static_cast(imm->reg.data.s32); + + assert(s8 == imm->reg.data.s32); + + code[0] |= (s8 & 0x3f) << 26; + code[0] |= (s8 >> 6) << 8; +} + +void +CodeEmitterNVC0::emitForm_A(const Instruction *i, uint64_t opc) +{ + code[0] = opc; + code[1] = opc >> 32; + + emitPredicate(i); + + defId(i->def[0], 14); + + int s1 = 26; + if (i->srcExists(2) && i->getSrc(2)->reg.file == FILE_MEMORY_CONST) + s1 = 49; + + for (int s = 0; s < 3 && i->srcExists(s); ++s) { + switch (i->getSrc(s)->reg.file) { + case FILE_MEMORY_CONST: + assert(!(code[1] & 0xc000)); + code[1] |= (s == 2) ? 0x8000 : 0x4000; + code[1] |= i->getSrc(s)->reg.fileIndex << 10; + setAddress16(i->src[s]); + break; + case FILE_IMMEDIATE: + assert(s == 1 || + i->op == OP_MOV || i->op == OP_PRESIN || i->op == OP_PREEX2); + assert(!(code[1] & 0xc000)); + setImmediate(i, s); + break; + case FILE_GPR: + if ((s == 2) && ((code[0] & 0x7) == 2)) // LIMM: 3rd src == dst + break; + srcId(i->src[s], s ? ((s == 2) ? 49 : s1) : 20); + break; + default: + // ignore here, can be predicate or flags, but must not be address + break; + } + } +} + +void +CodeEmitterNVC0::emitForm_B(const Instruction *i, uint64_t opc) +{ + code[0] = opc; + code[1] = opc >> 32; + + emitPredicate(i); + + defId(i->def[0], 14); + + switch (i->src[0].getFile()) { + case FILE_MEMORY_CONST: + assert(!(code[1] & 0xc000)); + code[1] |= 0x4000 | (i->src[0].get()->reg.fileIndex << 10); + setAddress16(i->src[0]); + break; + case FILE_IMMEDIATE: + assert(!(code[1] & 0xc000)); + setImmediate(i, 0); + break; + case FILE_GPR: + srcId(i->src[0], 26); + break; + default: + // ignore here, can be predicate or flags, but must not be address + break; + } +} + +void +CodeEmitterNVC0::emitForm_S(const Instruction *i, uint32_t opc, bool pred) +{ + code[0] = opc; + + int ss2a = 0; + if (opc == 0x0d || opc == 0x0e) + ss2a = 2; + + defId(i->def[0], 14); + srcId(i->src[0], 20); + + assert(pred || (i->predSrc < 0)); + if (pred) + emitPredicate(i); + + for (int s = 1; s < 3 && i->srcExists(s); ++s) { + if (i->src[s].get()->reg.file == FILE_MEMORY_CONST) { + assert(!(code[0] & (0x300 >> ss2a))); + switch (i->src[s].get()->reg.fileIndex) { + case 0: code[0] |= 0x100 >> ss2a; break; + case 1: code[0] |= 0x200 >> ss2a; break; + case 16: code[0] |= 0x300 >> ss2a; break; + default: + ERROR("invalid c[] space for short form\n"); + break; + } + if (s == 1) + code[0] |= i->getSrc(s)->reg.data.offset << 24; + else + code[0] |= i->getSrc(s)->reg.data.offset << 6; + } else + if (i->src[s].getFile() == FILE_IMMEDIATE) { + assert(s == 1); + setImmediateS8(i->src[s]); + } else + if (i->src[s].getFile() == FILE_GPR) { + srcId(i->src[s], (s == 1) ? 26 : 8); + } + } +} + +void +CodeEmitterNVC0::emitShortSrc2(const ValueRef &src) +{ + if (src.getFile() == FILE_MEMORY_CONST) { + switch (src.get()->reg.fileIndex) { + case 0: code[0] |= 0x100; break; + case 1: code[0] |= 0x200; break; + case 16: code[0] |= 0x300; break; + default: + assert(!"unsupported file index for short op"); + break; + } + srcAddr32(src, 20); + } else { + srcId(src, 20); + assert(src.getFile() == FILE_GPR); + } +} + +void +CodeEmitterNVC0::emitNOP(const Instruction *i) +{ + code[0] = 0x000001e4; + code[1] = 0x40000000; + emitPredicate(i); +} + +void +CodeEmitterNVC0::emitFMAD(const Instruction *i) +{ + bool neg1 = (i->src[0].mod ^ i->src[1].mod).neg(); + + if (i->encSize == 8) { + if (isLIMM(i->src[1], TYPE_F32)) { + emitForm_A(i, HEX64(20000000, 00000002)); + } else { + emitForm_A(i, HEX64(30000000, 00000000)); + + if (i->src[2].mod.neg()) + code[0] |= 1 << 8; + } + roundMode_A(i); + + if (neg1) + code[0] |= 1 << 9; + + if (i->saturate) + code[0] |= 1 << 5; + if (i->ftz) + code[0] |= 1 << 6; + } else { + assert(!i->saturate && !i->src[2].mod.neg()); + emitForm_S(i, (i->src[2].getFile() == FILE_MEMORY_CONST) ? 0x2e : 0x0e, + false); + if (neg1) + code[0] |= 1 << 4; + } +} + +void +CodeEmitterNVC0::emitFMUL(const Instruction *i) +{ + bool neg = (i->src[0].mod ^ i->src[1].mod).neg(); + + assert(i->postFactor >= -3 && i->postFactor <= 3); + + if (i->encSize == 8) { + if (isLIMM(i->src[1], TYPE_F32)) { + assert(i->postFactor == 0); // constant folded, hopefully + emitForm_A(i, HEX64(30000000, 00000002)); + } else { + emitForm_A(i, HEX64(58000000, 00000000)); + roundMode_A(i); + code[1] |= ((i->postFactor > 0) ? + (7 - i->postFactor) : (0 - i->postFactor)) << 17; + } + if (neg) + code[1] ^= 1 << 25; // aliases with LIMM sign bit + + if (i->saturate) + code[0] |= 1 << 5; + + if (i->dnz) + code[0] |= 1 << 7; + else + if (i->ftz) + code[0] |= 1 << 6; + } else { + assert(!neg && !i->saturate && !i->ftz && !i->postFactor); + emitForm_S(i, 0xa8, true); + } +} + +void +CodeEmitterNVC0::emitUMUL(const Instruction *i) +{ + if (i->encSize == 8) { + if (i->src[1].getFile() == FILE_IMMEDIATE) { + emitForm_A(i, HEX64(10000000, 00000002)); + } else { + emitForm_A(i, HEX64(50000000, 00000003)); + } + if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) + code[0] |= 1 << 6; + if (i->sType == TYPE_S32) + code[0] |= 1 << 5; + if (i->dType == TYPE_S32) + code[0] |= 1 << 7; + } else { + emitForm_S(i, i->src[1].getFile() == FILE_IMMEDIATE ? 0xaa : 0x2a, true); + + if (i->sType == TYPE_S32) + code[0] |= 1 << 6; + } +} + +void +CodeEmitterNVC0::emitFADD(const Instruction *i) +{ + if (i->encSize == 8) { + if (isLIMM(i->src[1], TYPE_F32)) { + emitForm_A(i, HEX64(28000000, 00000002)); + + assert(!i->src[1].mod.neg() && !i->src[1].mod.abs() && !i->saturate); + } else { + emitForm_A(i, HEX64(50000000, 00000000)); + + roundMode_A(i); + if (i->saturate) + code[1] |= 1 << 17; + } + emitNegAbs12(i); + + if (i->op == OP_SUB) code[0] ^= 1 << 8; + + if (i->ftz) + code[0] |= 1 << 5; + } else { + assert(!i->saturate && i->op != OP_SUB && + !i->src[0].mod.abs() && + !i->src[1].mod.neg() && !i->src[1].mod.abs()); + + emitForm_S(i, 0x49, true); + + if (i->src[0].mod.neg()) + code[0] |= 1 << 7; + } +} + +void +CodeEmitterNVC0::emitUADD(const Instruction *i) +{ + uint32_t addOp = 0; + + assert(!i->src[0].mod.abs() && !i->src[1].mod.abs()); + assert(!i->src[0].mod.neg() || !i->src[1].mod.neg()); + + if (i->src[0].mod.neg()) + addOp |= 0x200; + if (i->src[1].mod.neg()) + addOp |= 0x100; + if (i->op == OP_SUB) { + addOp ^= 0x100; + assert(addOp != 0x300); // would be add-plus-one + } + + if (i->encSize == 8) { + if (isLIMM(i->src[1], TYPE_U32)) { + emitForm_A(i, HEX64(08000000, 00000002)); + if (i->def[1].exists()) + code[1] |= 1 << 26; // write carry + } else { + emitForm_A(i, HEX64(48000000, 00000003)); + if (i->def[1].exists()) + code[1] |= 1 << 16; // write carry + } + code[0] |= addOp; + + if (i->saturate) + code[0] |= 1 << 5; + if (i->flagsSrc >= 0) // add carry + code[0] |= 1 << 6; + } else { + assert(!(addOp & 0x100)); + emitForm_S(i, (addOp >> 3) | + ((i->src[1].getFile() == FILE_IMMEDIATE) ? 0xac : 0x2c), true); + } +} + +// TODO: shl-add +void +CodeEmitterNVC0::emitIMAD(const Instruction *i) +{ + assert(i->encSize == 8); + emitForm_A(i, HEX64(20000000, 00000003)); + + if (isSignedType(i->dType)) + code[0] |= 1 << 7; + if (isSignedType(i->sType)) + code[0] |= 1 << 5; + + code[1] |= i->saturate << 24; + + if (i->flagsDef >= 0) code[1] |= 1 << 16; + if (i->flagsSrc >= 0) code[1] |= 1 << 23; + + if (i->src[2].mod.neg()) code[0] |= 0x10; + if (i->src[1].mod.neg() ^ + i->src[0].mod.neg()) code[0] |= 0x20; + + if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) + code[0] |= 1 << 6; +} + +void +CodeEmitterNVC0::emitNOT(Instruction *i) +{ + assert(i->encSize == 8); + i->src[1].set(i->src[0]); + emitForm_A(i, HEX64(68000000, 000001c3)); +} + +void +CodeEmitterNVC0::emitLogicOp(const Instruction *i, uint8_t subOp) +{ + if (i->encSize == 8) { + if (isLIMM(i->src[1], TYPE_U32)) { + emitForm_A(i, HEX64(38000000, 00000002)); + + if (i->src[2].exists()) + code[1] |= 1 << 26; + } else { + emitForm_A(i, HEX64(68000000, 00000003)); + + if (i->src[2].exists()) + code[1] |= 1 << 16; + } + code[0] |= subOp << 6; + + if (i->src[2].exists()) // carry + code[0] |= 1 << 5; + + if (i->src[0].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9; + if (i->src[1].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8; + } else { + emitForm_S(i, (subOp << 5) | + ((i->src[1].getFile() == FILE_IMMEDIATE) ? 0x1d : 0x8d), true); + } +} + +void +CodeEmitterNVC0::emitPOPC(const Instruction *i) +{ + emitForm_A(i, HEX64(54000000, 00000004)); + + if (i->src[0].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9; + if (i->src[1].mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8; +} + +void +CodeEmitterNVC0::emitINSBF(const Instruction *i) +{ + emitForm_A(i, HEX64(28000000, 30000000)); +} + +void +CodeEmitterNVC0::emitShift(const Instruction *i) +{ + if (i->op == OP_SHR) { + emitForm_A(i, HEX64(58000000, 00000003) + | (isSignedType(i->dType) ? 0x20 : 0x00)); + } else { + emitForm_A(i, HEX64(60000000, 00000003)); + } + + if (0) + code[0] |= 1 << 9; // clamp shift amount +} + +void +CodeEmitterNVC0::emitPreOp(const Instruction *i) +{ + if (i->encSize == 8) { + emitForm_B(i, HEX64(60000000, 00000000)); + + if (i->op == OP_PREEX2) + code[0] |= 0x20; + + if (i->src[0].mod.abs()) code[0] |= 1 << 6; + if (i->src[0].mod.neg()) code[0] |= 1 << 8; + } else { + emitForm_S(i, i->op == OP_PREEX2 ? 0x74000008 : 0x70000008, true); + } +} + +void +CodeEmitterNVC0::emitSFnOp(const Instruction *i, uint8_t subOp) +{ + if (i->encSize == 8) { + code[0] = 0x00000000 | (subOp << 26); + code[1] = 0xc8000000; + + emitPredicate(i); + + defId(i->def[0], 14); + srcId(i->src[0], 20); + + assert(i->src[0].getFile() == FILE_GPR); + + if (i->saturate) code[0] |= 1 << 5; + + if (i->src[0].mod.abs()) code[0] |= 1 << 7; + if (i->src[0].mod.neg()) code[0] |= 1 << 9; + } else { + emitForm_S(i, 0x80000008 | (subOp << 26), true); + + assert(!i->src[0].mod.neg()); + if (i->src[0].mod.abs()) code[0] |= 1 << 30; + } +} + +void +CodeEmitterNVC0::emitMINMAX(const Instruction *i) +{ + uint64_t op; + + assert(i->encSize == 8); + + op = (i->op == OP_MIN) ? 0x080e000000000000ULL : 0x081e000000000000ULL; + + if (i->ftz) + op |= 1 << 5; + else + if (!isFloatType(i->dType)) + op |= isSignedType(i->dType) ? 0x23 : 0x03; + + emitForm_A(i, op); + emitNegAbs12(i); +} + +void +CodeEmitterNVC0::roundMode_C(const Instruction *i) +{ + switch (i->rnd) { + case ROUND_M: code[1] |= 1 << 17; break; + case ROUND_P: code[1] |= 2 << 17; break; + case ROUND_Z: code[1] |= 3 << 17; break; + case ROUND_NI: code[0] |= 1 << 7; break; + case ROUND_MI: code[0] |= 1 << 7; code[1] |= 1 << 17; break; + case ROUND_PI: code[0] |= 1 << 7; code[1] |= 2 << 17; break; + case ROUND_ZI: code[0] |= 1 << 7; code[1] |= 3 << 17; break; + case ROUND_N: break; + default: + assert(!"invalid round mode"); + break; + } +} + +void +CodeEmitterNVC0::roundMode_CS(const Instruction *i) +{ + switch (i->rnd) { + case ROUND_M: + case ROUND_MI: code[0] |= 1 << 16; break; + case ROUND_P: + case ROUND_PI: code[0] |= 2 << 16; break; + case ROUND_Z: + case ROUND_ZI: code[0] |= 3 << 16; break; + default: + break; + } +} + +void +CodeEmitterNVC0::emitCVT(Instruction *i) +{ + const bool f2f = isFloatType(i->dType) && isFloatType(i->sType); + + switch (i->op) { + case OP_CEIL: i->rnd = f2f ? ROUND_PI : ROUND_P; break; + case OP_FLOOR: i->rnd = f2f ? ROUND_MI : ROUND_M; break; + case OP_TRUNC: i->rnd = f2f ? ROUND_ZI : ROUND_Z; break; + default: + break; + } + + const bool sat = (i->op == OP_SAT) || i->saturate; + const bool abs = (i->op == OP_ABS) || i->src[0].mod.abs(); + const bool neg = (i->op == OP_NEG) || i->src[0].mod.neg(); + + if (i->encSize == 8) { + emitForm_B(i, HEX64(10000000, 00000004)); + + roundMode_C(i); + + code[0] |= util_logbase2(i->def[0].getSize()) << 20; + code[0] |= util_logbase2(i->src[0].getSize()) << 23; + + if (sat) + code[0] |= 0x20; + if (abs) + code[0] |= 1 << 6; + if (neg && i->op != OP_ABS) + code[0] |= 1 << 8; + + if (i->ftz) + code[1] |= 1 << 23; + + if (isSignedIntType(i->dType)) + code[0] |= 0x080; + if (isSignedIntType(i->sType)) + code[0] |= 0x200; + + if (isFloatType(i->dType)) { + if (!isFloatType(i->sType)) + code[1] |= 0x08000000; + } else { + if (isFloatType(i->sType)) + code[1] |= 0x04000000; + else + code[1] |= 0x0c000000; + } + } else { + if (i->op == OP_CEIL || i->op == OP_FLOOR || i->op == OP_TRUNC) { + code[0] = 0x298; + } else + if (isFloatType(i->dType)) { + if (isFloatType(i->sType)) + code[0] = 0x098; + else + code[0] = 0x088 | (isSignedType(i->sType) ? (1 << 8) : 0); + } else { + assert(isFloatType(i->sType)); + + code[0] = 0x288 | (isSignedType(i->sType) ? (1 << 8) : 0); + } + + if (neg) code[0] |= 1 << 16; + if (sat) code[0] |= 1 << 18; + if (abs) code[0] |= 1 << 19; + + roundMode_CS(i); + } +} + +void +CodeEmitterNVC0::emitSET(const CmpInstruction *i) +{ + uint32_t hi; + uint32_t lo = 0; + + if (i->sType == TYPE_F64) + lo = 0x1; + else + if (!isFloatType(i->sType)) + lo = 0x3; + + if (isFloatType(i->dType) || isSignedIntType(i->sType)) + lo |= 0x20; + + switch (i->op) { + case OP_SET_AND: hi = 0x10000000; break; + case OP_SET_OR: hi = 0x10200000; break; + case OP_SET_XOR: hi = 0x10400000; break; + default: + hi = 0x100e0000; + break; + } + emitForm_A(i, (static_cast(hi) << 32) | lo); + + if (i->def[0].getFile() == FILE_PREDICATE) { + if (i->sType == TYPE_F32) + code[1] += 0x10000000; + else + code[1] += 0x08000000; + + code[0] &= ~0xfc000; + defId(i->def[0], 17); + if (i->defExists(1)) + defId(i->def[1], 14); + else + code[0] |= 0x1c000; + } + + if (i->ftz) + code[1] |= 1 << 27; + + emitCondCode(i->setCond, 32 + 23); + emitNegAbs12(i); +} + +void +CodeEmitterNVC0::emitSLCT(const CmpInstruction *i) +{ + uint64_t op; + + switch (i->dType) { + case TYPE_S32: + op = HEX64(30000000, 00000023); + break; + case TYPE_U32: + op = HEX64(30000000, 00000003); + break; + case TYPE_F32: + op = HEX64(38000000, 00000000); + break; + default: + assert(!"invalid type for SLCT"); + op = 0; + break; + } + emitForm_A(i, op); + + CondCode cc = i->setCond; + + if (i->src[2].mod.neg()) + cc = reverseCondCode(cc); + + emitCondCode(cc, 32 + 23); + + if (i->ftz) + code[0] |= 1 << 5; +} + +void CodeEmitterNVC0::emitSELP(const Instruction *i) +{ + emitForm_A(i, HEX64(20000000, 00000004)); + + if (i->cc == CC_NOT_P || i->src[2].mod & Modifier(NV50_IR_MOD_NOT)) + code[1] |= 1 << 20; +} + +void CodeEmitterNVC0::emitTEXCSAA(const TexInstruction *i) +{ + code[0] = 0x00000086; + code[1] = 0xd0000000; + + code[1] |= i->tex.r; + code[1] |= i->tex.s << 8; + + if (i->tex.liveOnly) + code[0] |= 1 << 9; + + defId(i->def[0], 14); + srcId(i->src[0], 20); +} + +void +CodeEmitterNVC0::emitTEX(const TexInstruction *i) +{ + code[0] = 0x00000006; + + if (1) + code[0] |= 0x80; // normal/t/p mode = t, XXX: what is this ? + + if (i->tex.liveOnly) + code[0] |= 1 << 9; + + switch (i->op) { + case OP_TEX: code[1] = 0x80000000; break; + case OP_TXB: code[1] = 0x84000000; break; + case OP_TXL: code[1] = 0x86000000; break; + case OP_TXF: code[1] = 0x92000000; break; + case OP_TXG: code[1] = 0xa0000000; break; + case OP_TXD: code[1] = 0xe0000000; break; + default: + assert(!"invalid texture op"); + break; + } + defId(i->def[0], 14); + srcId(i->src[0], 20); + + emitPredicate(i); + + if (i->op == OP_TXG) code[0] |= i->tex.gatherComp << 5; + + code[1] |= i->tex.mask << 14; + + code[1] |= i->tex.r; + code[1] |= i->tex.s << 8; + if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) + code[1] |= 1 << 18; // in 1st source (with array index) + + // texture target: + code[1] |= (i->tex.target.getDim() - 1) << 20; + if (i->tex.target.isCube()) + code[1] += 2 << 20; + if (i->tex.target.isArray()) + code[1] |= 1 << 19; + if (i->tex.target.isShadow()) + code[1] |= 1 << 24; + + int src1 = i->tex.target.getArgCount(); + + if (i->src[src1].getFile() == FILE_IMMEDIATE) { // lzero + if (i->op == OP_TXL) + code[1] &= ~(1 << 26); + else + if (i->op == OP_TXF) + code[1] &= ~(1 << 25); + } + if (i->tex.target == TEX_TARGET_2D_MS || + i->tex.target == TEX_TARGET_2D_MS_ARRAY) + code[1] |= 1 << 23; + + if (i->tex.useOffsets) // in vecSrc0.w + code[1] |= 1 << 22; + + srcId(i->src[src1], 26); +} + +void +CodeEmitterNVC0::emitTXQ(const TexInstruction *i) +{ + code[0] = 0x00000086; + code[1] = 0xc0000000; + + switch (i->tex.query) { + case TXQ_DIMS: code[1] |= 0 << 22; break; + case TXQ_TYPE: code[1] |= 1 << 22; break; + case TXQ_SAMPLE_POSITION: code[1] |= 2 << 22; break; + case TXQ_FILTER: code[1] |= 3 << 22; break; + case TXQ_LOD: code[1] |= 4 << 22; break; + case TXQ_BORDER_COLOUR: code[1] |= 5 << 22; break; + default: + assert(!"invalid texture query"); + break; + } + + code[1] |= i->tex.mask << 14; + + code[1] |= i->tex.r; + code[1] |= i->tex.s << 8; + if (i->tex.sIndirectSrc >= 0 || i->tex.rIndirectSrc >= 0) + code[1] |= 1 << 18; + + defId(i->def[0], 14); + srcId(i->src[0], 20); + srcId(i->src[1], 26); + + emitPredicate(i); +} + +void +CodeEmitterNVC0::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask) +{ + code[0] = 0x00000000 | (laneMask << 6); + code[1] = 0x48000000 | qOp; + + defId(i->def[0], 14); + srcId(i->src[0], 20); + srcId(i->srcExists(1) ? i->src[1] : i->src[0], 26); + + emitPredicate(i); +} + +void +CodeEmitterNVC0::emitFlow(const Instruction *i) +{ + const FlowInstruction *f = i->asFlow(); + + unsigned mask; // bit 0: predicate, bit 1: target + + code[0] = 0x00000007; + + switch (i->op) { + case OP_BRA: + code[1] = f->absolute ? 0x00000000 : 0x40000000; + if (i->src[0].getFile() == FILE_MEMORY_CONST || + i->src[1].getFile() == FILE_MEMORY_CONST) + code[1] |= 0x4000; + mask = 3; + break; + case OP_CALL: + code[1] = f->absolute ? 0x10000000 : 0x50000000; + if (i->src[0].getFile() == FILE_MEMORY_CONST) + code[1] |= 0x4000; + mask = 2; + break; + + case OP_EXIT: code[1] = 0x80000000; mask = 1; break; + case OP_RET: code[1] = 0x90000000; mask = 1; break; + case OP_DISCARD: code[1] = 0x98000000; mask = 1; break; + case OP_BREAK: code[1] = 0xa8000000; mask = 1; break; + case OP_CONT: code[1] = 0xb0000000; mask = 1; break; + + case OP_JOINAT: code[1] = 0x60000000; mask = 2; break; + case OP_PREBREAK: code[1] = 0x68000000; mask = 2; break; + case OP_PRECONT: code[1] = 0x70000000; mask = 2; break; + case OP_PRERET: code[1] = 0x78000000; mask = 2; break; + + case OP_QUADON: code[1] = 0xc0000000; mask = 0; break; + case OP_QUADPOP: code[1] = 0xc8000000; mask = 0; break; + case OP_BRKPT: code[1] = 0xd0000000; mask = 0; break; + default: + assert(!"invalid flow operation"); + return; + } + + if (mask & 1) { + emitPredicate(i); + if (i->flagsSrc < 0) + code[0] |= 0x1e0; + } + + if (!f) + return; + + if (f->allWarp) + code[0] |= 1 << 15; + if (f->limit) + code[0] |= 1 << 16; + + if (f->op == OP_CALL) { + if (f->builtin) { + assert(f->absolute); + uint32_t pcAbs = targ->getBuiltinOffset(f->target.builtin); + addReloc(RelocEntry::TYPE_BUILTIN, 0, pcAbs, 0xfc000000, 26); + addReloc(RelocEntry::TYPE_BUILTIN, 1, pcAbs, 0x03ffffff, -6); + } else { + assert(!f->absolute); + int32_t pcRel = f->target.fn->binPos - (codeSize + 8); + code[0] |= (pcRel & 0x3f) << 26; + code[1] |= (pcRel >> 6) & 0x3ffff; + } + } else + if (mask & 2) { + int32_t pcRel = f->target.bb->binPos - (codeSize + 8); + // currently we don't want absolute branches + assert(!f->absolute); + code[0] |= (pcRel & 0x3f) << 26; + code[1] |= (pcRel >> 6) & 0x3ffff; + } +} + +void +CodeEmitterNVC0::emitPFETCH(const Instruction *i) +{ + uint32_t prim = i->src[0].get()->reg.data.u32; + + code[0] = 0x00000006 | ((prim & 0x3f) << 26); + code[1] = 0x00000000 | (prim >> 6); + + emitPredicate(i); + + defId(i->def[0], 14); + srcId(i->src[1], 20); +} + +void +CodeEmitterNVC0::emitVFETCH(const Instruction *i) +{ + code[0] = 0x00000006; + code[1] = 0x06000000 | i->src[0].get()->reg.data.offset; + + if (i->perPatch) + code[0] |= 0x100; + if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT) + code[0] |= 0x200; // yes, TCPs can read from *outputs* of other threads + + emitPredicate(i); + + code[0] |= (i->defCount(0xf) - 1) << 5; + + defId(i->def[0], 14); + srcId(i->src[0].getIndirect(0), 20); + srcId(i->src[0].getIndirect(1), 26); // vertex address +} + +void +CodeEmitterNVC0::emitEXPORT(const Instruction *i) +{ + unsigned int size = typeSizeof(i->dType); + + code[0] = 0x00000006 | ((size / 4 - 1) << 5); + code[1] = 0x0a000000 | i->src[0].get()->reg.data.offset; + + assert(size != 12 && !(code[1] & (size - 1))); + + if (i->perPatch) + code[0] |= 0x100; + + emitPredicate(i); + + assert(i->src[1].getFile() == FILE_GPR); + + srcId(i->src[0].getIndirect(0), 20); + srcId(i->src[0].getIndirect(1), 32 + 17); // vertex base address + srcId(i->src[1], 26); +} + +void +CodeEmitterNVC0::emitOUT(const Instruction *i) +{ + code[0] = 0x00000006; + code[1] = 0x1c000000; + + emitPredicate(i); + + defId(i->def[0], 14); // new secret address + srcId(i->src[0], 20); // old secret address, should be 0 initially + + assert(i->src[0].getFile() == FILE_GPR); + + if (i->op == OP_EMIT) + code[0] |= 1 << 5; + if (i->op == OP_RESTART || i->subOp == NV50_IR_SUBOP_EMIT_RESTART) + code[0] |= 1 << 6; + + // vertex stream + if (i->src[1].getFile() == FILE_IMMEDIATE) { + code[1] |= 0xc000; + code[0] |= SDATA(i->src[1]).u32 << 26; + } else { + srcId(i->src[1], 26); + } +} + +void +CodeEmitterNVC0::emitInterpMode(const Instruction *i) +{ + if (i->encSize == 8) { + code[0] |= i->ipa << 6; // TODO: INTERP_SAMPLEID + } else { + if (i->getInterpMode() == NV50_IR_INTERP_SC) + code[0] |= 0x80; + assert(i->op == OP_PINTERP && i->getSampleMode() == 0); + } +} + +void +CodeEmitterNVC0::emitINTERP(const Instruction *i) +{ + const uint32_t base = i->getSrc(0)->reg.data.offset; + + if (i->encSize == 8) { + code[0] = 0x00000000; + code[1] = 0xc0000000 | (base & 0xffff); + + if (i->saturate) + code[0] |= 1 << 5; + + if (i->op == OP_PINTERP) + srcId(i->src[1], 26); + else + code[0] |= 0x3f << 26; + + srcId(i->src[0].getIndirect(0), 20); + } else { + assert(i->op == OP_PINTERP); + code[0] = 0x00000009 | ((base & 0xc) << 6) | ((base >> 4) << 26); + srcId(i->src[1], 20); + } + emitInterpMode(i); + + emitPredicate(i); + defId(i->def[0], 14); + + if (i->getSampleMode() == NV50_IR_INTERP_OFFSET) + srcId(i->src[i->op == OP_PINTERP ? 2 : 1], 17); + else + code[1] |= 0x3f << 17; +} + +void +CodeEmitterNVC0::emitLoadStoreType(DataType ty) +{ + uint8_t val; + + switch (ty) { + case TYPE_U8: + val = 0x00; + break; + case TYPE_S8: + val = 0x20; + break; + case TYPE_F16: + case TYPE_U16: + val = 0x40; + break; + case TYPE_S16: + val = 0x60; + break; + case TYPE_F32: + case TYPE_U32: + case TYPE_S32: + val = 0x80; + break; + case TYPE_F64: + case TYPE_U64: + case TYPE_S64: + val = 0xa0; + break; + case TYPE_B128: + val = 0xc0; + break; + default: + val = 0x80; + assert(!"invalid type"); + break; + } + code[0] |= val; +} + +void +CodeEmitterNVC0::emitCachingMode(CacheMode c) +{ + uint32_t val; + + switch (c) { + case CACHE_CA: +// case CACHE_WB: + val = 0x000; + break; + case CACHE_CG: + val = 0x100; + break; + case CACHE_CS: + val = 0x200; + break; + case CACHE_CV: +// case CACHE_WT: + val = 0x300; + break; + default: + val = 0; + assert(!"invalid caching mode"); + break; + } + code[0] |= val; +} + +void +CodeEmitterNVC0::emitSTORE(const Instruction *i) +{ + uint32_t opc; + + switch (i->src[0].getFile()) { + case FILE_MEMORY_GLOBAL: opc = 0x90000000; break; + case FILE_MEMORY_LOCAL: opc = 0xc8000000; break; + case FILE_MEMORY_SHARED: opc = 0xc9000000; break; + default: + assert(!"invalid memory file"); + opc = 0; + break; + } + code[0] = 0x00000005; + code[1] = opc; + + setAddress16(i->src[0]); + srcId(i->src[1], 14); + srcId(i->src[0].getIndirect(0), 20); + + emitPredicate(i); + + emitLoadStoreType(i->dType); + emitCachingMode(i->cache); +} + +void +CodeEmitterNVC0::emitLOAD(const Instruction *i) +{ + uint32_t opc; + + code[0] = 0x00000005; + + switch (i->src[0].getFile()) { + case FILE_MEMORY_GLOBAL: opc = 0x80000000; break; + case FILE_MEMORY_LOCAL: opc = 0xc0000000; break; + case FILE_MEMORY_SHARED: opc = 0xc1000000; break; + case FILE_MEMORY_CONST: + if (!i->src[0].isIndirect(0) && typeSizeof(i->dType) == 4) { + emitMOV(i); // not sure if this is any better + return; + } + opc = 0x14000000 | (i->src[0].get()->reg.fileIndex << 10); + code[0] = 0x00000006 | (i->subOp << 8); + break; + default: + assert(!"invalid memory file"); + opc = 0; + break; + } + code[1] = opc; + + defId(i->def[0], 14); + + setAddress16(i->src[0]); + srcId(i->src[0].getIndirect(0), 20); + + emitPredicate(i); + + emitLoadStoreType(i->dType); + emitCachingMode(i->cache); +} + +uint8_t +CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref) +{ + switch (SDATA(ref).sv.sv) { + case SV_LANEID: return 0x00; + case SV_PHYSID: return 0x03; + case SV_VERTEX_COUNT: return 0x10; + case SV_INVOCATION_ID: return 0x11; + case SV_YDIR: return 0x12; + case SV_TID: return 0x21 + SDATA(ref).sv.index; + case SV_CTAID: return 0x25 + SDATA(ref).sv.index; + case SV_NTID: return 0x29 + SDATA(ref).sv.index; + case SV_GRIDID: return 0x2c; + case SV_NCTAID: return 0x2d + SDATA(ref).sv.index; + case SV_LBASE: return 0x34; + case SV_SBASE: return 0x30; + case SV_CLOCK: return 0x50 + SDATA(ref).sv.index; + default: + assert(!"no sreg for system value"); + return 0; + } +} + +void +CodeEmitterNVC0::emitMOV(const Instruction *i) +{ + if (i->src[0].getFile() == FILE_SYSTEM_VALUE) { + uint8_t sr = getSRegEncoding(i->src[0]); + + if (i->encSize == 8) { + code[0] = 0x00000004 | (sr << 26); + code[1] = 0x2c000000; + } else { + code[0] = 0x40000008 | (sr << 20); + } + defId(i->def[0], 14); + + emitPredicate(i); + } else + if (i->encSize == 8) { + uint64_t opc; + + if (i->src[0].getFile() == FILE_IMMEDIATE) + opc = HEX64(18000000, 000001e2); + else + if (i->src[0].getFile() == FILE_PREDICATE) + opc = HEX64(080e0000, 1c000004); + else + opc = HEX64(28000000, 00000004); + + opc |= i->lanes << 5; + + emitForm_B(i, opc); + } else { + uint32_t imm; + + if (i->src[0].getFile() == FILE_IMMEDIATE) { + imm = SDATA(i->src[0]).u32; + if (imm & 0xfff00000) { + assert(!(imm & 0x000fffff)); + code[0] = 0x00000318 | imm; + } else { + assert(imm < 0x800 || ((int32_t)imm >= -0x800)); + code[0] = 0x00000118 | (imm << 20); + } + } else { + code[0] = 0x0028; + emitShortSrc2(i->src[0]); + } + defId(i->def[0], 14); + + emitPredicate(i); + } +} + +bool +CodeEmitterNVC0::emitInstruction(Instruction *insn) +{ + if (!insn->encSize) { + ERROR("skipping unencodable instruction: "); insn->print(); + return false; + } else + if (codeSize + insn->encSize > codeSizeLimit) { + ERROR("code emitter output buffer too small\n"); + return false; + } + + // assert that instructions with multiple defs don't corrupt registers + for (int d = 0; insn->defExists(d); ++d) + assert(insn->asTex() || insn->def[d].rep()->reg.data.id >= 0); + + switch (insn->op) { + case OP_MOV: + case OP_RDSV: + emitMOV(insn); + break; + case OP_NOP: + break; + case OP_LOAD: + emitLOAD(insn); + break; + case OP_STORE: + emitSTORE(insn); + break; + case OP_LINTERP: + case OP_PINTERP: + emitINTERP(insn); + break; + case OP_VFETCH: + emitVFETCH(insn); + break; + case OP_EXPORT: + emitEXPORT(insn); + break; + case OP_PFETCH: + emitPFETCH(insn); + break; + case OP_EMIT: + case OP_RESTART: + emitOUT(insn); + break; + case OP_ADD: + case OP_SUB: + if (isFloatType(insn->dType)) + emitFADD(insn); + else + emitUADD(insn); + break; + case OP_MUL: + if (isFloatType(insn->dType)) + emitFMUL(insn); + else + emitUMUL(insn); + break; + case OP_MAD: + case OP_FMA: + if (isFloatType(insn->dType)) + emitFMAD(insn); + else + emitIMAD(insn); + break; + case OP_NOT: + emitNOT(insn); + break; + case OP_AND: + emitLogicOp(insn, 0); + break; + case OP_OR: + emitLogicOp(insn, 1); + break; + case OP_XOR: + emitLogicOp(insn, 2); + break; + case OP_SHL: + case OP_SHR: + emitShift(insn); + break; + case OP_SET: + case OP_SET_AND: + case OP_SET_OR: + case OP_SET_XOR: + emitSET(insn->asCmp()); + break; + case OP_SELP: + emitSELP(insn); + break; + case OP_SLCT: + emitSLCT(insn->asCmp()); + break; + case OP_MIN: + case OP_MAX: + emitMINMAX(insn); + break; + case OP_ABS: + case OP_NEG: + case OP_CEIL: + case OP_FLOOR: + case OP_TRUNC: + case OP_CVT: + case OP_SAT: + emitCVT(insn); + break; + case OP_RSQ: + emitSFnOp(insn, 5); + break; + case OP_RCP: + emitSFnOp(insn, 4); + break; + case OP_LG2: + emitSFnOp(insn, 3); + break; + case OP_EX2: + emitSFnOp(insn, 2); + break; + case OP_SIN: + emitSFnOp(insn, 1); + break; + case OP_COS: + emitSFnOp(insn, 0); + break; + case OP_PRESIN: + case OP_PREEX2: + emitPreOp(insn); + break; + case OP_TEX: + case OP_TXB: + case OP_TXL: + case OP_TXD: + case OP_TXF: + emitTEX(insn->asTex()); + break; + case OP_TXQ: + emitTXQ(insn->asTex()); + break; + case OP_BRA: + case OP_CALL: + case OP_PRERET: + case OP_RET: + case OP_DISCARD: + case OP_EXIT: + case OP_PRECONT: + case OP_CONT: + case OP_PREBREAK: + case OP_BREAK: + case OP_JOINAT: + case OP_BRKPT: + case OP_QUADON: + case OP_QUADPOP: + emitFlow(insn); + break; + case OP_QUADOP: + emitQUADOP(insn, insn->subOp, insn->lanes); + break; + case OP_DFDX: + emitQUADOP(insn, insn->src[0].mod.neg() ? 0x66 : 0x99, 0x4); + break; + case OP_DFDY: + emitQUADOP(insn, insn->src[0].mod.neg() ? 0x5a : 0xa5, 0x5); + break; + case OP_POPCNT: + emitPOPC(insn); + break; + case OP_JOIN: + emitNOP(insn); + insn->join = 1; + break; + case OP_PHI: + case OP_UNION: + case OP_CONSTRAINT: + ERROR("operation should have been eliminated"); + return false; + case OP_EXP: + case OP_LOG: + case OP_SQRT: + case OP_POW: + ERROR("operation should have been lowered\n"); + return false; + default: + ERROR("unknow op\n"); + return false; + } + + if (insn->join) { + code[0] |= 0x10; + assert(insn->encSize == 8); + } + + code += insn->encSize / 4; + codeSize += insn->encSize; + return true; +} + +uint32_t +CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const +{ + const Target::OpInfo &info = targ->getOpInfo(i); + + if (info.minEncSize == 8 || 1) + return 8; + + if (i->ftz || i->saturate || i->join) + return 8; + if (i->rnd != ROUND_N) + return 8; + if (i->predSrc >= 0 && i->op == OP_MAD) + return 8; + + if (i->op == OP_PINTERP) { + if (i->getSampleMode() || 1) // XXX: grr, short op doesn't work + return 8; + } else + if (i->op == OP_MOV && i->lanes != 0xf) { + return 8; + } + + for (int s = 0; i->srcExists(s); ++s) { + if (i->src[s].isIndirect(0)) + return 8; + + if (i->src[s].getFile() == FILE_MEMORY_CONST) { + if (SDATA(i->src[s]).offset >= 0x100) + return 8; + if (i->getSrc(s)->reg.fileIndex > 1 && + i->getSrc(s)->reg.fileIndex != 16) + return 8; + } else + if (i->src[s].getFile() == FILE_IMMEDIATE) { + if (i->dType == TYPE_F32) { + if (SDATA(i->src[s]).u32 >= 0x100) + return 8; + } else { + if (SDATA(i->src[s]).u32 > 0xff) + return 8; + } + } + + if (i->op == OP_CVT) + continue; + if (i->src[s].mod != Modifier(0)) { + if (i->src[s].mod == Modifier(NV50_IR_MOD_ABS)) + if (i->op != OP_RSQ) + return 8; + if (i->src[s].mod == Modifier(NV50_IR_MOD_NEG)) + if (i->op != OP_ADD || s != 0) + return 8; + } + } + + return 4; +} + +CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target) : targ(target) +{ + code = NULL; + codeSize = codeSizeLimit = 0; + relocInfo = NULL; +} + +CodeEmitter * +TargetNVC0::getCodeEmitter(Program::Type type) +{ + CodeEmitterNVC0 *emit = new CodeEmitterNVC0(this); + emit->setProgramType(type); + return emit; +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp new file mode 100644 index 00000000000..de73efcc56a --- /dev/null +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp @@ -0,0 +1,705 @@ + +#include "nv50/codegen/nv50_ir.h" +#include "nv50/codegen/nv50_ir_build_util.h" + +#include "nv50_ir_target_nvc0.h" + +namespace nv50_ir { + +#define QOP_ADD 0 +#define QOP_SUBR 1 +#define QOP_SUB 2 +#define QOP_MOV2 3 + +#define QUADOP(q, r, s, t) \ + ((QOP_##q << 0) | (QOP_##r << 2) | \ + (QOP_##s << 4) | (QOP_##t << 6)) + +class NVC0LegalizeSSA : public Pass +{ +private: + virtual bool visit(BasicBlock *); + virtual bool visit(Function *); + + // we want to insert calls to the builtin library only after optimization + void handleDIV(Instruction *); // integer division, modulus + void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt + +private: + BuildUtil bld; +}; + +void +NVC0LegalizeSSA::handleDIV(Instruction *i) +{ + FlowInstruction *call; + int builtin; + Value *def[2]; + + bld.setPosition(i, false); + def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0); + def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0); + switch (i->dType) { + case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break; + case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break; + default: + return; + } + call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL); + bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]); + bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2); + bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0); + + call->fixed = 1; + call->absolute = call->builtin = 1; + call->target.builtin = builtin; + delete_Instruction(prog, i); +} + +void +NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) +{ + // TODO +} + +bool +NVC0LegalizeSSA::visit(Function *fn) +{ + bld.setProgram(fn->getProgram()); + return true; +} + +bool +NVC0LegalizeSSA::visit(BasicBlock *bb) +{ + Instruction *next; + for (Instruction *i = bb->getEntry(); i; i = next) { + next = i->next; + if (i->dType == TYPE_F32) + continue; + switch (i->op) { + case OP_DIV: + case OP_MOD: + handleDIV(i); + break; + case OP_RCP: + case OP_RSQ: + if (i->dType == TYPE_F64) + handleRCPRSQ(i); + break; + default: + break; + } + } + return true; +} + +class NVC0LegalizePostRA : public Pass +{ +private: + virtual bool visit(Function *); + virtual bool visit(BasicBlock *); + + void replaceZero(Instruction *); + void split64BitOp(Instruction *); + bool tryReplaceContWithBra(BasicBlock *); + void propagateJoin(BasicBlock *); + + LValue *r63; +}; + +bool +NVC0LegalizePostRA::visit(Function *fn) +{ + r63 = new_LValue(fn, FILE_GPR); + r63->reg.data.id = 63; + return true; +} + +void +NVC0LegalizePostRA::replaceZero(Instruction *i) +{ + for (int s = 0; i->srcExists(s); ++s) { + ImmediateValue *imm = i->getSrc(s)->asImm(); + if (imm && imm->reg.data.u64 == 0) + i->setSrc(s, r63); + } +} + +void +NVC0LegalizePostRA::split64BitOp(Instruction *i) +{ + if (i->dType == TYPE_F64) { + if (i->op == OP_MAD) + i->op = OP_FMA; + if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA || + i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX || + i->op == OP_SET) + return; + i->dType = i->sType = TYPE_U32; + + i->bb->insertAfter(i, i->clone(true)); // deep cloning + } +} + +// replace CONT with BRA for single unconditional continue +bool +NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb) +{ + if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT) + return false; + Graph::EdgeIterator ei = bb->cfg.incident(); + if (ei.getType() != Graph::Edge::BACK) + ei.next(); + if (ei.getType() != Graph::Edge::BACK) + return false; + BasicBlock *contBB = BasicBlock::get(ei.getNode()); + + if (!contBB->getExit() || contBB->getExit()->op != OP_CONT || + contBB->getExit()->getPredicate()) + return false; + contBB->getExit()->op = OP_BRA; + bb->remove(bb->getEntry()); // delete PRECONT + + ei.next(); + assert(ei.end() || ei.getType() != Graph::Edge::BACK); + return true; +} + +// replace branches to join blocks with join ops +void +NVC0LegalizePostRA::propagateJoin(BasicBlock *bb) +{ + if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit) + return; + for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) { + BasicBlock *in = BasicBlock::get(ei.getNode()); + Instruction *exit = in->getExit(); + if (!exit) { + in->insertTail(new FlowInstruction(func, OP_JOIN, bb)); + // there should always be a terminator instruction + WARN("inserted missing terminator in BB:%i\n", in->getId()); + } else + if (exit->op == OP_BRA) { + exit->op = OP_JOIN; + exit->asFlow()->limit = 1; // must-not-propagate marker + } + } + bb->remove(bb->getEntry()); +} + +bool +NVC0LegalizePostRA::visit(BasicBlock *bb) +{ + Instruction *i, *next; + + // remove pseudo operations and non-fixed no-ops, split 64 bit operations + for (i = bb->getFirst(); i; i = next) { + next = i->next; + if (i->op == OP_EMIT || i->op == OP_RESTART) { + if (!i->getDef(0)->refCount()) + i->setDef(0, NULL); + if (i->src[0].getFile() == FILE_IMMEDIATE) + i->setSrc(0, r63); // initial value must be 0 + } else + if (i->isNop()) { + bb->remove(i); + } else { + if (i->op != OP_MOV && i->op != OP_PFETCH) + replaceZero(i); + if (typeSizeof(i->dType) == 8) + split64BitOp(i); + } + } + if (!bb->getEntry()) + return true; + + if (!tryReplaceContWithBra(bb)) + propagateJoin(bb); + + return true; +} + +class NVC0LoweringPass : public Pass +{ +public: + NVC0LoweringPass(Program *); + +private: + virtual bool visit(Function *); + virtual bool visit(BasicBlock *); + virtual bool visit(Instruction *); + + bool handleRDSV(Instruction *); + bool handleWRSV(Instruction *); + bool handleEXPORT(Instruction *); + bool handleOUT(Instruction *); + bool handleDIV(Instruction *); + bool handleMOD(Instruction *); + bool handleSQRT(Instruction *); + bool handlePOW(Instruction *); + bool handleTEX(TexInstruction *); + bool handleTXD(TexInstruction *); + bool handleManualTXD(TexInstruction *); + + void checkPredicate(Instruction *); + + void readTessCoord(LValue *dst, int c); + +private: + const Target *const targ; + + BuildUtil bld; + + LValue *gpEmitAddress; +}; + +NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget()) +{ + bld.setProgram(prog); +} + +bool +NVC0LoweringPass::visit(Function *fn) +{ + if (prog->getType() == Program::TYPE_GEOMETRY) { + assert(!strncmp(fn->getName(), "MAIN", 4)); + // TODO: when we generate actual functions pass this value along somehow + bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false); + gpEmitAddress = bld.loadImm(NULL, 0)->asLValue(); + } + return true; +} + +bool +NVC0LoweringPass::visit(BasicBlock *bb) +{ + return true; +} + +// move array source to first slot, convert to u16, add indirections +bool +NVC0LoweringPass::handleTEX(TexInstruction *i) +{ + const int dim = i->tex.target.getDim(); + const int arg = i->tex.target.getDim() + i->tex.target.isArray(); + + // generate and move the tsc/tic/array source to the front + if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) { + LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa + + Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(dim) : NULL; + for (int s = dim; s >= 1; --s) + i->setSrc(s, i->getSrc(s - 1)); + i->setSrc(0, arrayIndex); + + Value *ticRel = i->getIndirectR(); + Value *tscRel = i->getIndirectS(); + + if (arrayIndex) + bld.mkCvt(OP_CVT, TYPE_U16, src, TYPE_F32, arrayIndex); + else + bld.loadImm(src, 0); + + if (ticRel) { + i->setSrc(i->tex.rIndirectSrc, NULL); + bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src); + } + if (tscRel) { + i->setSrc(i->tex.sIndirectSrc, NULL); + bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src); + } + + i->setSrc(0, src); + } + + // offset is last source (lod 1st, dc 2nd) + if (i->tex.useOffsets) { + uint32_t value = 0; + int n, c; + int s = i->srcCount(0xff); + for (n = 0; n < i->tex.useOffsets; ++n) + for (c = 0; c < 3; ++c) + value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4); + i->setSrc(s, bld.loadImm(NULL, value)); + } + + return true; +} + +bool +NVC0LoweringPass::handleManualTXD(TexInstruction *i) +{ + static const uint8_t qOps[4][2] = + { + { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0 + { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1 + { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2 + { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3 + }; + Value *def[4][4]; + Value *crd[3]; + Instruction *tex; + Value *zero = bld.loadImm(bld.getSSA(), 0); + int l, c; + const int dim = i->tex.target.getDim(); + + i->op = OP_TEX; // no need to clone dPdx/dPdy later + + for (c = 0; c < dim; ++c) + crd[c] = bld.getScratch(); + + bld.mkOp(OP_QUADON, TYPE_NONE, NULL); + for (l = 0; l < 4; ++l) { + // mov coordinates from lane l to all lanes + for (c = 0; c < dim; ++c) + bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero); + // add dPdx from lane l to lanes dx + for (c = 0; c < dim; ++c) + bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]); + // add dPdy from lane l to lanes dy + for (c = 0; c < dim; ++c) + bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); + // texture + bld.insert(tex = i->clone(true)); + for (c = 0; c < dim; ++c) + tex->setSrc(c, crd[c]); + // save results + for (c = 0; i->defExists(c); ++c) { + Instruction *mov; + def[c][l] = bld.getSSA(); + mov = bld.mkMov(def[c][l], tex->getDef(c)); + mov->fixed = 1; + mov->lanes = 1 << l; + } + } + bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); + + for (c = 0; i->defExists(c); ++c) { + Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c)); + for (l = 0; l < 4; ++l) + u->setSrc(l, def[c][l]); + } + + i->bb->remove(i); + return true; +} + +bool +NVC0LoweringPass::handleTXD(TexInstruction *txd) +{ + int dim = txd->tex.target.getDim(); + int arg = txd->tex.target.getDim() + txd->tex.target.isArray(); + + handleTEX(txd); + if (txd->src[arg].exists()) + ++arg; + + if (dim > 2 || txd->tex.target.isShadow()) + return handleManualTXD(txd); + + // at most s/t/array, x, y, offset + assert(arg <= 4 && !txd->src[arg].exists()); + + for (int c = 0; c < dim; ++c) { + txd->src[arg + c * 2 + 0].set(txd->dPdx[c]); + txd->src[arg + c * 2 + 1].set(txd->dPdy[c]); + txd->dPdx[c] = NULL; + txd->dPdy[c] = NULL; + } + return true; +} + +bool +NVC0LoweringPass::handleWRSV(Instruction *i) +{ + Instruction *st; + Symbol *sym; + uint32_t addr; + + // must replace, $sreg are not writeable + addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym()); + if (addr >= 0x400) + return false; + sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr); + + st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), + i->getSrc(1)); + st->perPatch = i->perPatch; + + bld.getBB()->remove(i); + return true; +} + +void +NVC0LoweringPass::readTessCoord(LValue *dst, int c) +{ + Value *laneid = bld.getSSA(); + Value *x, *y; + + bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0)); + + if (c == 0) { + x = dst; + y = NULL; + } else + if (c == 1) { + x = NULL; + y = dst; + } else { + assert(c == 2); + x = bld.getSSA(); + y = bld.getSSA(); + } + if (x) + bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid); + if (y) + bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid); + + if (c == 2) { + bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y); + bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst); + } +} + +bool +NVC0LoweringPass::handleRDSV(Instruction *i) +{ + Symbol *sym = i->getSrc(0)->asSym(); + Value *vtx = NULL; + Instruction *ld; + uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym); + + if (addr >= 0x400) // mov $sreg + return true; + + switch (i->getSrc(0)->reg.data.sv.sv) { + case SV_POSITION: + assert(prog->getType() == Program::TYPE_FRAGMENT); + ld = new_Instruction(func, OP_LINTERP, TYPE_F32); + ld->setDef(0, i->getDef(0)); + ld->setSrc(0, bld.mkSymbol(FILE_SHADER_INPUT, 0, TYPE_F32, addr)); + ld->setInterpolate(NV50_IR_INTERP_LINEAR); + bld.getBB()->insertAfter(i, ld); + break; + case SV_TESS_COORD: + assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL); + readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index); + break; + default: + if (prog->getType() == Program::TYPE_TESSELLATION_EVAL) + vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0)); + ld = bld.mkFetch(i->getDef(0), i->dType, + FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx); + ld->perPatch = i->perPatch; + break; + } + bld.getBB()->remove(i); + return true; +} + +bool +NVC0LoweringPass::handleDIV(Instruction *i) +{ + if (!isFloatType(i->dType)) + return true; + Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1)); + i->op = OP_MUL; + i->setSrc(1, rcp->getDef(0)); + return true; +} + +bool +NVC0LoweringPass::handleMOD(Instruction *i) +{ + if (i->dType != TYPE_F32) + return true; + LValue *value = bld.getScratch(); + bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1)); + bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value); + bld.mkOp1(OP_TRUNC, TYPE_F32, value, value); + bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value); + i->op = OP_SUB; + i->setSrc(1, value); + return true; +} + +bool +NVC0LoweringPass::handleSQRT(Instruction *i) +{ + Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32, + bld.getSSA(), i->getSrc(0)); + i->op = OP_MUL; + i->setSrc(1, rsq->getDef(0)); + + return true; +} + +bool +NVC0LoweringPass::handlePOW(Instruction *i) +{ + LValue *val = bld.getScratch(); + + bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0)); + bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1; + bld.mkOp1(OP_PREEX2, TYPE_F32, val, val); + + i->op = OP_EX2; + i->setSrc(0, val); + i->setSrc(1, NULL); + + return true; +} + +bool +NVC0LoweringPass::handleEXPORT(Instruction *i) +{ + if (prog->getType() == Program::TYPE_FRAGMENT) { + int id = i->getSrc(0)->reg.data.offset / 4; + + if (i->src[0].isIndirect(0)) // TODO, ugly + return false; + i->op = OP_MOV; + i->src[0].set(i->src[1]); + i->setSrc(1, NULL); + i->setDef(0, new_LValue(func, FILE_GPR)); + i->getDef(0)->reg.data.id = id; + + prog->maxGPR = MAX2(prog->maxGPR, id); + } else + if (prog->getType() == Program::TYPE_GEOMETRY) { + i->setIndirect(0, 1, gpEmitAddress); + } + return true; +} + +bool +NVC0LoweringPass::handleOUT(Instruction *i) +{ + if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) { + i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART; + delete_Instruction(prog, i); + } else { + assert(gpEmitAddress); + i->setDef(0, gpEmitAddress); + if (i->srcExists(0)) + i->setSrc(1, i->getSrc(0)); + i->setSrc(0, gpEmitAddress); + } + return true; +} + +// Generate a binary predicate if an instruction is predicated by +// e.g. an f32 value. +void +NVC0LoweringPass::checkPredicate(Instruction *insn) +{ + Value *pred = insn->getPredicate(); + Value *pdst; + + if (!pred || pred->reg.file == FILE_PREDICATE) + return; + pdst = new_LValue(func, FILE_PREDICATE); + + // CAUTION: don't use pdst->getInsn, the definition might not be unique, + // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass + + bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, pdst, bld.mkImm(0), pred); + + insn->setPredicate(insn->cc, pdst); +} + +// +// - add quadop dance for texturing +// - put FP outputs in GPRs +// - convert instruction sequences +// +bool +NVC0LoweringPass::visit(Instruction *i) +{ + if (i->prev) + bld.setPosition(i->prev, true); + else + if (i->next) + bld.setPosition(i->next, false); + else + bld.setPosition(i->bb, true); + + if (i->cc != CC_ALWAYS) + checkPredicate(i); + + switch (i->op) { + case OP_TEX: + case OP_TXB: + case OP_TXL: + case OP_TXF: + case OP_TXQ: + case OP_TXG: + return handleTEX(i->asTex()); + case OP_TXD: + return handleTXD(i->asTex()); + case OP_EX2: + bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0)); + i->setSrc(0, i->getDef(0)); + break; + case OP_POW: + return handlePOW(i); + case OP_DIV: + return handleDIV(i); + case OP_MOD: + return handleMOD(i); + case OP_SQRT: + return handleSQRT(i); + case OP_EXPORT: + return handleEXPORT(i); + case OP_EMIT: + case OP_RESTART: + return handleOUT(i); + case OP_RDSV: + return handleRDSV(i); + case OP_WRSV: + return handleWRSV(i); + case OP_LOAD: + if (i->src[0].getFile() == FILE_SHADER_INPUT) { + i->op = OP_VFETCH; + assert(prog->getType() != Program::TYPE_FRAGMENT); + } + break; + case OP_PINTERP: + if (i->getSrc(0)->reg.data.offset >= 0x280 && + i->getSrc(0)->reg.data.offset < 0x2c0) + i->setInterpolate(i->getSampleMode() | NV50_IR_INTERP_SC); + break; + case OP_LINTERP: + if (i->getSrc(0)->reg.data.offset == 0x3fc) { + Value *face = i->getDef(0); + bld.setPosition(i, true); + bld.mkOp2(OP_SHL, TYPE_U32, face, face, bld.mkImm(31)); + bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000)); + } + break; + default: + break; + } + return true; +} + +bool +TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const +{ + if (stage == CG_STAGE_PRE_SSA) { + NVC0LoweringPass pass(prog); + return pass.run(prog, false, true); + } else + if (stage == CG_STAGE_POST_RA) { + NVC0LegalizePostRA pass; + return pass.run(prog, false, true); + } else + if (stage == CG_STAGE_SSA) { + NVC0LegalizeSSA pass; + return pass.run(prog, false, true); + } + return false; +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp new file mode 100644 index 00000000000..60b2016878e --- /dev/null +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp @@ -0,0 +1,568 @@ + +#include "nv50_ir_target_nvc0.h" + +namespace nv50_ir { + +Target *getTargetNVC0(unsigned int chipset) +{ + return new TargetNVC0(chipset); +} + +TargetNVC0::TargetNVC0(unsigned int card) +{ + chipset = card; + initOpInfo(); +} + +// BULTINS / LIBRARY FUNCTIONS: + +// lazyness -> will just hardcode everything for the time being + +// Will probably make this nicer once we support subroutines properly, +// i.e. when we have an input IR that provides function declarations. + +static const uint32_t nvc0_builtin_code[] = +{ +// DIV U32: slow unsigned integer division +// +// UNR recurrence (q = a / b): +// look for z such that 2^32 - b <= b * z < 2^32 +// then q - 1 <= (a * z) / 2^32 <= q +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p1 +// SIZE: 22 / 14 * 8 bytes +// +#if 1 + 0x04009c03, 0x78000000, + 0x7c209cdd, + 0x0010dd18, + 0x08309c03, 0x60000000, + 0x05605c18, + 0x0810dc2a, + 0x0c209c43, 0x20040000, + 0x0810dc03, 0x50000000, + 0x0c209c43, 0x20040000, + 0x0810dc03, 0x50000000, + 0x0c209c43, 0x20040000, + 0x0810dc03, 0x50000000, + 0x0c209c43, 0x20040000, + 0x0810dc03, 0x50000000, + 0x0c209c43, 0x20040000, + 0x0000dde4, 0x28000000, + 0x08001c43, 0x50000000, + 0x05609c18, + 0x0010430d, + 0x0811dc03, 0x1b0e0000, + 0x08104103, 0x48000000, + 0x04000002, 0x08000000, + 0x0811c003, 0x1b0e0000, + 0x08104103, 0x48000000, + 0x040000ac, + 0x90001dff, +#else + 0x0401dc03, 0x1b0e0000, + 0x00008003, 0x78000000, + 0x0400c003, 0x78000000, + 0x0c20c103, 0x48000000, + 0x0c108003, 0x60000000, + 0x00005c28, + 0x00001d18, + 0x0031c023, 0x1b0ec000, + 0xb000a1e7, 0x40000000, + 0x04000003, 0x6000c000, + 0x0813dc03, 0x1b000000, + 0x0420446c, + 0x040004bd, + 0x04208003, 0x5800c000, + 0x0430c103, 0x4800c000, + 0x0ffc5dff, + 0x90001dff, +#endif + +// DIV S32: slow signed integer division +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p3 +// SIZE: 18 * 8 bytes +// + 0xfc05dc23, 0x188e0000, + 0xfc17dc23, 0x18c40000, + 0x03301e18, + 0x07305e18, + 0x0401dc03, 0x1b0e0000, + 0x00008003, 0x78000000, + 0x0400c003, 0x78000000, + 0x0c20c103, 0x48000000, + 0x0c108003, 0x60000000, + 0x00005c28, + 0x00001d18, + 0x0031c023, 0x1b0ec000, + 0xb000a1e7, 0x40000000, + 0x04000003, 0x6000c000, + 0x0813dc03, 0x1b000000, + 0x0420446c, + 0x040004bd, + 0x04208003, 0x5800c000, + 0x0430c103, 0x4800c000, + 0x0ffc5dff, + 0x01700e18, + 0x05704a18, + 0x90001dff, + +// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) +// +// INPUT: $r0d (x) +// OUTPUT: $r0d (rcp(x)) +// CLOBBER: $r2 - $r7 +// SIZE: 9 * 8 bytes +// + 0x9810dc08, + 0x00009c28, + 0x4001df18, + 0x00019d18, + 0x08011e01, 0x200c0000, + 0x10209c01, 0x50000000, + 0x08011e01, 0x200c0000, + 0x10209c01, 0x50000000, + 0x08011e01, 0x200c0000, + 0x10201c01, 0x50000000, + 0x00001de7, 0x90000000, + +// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) +// +// INPUT: $r0d (x) +// OUTPUT: $r0d (rsqrt(x)) +// CLOBBER: $r2 - $r7 +// SIZE: 14 * 8 bytes +// + 0x9c10dc08, + 0x00009c28, + 0x00019d18, + 0x3fe1df18, + 0x18001c01, 0x50000000, + 0x0001dde2, 0x18ffe000, + 0x08211c01, 0x50000000, + 0x10011e01, 0x200c0000, + 0x10209c01, 0x50000000, + 0x08211c01, 0x50000000, + 0x10011e01, 0x200c0000, + 0x10209c01, 0x50000000, + 0x08211c01, 0x50000000, + 0x10011e01, 0x200c0000, + 0x10201c01, 0x50000000, + 0x00001de7, 0x90000000, +}; + +static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] = +{ + 0, + 8 * (22), + 8 * (22 + 18), + 8 * (22 + 18 + 9) +}; + +void +TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const +{ + *code = &nvc0_builtin_code[0]; + *size = sizeof(nvc0_builtin_code); +} + +uint32_t +TargetNVC0::getBuiltinOffset(int builtin) const +{ + assert(builtin < NVC0_BUILTIN_COUNT); + return nvc0_builtin_offsets[builtin]; +} + +struct opProperties +{ + operation op; + unsigned int mNeg : 4; + unsigned int mAbs : 4; + unsigned int mNot : 4; + unsigned int mSat : 4; + unsigned int fConst : 3; + unsigned int fImmd : 4; // last bit indicates if full immediate is suppoted +}; + +static const struct opProperties _initProps[] = +{ + // neg abs not sat c[] imm + { OP_ADD, 0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 }, + { OP_SUB, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 }, + { OP_MUL, 0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 }, + { OP_MAX, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + { OP_MIN, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + { OP_MAD, 0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint + { OP_ABS, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 }, + { OP_NEG, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0 }, + { OP_CVT, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 }, + { OP_AND, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 }, + { OP_OR, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 }, + { OP_XOR, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 }, + { OP_SHL, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 }, + { OP_SHR, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 }, + { OP_SET, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + { OP_SLCT, 0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint + { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 }, + { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 }, + { OP_COS, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_SIN, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_EX2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_LG2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_RCP, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_RSQ, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_DFDX, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_DFDY, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 }, + { OP_CALL, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 }, + { OP_INSBF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4 }, + { OP_SET_AND, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + { OP_SET_OR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + { OP_SET_XOR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, + // saturate only: + { OP_LINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 }, + { OP_PINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 }, +}; + +void TargetNVC0::initOpInfo() +{ + unsigned int i, j; + + static const uint32_t commutative[(OP_LAST + 31) / 32] = + { + // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN + 0x0670ca00, 0x0000003f, 0x00000000 + }; + + static const uint32_t shortForm[(OP_LAST + 31) / 32] = + { + // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV + 0x0670ca00, 0x00000000, 0x00000000 + }; + + static const operation noDest[] = + { + OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT, + OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET, + OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART, + OP_QUADON, OP_QUADPOP + }; + + joinAnterior = false; + + for (i = 0; i < DATA_FILE_COUNT; ++i) + nativeFileMap[i] = (DataFile)i; + nativeFileMap[FILE_ADDRESS] = FILE_GPR; + + for (i = 0; i < OP_LAST; ++i) { + opInfo[i].variants = NULL; + opInfo[i].op = (operation)i; + opInfo[i].srcTypes = 1 << (int)TYPE_F32; + opInfo[i].dstTypes = 1 << (int)TYPE_F32; + opInfo[i].immdBits = 0; + opInfo[i].srcNr = operationSrcNr[i]; + + for (j = 0; j < opInfo[i].srcNr; ++j) { + opInfo[i].srcMods[j] = 0; + opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR; + } + opInfo[i].dstMods = 0; + opInfo[i].dstFiles = 1 << (int)FILE_GPR; + + opInfo[i].hasDest = 1; + opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA); + opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1; + opInfo[i].pseudo = (i < OP_MOV); + opInfo[i].predicate = !opInfo[i].pseudo; + opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN); + opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8; + } + for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i) + opInfo[noDest[i]].hasDest = 0; + + for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) { + const struct opProperties *prop = &_initProps[i]; + + for (int s = 0; s < 3; ++s) { + if (prop->mNeg & (1 << s)) + opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG; + if (prop->mAbs & (1 << s)) + opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS; + if (prop->mNot & (1 << s)) + opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT; + if (prop->fConst & (1 << s)) + opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST; + if (prop->fImmd & (1 << s)) + opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE; + if (prop->fImmd & 8) + opInfo[prop->op].immdBits = 0xffffffff; + } + if (prop->mSat & 8) + opInfo[prop->op].dstMods = NV50_IR_MOD_SAT; + } +} + +unsigned int +TargetNVC0::getFileSize(DataFile file) const +{ + switch (file) { + case FILE_NULL: return 0; + case FILE_GPR: return 63; + case FILE_PREDICATE: return 7; + case FILE_FLAGS: return 1; + case FILE_ADDRESS: return 0; + case FILE_IMMEDIATE: return 0; + case FILE_MEMORY_CONST: return 65536; + case FILE_SHADER_INPUT: return 0x400; + case FILE_SHADER_OUTPUT: return 0x400; + case FILE_MEMORY_GLOBAL: return 0xffffffff; + case FILE_MEMORY_SHARED: return 16 << 10; + case FILE_MEMORY_LOCAL: return 48 << 10; + case FILE_SYSTEM_VALUE: return 32; + default: + assert(!"invalid file"); + return 0; + } +} + +unsigned int +TargetNVC0::getFileUnit(DataFile file) const +{ + if (file == FILE_GPR || file == FILE_ADDRESS || file == FILE_SYSTEM_VALUE) + return 2; + return 0; +} + +uint32_t +TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const +{ + const int idx = sym->reg.data.sv.index; + const SVSemantic sv = sym->reg.data.sv.sv; + + const bool isInput = shaderFile == FILE_SHADER_INPUT; + + switch (sv) { + case SV_POSITION: return 0x070 + idx * 4; + case SV_INSTANCE_ID: return 0x2f8; + case SV_VERTEX_ID: return 0x2fc; + case SV_PRIMITIVE_ID: return isInput ? 0x060 : 0x040; + case SV_LAYER: return 0x064; + case SV_VIEWPORT_INDEX: return 0x068; + case SV_POINT_SIZE: return 0x06c; + case SV_CLIP_DISTANCE: return 0x2c0 + idx * 4; + case SV_POINT_COORD: return 0x2e0 + idx * 4; + case SV_FACE: return 0x3fc; + case SV_TESS_FACTOR: return 0x000 + idx * 4; + case SV_TESS_COORD: return 0x2f0 + idx * 4; + default: + return 0xffffffff; + } +} + +bool +TargetNVC0::insnCanLoad(const Instruction *i, int s, + const Instruction *ld) const +{ + DataFile sf = ld->src[0].getFile(); + + // immediate 0 can be represented by GPR $r63 + if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0) + return (!i->asTex() && i->op != OP_EXPORT && i->op != OP_STORE); + + if (s > opInfo[i->op].srcNr) + return false; + if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf))) + return false; + + // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0 + if (ld->src[0].isIndirect(0)) + return false; + + for (int k = 0; i->srcExists(k); ++k) { + if (i->src[k].getFile() == FILE_IMMEDIATE) { + if (i->getSrc(k)->reg.data.u64 != 0) + return false; + } else + if (i->src[k].getFile() != FILE_GPR && + i->src[k].getFile() != FILE_PREDICATE) { + return false; + } + } + + // not all instructions support full 32 bit immediates + if (sf == FILE_IMMEDIATE) { + Storage ® = ld->getSrc(0)->asImm()->reg; + + if (opInfo[i->op].immdBits != 0xffffffff) { + if (i->sType == TYPE_F32) { + if (reg.data.u32 & 0xfff) + return false; + } else + if (i->sType == TYPE_S32 || i->sType == TYPE_U32) { + // with u32, 0xfffff counts as 0xffffffff as well + if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000) + return false; + } + } else + if (i->op == OP_MAD || i->op == OP_FMA) { + // requires src == dst, cannot decide before RA + // (except if we implement more constraints) + if (ld->getSrc(0)->asImm()->reg.data.u32 & 0xfff) + return false; + } + } + + return true; +} + +bool +TargetNVC0::isOpSupported(operation op, DataType ty) const +{ + if ((op == OP_MAD || op == OP_FMA) && (ty != TYPE_F32)) + return false; + if (op == OP_SAD && ty != TYPE_S32) + return false; + if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD) + return false; + return true; +} + +bool +TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const +{ + if (!isFloatType(insn->dType)) { + switch (insn->op) { + case OP_ABS: + case OP_NEG: + case OP_CVT: + case OP_CEIL: + case OP_FLOOR: + case OP_TRUNC: + case OP_AND: + case OP_OR: + case OP_XOR: + break; + case OP_ADD: + if (insn->src[s ? 0 : 1].mod.neg()) + return false; + break; + case OP_SUB: + if (s == 0) + return insn->src[1].mod.neg() ? false : true; + break; + default: + return false; + } + } + if (s > 3) + return false; + return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod; +} + +bool +TargetNVC0::mayPredicate(const Instruction *insn, const Value *pred) const +{ + if (insn->getPredicate()) + return false; + return opInfo[insn->op].predicate; +} + +bool +TargetNVC0::isSatSupported(const Instruction *insn) const +{ + if (insn->op == OP_CVT) + return true; + if (!(opInfo[insn->op].dstMods & NV50_IR_MOD_SAT)) + return false; + + if (insn->dType == TYPE_U32) + return (insn->op == OP_ADD) || (insn->op == OP_MAD); + + return insn->dType == TYPE_F32; +} + +// TODO: better values +int TargetNVC0::getLatency(const Instruction *i) const +{ + if (i->op == OP_LOAD) { + if (i->cache == CACHE_CV) + return 700; + return 48; + } + return 24; +} + +// These are "inverse" throughput values, i.e. the number of cycles required +// to issue a specific instruction for a full warp (32 threads). +// +// Assuming we have more than 1 warp in flight, a higher issue latency results +// in a lower result latency since the MP will have spent more time with other +// warps. +// This also helps to determine the number of cycles between instructions in +// a single warp. +// +int TargetNVC0::getThroughput(const Instruction *i) const +{ + // TODO: better values + if (i->dType == TYPE_F32) { + switch (i->op) { + case OP_ADD: + case OP_MUL: + case OP_MAD: + case OP_FMA: + return 1; + case OP_CVT: + case OP_CEIL: + case OP_FLOOR: + case OP_TRUNC: + case OP_SET: + case OP_SLCT: + case OP_MIN: + case OP_MAX: + return 2; + case OP_RCP: + case OP_RSQ: + case OP_LG2: + case OP_SIN: + case OP_COS: + case OP_PRESIN: + case OP_PREEX2: + default: + return 8; + } + } else + if (i->dType == TYPE_U32 || i->dType == TYPE_S32) { + switch (i->op) { + case OP_ADD: + case OP_AND: + case OP_OR: + case OP_XOR: + case OP_NOT: + return 1; + case OP_MUL: + case OP_MAD: + case OP_CVT: + case OP_SET: + case OP_SLCT: + case OP_SHL: + case OP_SHR: + case OP_NEG: + case OP_ABS: + case OP_MIN: + case OP_MAX: + default: + return 2; + } + } else + if (i->dType == TYPE_F64) { + return 2; + } else { + return 1; + } +} + +} // namespace nv50_ir diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h new file mode 100644 index 00000000000..f96bfbeaa6a --- /dev/null +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h @@ -0,0 +1,46 @@ + +#include "nv50/codegen/nv50_ir_target.h" + +namespace nv50_ir { + +#define NVC0_BUILTIN_DIV_U32 0 +#define NVC0_BUILTIN_DIV_S32 1 +#define NVC0_BUILTIN_RCP_F64 2 +#define NVC0_BUILTIN_RSQ_F64 3 + +#define NVC0_BUILTIN_COUNT 4 + +class TargetNVC0 : public Target +{ +public: + TargetNVC0(unsigned int chipset); + + virtual CodeEmitter *getCodeEmitter(Program::Type); + + virtual bool runLegalizePass(Program *, CGStage stage) const; + + virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const; + + virtual bool insnCanLoad(const Instruction *insn, int s, + const Instruction *ld) const; + virtual bool isOpSupported(operation, DataType) const; + virtual bool isModSupported(const Instruction *, int s, Modifier) const; + virtual bool isSatSupported(const Instruction *) const; + virtual bool mayPredicate(const Instruction *, const Value *) const; + + virtual int getLatency(const Instruction *) const; + virtual int getThroughput(const Instruction *) const; + + virtual unsigned int getFileSize(DataFile) const; + virtual unsigned int getFileUnit(DataFile) const; + + virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const; + + uint32_t getBuiltinOffset(int builtin) const; + +private: + void initOpInfo(); + +}; + +} // namespace nv50_ir diff --git a/src/gallium/targets/gbm/Makefile b/src/gallium/targets/gbm/Makefile index 033a1acaaf9..c516588f95f 100644 --- a/src/gallium/targets/gbm/Makefile +++ b/src/gallium/targets/gbm/Makefile @@ -118,6 +118,7 @@ pipe_SOURCES += pipe_i965.c endif ifneq ($(findstring nouveau/drm,$(GALLIUM_WINSYS_DIRS)),) +LDFLAGS += -lstdc++ pipe_TARGETS += $(PIPE_PREFIX)nouveau.so pipe_SOURCES += pipe_nouveau.c endif -- 2.30.2