From 5d6952d2dec53c2660a57408395552629c380d35 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 13 Jun 2018 16:21:20 +0100 Subject: [PATCH] nv50/ir: add preliminary support for OP_XMAD v4: remove uint16_t(...) v4: don't allow immediates outside [0,65535] in insnCanLoad() Signed-off-by: Rhys Perry Reviewed-by: Karol Herbst --- src/gallium/drivers/nouveau/codegen/nv50_ir.h | 26 +++++++++++++++++++ .../nouveau/codegen/nv50_ir_peephole.cpp | 18 +++++++++++-- .../drivers/nouveau/codegen/nv50_ir_print.cpp | 19 ++++++++++++++ .../nouveau/codegen/nv50_ir_target.cpp | 7 ++--- .../nouveau/codegen/nv50_ir_target_gm107.cpp | 1 + .../nouveau/codegen/nv50_ir_target_nv50.cpp | 1 + .../nouveau/codegen/nv50_ir_target_nvc0.cpp | 18 +++++++++++++ 7 files changed, 85 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index 0b220cc48de..d5c9570a56b 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -58,6 +58,9 @@ enum operation OP_FMA, OP_SAD, // abs(src0 - src1) + src2 OP_SHLADD, + // extended multiply-add (GM107+), does a lot of things. + // see envytools for detailed documentation + OP_XMAD, OP_ABS, OP_NEG, OP_NOT, @@ -256,6 +259,29 @@ enum operation #define NV50_IR_SUBOP_MINMAX_MED 2 #define NV50_IR_SUBOP_MINMAX_HIGH 3 +// xmad(src0, src1, 0) << 16 + src2 +#define NV50_IR_SUBOP_XMAD_PSL (1 << 0) +// (xmad(src0, src1, src2) & 0xffff) | (src1 << 16) +#define NV50_IR_SUBOP_XMAD_MRG (1 << 1) +// xmad(src0, src1, src2.lo) +#define NV50_IR_SUBOP_XMAD_CLO (1 << 2) +// xmad(src0, src1, src2.hi) +#define NV50_IR_SUBOP_XMAD_CHI (2 << 2) +// if both operands to the multiplication are non-zero, subtract 65536 for each +// negative operand +#define NV50_IR_SUBOP_XMAD_CSFU (3 << 2) +// xmad(src0, src1, src2) + src1 << 16 +#define NV50_IR_SUBOP_XMAD_CBCC (4 << 2) +#define NV50_IR_SUBOP_XMAD_CMODE_SHIFT 2 +#define NV50_IR_SUBOP_XMAD_CMODE_MASK (0x7 << NV50_IR_SUBOP_XMAD_CMODE_SHIFT) + +// use the high 16 bits instead of the low 16 bits for the multiplication. +// if the instruction's sType is signed, sign extend the operand from 16 bits +// to 32 before multiplication. +#define NV50_IR_SUBOP_XMAD_H1_SHIFT 5 +#define NV50_IR_SUBOP_XMAD_H1(i) (1 << (NV50_IR_SUBOP_XMAD_H1_SHIFT + (i))) +#define NV50_IR_SUBOP_XMAD_H1_MASK (0x3 << NV50_IR_SUBOP_XMAD_H1_SHIFT) + enum DataType { TYPE_NONE, diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 2f7cc206b84..5b4a98d25cb 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -191,9 +191,17 @@ void LoadPropagation::checkSwapSrc01(Instruction *insn) { const Target *targ = prog->getTarget(); - if (!targ->getOpInfo(insn).commutative) - if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB) + if (!targ->getOpInfo(insn).commutative) { + if (insn->op != OP_SET && insn->op != OP_SLCT && + insn->op != OP_SUB && insn->op != OP_XMAD) return; + // XMAD is only commutative if both the CBCC and MRG flags are not set. + if (insn->op == OP_XMAD && + (insn->subOp & NV50_IR_SUBOP_XMAD_CMODE_MASK) == NV50_IR_SUBOP_XMAD_CBCC) + return; + if (insn->op == OP_XMAD && (insn->subOp & NV50_IR_SUBOP_XMAD_MRG)) + return; + } if (insn->src(1).getFile() != FILE_GPR) return; // This is the special OP_SET used for alphatesting, we can't reverse its @@ -236,6 +244,12 @@ LoadPropagation::checkSwapSrc01(Instruction *insn) if (insn->op == OP_SUB) { insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG); insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG); + } else + if (insn->op == OP_XMAD) { + // swap h1 flags + uint16_t h1 = (insn->subOp >> 1 & NV50_IR_SUBOP_XMAD_H1(0)) | + (insn->subOp << 1 & NV50_IR_SUBOP_XMAD_H1(1)); + insn->subOp = (insn->subOp & ~NV50_IR_SUBOP_XMAD_H1_MASK) | h1; } } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp index ee3506fbaee..7eab8b8d70d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp @@ -86,6 +86,7 @@ const char *operationStr[OP_LAST + 1] = "fma", "sad", "shladd", + "xmad", "abs", "neg", "not", @@ -240,6 +241,11 @@ static const char *barOpStr[] = "sync", "arrive", "red and", "red or", "red popc" }; +static const char *xmadOpCModeStr[] = +{ + "clo", "chi", "csfu", "cbcc" +}; + static const char *DataTypeStr[] = { "-", @@ -625,6 +631,19 @@ void Instruction::print() const if (subOp < ARRAY_SIZE(barOpStr)) PRINT("%s ", barOpStr[subOp]); break; + case OP_XMAD: { + if (subOp & NV50_IR_SUBOP_XMAD_PSL) + PRINT("psl "); + if (subOp & NV50_IR_SUBOP_XMAD_MRG) + PRINT("mrg "); + unsigned cmode = (subOp & NV50_IR_SUBOP_XMAD_CMODE_MASK); + cmode >>= NV50_IR_SUBOP_XMAD_CMODE_SHIFT; + if (cmode && cmode <= ARRAY_SIZE(xmadOpCModeStr)) + PRINT("%s ", xmadOpCModeStr[cmode - 1]); + for (int i = 0; i < 2; i++) + PRINT("h%d ", (subOp & NV50_IR_SUBOP_XMAD_H1(i)) ? 1 : 0); + break; + } default: if (subOp) PRINT("(SUBOP:%u) ", subOp); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp index 298e7c6ef9b..9193a01f189 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp @@ -30,7 +30,8 @@ const uint8_t Target::operationSrcNr[] = 0, 0, // NOP, PHI 0, 0, 0, 0, // UNION, SPLIT, MERGE, CONSTRAINT 1, 1, 2, // MOV, LOAD, STORE - 2, 2, 2, 2, 2, 3, 3, 3, 3, // ADD, SUB, MUL, DIV, MOD, MAD, FMA, SAD, SHLADD + 2, 2, 2, 2, 2, 3, 3, 3, // ADD, SUB, MUL, DIV, MOD, MAD, FMA, SAD + 3, 3, // SHLADD, XMAD 1, 1, 1, // ABS, NEG, NOT 2, 2, 2, 2, 2, // AND, OR, XOR, SHL, SHR 2, 2, 1, // MAX, MIN, SAT @@ -70,10 +71,10 @@ const OpClass Target::operationClass[] = OPCLASS_MOVE, OPCLASS_LOAD, OPCLASS_STORE, - // ADD, SUB, MUL; DIV, MOD; MAD, FMA, SAD, SHLADD + // ADD, SUB, MUL; DIV, MOD; MAD, FMA, SAD, SHLADD, XMAD OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, - OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, + OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, // ABS, NEG; NOT, AND, OR, XOR; SHL, SHR OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp index c25e6da024d..2dd12322a89 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp @@ -59,6 +59,7 @@ TargetGM107::isOpSupported(operation op, DataType ty) const case OP_POW: case OP_DIV: case OP_MOD: + case OP_XMAD: return false; case OP_SQRT: if (ty == TYPE_F64) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index 1ad3467337c..29814973408 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -443,6 +443,7 @@ TargetNV50::isOpSupported(operation op, DataType ty) const case OP_EXIT: // want exit modifier instead (on NOP if required) case OP_MEMBAR: case OP_SHLADD: + case OP_XMAD: return false; case OP_SAD: return ty == TYPE_S32; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 9304e392361..8e040695363 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -357,6 +357,18 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s, if ((i->op == OP_SHL || i->op == OP_SHR) && typeSizeof(i->sType) == 8 && sf == FILE_MEMORY_CONST) return false; + // constant buffer loads can't be used with cbcc xmads + if (i->op == OP_XMAD && sf == FILE_MEMORY_CONST && + (i->subOp & NV50_IR_SUBOP_XMAD_CMODE_MASK) == NV50_IR_SUBOP_XMAD_CBCC) + return false; + // constant buffer loads for the third operand can't be used with psl/mrg xmads + if (i->op == OP_XMAD && sf == FILE_MEMORY_CONST && s == 2 && + (i->subOp & (NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_MRG))) + return false; + // for xmads, immediates can't have the h1 flag set + if (i->op == OP_XMAD && sf == FILE_IMMEDIATE && s < 2 && + i->subOp & NV50_IR_SUBOP_XMAD_H1(s)) + return false; for (int k = 0; i->srcExists(k); ++k) { if (i->src(k).getFile() == FILE_IMMEDIATE) { @@ -393,6 +405,9 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s, // with u32, 0xfffff counts as 0xffffffff as well if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000) return false; + // XMADs can only have 16-bit immediates + if (i->op == OP_XMAD && reg.data.u32 > 0xffff) + return false; break; case TYPE_U8: case TYPE_S8: @@ -449,6 +464,8 @@ TargetNVC0::isOpSupported(operation op, DataType ty) const return false; if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD) return false; + if (op == OP_XMAD) + return false; return true; } @@ -468,6 +485,7 @@ TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const case OP_XOR: case OP_POPCNT: case OP_BFIND: + case OP_XMAD: break; case OP_SET: if (insn->sType != TYPE_F32) -- 2.30.2