From 6f98a3065bce873652e24e3591c3d57192514473 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sat, 4 Aug 2018 03:13:11 +0200 Subject: [PATCH] gm200/ir: add native OP_SQRT support ./GpuTest /test=pixmark_piano 1024x640 30sec: 301 -> 327 points shader-db: total instructions in shared programs : 5472103 -> 5456166 (-0.29%) total gprs used in shared programs : 647530 -> 647522 (-0.00%) total shared used in shared programs : 389120 -> 389120 (0.00%) total local used in shared programs : 21064 -> 21064 (0.00%) total bytes used in shared programs : 58459304 -> 58288696 (-0.29%) local shared gpr inst bytes helped 0 0 27 8281 8281 hurt 0 0 21 431 431 v2: use NVISA_GM200_CHIPSET Reviewed-by: Ilia Mirkin Signed-off-by: Karol Herbst --- .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp | 4 +++- .../drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 3 +++ .../drivers/nouveau/codegen/nv50_ir_target_gm107.cpp | 8 +++++++- .../drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp | 1 + 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index 1d31f181e44..5e8c22cd54b 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -1409,6 +1409,7 @@ CodeEmitterGM107::emitMUFU() case OP_LG2: mufu = 3; break; case OP_RCP: mufu = 4 + 2 * insn->subOp; break; case OP_RSQ: mufu = 5 + 2 * insn->subOp; break; + case OP_SQRT: mufu = 8; break; default: assert(!"invalid mufu"); break; @@ -1418,7 +1419,7 @@ CodeEmitterGM107::emitMUFU() emitSAT (0x32); emitNEG (0x30, insn->src(0)); emitABS (0x2e, insn->src(0)); - emitField(0x14, 3, mufu); + emitField(0x14, 4, mufu); emitGPR (0x08, insn->src(0)); emitGPR (0x00, insn->def(0)); } @@ -3342,6 +3343,7 @@ CodeEmitterGM107::emitInstruction(Instruction *i) case OP_LG2: case OP_RCP: case OP_RSQ: + case OP_SQRT: emitMUFU(); break; case OP_AND: diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 1410cf26c87..c47d10896ce 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -2752,6 +2752,9 @@ NVC0LoweringPass::handleMOD(Instruction *i) bool NVC0LoweringPass::handleSQRT(Instruction *i) { + if (targ->isOpSupported(OP_SQRT, i->dType)) + return true; + if (i->dType == TYPE_F64) { Value *pred = bld.getSSA(1, FILE_PREDICATE); Value *zero = bld.loadImm(NULL, 0.0); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp index adbfcc3cfec..c25e6da024d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp @@ -57,10 +57,13 @@ TargetGM107::isOpSupported(operation op, DataType ty) const switch (op) { case OP_SAD: case OP_POW: - case OP_SQRT: case OP_DIV: case OP_MOD: return false; + case OP_SQRT: + if (ty == TYPE_F64) + return false; + return chipset >= NVISA_GM200_CHIPSET; default: break; } @@ -125,6 +128,7 @@ TargetGM107::isBarrierRequired(const Instruction *insn) const case OP_RCP: case OP_RSQ: case OP_SIN: + case OP_SQRT: return true; default: break; @@ -256,6 +260,7 @@ TargetGM107::getLatency(const Instruction *insn) const case OP_RCP: case OP_RSQ: case OP_SIN: + case OP_SQRT: return 13; default: break; @@ -284,6 +289,7 @@ TargetGM107::getReadLatency(const Instruction *insn) const case OP_RSQ: case OP_SAT: case OP_SIN: + case OP_SQRT: case OP_SULDB: case OP_SULDP: case OP_SUREDB: diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 7e059235f4c..9304e392361 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -129,6 +129,7 @@ static const struct opProperties _initProps[] = { OP_LG2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, { OP_RCP, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, { OP_RSQ, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, + { OP_SQRT, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, { OP_DFDX, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 }, { OP_DFDY, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 }, { OP_CALL, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 }, -- 2.30.2