From cce495572136a606dd2a35e79f45080c3796e2cc Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 1 Jul 2018 09:44:48 +0200 Subject: [PATCH] gm107/ir: add fp64 rsq Acked-by: Ilia Mirkin Cc: 19.0 --- .../drivers/nouveau/codegen/lib/gm107.asm | 78 ++++++++++++++++++- .../drivers/nouveau/codegen/lib/gm107.asm.h | 51 +++++++++++- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 2 +- 3 files changed, 128 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm index 595d9dc5d41..faee0218d18 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm @@ -269,8 +269,84 @@ rcp_result_denorm: rcp_end: ret +// RSQ F64 +// +// INPUT: $r0d +// OUTPUT: $r0d +// CLOBBER: $r2 - $r9, $p0 - $p1 +// gm107_rsq_f64: - sched (st 0x0) (st 0x0) (st 0x0) + // Before getting initial result rsqrt64h, two special cases should be + // handled first. + // 1. NaN: set the highest bit in mantissa so it'll be surely recognized + // as NaN in rsqrt64h + sched (st 0xd wr 0x0 wt 0x3f) (st 0xd wt 0x1) (st 0xd) + dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1 + $p0 lop32i or $r1 $r1 0x00080000 + lop32i and $r2 $r1 0x7fffffff + // 2. denorms and small normal values: using their original value will + // lose precision either at rsqrt64h or the first step in newton-raphson + // steps below. Take 2 as a threshold in exponent field, and multiply + // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 + // to recover in the end) + sched (st 0xd) (st 0xd) (st 0xd) + bfe u32 $r3 $r1 0xb14 + isetp le u32 and $p1 1 $r3 0x2 1 + lop or 1 $r2 $r0 $r2 + sched (st 0xd wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xd) + $p1 dmul $r0 $r0 0x4350000000000000 + mufu rsq64h $r5 $r1 + // rsqrt64h will give correct result for 0/inf/nan, the following logic + // checks whether the input is one of those (exponent is 0x7ff or all 0 + // except for the sign bit) + iset ne u32 and $r6 $r3 0x7ff 1 + sched (st 0xd) (st 0xd) (st 0xd) + lop and 1 $r2 $r2 $r6 + isetp ne u32 and $p0 1 $r2 0x0 1 + $p0 bra #rsq_norm + // For 0/inf/nan, make sure the sign bit agrees with input and return + sched (st 0xd) (st 0xd) (st 0xd wt 0x1) + lop32i and $r1 $r1 0x80000000 + mov $r0 0x0 0xf + lop or 1 $r1 $r1 $r5 + sched (st 0xd) (st 0xf) (st 0xf) + ret + nop 0 + nop 0 +rsq_norm: + // For others, do 4 Newton-Raphson steps with the formula: + // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) + // In the code below, each step is written as: + // tmp1 = 0.5 * x * RSQ_{n} + // tmp2 = -RSQ_{n} * tmp1 + 0.5 + // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} + sched (st 0xd) (st 0xd wr 0x1) (st 0xd wr 0x1 rd 0x0 wt 0x3) + mov $r4 0x0 0xf + // 0x3f000000: 1/2 + f2f f32 f64 $r8 0x3f000000 + dmul $r2 $r0 $r8 + sched (st 0xd wr 0x0 wt 0x3) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + // Multiply 2^27 to result for small inputs to recover + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wt 0x1) (st 0xd) + $p1 dmul $r4 $r4 0x41a0000000000000 + mov $r1 $r5 0xf + mov $r0 $r4 0xf + sched (st 0xd) (st 0xf) (st 0xf) ret nop 0 nop 0 diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h index 206d01bde83..8eb27bbac99 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h @@ -182,7 +182,56 @@ uint64_t gm107_builtin_code[] = { /* 0x0558: rcp_end */ 0xe32000000007000f, /* 0x0560: gm107_rsq_f64 */ - 0x001f8000fc0007e0, + 0x001fb401fda1ff0d, + 0x368c03fff0070087, + 0x0420008000000101, + 0x0407fffffff70102, + 0x001fb400fda007ed, + 0x38000000b1470103, + 0x366603800027030f, + 0x5c47020000270002, + 0x001fb401e1a0070d, + 0x3880004350010000, + 0x5080000000770105, + 0x365a03807ff70306, + 0x001fb400fda007ed, + 0x5c47000000670202, + 0x5b6a03800ff70207, + 0xe24000000400000f, + 0x003fb400fda007ed, + 0x0408000000070101, + 0x5c9807800ff70000, + 0x5c47020000570101, + 0x001fbc00fde007ed, + 0xe32000000007000f, + 0x50b0000000070f00, + 0x50b0000000070f00, +/* 0x0620: rsq_norm */ + 0x0060b400e5a007ed, + 0x5c9807800ff70004, + 0x38a8003f00070b08, + 0x5c80000000870002, + 0x003c3401e1a01f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x003c3401e1a00f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x003c3401e1a00f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x003c3401e1a00f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x001fb401fda00f0d, + 0x38800041a0010404, + 0x5c98078000570001, + 0x5c98078000470000, + 0x001fbc00fde007ed, 0xe32000000007000f, 0x50b0000000070f00, 0x50b0000000070f00, diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 5d69ce2ab7c..aca3b0afb1e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -129,7 +129,7 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) bld.mkSplit(src, 4, i->getSrc(0)); int chip = prog->getTarget()->getChipset(); - if (chip >= NVISA_GK104_CHIPSET && (i->op == OP_RCP || chip < NVISA_GM107_CHIPSET)) { + if (chip >= NVISA_GK104_CHIPSET) { handleRCPRSQLib(i, src); return; } -- 2.30.2