From 815a8e59c6d462a7008653ea9e3010d40b6ba589 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sat, 30 Jun 2018 23:09:27 +0200 Subject: [PATCH] gm107/ir: add fp64 rcp Acked-by: Ilia Mirkin Cc: 19.0 --- .../drivers/nouveau/codegen/lib/gm107.asm | 169 +++++++++++++++++- .../drivers/nouveau/codegen/lib/gm107.asm.h | 103 ++++++++++- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 2 +- 3 files changed, 270 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm index 7ee5f8fc65b..595d9dc5d41 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm @@ -100,8 +100,175 @@ gm107_div_s32: ret nop 0 -// STUB +// RCP F64 +// +// INPUT: $r0d +// OUTPUT: $r0d +// CLOBBER: $r2 - $r9, $p0 +// +// The core of RCP and RSQ implementation is Newton-Raphson step, which is +// used to find successively better approximation from an imprecise initial +// value (single precision rcp in RCP and rsqrt64h in RSQ). +// gm107_rcp_f64: + // Step 1: classify input according to exponent and value, and calculate + // result for 0/inf/nan. $r2 holds the exponent value, which starts at + // bit 52 (bit 20 of the upper half) and is 11 bits in length + sched (st 0x0) (st 0x0) (st 0x0) + bfe u32 $r2 $r1 0xb14 + iadd32i $r3 $r2 -1 + ssy #rcp_rejoin + // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, + // denorm, or 0). Do this by substracting 1 from the exponent, which will + // mean that it's > 0x7fd in those cases when doing unsigned comparison + sched (st 0x0) (st 0x0) (st 0x0) + isetp gt u32 and $p0 1 $r3 0x7fd 1 + // $r3: 0 for norms, 0x36 for denorms, -1 for others + mov $r3 0x0 0xf + not $p0 sync + // Process all special values: NaN, inf, denorm, 0 + sched (st 0x0) (st 0x0) (st 0x0) + mov32i $r3 0xffffffff 0xf + // A number is NaN if its abs value is greater than or unordered with inf + dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1 + not $p0 bra #rcp_inf_or_denorm_or_zero + // NaN -> NaN, the next line sets the "quiet" bit of the result. This + // behavior is both seen on the CPU and the blob + sched (st 0x0) (st 0x0) (st 0x0) + lop32i or $r1 $r1 0x80000 + sync +rcp_inf_or_denorm_or_zero: + lop32i and $r4 $r1 0x7ff00000 + sched (st 0x0) (st 0x0) (st 0x0) + // Other values with nonzero in exponent field should be inf + isetp eq and $p0 1 $r4 0x0 1 + $p0 bra #rcp_denorm_or_zero + // +/-Inf -> +/-0 + lop32i xor $r1 $r1 0x7ff00000 + sched (st 0x0) (st 0x0) (st 0x0) + mov $r0 0x0 0xf + sync +rcp_denorm_or_zero: + dsetp gtu and $p0 1 abs $r0 0x0 1 + sched (st 0x0) (st 0x0) (st 0x0) + $p0 bra #rcp_denorm + // +/-0 -> +/-Inf + lop32i or $r1 $r1 0x7ff00000 + sync +rcp_denorm: + // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms + sched (st 0x0) (st 0x0) (st 0x0) + dmul $r0 $r0 0x4350000000000000 + mov $r3 0x36 0xf + sync +rcp_rejoin: + // All numbers with -1 in $r3 have their result ready in $r0d, return them + // others need further calculation + sched (st 0x0) (st 0x0) (st 0x0) + isetp lt and $p0 1 $r3 0x0 1 + $p0 bra #rcp_end + // Step 2: Before the real calculation goes on, renormalize the values to + // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) + // result in $r6d. The exponent will be recovered later. + bfe u32 $r2 $r1 0xb14 + sched (st 0x0) (st 0x0) (st 0x0) + lop32i and $r7 $r1 0x800fffff + iadd32i $r7 $r7 0x3ff00000 + mov $r6 $r0 0xf + // Step 3: Convert new value to float (no overflow will occur due to step + // 2), calculate rcp and do newton-raphson step once + sched (st 0x0) (st 0x0) (st 0x0) + f2f ftz f64 f32 $r5 $r6 + mufu rcp $r4 $r5 + mov32i $r0 0xbf800000 0xf + sched (st 0x0) (st 0x0) (st 0x0) + ffma $r5 $r4 $r5 $r0 + ffma $r0 $r5 neg $r4 $r4 + // Step 4: convert result $r0 back to double, do newton-raphson steps + f2f f32 f64 $r0 $r0 + sched (st 0x0) (st 0x0) (st 0x0) + f2f f64 f64 $r6 neg $r6 + f2f f32 f64 $r8 0x3f800000 + // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d + // The formula used here (and above) is: + // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} + // The following code uses 2 FMAs for each step, and it will basically + // looks like: + // tmp = -src * RCP_{n} + 1 + // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} + dfma $r4 $r6 $r0 $r8 + sched (st 0x0) (st 0x0) (st 0x0) + dfma $r0 $r0 $r4 $r0 + dfma $r4 $r6 $r0 $r8 + dfma $r0 $r0 $r4 $r0 + sched (st 0x0) (st 0x0) (st 0x0) + dfma $r4 $r6 $r0 $r8 + dfma $r0 $r0 $r4 $r0 + dfma $r4 $r6 $r0 $r8 + sched (st 0x0) (st 0x0) (st 0x0) + dfma $r0 $r0 $r4 $r0 + // Step 5: Exponent recovery and final processing + // The exponent is recovered by adding what we added to the exponent. + // Suppose we want to calculate rcp(x), but we have rcp(cx), then + // rcp(x) = c * rcp(cx) + // The delta in exponent comes from two sources: + // 1) The renormalization in step 2. The delta is: + // 0x3ff - $r2 + // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored + // in $r3 + // These 2 sources are calculated in the first two lines below, and then + // added to the exponent extracted from the result above. + // Note that after processing, the new exponent may >= 0x7ff (inf) + // or <= 0 (denorm). Those cases will be handled respectively below + iadd $r2 neg $r2 0x3ff + iadd $r4 $r2 $r3 + sched (st 0x0) (st 0x0) (st 0x0) + bfe u32 $r3 $r1 0xb14 + // New exponent in $r3 + iadd $r3 $r3 $r4 + iadd32i $r2 $r3 -1 + // (exponent-1) < 0x7fe (unsigned) means the result is in norm range + // (same logic as in step 1) + sched (st 0x0) (st 0x0) (st 0x0) + isetp lt u32 and $p0 1 $r2 0x7fe 1 + not $p0 bra #rcp_result_inf_or_denorm + // Norms: convert exponents back and return + shl $r4 $r4 0x14 + sched (st 0x0) (st 0x0) (st 0x0) + iadd $r1 $r4 $r1 + bra #rcp_end +rcp_result_inf_or_denorm: + // New exponent >= 0x7ff means that result is inf + isetp ge and $p0 1 $r3 0x7ff 1 + sched (st 0x0) (st 0x0) (st 0x0) + not $p0 bra #rcp_result_denorm + // Infinity + lop32i and $r1 $r1 0x80000000 + mov $r0 0x0 0xf + sched (st 0x0) (st 0x0) (st 0x0) + iadd32i $r1 $r1 0x7ff00000 + bra #rcp_end +rcp_result_denorm: + // Denorm result comes from huge input. The greatest possible fp64, i.e. + // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest + // normal value. Other rcp result should be greater than that. If we + // set the exponent field to 1, we can recover the result by multiplying + // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise + // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies + // the logic here. + isetp ne u32 and $p0 1 $r3 0x0 1 + sched (st 0x0) (st 0x0) (st 0x0) + lop32i and $r1 $r1 0x800fffff + // 0x3e800000: 1/4 + $p0 f2f f32 f64 $r6 0x3e800000 + // 0x3f000000: 1/2 + not $p0 f2f f32 f64 $r6 0x3f000000 + sched (st 0x0) (st 0x0) (st 0x0) + iadd32i $r1 $r1 0x00100000 + dmul $r0 $r0 $r6 +rcp_end: + ret + gm107_rsq_f64: sched (st 0x0) (st 0x0) (st 0x0) ret diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h index 65c93f7ae89..206d01bde83 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h @@ -82,7 +82,106 @@ uint64_t gm107_builtin_code[] = { 0xe32000000007000f, 0x50b0000000070f00, /* 0x0280: gm107_rcp_f64 */ -/* 0x0280: gm107_rsq_f64 */ + 0x001f8000fc0007e0, + 0x38000000b1470102, + 0x1c0ffffffff70203, + 0xe29000000e000000, + 0x001f8000fc0007e0, + 0x366803807fd70307, + 0x5c9807800ff70003, + 0xf0f800000008000f, + 0x001f8000fc0007e0, + 0x010ffffffff7f003, + 0x368c03fff0070087, + 0xe24000000188000f, + 0x001f8000fc0007e0, + 0x0420008000070101, + 0xf0f800000007000f, +/* 0x02f8: rcp_inf_or_denorm_or_zero */ + 0x0407ff0000070104, + 0x001f8000fc0007e0, + 0x5b6503800ff70407, + 0xe24000000200000f, + 0x0447ff0000070101, + 0x001f8000fc0007e0, + 0x5c9807800ff70000, + 0xf0f800000007000f, +/* 0x0338: rcp_denorm_or_zero */ + 0x5b8c03800ff70087, + 0x001f8000fc0007e0, + 0xe24000000100000f, + 0x0427ff0000070101, + 0xf0f800000007000f, +/* 0x0360: rcp_denorm */ + 0x001f8000fc0007e0, + 0x3880004350070000, + 0x3898078003670003, + 0xf0f800000007000f, +/* 0x0380: rcp_rejoin */ + 0x001f8000fc0007e0, + 0x5b6303800ff70307, + 0xe24000001c00000f, + 0x38000000b1470102, + 0x001f8000fc0007e0, + 0x040800fffff70107, + 0x1c03ff0000070707, + 0x5c98078000070006, + 0x001f8000fc0007e0, + 0x5ca8100000670e05, + 0x5080000000470504, + 0x010bf8000007f000, + 0x001f8000fc0007e0, + 0x5980000000570405, + 0x5981020000470500, + 0x5ca8000000070b00, + 0x001f8000fc0007e0, + 0x5ca8200000670f06, + 0x38a8003f80070b08, + 0x5b70040000070604, + 0x001f8000fc0007e0, + 0x5b70000000470000, + 0x5b70040000070604, + 0x5b70000000470000, + 0x001f8000fc0007e0, + 0x5b70040000070604, + 0x5b70000000470000, + 0x5b70040000070604, + 0x001f8000fc0007e0, + 0x5b70000000470000, + 0x381200003ff70202, + 0x5c10000000370204, + 0x001f8000fc0007e0, + 0x38000000b1470103, + 0x5c10000000470303, + 0x1c0ffffffff70302, + 0x001f8000fc0007e0, + 0x366203807fe70207, + 0xe24000000208000f, + 0x3848000001470404, + 0x001f8000fc0007e0, + 0x5c10000000170401, + 0xe24000000807000f, +/* 0x04d8: rcp_result_inf_or_denorm */ + 0x366d03807ff70307, + 0x001f8000fc0007e0, + 0xe24000000288000f, + 0x0408000000070101, + 0x5c9807800ff70000, + 0x001f8000fc0007e0, + 0x1c07ff0000070101, + 0xe24000000407000f, +/* 0x0518: rcp_result_denorm */ + 0x5b6a03800ff70307, + 0x001f8000fc0007e0, + 0x040800fffff70101, + 0x38a8003e80000b06, + 0x38a8003f00080b06, + 0x001f8000fc0007e0, + 0x1c00010000070101, + 0x5c80000000670000, +/* 0x0558: rcp_end */ + 0xe32000000007000f, +/* 0x0560: gm107_rsq_f64 */ 0x001f8000fc0007e0, 0xe32000000007000f, 0x50b0000000070f00, @@ -93,5 +192,5 @@ uint64_t gm107_builtin_offsets[] = { 0x0000000000000000, 0x0000000000000120, 0x0000000000000280, - 0x0000000000000280, + 0x0000000000000560, }; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 7d28c5f6e52..5d69ce2ab7c 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -129,7 +129,7 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) bld.mkSplit(src, 4, i->getSrc(0)); int chip = prog->getTarget()->getChipset(); - if (chip >= NVISA_GK104_CHIPSET && chip < NVISA_GM107_CHIPSET) { + if (chip >= NVISA_GK104_CHIPSET && (i->op == OP_RCP || chip < NVISA_GM107_CHIPSET)) { handleRCPRSQLib(i, src); return; } -- 2.30.2