gk110/ir: Add rsq f64 implementation

author Boyan Ding <boyan.j.ding@gmail.com>

Thu, 9 Mar 2017 05:55:18 +0000 (13:55 +0800)

committer Ilia Mirkin <imirkin@alum.mit.edu>

Thu, 7 Feb 2019 00:35:57 +0000 (19:35 -0500)
author Boyan Ding <boyan.j.ding@gmail.com>
Thu, 9 Mar 2017 05:55:18 +0000 (13:55 +0800)
committer Ilia Mirkin <imirkin@alum.mit.edu>
Thu, 7 Feb 2019 00:35:57 +0000 (19:35 -0500)
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm

index c33dd2158c9434e8d249c5cd66f295cf7590796f..4047a565a9fca68ce0fd562f2b7433f9ce5ab038 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
@@ -230,7 +230,7 @@ rcp_result_denorm:
     and b32 $r1 $r1 0x800fffff
     // 0x3e800000: 1/4
     $p0 cvt f64 $r6d f32 0x3e800000
-   sched 0x2f 0x28 0x2c 0x2e 0x2e 0x00 0x00
+   sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
     // 0x3f000000: 1/2
     (not $p0) cvt f64 $r6d f32 0x3f000000
     add b32 $r1 $r1 0x00100000
@@ -238,7 +238,74 @@ rcp_result_denorm:
  rcp_end:
     ret
  
+// RSQ F64
+//
+// INPUT:   $r0d
+// OUTPUT:  $r0d
+// CLOBBER: $r2 - $r9, $p0 - $p1
+//
  gk110_rsq_f64:
+   // Before getting initial result rsqrt64h, two special cases should be
+   // handled first.
+   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
+   //    as NaN in rsqrt64h
+   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
+   $p0 or b32 $r1 $r1 0x00080000
+   and b32 $r2 $r1 0x7fffffff
+   sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
+   // 2. denorms and small normal values: using their original value will
+   //    lose precision either at rsqrt64h or the first step in newton-raphson
+   //    steps below. Take 2 as a threshold in exponent field, and multiply
+   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
+   //    to recover in the end)
+   ext u32 $r3 $r1 0xb14
+   set b32 $p1 0x1 le u32 $r3 0x2
+   or b32 $r2 $r0 $r2
+   $p1 mul rn f64 $r0d $r0d 0x4350000000000000
+   rsqrt64h f32 $r5 $r1
+   // rsqrt64h will give correct result for 0/inf/nan, the following logic
+   // checks whether the input is one of those (exponent is 0x7ff or all 0
+   // except for the sign bit)
+   set b32 $r6 ne u32 $r3 0x7ff
+   and b32 $r2 $r2 $r6
+   sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
+   set b32 $p0 0x1 ne u32 $r2 0x0
+   $p0 bra #rsq_norm
+   // For 0/inf/nan, make sure the sign bit agrees with input and return
+   and b32 $r1 $r1 0x80000000
+   mov b32 $r0 0x0
+   or b32 $r1 $r1 $r5
+   ret
+rsq_norm:
+   // For others, do 4 Newton-Raphson steps with the formula:
+   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
+   // In the code below, each step is written as:
+   //     tmp1 = 0.5 * x * RSQ_{n}
+   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
+   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
+   mov b32 $r4 0x0
+   sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
+   // 0x3f000000: 1/2
+   cvt f64 $r8d f32 0x3f000000
+   mul rn f64 $r2d $r0d $r8d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   mul rn f64 $r0d $r2d $r4d
+   fma rn f64 $r6d neg $r4d $r0d $r8d
+   fma rn f64 $r4d $r4d $r6d $r4d
+   sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
+   // Multiply 2^27 to result for small inputs to recover
+   $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
+   mov b32 $r1 $r5
+   mov b32 $r0 $r4
     ret
  
  .section #gk110_builtin_offsets
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h

index d41f135a26a21698b1d93929fa491ec63b1a08c2..3d1523f2fdd87e028f93149cf69644d4a1eee9e0 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h
@@ -144,13 +144,53 @@ uint64_t gk110_builtin_code[] = {
         0xb3501c00001c0c1d,
         0x204007ffff9c0404,
         0xc54001f400002c19,
-       0x080000b8b8b0a0bc,
+       0x089c80a8b8b0a0bc,
         0xc54001f800202c19,
         0x40000800001c0405,
         0xe4000000031c0002,
  /* 0x0460: rcp_end */
         0x19000000001c003c,
  /* 0x0468: gk110_rsq_f64 */
+       0xb4601fff801c021d,
+       0x2100040000000404,
+       0x203fffffff9c0408,
+       0x08a0a094b0a0809c,
+       0xc00000058a1c040d,
+       0xb3301c00011c0c3d,
+       0xe2001000011c000a,
+       0xc400021a80040001,
+       0x84000000039c0416,
+       0xb2d01c03ff9c0c19,
+       0xe2000000031c080a,
+       0x08a0b8a09c80aca0,
+       0xb3501c00001c081d,
+       0x120000001000003c,
+       0x20400000001c0404,
+       0xe4c03c007f9c0002,
+       0xe2001000029c0406,
+       0x19000000001c003c,
+/* 0x04f8: rsq_norm */
+       0xe4c03c007f9c0012,
+       0x08a4a4a4a4a4a4bc,
+       0xc54001f8001c2c21,
+       0xe4000000041c000a,
+       0xe4000000021c0802,
+       0xdb882000001c101a,
+       0xdb801000031c1012,
+       0xe4000000021c0802,
+       0xdb882000001c101a,
+       0x08a4a4a4a4a4a4a4,
+       0xdb801000031c1012,
+       0xe4000000021c0802,
+       0xdb882000001c101a,
+       0xdb801000031c1012,
+       0xe4000000021c0802,
+       0xdb882000001c101a,
+       0xdb801000031c1012,
+       0x08000000b8a080a4,
+       0xc400020d00041011,
+       0xe4c03c00029c0006,
+       0xe4c03c00021c0002,
         0x19000000001c003c,
  };
author	Boyan Ding <boyan.j.ding@gmail.com>
	Thu, 9 Mar 2017 05:55:18 +0000 (13:55 +0800)
committer	Ilia Mirkin <imirkin@alum.mit.edu>
	Thu, 7 Feb 2019 00:35:57 +0000 (19:35 -0500)
src/gallium/drivers/nouveau/codegen/lib/gk110.asm		patch \| blob \| history
src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h		patch \| blob \| history