4 // UNR recurrence (q = a / b):
5 // look for z such that 2^32 - b <= b * z < 2^32
6 // then q - 1 <= (a * z) / 2^32 <= q
8 // INPUT: $r0: dividend, $r1: divisor
9 // OUTPUT: $r0: result, $r1: modulus
10 // CLOBBER: $r2 - $r3, $p0 - $p1
11 // SIZE: 22 / 14 * 8 bytes
16 shl b32 $r2 $r3 clamp $r2
17 cvt u32 $r1 neg u32 $r1
18 mul $r3 u32 $r1 u32 $r2
19 add $r2 (mul high u32 $r2 u32 $r3) $r2
20 mul $r3 u32 $r1 u32 $r2
21 add $r2 (mul high u32 $r2 u32 $r3) $r2
22 mul $r3 u32 $r1 u32 $r2
23 add $r2 (mul high u32 $r2 u32 $r3) $r2
24 mul $r3 u32 $r1 u32 $r2
25 add $r2 (mul high u32 $r2 u32 $r3) $r2
26 mul $r3 u32 $r1 u32 $r2
27 add $r2 (mul high u32 $r2 u32 $r3) $r2
29 mul high $r0 u32 $r0 u32 $r2
30 cvt u32 $r2 neg u32 $r1
31 add $r1 (mul u32 $r1 u32 $r0) $r3
32 set $p0 0x1 ge u32 $r1 $r2
33 $p0 sub b32 $r1 $r1 $r2
34 $p0 add b32 $r0 $r0 0x1
35 $p0 set $p0 0x1 ge u32 $r1 $r2
36 $p0 sub b32 $r1 $r1 $r2
37 $p0 add b32 $r0 $r0 0x1
40 // DIV S32, like DIV U32 after taking ABS(inputs)
42 // INPUT: $r0: dividend, $r1: divisor
43 // OUTPUT: $r0: result, $r1: modulus
44 // CLOBBER: $r2 - $r3, $p0 - $p3
46 set $p2 0x1 lt s32 $r0 0x0
47 set $p3 0x1 lt s32 $r1 0x0 xor $p2
48 cvt s32 $r0 abs s32 $r0
49 cvt s32 $r1 abs s32 $r1
53 shl b32 $r2 $r3 clamp $r2
54 cvt u32 $r1 neg u32 $r1
55 mul $r3 u32 $r1 u32 $r2
56 add $r2 (mul high u32 $r2 u32 $r3) $r2
57 mul $r3 u32 $r1 u32 $r2
58 add $r2 (mul high u32 $r2 u32 $r3) $r2
59 mul $r3 u32 $r1 u32 $r2
60 add $r2 (mul high u32 $r2 u32 $r3) $r2
61 mul $r3 u32 $r1 u32 $r2
62 add $r2 (mul high u32 $r2 u32 $r3) $r2
63 mul $r3 u32 $r1 u32 $r2
64 add $r2 (mul high u32 $r2 u32 $r3) $r2
66 mul high $r0 u32 $r0 u32 $r2
67 cvt u32 $r2 neg u32 $r1
68 add $r1 (mul u32 $r1 u32 $r0) $r3
69 set $p0 0x1 ge u32 $r1 $r2
70 $p0 sub b32 $r1 $r1 $r2
71 $p0 add b32 $r0 $r0 0x1
72 $p0 set $p0 0x1 ge u32 $r1 $r2
73 $p0 sub b32 $r1 $r1 $r2
74 $p0 add b32 $r0 $r0 0x1
75 $p3 cvt s32 $r0 neg s32 $r0
76 $p2 cvt s32 $r1 neg s32 $r1
79 // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
82 // OUTPUT: $r0d (rcp(x))
88 // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
91 // OUTPUT: $r0d (rsqrt(x))