ilo: resolve Z/HiZ correctly
[mesa.git] / src / gallium / drivers / nouveau / codegen / target_lib_nvc0.asm
1 //
2 // DIV U32
3 //
4 // UNR recurrence (q = a / b):
5 // look for z such that 2^32 - b <= b * z < 2^32
6 // then q - 1 <= (a * z) / 2^32 <= q
7 //
8 // INPUT: $r0: dividend, $r1: divisor
9 // OUTPUT: $r0: result, $r1: modulus
10 // CLOBBER: $r2 - $r3, $p0 - $p1
11 // SIZE: 22 / 14 * 8 bytes
12 //
13 bfind u32 $r2 $r1
14 xor b32 $r2 $r2 0x1f
15 mov b32 $r3 0x1
16 shl b32 $r2 $r3 clamp $r2
17 cvt u32 $r1 neg u32 $r1
18 mul $r3 u32 $r1 u32 $r2
19 add $r2 (mul high u32 $r2 u32 $r3) $r2
20 mul $r3 u32 $r1 u32 $r2
21 add $r2 (mul high u32 $r2 u32 $r3) $r2
22 mul $r3 u32 $r1 u32 $r2
23 add $r2 (mul high u32 $r2 u32 $r3) $r2
24 mul $r3 u32 $r1 u32 $r2
25 add $r2 (mul high u32 $r2 u32 $r3) $r2
26 mul $r3 u32 $r1 u32 $r2
27 add $r2 (mul high u32 $r2 u32 $r3) $r2
28 mov b32 $r3 $r0
29 mul high $r0 u32 $r0 u32 $r2
30 cvt u32 $r2 neg u32 $r1
31 add $r1 (mul u32 $r1 u32 $r0) $r3
32 set $p0 0x1 ge u32 $r1 $r2
33 $p0 sub b32 $r1 $r1 $r2
34 $p0 add b32 $r0 $r0 0x1
35 $p0 set $p0 0x1 ge u32 $r1 $r2
36 $p0 sub b32 $r1 $r1 $r2
37 $p0 add b32 $r0 $r0 0x1
38 ret
39 //
40 // DIV S32, like DIV U32 after taking ABS(inputs)
41 //
42 // INPUT: $r0: dividend, $r1: divisor
43 // OUTPUT: $r0: result, $r1: modulus
44 // CLOBBER: $r2 - $r3, $p0 - $p3
45 //
46 set $p2 0x1 lt s32 $r0 0x0
47 set $p3 0x1 lt s32 $r1 0x0 xor $p2
48 cvt s32 $r0 abs s32 $r0
49 cvt s32 $r1 abs s32 $r1
50 bfind u32 $r2 $r1
51 xor b32 $r2 $r2 0x1f
52 mov b32 $r3 0x1
53 shl b32 $r2 $r3 clamp $r2
54 cvt u32 $r1 neg u32 $r1
55 mul $r3 u32 $r1 u32 $r2
56 add $r2 (mul high u32 $r2 u32 $r3) $r2
57 mul $r3 u32 $r1 u32 $r2
58 add $r2 (mul high u32 $r2 u32 $r3) $r2
59 mul $r3 u32 $r1 u32 $r2
60 add $r2 (mul high u32 $r2 u32 $r3) $r2
61 mul $r3 u32 $r1 u32 $r2
62 add $r2 (mul high u32 $r2 u32 $r3) $r2
63 mul $r3 u32 $r1 u32 $r2
64 add $r2 (mul high u32 $r2 u32 $r3) $r2
65 mov b32 $r3 $r0
66 mul high $r0 u32 $r0 u32 $r2
67 cvt u32 $r2 neg u32 $r1
68 add $r1 (mul u32 $r1 u32 $r0) $r3
69 set $p0 0x1 ge u32 $r1 $r2
70 $p0 sub b32 $r1 $r1 $r2
71 $p0 add b32 $r0 $r0 0x1
72 $p0 set $p0 0x1 ge u32 $r1 $r2
73 $p0 sub b32 $r1 $r1 $r2
74 $p0 add b32 $r0 $r0 0x1
75 $p3 cvt s32 $r0 neg s32 $r0
76 $p2 cvt s32 $r1 neg s32 $r1
77 ret
78 //
79 // RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
80 //
81 // INPUT: $r0d (x)
82 // OUTPUT: $r0d (rcp(x))
83 // CLOBBER: $r2 - $r7
84 // SIZE: 9 * 8 bytes
85 //
86 nop
87 ret
88 // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
89 //
90 // INPUT: $r0d (x)
91 // OUTPUT: $r0d (rsqrt(x))
92 // CLOBBER: $r2 - $r7
93 // SIZE: 14 * 8 bytes
94 //
95 nop
96 ret