nvc0: allow for easier modification of compiler library routines
[mesa.git] / src / gallium / drivers / nouveau / codegen / lib / gk110.asm
1 .section #gk110_builtin_code
2 // DIV U32
3 //
4 // UNR recurrence (q = a / b):
5 // look for z such that 2^32 - b <= b * z < 2^32
6 // then q - 1 <= (a * z) / 2^32 <= q
7 //
8 // INPUT: $r0: dividend, $r1: divisor
9 // OUTPUT: $r0: result, $r1: modulus
10 // CLOBBER: $r2 - $r3, $p0 - $p1
11 // SIZE: 22 / 14 * 8 bytes
12 //
13 gk110_div_u32:
14 sched 0x28282804280428
15 bfind u32 $r2 $r1
16 xor b32 $r2 $r2 0x1f
17 mov b32 $r3 0x1
18 shl b32 $r2 $r3 clamp $r2
19 cvt u32 $r1 neg u32 $r1
20 mul $r3 u32 $r1 u32 $r2
21 add $r2 (mul high u32 $r2 u32 $r3) $r2
22 sched 0x28282828282828
23 mul $r3 u32 $r1 u32 $r2
24 add $r2 (mul high u32 $r2 u32 $r3) $r2
25 mul $r3 u32 $r1 u32 $r2
26 add $r2 (mul high u32 $r2 u32 $r3) $r2
27 mul $r3 u32 $r1 u32 $r2
28 add $r2 (mul high u32 $r2 u32 $r3) $r2
29 mul $r3 u32 $r1 u32 $r2
30 sched 0x042c2828042804
31 add $r2 (mul high u32 $r2 u32 $r3) $r2
32 mov b32 $r3 $r0
33 mul high $r0 u32 $r0 u32 $r2
34 cvt u32 $r2 neg u32 $r1
35 add $r1 (mul u32 $r1 u32 $r0) $r3
36 set $p0 0x1 ge u32 $r1 $r2
37 $p0 sub b32 $r1 $r1 $r2
38 sched 0x20282e20042c28
39 $p0 add b32 $r0 $r0 0x1
40 $p0 set $p0 0x1 ge u32 $r1 $r2
41 $p0 sub b32 $r1 $r1 $r2
42 $p0 add b32 $r0 $r0 0x1
43 ret
44
45 // DIV S32, like DIV U32 after taking ABS(inputs)
46 //
47 // INPUT: $r0: dividend, $r1: divisor
48 // OUTPUT: $r0: result, $r1: modulus
49 // CLOBBER: $r2 - $r3, $p0 - $p3
50 //
51 gk110_div_s32:
52 set $p2 0x1 lt s32 $r0 0x0
53 set $p3 0x1 lt s32 $r1 0x0 xor $p2
54 sched 0x28042804282820
55 cvt s32 $r0 abs s32 $r0
56 cvt s32 $r1 abs s32 $r1
57 bfind u32 $r2 $r1
58 xor b32 $r2 $r2 0x1f
59 mov b32 $r3 0x1
60 shl b32 $r2 $r3 clamp $r2
61 cvt u32 $r1 neg u32 $r1
62 sched 0x28282828282828
63 mul $r3 u32 $r1 u32 $r2
64 add $r2 (mul high u32 $r2 u32 $r3) $r2
65 mul $r3 u32 $r1 u32 $r2
66 add $r2 (mul high u32 $r2 u32 $r3) $r2
67 mul $r3 u32 $r1 u32 $r2
68 add $r2 (mul high u32 $r2 u32 $r3) $r2
69 mul $r3 u32 $r1 u32 $r2
70 sched 0x28280428042828
71 add $r2 (mul high u32 $r2 u32 $r3) $r2
72 mul $r3 u32 $r1 u32 $r2
73 add $r2 (mul high u32 $r2 u32 $r3) $r2
74 mov b32 $r3 $r0
75 mul high $r0 u32 $r0 u32 $r2
76 cvt u32 $r2 neg u32 $r1
77 add $r1 (mul u32 $r1 u32 $r0) $r3
78 sched 0x2028042c28042c
79 set $p0 0x1 ge u32 $r1 $r2
80 $p0 sub b32 $r1 $r1 $r2
81 $p0 add b32 $r0 $r0 0x1
82 $p0 set $p0 0x1 ge u32 $r1 $r2
83 $p0 sub b32 $r1 $r1 $r2
84 $p0 add b32 $r0 $r0 0x1
85 $p3 cvt s32 $r0 neg s32 $r0
86 sched 0x2c200428042e04
87 $p2 cvt s32 $r1 neg s32 $r1
88 ret
89
90 gk110_rcp_f64:
91 gk110_rsq_f64:
92 ret
93
94 .section #gk110_builtin_offsets
95 .b64 #gk110_div_u32
96 .b64 #gk110_div_s32
97 .b64 #gk110_rcp_f64
98 .b64 #gk110_rsq_f64