90741b6c59fa8b58a956575162da12ddac3ef23f
[mesa.git] / src / gallium / drivers / nouveau / codegen / lib / gm107.asm
1 .section #gm107_builtin_code
2 // DIV U32
3 //
4 // UNR recurrence (q = a / b):
5 // look for z such that 2^32 - b <= b * z < 2^32
6 // then q - 1 <= (a * z) / 2^32 <= q
7 //
8 // INPUT: $r0: dividend, $r1: divisor
9 // OUTPUT: $r0: result, $r1: modulus
10 // CLOBBER: $r2 - $r3, $p0 - $p1
11 // SIZE: 22 / 14 * 8 bytes
12 //
13 gm107_div_u32:
14 sched (st 0xd wr 0x0 wt 0x3f) (st 0x1 wt 0x1) (st 0x6)
15 flo u32 $r2 $r1
16 lop xor 1 $r2 $r2 0x1f
17 mov $r3 0x1 0xf
18 sched (st 0x1) (st 0xf wr 0x0) (st 0x6 wr 0x0 wt 0x1)
19 shl $r2 $r3 $r2
20 i2i u32 u32 $r1 neg $r1
21 imul u32 u32 $r3 $r1 $r2
22 sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
23 imad u32 u32 hi $r2 $r2 $r3 $r2
24 imul u32 u32 $r3 $r1 $r2
25 imad u32 u32 hi $r2 $r2 $r3 $r2
26 sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
27 imul u32 u32 $r3 $r1 $r2
28 imad u32 u32 hi $r2 $r2 $r3 $r2
29 imul u32 u32 $r3 $r1 $r2
30 sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
31 imad u32 u32 hi $r2 $r2 $r3 $r2
32 imul u32 u32 $r3 $r1 $r2
33 imad u32 u32 hi $r2 $r2 $r3 $r2
34 sched (st 0x6) (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2)
35 mov $r3 $r0 0xf
36 imul u32 u32 hi $r0 $r0 $r2
37 i2i u32 u32 $r2 neg $r1
38 sched (st 0x6 wr 0x0 wt 0x3) (st 0xd wt 0x1) (st 0x1)
39 imad u32 u32 $r1 $r1 $r0 $r3
40 isetp ge u32 and $p0 1 $r1 $r2 1
41 $p0 iadd $r1 $r1 neg $r2
42 sched (st 0x5) (st 0xd) (st 0x1)
43 $p0 iadd $r0 $r0 0x1
44 $p0 isetp ge u32 and $p0 1 $r1 $r2 1
45 $p0 iadd $r1 $r1 neg $r2
46 sched (st 0x1) (st 0xf) (st 0xf)
47 $p0 iadd $r0 $r0 0x1
48 ret
49 nop 0
50
51 // DIV S32, like DIV U32 after taking ABS(inputs)
52 //
53 // INPUT: $r0: dividend, $r1: divisor
54 // OUTPUT: $r0: result, $r1: modulus
55 // CLOBBER: $r2 - $r3, $p0 - $p3
56 //
57 gm107_div_s32:
58 sched (st 0xd wt 0x3f) (st 0x1) (st 0x1 wr 0x0)
59 isetp lt and $p2 0x1 $r0 0 1
60 isetp lt xor $p3 1 $r1 0 $p2
61 i2i s32 s32 $r0 abs $r0
62 sched (st 0xf wr 0x1) (st 0xd wr 0x1 wt 0x2) (st 0x1 wt 0x2)
63 i2i s32 s32 $r1 abs $r1
64 flo u32 $r2 $r1
65 lop xor 1 $r2 $r2 0x1f
66 sched (st 0x6) (st 0x1) (st 0xf wr 0x1)
67 mov $r3 0x1 0xf
68 shl $r2 $r3 $r2
69 i2i u32 u32 $r1 neg $r1
70 sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
71 imul u32 u32 $r3 $r1 $r2
72 imad u32 u32 hi $r2 $r2 $r3 $r2
73 imul u32 u32 $r3 $r1 $r2
74 sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
75 imad u32 u32 hi $r2 $r2 $r3 $r2
76 imul u32 u32 $r3 $r1 $r2
77 imad u32 u32 hi $r2 $r2 $r3 $r2
78 sched (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2) (st 0x6 wr 0x1 wt 0x2)
79 imul u32 u32 $r3 $r1 $r2
80 imad u32 u32 hi $r2 $r2 $r3 $r2
81 imul u32 u32 $r3 $r1 $r2
82 sched (st 0x6 wr 0x1 rd 0x2 wt 0x2) (st 0x2 wt 0x5) (st 0x6 wr 0x0 rd 0x1 wt 0x2)
83 imad u32 u32 hi $r2 $r2 $r3 $r2
84 mov $r3 $r0 0xf
85 imul u32 u32 hi $r0 $r0 $r2
86 sched (st 0xf wr 0x1 rd 0x2 wt 0x2) (st 0x6 wr 0x0 wt 0x5) (st 0xd wt 0x3)
87 i2i u32 u32 $r2 neg $r1
88 imad u32 u32 $r1 $r1 $r0 $r3
89 isetp ge u32 and $p0 1 $r1 $r2 1
90 sched (st 0x1) (st 0x5) (st 0xd)
91 $p0 iadd $r1 $r1 neg $r2
92 $p0 iadd $r0 $r0 0x1
93 $p0 isetp ge u32 and $p0 1 $r1 $r2 1
94 sched (st 0x1) (st 0x2) (st 0xf wr 0x0)
95 $p0 iadd $r1 $r1 neg $r2
96 $p0 iadd $r0 $r0 0x1
97 $p3 i2i s32 s32 $r0 neg $r0
98 sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)
99 $p2 i2i s32 s32 $r1 neg $r1
100 ret
101 nop 0
102
103 // STUB
104 gm107_rcp_f64:
105 gm107_rsq_f64:
106 sched (st 0x0) (st 0x0) (st 0x0)
107 ret
108 nop 0
109 nop 0
110
111 .section #gm107_builtin_offsets
112 .b64 #gm107_div_u32
113 .b64 #gm107_div_s32
114 .b64 #gm107_rcp_f64
115 .b64 #gm107_rsq_f64