experiment expressing borrow as ~val+1 instead of -val
[libreriscv.git] / openpower / sv / bitmanip / divmnu64.c
1 /* original source code from Hackers-Delight
2 https://github.com/hcs0/Hackers-Delight
3 */
4 /* This divides an n-word dividend by an m-word divisor, giving an
5 n-m+1-word quotient and m-word remainder. The bignums are in arrays of
6 words. Here a "word" is 32 bits. This routine is designed for a 64-bit
7 machine which has a 64/64 division instruction. */
8
9 #include <stdio.h>
10 #include <stdlib.h> //To define "exit", req'd by XLC.
11 #include <stdbool.h>
12 #include <stdint.h>
13
14 #define max(x, y) ((x) > (y) ? (x) : (y))
15
16 int nlz(unsigned x) {
17 int n;
18
19 if (x == 0) return(32);
20 n = 0;
21 if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
22 if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
23 if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
24 if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
25 if (x <= 0x7FFFFFFF) {n = n + 1;}
26 return n;
27 }
28
29 void dumpit(char *msg, int n, unsigned v[]) {
30 int i;
31 printf("%s", msg);
32 for (i = n-1; i >= 0; i--) printf(" %08x", v[i]);
33 printf("\n");
34 }
35
36 /* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
37 (The sequence is in little-endian order).
38
39 This is a fairly precise implementation of Knuth's Algorithm D, for a
40 binary computer with base b = 2**32. The caller supplies:
41 1. Space q for the quotient, m - n + 1 words (at least one).
42 2. Space r for the remainder (optional), n words.
43 3. The dividend u, m words, m >= 1.
44 4. The divisor v, n words, n >= 2.
45 The most significant digit of the divisor, v[n-1], must be nonzero. The
46 dividend u may have leading zeros; this just makes the algorithm take
47 longer and makes the quotient contain more leading zeros. A value of
48 NULL may be given for the address of the remainder to signify that the
49 caller does not want the remainder.
50 The program does not alter the input parameters u and v.
51 The quotient and remainder returned may have leading zeros. The
52 function itself returns a value of 0 for success and 1 for invalid
53 parameters (e.g., division by 0).
54 For now, we must have m >= n. Knuth's Algorithm D also requires
55 that the dividend be at least as long as the divisor. (In his terms,
56 m >= 0 (unstated). Therefore m+n >= n.) */
57
58 int divmnu(unsigned q[], unsigned r[],
59 const unsigned u[], const unsigned v[],
60 int m, int n) {
61
62 const unsigned long long b = 4294967296LL; // Number base (2**32).
63 unsigned *un, *vn; // Normalized form of u, v.
64 unsigned long long qhat; // Estimated quotient digit.
65 unsigned long long rhat; // A remainder.
66 unsigned long long p; // Product of two digits.
67 long long t, k;
68 int s, i, j;
69
70 if (m < n || n <= 0 || v[n-1] == 0)
71 return 1; // Return if invalid param.
72
73 if (n == 1) { // Take care of
74 k = 0; // the case of a
75 for (j = m - 1; j >= 0; j--) { // single-digit
76 q[j] = (k*b + u[j])/v[0]; // divisor here.
77 k = (k*b + u[j]) - q[j]*v[0];
78 }
79 if (r != NULL) r[0] = k;
80 return 0;
81 }
82
83 /* Normalize by shifting v left just enough so that its high-order
84 bit is on, and shift u left the same amount. We may have to append a
85 high-order digit on the dividend; we do that unconditionally. */
86
87 s = nlz(v[n-1]); // 0 <= s <= 31.
88 vn = (unsigned *)alloca(4*n);
89 for (i = n - 1; i > 0; i--)
90 vn[i] = (v[i] << s) | ((unsigned long long)v[i-1] >> (32-s));
91 vn[0] = v[0] << s;
92
93 un = (unsigned *)alloca(4*(m + 1));
94 un[m] = (unsigned long long)u[m-1] >> (32-s);
95 for (i = m - 1; i > 0; i--)
96 un[i] = (u[i] << s) | ((unsigned long long)u[i-1] >> (32-s));
97 un[0] = u[0] << s;
98
99 for (j = m - n; j >= 0; j--) { // Main loop.
100 // Compute estimate qhat of q[j].
101 qhat = (un[j+n]*b + un[j+n-1])/vn[n-1];
102 rhat = (un[j+n]*b + un[j+n-1]) - qhat*vn[n-1];
103 again:
104 if (qhat >= b || qhat*vn[n-2] > b*rhat + un[j+n-2])
105 { qhat = qhat - 1;
106 rhat = rhat + vn[n-1];
107 if (rhat < b) goto again;
108 }
109
110 #define SUB_MUL_BORROW
111 #ifdef ORIGINAL
112 // Multiply and subtract.
113 k = 0;
114 for (i = 0; i < n; i++) {
115 p = qhat*vn[i];
116 t = un[i+j] - k - (p & 0xFFFFFFFFLL);
117 un[i+j] = t;
118 k = (p >> 32) - (t >> 32);
119 }
120 t = un[j+n] - k;
121 un[j+n] = t;
122 bool need_fixup = t < 0;
123 #elif defined(SUB_MUL_BORROW)
124 (void)p; // shut up unused variable warning
125
126 // Multiply and subtract.
127 uint32_t borrow = 0;
128 uint32_t phi[2000]; // plenty space
129 uint32_t plo[2000]; // plenty space
130 // first, perform mul-and-sub and store in split hi-lo
131 // this shows the vectorised sv.msubx which stores 128-bit in
132 // two 64-bit registers
133 for(int i = 0; i <= n; i++) {
134 uint32_t vn_i = i < n ? vn[i] : 0;
135 uint64_t value = un[i + j] - (uint64_t)qhat * vn_i;
136 plo[i] = value & 0xffffffffLL;
137 phi[i] = value >> 32;
138 }
139 // second, reconstruct the 64-bit result, subtract borrow,
140 // store top-half (-ve) in new borrow and store low-half as answer
141 // this is the new (odd) instruction
142 for(int i = 0; i <= n; i++) {
143 uint64_t value = (((uint64_t)phi[i]<<32) | plo[i]) - borrow;
144 borrow = ~(value >> 32)+1; // -(uint32_t)(value >> 32);
145 un[i + j] = (uint32_t)value;
146 }
147 bool need_fixup = borrow != 0;
148 #elif defined(MUL_RSUB_CARRY)
149 (void)p; // shut up unused variable warning
150
151 // Multiply and subtract.
152 uint32_t carry = 1;
153 uint32_t phi[2000]; // plenty space
154 uint32_t plo[2000]; // plenty space
155 for(int i = 0; i <= n; i++) {
156 uint32_t vn_i = i < n ? vn[i] : 0;
157 uint64_t value = un[i + j] + ~((uint64_t)qhat * vn_i);
158 plo[i] = value & 0xffffffffLL;
159 phi[i] = value >> 32;
160 }
161 for(int i = 0; i <= n; i++) {
162 uint64_t result = (((uint64_t)phi[i]<<32) | plo[i]) + carry;
163 uint32_t result_high = result >> 32;
164 if(carry <= 1)
165 result_high++;
166 carry = result_high;
167 un[i + j] = (uint32_t)result;
168 }
169 bool need_fixup = carry != 1;
170 #elif defined(MUL_RSUB_CARRY1)
171 (void)p; // shut up unused variable warning
172
173 // Multiply and subtract.
174 uint32_t carry = 1;
175 uint32_t phi[2000]; // plenty space
176 uint32_t plo[2000]; // plenty space
177 // same mul-and-sub as SUB_MUL_BORROW but not the same
178 // mul-and-sub-minus-one as MUL_RSUB_CARRY
179 for(int i = 0; i <= n; i++) {
180 uint32_t vn_i = i < n ? vn[i] : 0;
181 uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
182 plo[i] = value & 0xffffffffLL;
183 phi[i] = value >> 32;
184 }
185 // compensate for the +1 that was added by mul-and-sub by subtracting
186 // it here (as ~(0))
187 for(int i = 0; i <= n; i++) {
188 uint64_t result = (((uint64_t)phi[i]<<32) | plo[i]) + carry+
189 ~(0); // a way to express "-1"
190 uint32_t result_high = result >> 32;
191 if(carry <= 1)
192 result_high++;
193 carry = result_high;
194 un[i + j] = (uint32_t)result;
195 }
196 bool need_fixup = carry != 1;
197 #else
198 #error need to define one of ORIGINAL, SUB_MUL_BORROW, or MUL_RSUB_CARRY
199 #endif
200
201 q[j] = qhat; // Store quotient digit.
202 if (need_fixup) { // If we subtracted too
203 q[j] = q[j] - 1; // much, add back.
204 k = 0;
205 for (i = 0; i < n; i++) {
206 t = (unsigned long long)un[i+j] + vn[i] + k;
207 un[i+j] = t;
208 k = t >> 32;
209 }
210 un[j+n] = un[j+n] + k;
211 }
212 } // End j.
213 // If the caller wants the remainder, unnormalize
214 // it and pass it back.
215 if (r != NULL) {
216 for (i = 0; i < n-1; i++)
217 r[i] = (un[i] >> s) | ((unsigned long long)un[i+1] << (32-s));
218 r[n-1] = un[n-1] >> s;
219 }
220 return 0;
221 }
222
223 int errors;
224
225 void check(unsigned q[], unsigned r[],
226 unsigned u[], unsigned v[],
227 int m, int n,
228 unsigned cq[], unsigned cr[]) {
229 int i, szq;
230
231 szq = max(m - n + 1, 1);
232 for (i = 0; i < szq; i++) {
233 if (q[i] != cq[i]) {
234 errors = errors + 1;
235 dumpit("Error, dividend u =", m, u);
236 dumpit(" divisor v =", n, v);
237 dumpit("For quotient, got:", m-n+1, q);
238 dumpit(" Should get:", m-n+1, cq);
239 return;
240 }
241 }
242 for (i = 0; i < n; i++) {
243 if (r[i] != cr[i]) {
244 errors = errors + 1;
245 dumpit("Error, dividend u =", m, u);
246 dumpit(" divisor v =", n, v);
247 dumpit("For remainder, got:", n, r);
248 dumpit(" Should get:", n, cr);
249 return;
250 }
251 }
252 return;
253 }
254
255 int main() {
256 static unsigned test[] = {
257 // m, n, u..., v..., cq..., cr....
258 1, 1, 3, 0, 1, 1, // Error, divide by 0.
259 1, 2, 7, 1,3, 0, 7,0, // Error, n > m.
260 2, 2, 0,0, 1,0, 0, 0,0, // Error, incorrect remainder cr.
261 1, 1, 3, 2, 1, 1,
262 1, 1, 3, 3, 1, 0,
263 1, 1, 3, 4, 0, 3,
264 1, 1, 0, 0xffffffff, 0, 0,
265 1, 1, 0xffffffff, 1, 0xffffffff, 0,
266 1, 1, 0xffffffff, 0xffffffff, 1, 0,
267 1, 1, 0xffffffff, 3, 0x55555555, 0,
268 2, 1, 0xffffffff,0xffffffff, 1, 0xffffffff,0xffffffff, 0,
269 2, 1, 0xffffffff,0xffffffff, 0xffffffff, 1,1, 0,
270 2, 1, 0xffffffff,0xfffffffe, 0xffffffff, 0xffffffff,0, 0xfffffffe,
271 2, 1, 0x00005678,0x00001234, 0x00009abc, 0x1e1dba76,0, 0x6bd0,
272 2, 2, 0,0, 0,1, 0, 0,0,
273 2, 2, 0,7, 0,3, 2, 0,1,
274 2, 2, 5,7, 0,3, 2, 5,1,
275 2, 2, 0,6, 0,2, 3, 0,0,
276 1, 1, 0x80000000, 0x40000001, 0x00000001, 0x3fffffff,
277 2, 1, 0x00000000,0x80000000, 0x40000001, 0xfffffff8,0x00000001, 0x00000008,
278 2, 2, 0x00000000,0x80000000, 0x00000001,0x40000000, 0x00000001, 0xffffffff,0x3fffffff,
279 2, 2, 0x0000789a,0x0000bcde, 0x0000789a,0x0000bcde, 1, 0,0,
280 2, 2, 0x0000789b,0x0000bcde, 0x0000789a,0x0000bcde, 1, 1,0,
281 2, 2, 0x00007899,0x0000bcde, 0x0000789a,0x0000bcde, 0, 0x00007899,0x0000bcde,
282 2, 2, 0x0000ffff,0x0000ffff, 0x0000ffff,0x0000ffff, 1, 0,0,
283 2, 2, 0x0000ffff,0x0000ffff, 0x00000000,0x00000001, 0x0000ffff, 0x0000ffff,0,
284 3, 2, 0x000089ab,0x00004567,0x00000123, 0x00000000,0x00000001, 0x00004567,0x00000123, 0x000089ab,0,
285 3, 2, 0x00000000,0x0000fffe,0x00008000, 0x0000ffff,0x00008000, 0xffffffff,0x00000000, 0x0000ffff,0x00007fff, // Shows that first qhat can = b + 1.
286 3, 3, 0x00000003,0x00000000,0x80000000, 0x00000001,0x00000000,0x20000000, 0x00000003, 0,0,0x20000000, // Adding back step req'd.
287 3, 3, 0x00000003,0x00000000,0x00008000, 0x00000001,0x00000000,0x00002000, 0x00000003, 0,0,0x00002000, // Adding back step req'd.
288 4, 3, 0,0,0x00008000,0x00007fff, 1,0,0x00008000, 0xfffe0000,0, 0x00020000,0xffffffff,0x00007fff, // Add back req'd.
289 4, 3, 0,0x0000fffe,0,0x00008000, 0x0000ffff,0,0x00008000, 0xffffffff,0, 0x0000ffff,0xffffffff,0x00007fff, // Shows that mult-sub quantity cannot be treated as signed.
290 4, 3, 0,0xfffffffe,0,0x80000000, 0x0000ffff,0,0x80000000, 0x00000000,1, 0x00000000,0xfffeffff,0x00000000, // Shows that mult-sub quantity cannot be treated as signed.
291 4, 3, 0,0xfffffffe,0,0x80000000, 0xffffffff,0,0x80000000, 0xffffffff,0, 0xffffffff,0xffffffff,0x7fffffff, // Shows that mult-sub quantity cannot be treated as signed.
292 };
293 int i, n, m, ncases, f;
294 unsigned q[10], r[10];
295 unsigned *u, *v, *cq, *cr;
296
297 printf("divmnu:\n");
298 i = 0;
299 ncases = 0;
300 while (i < sizeof(test)/4) {
301 m = test[i];
302 n = test[i+1];
303 u = &test[i+2];
304 v = &test[i+2+m];
305 cq = &test[i+2+m+n];
306 cr = &test[i+2+m+n+max(m-n+1, 1)];
307
308 f = divmnu(q, r, u, v, m, n);
309 if (f) {
310 dumpit("Error return code for dividend u =", m, u);
311 dumpit(" divisor v =", n, v);
312 errors = errors + 1;
313 }
314 else
315 check(q, r, u, v, m, n, cq, cr);
316 i = i + 2 + m + n + max(m-n+1, 1) + n;
317 ncases = ncases + 1;
318 }
319
320 printf("%d errors out of %d cases; there should be 3.\n", errors, ncases);
321 return 0;
322 }