(no commit message)
[libreriscv.git] / openpower / sv / biginteger / divmnu64.c
1 /* original source code from Hackers-Delight
2 https://github.com/hcs0/Hackers-Delight
3 */
4 /* This divides an n-word dividend by an m-word divisor, giving an
5 n-m+1-word quotient and m-word remainder. The bignums are in arrays of
6 words. Here a "word" is 32 bits. This routine is designed for a 64-bit
7 machine which has a 64/64 division instruction. */
8
9 #include <stdbool.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include <stdlib.h> //To define "exit", req'd by XLC.
13
14 #define max(x, y) ((x) > (y) ? (x) : (y))
15
16 int nlz(unsigned x)
17 {
18 int n;
19
20 if (x == 0)
21 return (32);
22 n = 0;
23 if (x <= 0x0000FFFF)
24 {
25 n = n + 16;
26 x = x << 16;
27 }
28 if (x <= 0x00FFFFFF)
29 {
30 n = n + 8;
31 x = x << 8;
32 }
33 if (x <= 0x0FFFFFFF)
34 {
35 n = n + 4;
36 x = x << 4;
37 }
38 if (x <= 0x3FFFFFFF)
39 {
40 n = n + 2;
41 x = x << 2;
42 }
43 if (x <= 0x7FFFFFFF)
44 {
45 n = n + 1;
46 }
47 return n;
48 }
49
50 void dumpit(char *msg, int n, unsigned v[])
51 {
52 int i;
53 printf("%s", msg);
54 for (i = n - 1; i >= 0; i--)
55 printf(" %08x", v[i]);
56 printf("\n");
57 }
58
59 typedef struct
60 {
61 uint32_t q, r;
62 bool overflow;
63 } divrem_t;
64
65 divrem_t divrem_64_by_32(uint64_t n, uint32_t d)
66 {
67 if ((n >> 32) >= d) // overflow
68 {
69 return (divrem_t){.q = UINT32_MAX, .r = 0, .overflow = true};
70 }
71 else
72 {
73 return (divrem_t){.q = n / d, .r = n % d, .overflow = false};
74 }
75 }
76
77 bool bigmul(uint32_t qhat, unsigned product[], unsigned vn[], int m, int n)
78 {
79 uint32_t carry = 0;
80 // VL = n + 1
81 // sv.madded product.v, vn.v, qhat.s, carry.s
82 for (int i = 0; i <= n; i++)
83 {
84 uint32_t vn_v = i < n ? vn[i] : 0;
85 uint64_t value = (uint64_t)vn_v * (uint64_t)qhat + carry;
86 carry = (uint32_t)(value >> 32);
87 product[i] = (uint32_t)value;
88 }
89 return carry != 0;
90 }
91
92 bool bigadd(unsigned result[], unsigned vn[], unsigned un[], int m, int n,
93 bool ca)
94 {
95 // VL = n + 1
96 // sv.subfe un_j.v, product.v, un_j.v
97 for (int i = 0; i <= n; i++)
98 {
99 uint64_t value = (uint64_t)vn[i] + (uint64_t)un[i] + ca;
100 ca = value >> 32 != 0;
101 result[i] = value;
102 }
103 return ca;
104 }
105
106 bool bigsub(unsigned result[], unsigned vn[], unsigned un[], int m, int n,
107 bool ca)
108 {
109 // VL = n + 1
110 // sv.subfe un_j.v, product.v, un_j.v
111 for (int i = 0; i <= n; i++)
112 {
113 uint64_t value = (uint64_t)~vn[i] + (uint64_t)un[i] + ca;
114 ca = value >> 32 != 0;
115 result[i] = value;
116 }
117 return ca;
118 }
119
120 bool bigmulsub(unsigned long long qhat, unsigned vn[], unsigned un[], int j,
121 int m, int n)
122 {
123 // Multiply and subtract.
124 uint32_t product[n + 1];
125 bigmul(qhat, product, vn, m, n);
126 bool ca = bigsub(un, product, un, m, n, true);
127 bool need_fixup = !ca;
128 return need_fixup;
129 }
130
131 /* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
132 (The sequence is in little-endian order).
133
134 This is a fairly precise implementation of Knuth's Algorithm D, for a
135 binary computer with base b = 2**32. The caller supplies:
136 1. Space q for the quotient, m - n + 1 words (at least one).
137 2. Space r for the remainder (optional), n words.
138 3. The dividend u, m words, m >= 1.
139 4. The divisor v, n words, n >= 2.
140 The most significant digit of the divisor, v[n-1], must be nonzero. The
141 dividend u may have leading zeros; this just makes the algorithm take
142 longer and makes the quotient contain more leading zeros. A value of
143 NULL may be given for the address of the remainder to signify that the
144 caller does not want the remainder.
145 The program does not alter the input parameters u and v.
146 The quotient and remainder returned may have leading zeros. The
147 function itself returns a value of 0 for success and 1 for invalid
148 parameters (e.g., division by 0).
149 For now, we must have m >= n. Knuth's Algorithm D also requires
150 that the dividend be at least as long as the divisor. (In his terms,
151 m >= 0 (unstated). Therefore m+n >= n.) */
152
153 int divmnu(unsigned q[], unsigned r[], const unsigned u[], const unsigned v[],
154 int m, int n)
155 {
156
157 const unsigned long long b = 1LL << 32; // Number base (2**32).
158 unsigned *un, *vn; // Normalized form of u, v.
159 unsigned long long qhat; // Estimated quotient digit.
160 unsigned long long rhat; // A remainder.
161 unsigned long long p; // Product of two digits.
162 long long t, k;
163 int s, i, j;
164
165 if (m < n || n <= 0 || v[n - 1] == 0)
166 return 1; // Return if invalid param.
167
168 if (n == 1)
169 { // Take care of
170 k = 0; // the case of a
171 for (j = m - 1; j >= 0; j--)
172 { // single-digit
173 uint64_t dig2 = ((uint64_t)k << 32) | u[j];
174 q[j] = dig2 / v[0]; // divisor here.
175 k = dig2 % v[0]; // modulo back into next loop
176 }
177 if (r != NULL)
178 r[0] = k;
179 return 0;
180 }
181
182 /* Normalize by shifting v left just enough so that its high-order
183 bit is on, and shift u left the same amount. We may have to append a
184 high-order digit on the dividend; we do that unconditionally. */
185
186 s = nlz(v[n - 1]); // 0 <= s <= 31.
187 vn = (unsigned *)alloca(4 * n);
188 for (i = n - 1; i > 0; i--)
189 vn[i] = (v[i] << s) | ((unsigned long long)v[i - 1] >> (32 - s));
190 vn[0] = v[0] << s;
191
192 un = (unsigned *)alloca(4 * (m + 1));
193 un[m] = (unsigned long long)u[m - 1] >> (32 - s);
194 for (i = m - 1; i > 0; i--)
195 un[i] = (u[i] << s) | ((unsigned long long)u[i - 1] >> (32 - s));
196 un[0] = u[0] << s;
197
198 for (j = m - n; j >= 0; j--)
199 { // Main loop.
200 uint32_t *un_j = &un[j];
201 // Compute estimate qhat of q[j] from top 2 digits.
202 uint64_t dig2 = ((uint64_t)un[j + n] << 32) | un[j + n - 1];
203 divrem_t qr = divrem_64_by_32(dig2, vn[n - 1]);
204 qhat = qr.q;
205 rhat = qr.r;
206 if (qr.overflow)
207 {
208 // rhat can be bigger than 32-bit when the division overflows,
209 // so rhat's computation can't be folded into divrem_64_by_32
210 rhat = dig2 - (uint64_t)qr.q * vn[n - 1];
211 }
212 again:
213 // use 3rd-from-top digit to obtain better accuracy
214 if (rhat < b && qhat * vn[n - 2] > b * rhat + un[j + n - 2])
215 {
216 qhat = qhat - 1;
217 rhat = rhat + vn[n - 1];
218 if (rhat < b)
219 goto again;
220 }
221 // don't define here, allowing easily testing all options by passing -D...
222 // #define MUL_RSUB_CARRY_2_STAGE2
223 #ifdef ORIGINAL
224 // Multiply and subtract.
225 k = 0;
226 for (i = 0; i < n; i++)
227 {
228 p = qhat * vn[i];
229 t = un[i + j] - k - (p & 0xFFFFFFFFLL);
230 un[i + j] = t;
231 k = (p >> 32) - (t >> 32);
232 }
233 t = un[j + n] - k;
234 un[j + n] = t;
235 bool need_fixup = t < 0;
236 #elif defined(SUB_MUL_BORROW)
237 (void)p, (void)t; // shut up unused variable warning
238
239 // Multiply and subtract.
240 uint32_t borrow = 0;
241 for (int i = 0; i <= n; i++)
242 {
243 uint32_t vn_i = i < n ? vn[i] : 0;
244 uint64_t value = un[i + j] - (uint64_t)qhat * vn_i - borrow;
245 borrow = -(uint32_t)(value >> 32);
246 un[i + j] = (uint32_t)value;
247 }
248 bool need_fixup = borrow != 0;
249 #elif defined(MUL_RSUB_CARRY)
250 (void)p, (void)t; // shut up unused variable warning
251
252 // Multiply and subtract.
253 uint32_t carry = 1;
254 for (int i = 0; i <= n; i++)
255 {
256 uint32_t vn_i = i < n ? vn[i] : 0;
257 uint64_t result = un[i + j] + ~((uint64_t)qhat * vn_i) + carry;
258 uint32_t result_high = result >> 32;
259 if (carry <= 1)
260 result_high++;
261 carry = result_high;
262 un[i + j] = (uint32_t)result;
263 }
264 bool need_fixup = carry != 1;
265 #elif defined(SUB_MUL_BORROW_2_STAGE)
266 (void)p, (void)t; // shut up unused variable warning
267
268 // Multiply and subtract.
269 uint32_t borrow = 0;
270 uint32_t phi[2000]; // plenty space
271 uint32_t plo[2000]; // plenty space
272 // first, perform mul-and-sub and store in split hi-lo
273 // this shows the vectorised sv.msubx which stores 128-bit in
274 // two 64-bit registers
275 for (int i = 0; i <= n; i++)
276 {
277 uint32_t vn_i = i < n ? vn[i] : 0;
278 uint64_t value = un[i + j] - (uint64_t)qhat * vn_i;
279 plo[i] = value & 0xffffffffLL;
280 phi[i] = value >> 32;
281 }
282 // second, reconstruct the 64-bit result, subtract borrow,
283 // store top-half (-ve) in new borrow and store low-half as answer
284 // this is the new (odd) instruction
285 for (int i = 0; i <= n; i++)
286 {
287 uint64_t value = (((uint64_t)phi[i] << 32) | plo[i]) - borrow;
288 borrow = ~(value >> 32) + 1; // -(uint32_t)(value >> 32);
289 un[i + j] = (uint32_t)value;
290 }
291 bool need_fixup = borrow != 0;
292 #elif defined(MUL_RSUB_CARRY_2_STAGE)
293 (void)p, (void)t; // shut up unused variable warning
294
295 // Multiply and subtract.
296 uint32_t carry = 1;
297 uint32_t phi[2000]; // plenty space
298 uint32_t plo[2000]; // plenty space
299 for (int i = 0; i <= n; i++)
300 {
301 uint32_t vn_i = i < n ? vn[i] : 0;
302 uint64_t value = un[i + j] + ~((uint64_t)qhat * vn_i);
303 plo[i] = value & 0xffffffffLL;
304 phi[i] = value >> 32;
305 }
306 for (int i = 0; i <= n; i++)
307 {
308 uint64_t result = (((uint64_t)phi[i] << 32) | plo[i]) + carry;
309 uint32_t result_high = result >> 32;
310 if (carry <= 1)
311 result_high++;
312 carry = result_high;
313 un[i + j] = (uint32_t)result;
314 }
315 bool need_fixup = carry != 1;
316 #elif defined(MUL_RSUB_CARRY_2_STAGE1)
317 (void)p, (void)t; // shut up unused variable warning
318
319 // Multiply and subtract.
320 uint32_t carry = 1;
321 uint32_t phi[2000]; // plenty space
322 uint32_t plo[2000]; // plenty space
323 // same mul-and-sub as SUB_MUL_BORROW but not the same
324 // mul-and-sub-minus-one as MUL_RSUB_CARRY
325 for (int i = 0; i <= n; i++)
326 {
327 uint32_t vn_i = i < n ? vn[i] : 0;
328 uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
329 plo[i] = value & 0xffffffffLL;
330 phi[i] = value >> 32;
331 }
332 // compensate for the +1 that was added by mul-and-sub by subtracting
333 // it here (as ~(0))
334 for (int i = 0; i <= n; i++)
335 {
336 uint64_t result = (((uint64_t)phi[i] << 32) | plo[i]) + carry +
337 ~(0); // a way to express "-1"
338 uint32_t result_high = result >> 32;
339 if (carry <= 1)
340 result_high++;
341 carry = result_high;
342 un[i + j] = (uint32_t)result;
343 }
344 bool need_fixup = carry != 1;
345 #elif defined(MUL_RSUB_CARRY_2_STAGE2)
346 (void)p, (void)t; // shut up unused variable warning
347
348 // Multiply and subtract.
349 uint32_t carry = 0;
350 uint32_t phi[2000]; // plenty space
351 uint32_t plo[2000]; // plenty space
352 // same mul-and-sub as SUB_MUL_BORROW but not the same
353 // mul-and-sub-minus-one as MUL_RSUB_CARRY
354 for (int i = 0; i <= n; i++)
355 {
356 uint32_t vn_i = i < n ? vn[i] : 0;
357 uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
358 plo[i] = value & 0xffffffffLL;
359 phi[i] = value >> 32;
360 }
361 // NOW it starts to make sense. when no carry this time, next
362 // carry as-is. rlse next carry reduces by one.
363 // it here (as ~(0))
364 for (int i = 0; i <= n; i++)
365 {
366 uint64_t result = (((uint64_t)phi[i] << 32) | plo[i]) + carry;
367 uint32_t result_high = result >> 32;
368 if (carry == 0)
369 carry = result_high;
370 else
371 carry = result_high - 1;
372 un[i + j] = (uint32_t)result;
373 }
374 bool need_fixup = carry != 0;
375 #elif defined(MADDED_SUBFE)
376 (void)p, (void)t; // shut up unused variable warning
377
378 // Multiply and subtract.
379 uint32_t carry = 0;
380 uint32_t product[n + 1];
381 // VL = n + 1
382 // sv.madded product.v, vn.v, qhat.s, carry.s
383 for (int i = 0; i <= n; i++)
384 {
385 uint32_t vn_v = i < n ? vn[i] : 0;
386 uint64_t value = (uint64_t)vn_v * (uint64_t)qhat + carry;
387 carry = (uint32_t)(value >> 32);
388 product[i] = (uint32_t)value;
389 }
390 bool ca = true;
391 // VL = n + 1
392 // sv.subfe un_j.v, product.v, un_j.v
393 for (int i = 0; i <= n; i++)
394 {
395 uint64_t value = (uint64_t)~product[i] + (uint64_t)un_j[i] + ca;
396 ca = value >> 32 != 0;
397 un_j[i] = value;
398 }
399 bool need_fixup = !ca;
400 #else
401 #error need to choose one of the algorithm options; e.g. -DORIGINAL
402 #endif
403
404 q[j] = qhat; // Store quotient digit.
405 if (need_fixup)
406 { // If we subtracted too
407 q[j] = q[j] - 1; // much, add back.
408 bigadd(un_j, vn, un_j, m, n, 0);
409 }
410 } // End j.
411 // If the caller wants the remainder, unnormalize
412 // it and pass it back.
413 if (r != NULL)
414 {
415 for (i = 0; i < n - 1; i++)
416 r[i] = (un[i] >> s) | ((unsigned long long)un[i + 1] << (32 - s));
417 r[n - 1] = un[n - 1] >> s;
418 }
419 return 0;
420 }
421
422 int errors;
423
424 void check(unsigned q[], unsigned r[], unsigned u[], unsigned v[], int m,
425 int n, unsigned cq[], unsigned cr[])
426 {
427 int i, szq;
428
429 szq = max(m - n + 1, 1);
430 for (i = 0; i < szq; i++)
431 {
432 if (q[i] != cq[i])
433 {
434 errors = errors + 1;
435 dumpit("Error, dividend u =", m, u);
436 dumpit(" divisor v =", n, v);
437 dumpit("For quotient, got:", m - n + 1, q);
438 dumpit(" Should get:", m - n + 1, cq);
439 return;
440 }
441 }
442 for (i = 0; i < n; i++)
443 {
444 if (r[i] != cr[i])
445 {
446 errors = errors + 1;
447 dumpit("Error, dividend u =", m, u);
448 dumpit(" divisor v =", n, v);
449 dumpit("For remainder, got:", n, r);
450 dumpit(" Should get:", n, cr);
451 return;
452 }
453 }
454 return;
455 }
456
457 int main()
458 {
459 static struct
460 {
461 int m, n;
462 uint32_t u[10], v[10], cq[10], cr[10];
463 bool error;
464 } test[] = {
465 // clang-format off
466 {.m=1, .n=1, .u={3}, .v={0}, .cq={1}, .cr={1}, .error=true}, // Error, divide by 0.
467 {.m=1, .n=2, .u={7}, .v={1,3}, .cq={0}, .cr={7,0}, .error=true}, // Error, n > m.
468 {.m=2, .n=2, .u={0,0}, .v={1,0}, .cq={0}, .cr={0,0}, .error=true}, // Error, incorrect remainder cr.
469 {.m=1, .n=1, .u={3}, .v={2}, .cq={1}, .cr={1}},
470 {.m=1, .n=1, .u={3}, .v={3}, .cq={1}, .cr={0}},
471 {.m=1, .n=1, .u={3}, .v={4}, .cq={0}, .cr={3}},
472 {.m=1, .n=1, .u={0}, .v={0xffffffff}, .cq={0}, .cr={0}},
473 {.m=1, .n=1, .u={0xffffffff}, .v={1}, .cq={0xffffffff}, .cr={0}},
474 {.m=1, .n=1, .u={0xffffffff}, .v={0xffffffff}, .cq={1}, .cr={0}},
475 {.m=1, .n=1, .u={0xffffffff}, .v={3}, .cq={0x55555555}, .cr={0}},
476 {.m=2, .n=1, .u={0xffffffff,0xffffffff}, .v={1}, .cq={0xffffffff,0xffffffff}, .cr={0}},
477 {.m=2, .n=1, .u={0xffffffff,0xffffffff}, .v={0xffffffff}, .cq={1,1}, .cr={0}},
478 {.m=2, .n=1, .u={0xffffffff,0xfffffffe}, .v={0xffffffff}, .cq={0xffffffff,0}, .cr={0xfffffffe}},
479 {.m=2, .n=1, .u={0x00005678,0x00001234}, .v={0x00009abc}, .cq={0x1e1dba76,0}, .cr={0x6bd0}},
480 {.m=2, .n=2, .u={0,0}, .v={0,1}, .cq={0}, .cr={0,0}},
481 {.m=2, .n=2, .u={0,7}, .v={0,3}, .cq={2}, .cr={0,1}},
482 {.m=2, .n=2, .u={5,7}, .v={0,3}, .cq={2}, .cr={5,1}},
483 {.m=2, .n=2, .u={0,6}, .v={0,2}, .cq={3}, .cr={0,0}},
484 {.m=1, .n=1, .u={0x80000000}, .v={0x40000001}, .cq={0x00000001}, .cr={0x3fffffff}},
485 {.m=2, .n=1, .u={0x00000000,0x80000000}, .v={0x40000001}, .cq={0xfffffff8,0x00000001}, .cr={0x00000008}},
486 {.m=2, .n=2, .u={0x00000000,0x80000000}, .v={0x00000001,0x40000000}, .cq={0x00000001}, .cr={0xffffffff,0x3fffffff}},
487 {.m=2, .n=2, .u={0x0000789a,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={1}, .cr={0,0}},
488 {.m=2, .n=2, .u={0x0000789b,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={1}, .cr={1,0}},
489 {.m=2, .n=2, .u={0x00007899,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={0}, .cr={0x00007899,0x0000bcde}},
490 {.m=2, .n=2, .u={0x0000ffff,0x0000ffff}, .v={0x0000ffff,0x0000ffff}, .cq={1}, .cr={0,0}},
491 {.m=2, .n=2, .u={0x0000ffff,0x0000ffff}, .v={0x00000000,0x00000001}, .cq={0x0000ffff}, .cr={0x0000ffff,0}},
492 {.m=3, .n=2, .u={0x000089ab,0x00004567,0x00000123}, .v={0x00000000,0x00000001}, .cq={0x00004567,0x00000123}, .cr={0x000089ab,0}},
493 {.m=3, .n=2, .u={0x00000000,0x0000fffe,0x00008000}, .v={0x0000ffff,0x00008000}, .cq={0xffffffff,0x00000000}, .cr={0x0000ffff,0x00007fff}}, // Shows that first qhat can = b + 1.
494 {.m=3, .n=3, .u={0x00000003,0x00000000,0x80000000}, .v={0x00000001,0x00000000,0x20000000}, .cq={0x00000003}, .cr={0,0,0x20000000}}, // Adding back step req'd.
495 {.m=3, .n=3, .u={0x00000003,0x00000000,0x00008000}, .v={0x00000001,0x00000000,0x00002000}, .cq={0x00000003}, .cr={0,0,0x00002000}}, // Adding back step req'd.
496 {.m=4, .n=3, .u={0,0,0x00008000,0x00007fff}, .v={1,0,0x00008000}, .cq={0xfffe0000,0}, .cr={0x00020000,0xffffffff,0x00007fff}}, // Add back req'd.
497 {.m=4, .n=3, .u={0,0x0000fffe,0,0x00008000}, .v={0x0000ffff,0,0x00008000}, .cq={0xffffffff,0}, .cr={0x0000ffff,0xffffffff,0x00007fff}}, // Shows that mult-sub quantity cannot be treated as signed.
498 {.m=4, .n=3, .u={0,0xfffffffe,0,0x80000000}, .v={0x0000ffff,0,0x80000000}, .cq={0x00000000,1}, .cr={0x00000000,0xfffeffff,0x00000000}}, // Shows that mult-sub quantity cannot be treated as signed.
499 {.m=4, .n=3, .u={0,0xfffffffe,0,0x80000000}, .v={0xffffffff,0,0x80000000}, .cq={0xffffffff,0}, .cr={0xffffffff,0xffffffff,0x7fffffff}}, // Shows that mult-sub quantity cannot be treated as signed.
500 // clang-format on
501 };
502 const int ncases = sizeof(test) / sizeof(test[0]);
503 unsigned q[10], r[10];
504
505 printf("divmnu:\n");
506 for (int i = 0; i < ncases; i++)
507 {
508 int m = test[i].m;
509 int n = test[i].n;
510 uint32_t *u = test[i].u;
511 uint32_t *v = test[i].v;
512 uint32_t *cq = test[i].cq;
513 uint32_t *cr = test[i].cr;
514
515 int f = divmnu(q, r, u, v, m, n);
516 if (f && !test[i].error)
517 {
518 dumpit("Unexpected error return code for dividend u =", m, u);
519 dumpit(" divisor v =", n, v);
520 errors = errors + 1;
521 }
522 else if (!f && test[i].error)
523 {
524 dumpit("Unexpected success return code for dividend u =", m, u);
525 dumpit(" divisor v =", n, v);
526 errors = errors + 1;
527 }
528
529 if (!f)
530 check(q, r, u, v, m, n, cq, cr);
531 }
532
533 printf("%d test failures out of %d cases.\n", errors, ncases);
534 return 0;
535 }