(no commit message)
[libreriscv.git] / openpower / sv / biginteger / divmnu64.c
1 /* original source code from Hackers-Delight
2 https://github.com/hcs0/Hackers-Delight
3 */
4 /* This divides an n-word dividend by an m-word divisor, giving an
5 n-m+1-word quotient and m-word remainder. The bignums are in arrays of
6 words. Here a "word" is 32 bits. This routine is designed for a 64-bit
7 machine which has a 64/64 division instruction. */
8
9 #include <stdbool.h>
10 #include <stdint.h>
11 #include <stdio.h>
12 #include <stdlib.h> //To define "exit", req'd by XLC.
13
14 #define max(x, y) ((x) > (y) ? (x) : (y))
15
16 int nlz(unsigned x)
17 {
18 int n;
19
20 if (x == 0)
21 return (32);
22 n = 0;
23 if (x <= 0x0000FFFF)
24 {
25 n = n + 16;
26 x = x << 16;
27 }
28 if (x <= 0x00FFFFFF)
29 {
30 n = n + 8;
31 x = x << 8;
32 }
33 if (x <= 0x0FFFFFFF)
34 {
35 n = n + 4;
36 x = x << 4;
37 }
38 if (x <= 0x3FFFFFFF)
39 {
40 n = n + 2;
41 x = x << 2;
42 }
43 if (x <= 0x7FFFFFFF)
44 {
45 n = n + 1;
46 }
47 return n;
48 }
49
50 void dumpit(char *msg, int n, unsigned v[])
51 {
52 int i;
53 printf("%s", msg);
54 for (i = n - 1; i >= 0; i--)
55 printf(" %08x", v[i]);
56 printf("\n");
57 }
58
59 /* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
60 (The sequence is in little-endian order).
61
62 This is a fairly precise implementation of Knuth's Algorithm D, for a
63 binary computer with base b = 2**32. The caller supplies:
64 1. Space q for the quotient, m - n + 1 words (at least one).
65 2. Space r for the remainder (optional), n words.
66 3. The dividend u, m words, m >= 1.
67 4. The divisor v, n words, n >= 2.
68 The most significant digit of the divisor, v[n-1], must be nonzero. The
69 dividend u may have leading zeros; this just makes the algorithm take
70 longer and makes the quotient contain more leading zeros. A value of
71 NULL may be given for the address of the remainder to signify that the
72 caller does not want the remainder.
73 The program does not alter the input parameters u and v.
74 The quotient and remainder returned may have leading zeros. The
75 function itself returns a value of 0 for success and 1 for invalid
76 parameters (e.g., division by 0).
77 For now, we must have m >= n. Knuth's Algorithm D also requires
78 that the dividend be at least as long as the divisor. (In his terms,
79 m >= 0 (unstated). Therefore m+n >= n.) */
80
81 int divmnu(unsigned q[], unsigned r[], const unsigned u[], const unsigned v[],
82 int m, int n)
83 {
84
85 const unsigned long long b = 1LL << 32; // Number base (2**32).
86 unsigned *un, *vn; // Normalized form of u, v.
87 unsigned long long qhat; // Estimated quotient digit.
88 unsigned long long rhat; // A remainder.
89 unsigned long long p; // Product of two digits.
90 long long t, k;
91 int s, i, j;
92
93 if (m < n || n <= 0 || v[n - 1] == 0)
94 return 1; // Return if invalid param.
95
96 if (n == 1)
97 { // Take care of
98 k = 0; // the case of a
99 for (j = m - 1; j >= 0; j--)
100 { // single-digit
101 q[j] = (k * b + u[j]) / v[0]; // divisor here.
102 k = (k * b + u[j]) - q[j] * v[0];
103 }
104 if (r != NULL)
105 r[0] = k;
106 return 0;
107 }
108
109 /* Normalize by shifting v left just enough so that its high-order
110 bit is on, and shift u left the same amount. We may have to append a
111 high-order digit on the dividend; we do that unconditionally. */
112
113 s = nlz(v[n - 1]); // 0 <= s <= 31.
114 vn = (unsigned *)alloca(4 * n);
115 for (i = n - 1; i > 0; i--)
116 vn[i] = (v[i] << s) | ((unsigned long long)v[i - 1] >> (32 - s));
117 vn[0] = v[0] << s;
118
119 un = (unsigned *)alloca(4 * (m + 1));
120 un[m] = (unsigned long long)u[m - 1] >> (32 - s);
121 for (i = m - 1; i > 0; i--)
122 un[i] = (u[i] << s) | ((unsigned long long)u[i - 1] >> (32 - s));
123 un[0] = u[0] << s;
124
125 for (j = m - n; j >= 0; j--)
126 { // Main loop.
127 // Compute estimate qhat of q[j] from top 2 digits
128 uint64_t dig2 = ((uint64_t)un[j + n] << 32) | un[j + n - 1];
129 qhat = dig2 / vn[n - 1];
130 rhat = dig2 % vn[n - 1];
131 again:
132 // use 3rd-from-top digit to obtain better accuracy
133 if (qhat >= b || qhat * vn[n - 2] > b * rhat + un[j + n - 2])
134 {
135 qhat = qhat - 1;
136 rhat = rhat + vn[n - 1];
137 if (rhat < b)
138 goto again;
139 }
140 // don't define here, allowing easily testing all options by passing -D...
141 // #define MUL_RSUB_CARRY_2_STAGE2
142 #ifdef ORIGINAL
143 // Multiply and subtract.
144 k = 0;
145 for (i = 0; i < n; i++)
146 {
147 p = qhat * vn[i];
148 t = un[i + j] - k - (p & 0xFFFFFFFFLL);
149 un[i + j] = t;
150 k = (p >> 32) - (t >> 32);
151 }
152 t = un[j + n] - k;
153 un[j + n] = t;
154 bool need_fixup = t < 0;
155 #elif defined(SUB_MUL_BORROW)
156 (void)p; // shut up unused variable warning
157
158 // Multiply and subtract.
159 uint32_t borrow = 0;
160 for (int i = 0; i <= n; i++)
161 {
162 uint32_t vn_i = i < n ? vn[i] : 0;
163 uint64_t value = un[i + j] - (uint64_t)qhat * vn_i - borrow;
164 borrow = -(uint32_t)(value >> 32);
165 un[i + j] = (uint32_t)value;
166 }
167 bool need_fixup = borrow != 0;
168 #elif defined(MUL_RSUB_CARRY)
169 (void)p; // shut up unused variable warning
170
171 // Multiply and subtract.
172 uint32_t carry = 1;
173 for (int i = 0; i <= n; i++)
174 {
175 uint32_t vn_i = i < n ? vn[i] : 0;
176 uint64_t result = un[i + j] + ~((uint64_t)qhat * vn_i) + carry;
177 uint32_t result_high = result >> 32;
178 if (carry <= 1)
179 result_high++;
180 carry = result_high;
181 un[i + j] = (uint32_t)result;
182 }
183 bool need_fixup = carry != 1;
184 #elif defined(SUB_MUL_BORROW_2_STAGE)
185 (void)p; // shut up unused variable warning
186
187 // Multiply and subtract.
188 uint32_t borrow = 0;
189 uint32_t phi[2000]; // plenty space
190 uint32_t plo[2000]; // plenty space
191 // first, perform mul-and-sub and store in split hi-lo
192 // this shows the vectorised sv.msubx which stores 128-bit in
193 // two 64-bit registers
194 for (int i = 0; i <= n; i++)
195 {
196 uint32_t vn_i = i < n ? vn[i] : 0;
197 uint64_t value = un[i + j] - (uint64_t)qhat * vn_i;
198 plo[i] = value & 0xffffffffLL;
199 phi[i] = value >> 32;
200 }
201 // second, reconstruct the 64-bit result, subtract borrow,
202 // store top-half (-ve) in new borrow and store low-half as answer
203 // this is the new (odd) instruction
204 for (int i = 0; i <= n; i++)
205 {
206 uint64_t value = (((uint64_t)phi[i] << 32) | plo[i]) - borrow;
207 borrow = ~(value >> 32) + 1; // -(uint32_t)(value >> 32);
208 un[i + j] = (uint32_t)value;
209 }
210 bool need_fixup = borrow != 0;
211 #elif defined(MUL_RSUB_CARRY_2_STAGE)
212 (void)p; // shut up unused variable warning
213
214 // Multiply and subtract.
215 uint32_t carry = 1;
216 uint32_t phi[2000]; // plenty space
217 uint32_t plo[2000]; // plenty space
218 for (int i = 0; i <= n; i++)
219 {
220 uint32_t vn_i = i < n ? vn[i] : 0;
221 uint64_t value = un[i + j] + ~((uint64_t)qhat * vn_i);
222 plo[i] = value & 0xffffffffLL;
223 phi[i] = value >> 32;
224 }
225 for (int i = 0; i <= n; i++)
226 {
227 uint64_t result = (((uint64_t)phi[i] << 32) | plo[i]) + carry;
228 uint32_t result_high = result >> 32;
229 if (carry <= 1)
230 result_high++;
231 carry = result_high;
232 un[i + j] = (uint32_t)result;
233 }
234 bool need_fixup = carry != 1;
235 #elif defined(MUL_RSUB_CARRY_2_STAGE1)
236 (void)p; // shut up unused variable warning
237
238 // Multiply and subtract.
239 uint32_t carry = 1;
240 uint32_t phi[2000]; // plenty space
241 uint32_t plo[2000]; // plenty space
242 // same mul-and-sub as SUB_MUL_BORROW but not the same
243 // mul-and-sub-minus-one as MUL_RSUB_CARRY
244 for (int i = 0; i <= n; i++)
245 {
246 uint32_t vn_i = i < n ? vn[i] : 0;
247 uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
248 plo[i] = value & 0xffffffffLL;
249 phi[i] = value >> 32;
250 }
251 // compensate for the +1 that was added by mul-and-sub by subtracting
252 // it here (as ~(0))
253 for (int i = 0; i <= n; i++)
254 {
255 uint64_t result = (((uint64_t)phi[i] << 32) | plo[i]) + carry +
256 ~(0); // a way to express "-1"
257 uint32_t result_high = result >> 32;
258 if (carry <= 1)
259 result_high++;
260 carry = result_high;
261 un[i + j] = (uint32_t)result;
262 }
263 bool need_fixup = carry != 1;
264 #elif defined(MUL_RSUB_CARRY_2_STAGE2)
265 (void)p; // shut up unused variable warning
266
267 // Multiply and subtract.
268 uint32_t carry = 0;
269 uint32_t phi[2000]; // plenty space
270 uint32_t plo[2000]; // plenty space
271 // same mul-and-sub as SUB_MUL_BORROW but not the same
272 // mul-and-sub-minus-one as MUL_RSUB_CARRY
273 for (int i = 0; i <= n; i++)
274 {
275 uint32_t vn_i = i < n ? vn[i] : 0;
276 uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
277 plo[i] = value & 0xffffffffLL;
278 phi[i] = value >> 32;
279 }
280 // NOW it starts to make sense. when no carry this time, next
281 // carry as-is. rlse next carry reduces by one.
282 // it here (as ~(0))
283 for (int i = 0; i <= n; i++)
284 {
285 uint64_t result = (((uint64_t)phi[i] << 32) | plo[i]) + carry;
286 uint32_t result_high = result >> 32;
287 if (carry == 0)
288 carry = result_high;
289 else
290 carry = result_high - 1;
291 un[i + j] = (uint32_t)result;
292 }
293 bool need_fixup = carry != 0;
294 #elif defined(MADDED_SUBFE)
295 (void)p; // shut up unused variable warning
296
297 // Multiply and subtract.
298 uint32_t carry = 0;
299 uint32_t product[n + 1];
300 // VL = n + 1
301 // sv.madded product.v, vn.v, qhat.s, carry.s
302 for (int i = 0; i <= n; i++)
303 {
304 uint32_t vn_v = i < n ? vn[i] : 0;
305 uint64_t value = (uint64_t)vn_v * (uint64_t)qhat + carry;
306 carry = (uint32_t)(value >> 32);
307 product[i] = (uint32_t)value;
308 }
309 bool ca = true;
310 uint32_t *un_j = &un[j];
311 // VL = n + 1
312 // sv.subfe un_j.v, product.v, un_j.v
313 for (int i = 0; i <= n; i++)
314 {
315 uint64_t value = (uint64_t)~product[i] + (uint64_t)un_j[i] + ca;
316 ca = value >> 32 != 0;
317 un_j[i] = value;
318 }
319 bool need_fixup = !ca;
320 #else
321 #error need to choose one of the algorithm options; e.g. -DORIGINAL
322 #endif
323
324 q[j] = qhat; // Store quotient digit.
325 if (need_fixup)
326 { // If we subtracted too
327 q[j] = q[j] - 1; // much, add back.
328 k = 0;
329 for (i = 0; i < n; i++)
330 {
331 t = (unsigned long long)un[i + j] + vn[i] + k;
332 un[i + j] = t;
333 k = t >> 32;
334 }
335 un[j + n] = un[j + n] + k;
336 }
337 } // End j.
338 // If the caller wants the remainder, unnormalize
339 // it and pass it back.
340 if (r != NULL)
341 {
342 for (i = 0; i < n - 1; i++)
343 r[i] = (un[i] >> s) | ((unsigned long long)un[i + 1] << (32 - s));
344 r[n - 1] = un[n - 1] >> s;
345 }
346 return 0;
347 }
348
349 int errors;
350
351 void check(unsigned q[], unsigned r[], unsigned u[], unsigned v[], int m,
352 int n, unsigned cq[], unsigned cr[])
353 {
354 int i, szq;
355
356 szq = max(m - n + 1, 1);
357 for (i = 0; i < szq; i++)
358 {
359 if (q[i] != cq[i])
360 {
361 errors = errors + 1;
362 dumpit("Error, dividend u =", m, u);
363 dumpit(" divisor v =", n, v);
364 dumpit("For quotient, got:", m - n + 1, q);
365 dumpit(" Should get:", m - n + 1, cq);
366 return;
367 }
368 }
369 for (i = 0; i < n; i++)
370 {
371 if (r[i] != cr[i])
372 {
373 errors = errors + 1;
374 dumpit("Error, dividend u =", m, u);
375 dumpit(" divisor v =", n, v);
376 dumpit("For remainder, got:", n, r);
377 dumpit(" Should get:", n, cr);
378 return;
379 }
380 }
381 return;
382 }
383
384 int main()
385 {
386 static struct
387 {
388 int m, n;
389 uint32_t u[10], v[10], cq[10], cr[10];
390 bool error;
391 } test[] = {
392 // clang-format off
393 {.m=1, .n=1, .u={3}, .v={0}, .cq={1}, .cr={1}, .error=true}, // Error, divide by 0.
394 {.m=1, .n=2, .u={7}, .v={1,3}, .cq={0}, .cr={7,0}, .error=true}, // Error, n > m.
395 {.m=2, .n=2, .u={0,0}, .v={1,0}, .cq={0}, .cr={0,0}, .error=true}, // Error, incorrect remainder cr.
396 {.m=1, .n=1, .u={3}, .v={2}, .cq={1}, .cr={1}},
397 {.m=1, .n=1, .u={3}, .v={3}, .cq={1}, .cr={0}},
398 {.m=1, .n=1, .u={3}, .v={4}, .cq={0}, .cr={3}},
399 {.m=1, .n=1, .u={0}, .v={0xffffffff}, .cq={0}, .cr={0}},
400 {.m=1, .n=1, .u={0xffffffff}, .v={1}, .cq={0xffffffff}, .cr={0}},
401 {.m=1, .n=1, .u={0xffffffff}, .v={0xffffffff}, .cq={1}, .cr={0}},
402 {.m=1, .n=1, .u={0xffffffff}, .v={3}, .cq={0x55555555}, .cr={0}},
403 {.m=2, .n=1, .u={0xffffffff,0xffffffff}, .v={1}, .cq={0xffffffff,0xffffffff}, .cr={0}},
404 {.m=2, .n=1, .u={0xffffffff,0xffffffff}, .v={0xffffffff}, .cq={1,1}, .cr={0}},
405 {.m=2, .n=1, .u={0xffffffff,0xfffffffe}, .v={0xffffffff}, .cq={0xffffffff,0}, .cr={0xfffffffe}},
406 {.m=2, .n=1, .u={0x00005678,0x00001234}, .v={0x00009abc}, .cq={0x1e1dba76,0}, .cr={0x6bd0}},
407 {.m=2, .n=2, .u={0,0}, .v={0,1}, .cq={0}, .cr={0,0}},
408 {.m=2, .n=2, .u={0,7}, .v={0,3}, .cq={2}, .cr={0,1}},
409 {.m=2, .n=2, .u={5,7}, .v={0,3}, .cq={2}, .cr={5,1}},
410 {.m=2, .n=2, .u={0,6}, .v={0,2}, .cq={3}, .cr={0,0}},
411 {.m=1, .n=1, .u={0x80000000}, .v={0x40000001}, .cq={0x00000001}, .cr={0x3fffffff}},
412 {.m=2, .n=1, .u={0x00000000,0x80000000}, .v={0x40000001}, .cq={0xfffffff8,0x00000001}, .cr={0x00000008}},
413 {.m=2, .n=2, .u={0x00000000,0x80000000}, .v={0x00000001,0x40000000}, .cq={0x00000001}, .cr={0xffffffff,0x3fffffff}},
414 {.m=2, .n=2, .u={0x0000789a,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={1}, .cr={0,0}},
415 {.m=2, .n=2, .u={0x0000789b,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={1}, .cr={1,0}},
416 {.m=2, .n=2, .u={0x00007899,0x0000bcde}, .v={0x0000789a,0x0000bcde}, .cq={0}, .cr={0x00007899,0x0000bcde}},
417 {.m=2, .n=2, .u={0x0000ffff,0x0000ffff}, .v={0x0000ffff,0x0000ffff}, .cq={1}, .cr={0,0}},
418 {.m=2, .n=2, .u={0x0000ffff,0x0000ffff}, .v={0x00000000,0x00000001}, .cq={0x0000ffff}, .cr={0x0000ffff,0}},
419 {.m=3, .n=2, .u={0x000089ab,0x00004567,0x00000123}, .v={0x00000000,0x00000001}, .cq={0x00004567,0x00000123}, .cr={0x000089ab,0}},
420 {.m=3, .n=2, .u={0x00000000,0x0000fffe,0x00008000}, .v={0x0000ffff,0x00008000}, .cq={0xffffffff,0x00000000}, .cr={0x0000ffff,0x00007fff}}, // Shows that first qhat can = b + 1.
421 {.m=3, .n=3, .u={0x00000003,0x00000000,0x80000000}, .v={0x00000001,0x00000000,0x20000000}, .cq={0x00000003}, .cr={0,0,0x20000000}}, // Adding back step req'd.
422 {.m=3, .n=3, .u={0x00000003,0x00000000,0x00008000}, .v={0x00000001,0x00000000,0x00002000}, .cq={0x00000003}, .cr={0,0,0x00002000}}, // Adding back step req'd.
423 {.m=4, .n=3, .u={0,0,0x00008000,0x00007fff}, .v={1,0,0x00008000}, .cq={0xfffe0000,0}, .cr={0x00020000,0xffffffff,0x00007fff}}, // Add back req'd.
424 {.m=4, .n=3, .u={0,0x0000fffe,0,0x00008000}, .v={0x0000ffff,0,0x00008000}, .cq={0xffffffff,0}, .cr={0x0000ffff,0xffffffff,0x00007fff}}, // Shows that mult-sub quantity cannot be treated as signed.
425 {.m=4, .n=3, .u={0,0xfffffffe,0,0x80000000}, .v={0x0000ffff,0,0x80000000}, .cq={0x00000000,1}, .cr={0x00000000,0xfffeffff,0x00000000}}, // Shows that mult-sub quantity cannot be treated as signed.
426 {.m=4, .n=3, .u={0,0xfffffffe,0,0x80000000}, .v={0xffffffff,0,0x80000000}, .cq={0xffffffff,0}, .cr={0xffffffff,0xffffffff,0x7fffffff}}, // Shows that mult-sub quantity cannot be treated as signed.
427 // clang-format on
428 };
429 const int ncases = sizeof(test) / sizeof(test[0]);
430 unsigned q[10], r[10];
431
432 printf("divmnu:\n");
433 for (int i = 0; i < ncases; i++)
434 {
435 int m = test[i].m;
436 int n = test[i].n;
437 uint32_t *u = test[i].u;
438 uint32_t *v = test[i].v;
439 uint32_t *cq = test[i].cq;
440 uint32_t *cr = test[i].cr;
441
442 int f = divmnu(q, r, u, v, m, n);
443 if (f && !test[i].error)
444 {
445 dumpit("Unexpected error return code for dividend u =", m, u);
446 dumpit(" divisor v =", n, v);
447 errors = errors + 1;
448 }
449 else if (!f && test[i].error)
450 {
451 dumpit("Unexpected success return code for dividend u =", m, u);
452 dumpit(" divisor v =", n, v);
453 errors = errors + 1;
454 }
455
456 if (!f)
457 check(q, r, u, v, m, n, cq, cr);
458 }
459
460 printf("%d test failures out of %d cases.\n", errors, ncases);
461 return 0;
462 }