1 /* original source code from Hackers-Delight
2 https://github.com/hcs0/Hackers-Delight
4 /* This divides an n-word dividend by an m-word divisor, giving an
5 n-m+1-word quotient and m-word remainder. The bignums are in arrays of
6 words. Here a "word" is 32 bits. This routine is designed for a 64-bit
7 machine which has a 64/64 division instruction. */
12 #include <stdlib.h> //To define "exit", req'd by XLC.
14 #define max(x, y) ((x) > (y) ? (x) : (y))
50 void dumpit(char *msg
, int n
, unsigned v
[])
54 for (i
= n
- 1; i
>= 0; i
--)
55 printf(" %08x", v
[i
]);
65 divrem_t
divrem_64_by_32(uint64_t n
, uint32_t d
)
67 if ((n
>> 32) >= d
) // overflow
69 return (divrem_t
){.q
= UINT32_MAX
, .r
= 0, .overflow
= true};
73 return (divrem_t
){.q
= n
/ d
, .r
= n
% d
, .overflow
= false};
77 bool bigmul(uint32_t qhat
, unsigned product
[], unsigned vn
[], int m
, int n
)
81 // sv.maddedu product.v, vn.v, qhat.s, carry.s
82 for (int i
= 0; i
<= n
; i
++)
84 uint32_t vn_v
= i
< n
? vn
[i
] : 0;
85 uint64_t value
= (uint64_t)vn_v
* (uint64_t)qhat
+ carry
;
86 carry
= (uint32_t)(value
>> 32);
87 product
[i
] = (uint32_t)value
;
92 bool bigadd(unsigned result
[], unsigned vn
[], unsigned un
[], int m
, int n
,
96 // sv.subfe un_j.v, product.v, un_j.v
97 for (int i
= 0; i
<= n
; i
++)
99 uint64_t value
= (uint64_t)vn
[i
] + (uint64_t)un
[i
] + ca
;
100 ca
= value
>> 32 != 0;
106 bool bigsub(unsigned result
[], unsigned vn
[], unsigned un
[], int m
, int n
,
110 // sv.subfe un_j.v, product.v, un_j.v
111 for (int i
= 0; i
<= n
; i
++)
113 uint64_t value
= (uint64_t)~vn
[i
] + (uint64_t)un
[i
] + ca
;
114 ca
= value
>> 32 != 0;
120 bool bigmulsub(unsigned long long qhat
, unsigned vn
[], unsigned un
[], int j
,
123 // Multiply and subtract.
124 uint32_t product
[n
+ 1];
125 bigmul(qhat
, product
, vn
, m
, n
);
126 bool ca
= bigsub(un
, product
, un
, m
, n
, true);
127 bool need_fixup
= !ca
;
131 /* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
132 (The sequence is in little-endian order).
134 This is a fairly precise implementation of Knuth's Algorithm D, for a
135 binary computer with base b = 2**32. The caller supplies:
136 1. Space q for the quotient, m - n + 1 words (at least one).
137 2. Space r for the remainder (optional), n words.
138 3. The dividend u, m words, m >= 1.
139 4. The divisor v, n words, n >= 2.
140 The most significant digit of the divisor, v[n-1], must be nonzero. The
141 dividend u may have leading zeros; this just makes the algorithm take
142 longer and makes the quotient contain more leading zeros. A value of
143 NULL may be given for the address of the remainder to signify that the
144 caller does not want the remainder.
145 The program does not alter the input parameters u and v.
146 The quotient and remainder returned may have leading zeros. The
147 function itself returns a value of 0 for success and 1 for invalid
148 parameters (e.g., division by 0).
149 For now, we must have m >= n. Knuth's Algorithm D also requires
150 that the dividend be at least as long as the divisor. (In his terms,
151 m >= 0 (unstated). Therefore m+n >= n.) */
153 int divmnu(unsigned q
[], unsigned r
[], const unsigned u
[], const unsigned v
[],
157 const unsigned long long b
= 1LL << 32; // Number base (2**32).
158 unsigned *un
, *vn
; // Normalized form of u, v.
159 unsigned long long qhat
; // Estimated quotient digit.
160 unsigned long long rhat
; // A remainder.
161 unsigned long long p
; // Product of two digits.
165 if (m
< n
|| n
<= 0 || v
[n
- 1] == 0)
166 return 1; // Return if invalid param.
170 k
= 0; // the case of a
171 for (j
= m
- 1; j
>= 0; j
--)
173 uint64_t dig2
= ((uint64_t)k
<< 32) | u
[j
];
174 q
[j
] = dig2
/ v
[0]; // divisor here.
175 k
= dig2
% v
[0]; // modulo back into next loop
182 /* Normalize by shifting v left just enough so that its high-order
183 bit is on, and shift u left the same amount. We may have to append a
184 high-order digit on the dividend; we do that unconditionally. */
186 s
= nlz(v
[n
- 1]); // 0 <= s <= 31.
187 vn
= (unsigned *)alloca(4 * n
);
188 for (i
= n
- 1; i
> 0; i
--)
189 vn
[i
] = (v
[i
] << s
) | ((unsigned long long)v
[i
- 1] >> (32 - s
));
192 un
= (unsigned *)alloca(4 * (m
+ 1));
193 un
[m
] = (unsigned long long)u
[m
- 1] >> (32 - s
);
194 for (i
= m
- 1; i
> 0; i
--)
195 un
[i
] = (u
[i
] << s
) | ((unsigned long long)u
[i
- 1] >> (32 - s
));
198 for (j
= m
- n
; j
>= 0; j
--)
200 uint32_t *un_j
= &un
[j
];
201 // Compute estimate qhat of q[j] from top 2 digits.
202 uint64_t dig2
= ((uint64_t)un
[j
+ n
] << 32) | un
[j
+ n
- 1];
203 divrem_t qr
= divrem_64_by_32(dig2
, vn
[n
- 1]);
208 // rhat can be bigger than 32-bit when the division overflows,
209 // so rhat's computation can't be folded into divrem_64_by_32
210 rhat
= dig2
- (uint64_t)qr
.q
* vn
[n
- 1];
213 // use 3rd-from-top digit to obtain better accuracy
214 if (rhat
< b
&& qhat
* vn
[n
- 2] > b
* rhat
+ un
[j
+ n
- 2])
217 rhat
= rhat
+ vn
[n
- 1];
221 // don't define here, allowing easily testing all options by passing -D...
222 // #define MUL_RSUB_CARRY_2_STAGE2
224 // Multiply and subtract.
226 for (i
= 0; i
< n
; i
++)
229 t
= un
[i
+ j
] - k
- (p
& 0xFFFFFFFFLL
);
231 k
= (p
>> 32) - (t
>> 32);
235 bool need_fixup
= t
< 0;
236 #elif defined(SUB_MUL_BORROW)
237 (void)p
, (void)t
; // shut up unused variable warning
239 // Multiply and subtract.
241 for (int i
= 0; i
<= n
; i
++)
243 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
244 uint64_t value
= un
[i
+ j
] - (uint64_t)qhat
* vn_i
- borrow
;
245 borrow
= -(uint32_t)(value
>> 32);
246 un
[i
+ j
] = (uint32_t)value
;
248 bool need_fixup
= borrow
!= 0;
249 #elif defined(MUL_RSUB_CARRY)
250 (void)p
, (void)t
; // shut up unused variable warning
252 // Multiply and subtract.
254 for (int i
= 0; i
<= n
; i
++)
256 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
257 uint64_t result
= un
[i
+ j
] + ~((uint64_t)qhat
* vn_i
) + carry
;
258 uint32_t result_high
= result
>> 32;
262 un
[i
+ j
] = (uint32_t)result
;
264 bool need_fixup
= carry
!= 1;
265 #elif defined(SUB_MUL_BORROW_2_STAGE)
266 (void)p
, (void)t
; // shut up unused variable warning
268 // Multiply and subtract.
270 uint32_t phi
[2000]; // plenty space
271 uint32_t plo
[2000]; // plenty space
272 // first, perform mul-and-sub and store in split hi-lo
273 // this shows the vectorised sv.msubx which stores 128-bit in
274 // two 64-bit registers
275 for (int i
= 0; i
<= n
; i
++)
277 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
278 uint64_t value
= un
[i
+ j
] - (uint64_t)qhat
* vn_i
;
279 plo
[i
] = value
& 0xffffffffLL
;
280 phi
[i
] = value
>> 32;
282 // second, reconstruct the 64-bit result, subtract borrow,
283 // store top-half (-ve) in new borrow and store low-half as answer
284 // this is the new (odd) instruction
285 for (int i
= 0; i
<= n
; i
++)
287 uint64_t value
= (((uint64_t)phi
[i
] << 32) | plo
[i
]) - borrow
;
288 borrow
= ~(value
>> 32) + 1; // -(uint32_t)(value >> 32);
289 un
[i
+ j
] = (uint32_t)value
;
291 bool need_fixup
= borrow
!= 0;
292 #elif defined(MUL_RSUB_CARRY_2_STAGE)
293 (void)p
, (void)t
; // shut up unused variable warning
295 // Multiply and subtract.
297 uint32_t phi
[2000]; // plenty space
298 uint32_t plo
[2000]; // plenty space
299 for (int i
= 0; i
<= n
; i
++)
301 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
302 uint64_t value
= un
[i
+ j
] + ~((uint64_t)qhat
* vn_i
);
303 plo
[i
] = value
& 0xffffffffLL
;
304 phi
[i
] = value
>> 32;
306 for (int i
= 0; i
<= n
; i
++)
308 uint64_t result
= (((uint64_t)phi
[i
] << 32) | plo
[i
]) + carry
;
309 uint32_t result_high
= result
>> 32;
313 un
[i
+ j
] = (uint32_t)result
;
315 bool need_fixup
= carry
!= 1;
316 #elif defined(MUL_RSUB_CARRY_2_STAGE1)
317 (void)p
, (void)t
; // shut up unused variable warning
319 // Multiply and subtract.
321 uint32_t phi
[2000]; // plenty space
322 uint32_t plo
[2000]; // plenty space
323 // same mul-and-sub as SUB_MUL_BORROW but not the same
324 // mul-and-sub-minus-one as MUL_RSUB_CARRY
325 for (int i
= 0; i
<= n
; i
++)
327 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
328 uint64_t value
= un
[i
+ j
] - ((uint64_t)qhat
* vn_i
);
329 plo
[i
] = value
& 0xffffffffLL
;
330 phi
[i
] = value
>> 32;
332 // compensate for the +1 that was added by mul-and-sub by subtracting
334 for (int i
= 0; i
<= n
; i
++)
336 uint64_t result
= (((uint64_t)phi
[i
] << 32) | plo
[i
]) + carry
+
337 ~(0); // a way to express "-1"
338 uint32_t result_high
= result
>> 32;
342 un
[i
+ j
] = (uint32_t)result
;
344 bool need_fixup
= carry
!= 1;
345 #elif defined(MUL_RSUB_CARRY_2_STAGE2)
346 (void)p
, (void)t
; // shut up unused variable warning
348 // Multiply and subtract.
350 uint32_t phi
[2000]; // plenty space
351 uint32_t plo
[2000]; // plenty space
352 // same mul-and-sub as SUB_MUL_BORROW but not the same
353 // mul-and-sub-minus-one as MUL_RSUB_CARRY
354 for (int i
= 0; i
<= n
; i
++)
356 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
357 uint64_t value
= un
[i
+ j
] - ((uint64_t)qhat
* vn_i
);
358 plo
[i
] = value
& 0xffffffffLL
;
359 phi
[i
] = value
>> 32;
361 // NOW it starts to make sense. when no carry this time, next
362 // carry as-is. rlse next carry reduces by one.
364 for (int i
= 0; i
<= n
; i
++)
366 uint64_t result
= (((uint64_t)phi
[i
] << 32) | plo
[i
]) + carry
;
367 uint32_t result_high
= result
>> 32;
371 carry
= result_high
- 1;
372 un
[i
+ j
] = (uint32_t)result
;
374 bool need_fixup
= carry
!= 0;
375 #elif defined(MADDEDU_SUBFE)
376 (void)p
, (void)t
; // shut up unused variable warning
378 // Multiply and subtract.
380 uint32_t product
[n
+ 1];
382 // sv.maddedu product.v, vn.v, qhat.s, carry.s
383 for (int i
= 0; i
<= n
; i
++)
385 uint32_t vn_v
= i
< n
? vn
[i
] : 0;
386 uint64_t value
= (uint64_t)vn_v
* (uint64_t)qhat
+ carry
;
387 carry
= (uint32_t)(value
>> 32);
388 product
[i
] = (uint32_t)value
;
392 // sv.subfe un_j.v, product.v, un_j.v
393 for (int i
= 0; i
<= n
; i
++)
395 uint64_t value
= (uint64_t)~product
[i
] + (uint64_t)un_j
[i
] + ca
;
396 ca
= value
>> 32 != 0;
399 bool need_fixup
= !ca
;
401 #error need to choose one of the algorithm options; e.g. -DORIGINAL
404 q
[j
] = qhat
; // Store quotient digit.
406 { // If we subtracted too
407 q
[j
] = q
[j
] - 1; // much, add back.
408 bigadd(un_j
, vn
, un_j
, m
, n
, 0);
411 // If the caller wants the remainder, unnormalize
412 // it and pass it back.
415 for (i
= 0; i
< n
- 1; i
++)
416 r
[i
] = (un
[i
] >> s
) | ((unsigned long long)un
[i
+ 1] << (32 - s
));
417 r
[n
- 1] = un
[n
- 1] >> s
;
424 void check(unsigned q
[], unsigned r
[], unsigned u
[], unsigned v
[], int m
,
425 int n
, unsigned cq
[], unsigned cr
[])
429 szq
= max(m
- n
+ 1, 1);
430 for (i
= 0; i
< szq
; i
++)
435 dumpit("Error, dividend u =", m
, u
);
436 dumpit(" divisor v =", n
, v
);
437 dumpit("For quotient, got:", m
- n
+ 1, q
);
438 dumpit(" Should get:", m
- n
+ 1, cq
);
442 for (i
= 0; i
< n
; i
++)
447 dumpit("Error, dividend u =", m
, u
);
448 dumpit(" divisor v =", n
, v
);
449 dumpit("For remainder, got:", n
, r
);
450 dumpit(" Should get:", n
, cr
);
462 uint32_t u
[10], v
[10], cq
[10], cr
[10];
466 {.m
=1, .n
=1, .u
={3}, .v
={0}, .cq
={1}, .cr
={1}, .error
=true}, // Error, divide by 0.
467 {.m
=1, .n
=2, .u
={7}, .v
={1,3}, .cq
={0}, .cr
={7,0}, .error
=true}, // Error, n > m.
468 {.m
=2, .n
=2, .u
={0,0}, .v
={1,0}, .cq
={0}, .cr
={0,0}, .error
=true}, // Error, incorrect remainder cr.
469 {.m
=1, .n
=1, .u
={3}, .v
={2}, .cq
={1}, .cr
={1}},
470 {.m
=1, .n
=1, .u
={3}, .v
={3}, .cq
={1}, .cr
={0}},
471 {.m
=1, .n
=1, .u
={3}, .v
={4}, .cq
={0}, .cr
={3}},
472 {.m
=1, .n
=1, .u
={0}, .v
={0xffffffff}, .cq
={0}, .cr
={0}},
473 {.m
=1, .n
=1, .u
={0xffffffff}, .v
={1}, .cq
={0xffffffff}, .cr
={0}},
474 {.m
=1, .n
=1, .u
={0xffffffff}, .v
={0xffffffff}, .cq
={1}, .cr
={0}},
475 {.m
=1, .n
=1, .u
={0xffffffff}, .v
={3}, .cq
={0x55555555}, .cr
={0}},
476 {.m
=2, .n
=1, .u
={0xffffffff,0xffffffff}, .v
={1}, .cq
={0xffffffff,0xffffffff}, .cr
={0}},
477 {.m
=2, .n
=1, .u
={0xffffffff,0xffffffff}, .v
={0xffffffff}, .cq
={1,1}, .cr
={0}},
478 {.m
=2, .n
=1, .u
={0xffffffff,0xfffffffe}, .v
={0xffffffff}, .cq
={0xffffffff,0}, .cr
={0xfffffffe}},
479 {.m
=2, .n
=1, .u
={0x00005678,0x00001234}, .v
={0x00009abc}, .cq
={0x1e1dba76,0}, .cr
={0x6bd0}},
480 {.m
=2, .n
=2, .u
={0,0}, .v
={0,1}, .cq
={0}, .cr
={0,0}},
481 {.m
=2, .n
=2, .u
={0,7}, .v
={0,3}, .cq
={2}, .cr
={0,1}},
482 {.m
=2, .n
=2, .u
={5,7}, .v
={0,3}, .cq
={2}, .cr
={5,1}},
483 {.m
=2, .n
=2, .u
={0,6}, .v
={0,2}, .cq
={3}, .cr
={0,0}},
484 {.m
=1, .n
=1, .u
={0x80000000}, .v
={0x40000001}, .cq
={0x00000001}, .cr
={0x3fffffff}},
485 {.m
=2, .n
=1, .u
={0x00000000,0x80000000}, .v
={0x40000001}, .cq
={0xfffffff8,0x00000001}, .cr
={0x00000008}},
486 {.m
=2, .n
=2, .u
={0x00000000,0x80000000}, .v
={0x00000001,0x40000000}, .cq
={0x00000001}, .cr
={0xffffffff,0x3fffffff}},
487 {.m
=2, .n
=2, .u
={0x0000789a,0x0000bcde}, .v
={0x0000789a,0x0000bcde}, .cq
={1}, .cr
={0,0}},
488 {.m
=2, .n
=2, .u
={0x0000789b,0x0000bcde}, .v
={0x0000789a,0x0000bcde}, .cq
={1}, .cr
={1,0}},
489 {.m
=2, .n
=2, .u
={0x00007899,0x0000bcde}, .v
={0x0000789a,0x0000bcde}, .cq
={0}, .cr
={0x00007899,0x0000bcde}},
490 {.m
=2, .n
=2, .u
={0x0000ffff,0x0000ffff}, .v
={0x0000ffff,0x0000ffff}, .cq
={1}, .cr
={0,0}},
491 {.m
=2, .n
=2, .u
={0x0000ffff,0x0000ffff}, .v
={0x00000000,0x00000001}, .cq
={0x0000ffff}, .cr
={0x0000ffff,0}},
492 {.m
=3, .n
=2, .u
={0x000089ab,0x00004567,0x00000123}, .v
={0x00000000,0x00000001}, .cq
={0x00004567,0x00000123}, .cr
={0x000089ab,0}},
493 {.m
=3, .n
=2, .u
={0x00000000,0x0000fffe,0x00008000}, .v
={0x0000ffff,0x00008000}, .cq
={0xffffffff,0x00000000}, .cr
={0x0000ffff,0x00007fff}}, // Shows that first qhat can = b + 1.
494 {.m
=3, .n
=3, .u
={0x00000003,0x00000000,0x80000000}, .v
={0x00000001,0x00000000,0x20000000}, .cq
={0x00000003}, .cr
={0,0,0x20000000}}, // Adding back step req'd.
495 {.m
=3, .n
=3, .u
={0x00000003,0x00000000,0x00008000}, .v
={0x00000001,0x00000000,0x00002000}, .cq
={0x00000003}, .cr
={0,0,0x00002000}}, // Adding back step req'd.
496 {.m
=4, .n
=3, .u
={0,0,0x00008000,0x00007fff}, .v
={1,0,0x00008000}, .cq
={0xfffe0000,0}, .cr
={0x00020000,0xffffffff,0x00007fff}}, // Add back req'd.
497 {.m
=4, .n
=3, .u
={0,0x0000fffe,0,0x00008000}, .v
={0x0000ffff,0,0x00008000}, .cq
={0xffffffff,0}, .cr
={0x0000ffff,0xffffffff,0x00007fff}}, // Shows that mult-sub quantity cannot be treated as signed.
498 {.m
=4, .n
=3, .u
={0,0xfffffffe,0,0x80000000}, .v
={0x0000ffff,0,0x80000000}, .cq
={0x00000000,1}, .cr
={0x00000000,0xfffeffff,0x00000000}}, // Shows that mult-sub quantity cannot be treated as signed.
499 {.m
=4, .n
=3, .u
={0,0xfffffffe,0,0x80000000}, .v
={0xffffffff,0,0x80000000}, .cq
={0xffffffff,0}, .cr
={0xffffffff,0xffffffff,0x7fffffff}}, // Shows that mult-sub quantity cannot be treated as signed.
502 const int ncases
= sizeof(test
) / sizeof(test
[0]);
503 unsigned q
[10], r
[10];
506 for (int i
= 0; i
< ncases
; i
++)
510 uint32_t *u
= test
[i
].u
;
511 uint32_t *v
= test
[i
].v
;
512 uint32_t *cq
= test
[i
].cq
;
513 uint32_t *cr
= test
[i
].cr
;
515 int f
= divmnu(q
, r
, u
, v
, m
, n
);
516 if (f
&& !test
[i
].error
)
518 dumpit("Unexpected error return code for dividend u =", m
, u
);
519 dumpit(" divisor v =", n
, v
);
522 else if (!f
&& test
[i
].error
)
524 dumpit("Unexpected success return code for dividend u =", m
, u
);
525 dumpit(" divisor v =", n
, v
);
530 check(q
, r
, u
, v
, m
, n
, cq
, cr
);
533 printf("%d test failures out of %d cases.\n", errors
, ncases
);