1 /* original source code from Hackers-Delight
2 https://github.com/hcs0/Hackers-Delight
4 /* This divides an n-word dividend by an m-word divisor, giving an
5 n-m+1-word quotient and m-word remainder. The bignums are in arrays of
6 words. Here a "word" is 32 bits. This routine is designed for a 64-bit
7 machine which has a 64/64 division instruction. */
12 #include <stdlib.h> //To define "exit", req'd by XLC.
14 #define max(x, y) ((x) > (y) ? (x) : (y))
50 void dumpit(char *msg
, int n
, unsigned v
[])
54 for (i
= n
- 1; i
>= 0; i
--)
55 printf(" %08x", v
[i
]);
59 /* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
60 (The sequence is in little-endian order).
62 This is a fairly precise implementation of Knuth's Algorithm D, for a
63 binary computer with base b = 2**32. The caller supplies:
64 1. Space q for the quotient, m - n + 1 words (at least one).
65 2. Space r for the remainder (optional), n words.
66 3. The dividend u, m words, m >= 1.
67 4. The divisor v, n words, n >= 2.
68 The most significant digit of the divisor, v[n-1], must be nonzero. The
69 dividend u may have leading zeros; this just makes the algorithm take
70 longer and makes the quotient contain more leading zeros. A value of
71 NULL may be given for the address of the remainder to signify that the
72 caller does not want the remainder.
73 The program does not alter the input parameters u and v.
74 The quotient and remainder returned may have leading zeros. The
75 function itself returns a value of 0 for success and 1 for invalid
76 parameters (e.g., division by 0).
77 For now, we must have m >= n. Knuth's Algorithm D also requires
78 that the dividend be at least as long as the divisor. (In his terms,
79 m >= 0 (unstated). Therefore m+n >= n.) */
81 int divmnu(unsigned q
[], unsigned r
[], const unsigned u
[], const unsigned v
[],
85 const unsigned long long b
= 1LL << 32; // Number base (2**32).
86 unsigned *un
, *vn
; // Normalized form of u, v.
87 unsigned long long qhat
; // Estimated quotient digit.
88 unsigned long long rhat
; // A remainder.
89 unsigned long long p
; // Product of two digits.
93 if (m
< n
|| n
<= 0 || v
[n
- 1] == 0)
94 return 1; // Return if invalid param.
98 k
= 0; // the case of a
99 for (j
= m
- 1; j
>= 0; j
--)
101 q
[j
] = (k
* b
+ u
[j
]) / v
[0]; // divisor here.
102 k
= (k
* b
+ u
[j
]) - q
[j
] * v
[0];
109 /* Normalize by shifting v left just enough so that its high-order
110 bit is on, and shift u left the same amount. We may have to append a
111 high-order digit on the dividend; we do that unconditionally. */
113 s
= nlz(v
[n
- 1]); // 0 <= s <= 31.
114 vn
= (unsigned *)alloca(4 * n
);
115 for (i
= n
- 1; i
> 0; i
--)
116 vn
[i
] = (v
[i
] << s
) | ((unsigned long long)v
[i
- 1] >> (32 - s
));
119 un
= (unsigned *)alloca(4 * (m
+ 1));
120 un
[m
] = (unsigned long long)u
[m
- 1] >> (32 - s
);
121 for (i
= m
- 1; i
> 0; i
--)
122 un
[i
] = (u
[i
] << s
) | ((unsigned long long)u
[i
- 1] >> (32 - s
));
125 for (j
= m
- n
; j
>= 0; j
--)
127 // Compute estimate qhat of q[j] from top 2 digits
128 uint64_t dig2
= ((uint64_t)un
[j
+ n
] << 32) | un
[j
+ n
- 1];
129 qhat
= dig2
/ vn
[n
- 1];
130 rhat
= dig2
% vn
[n
- 1];
132 // use 3rd-from-top digit to obtain better accuracy
133 if (qhat
>= b
|| qhat
* vn
[n
- 2] > b
* rhat
+ un
[j
+ n
- 2])
136 rhat
= rhat
+ vn
[n
- 1];
140 // don't define here, allowing easily testing all options by passing -D...
141 // #define MUL_RSUB_CARRY_2_STAGE2
143 // Multiply and subtract.
145 for (i
= 0; i
< n
; i
++)
148 t
= un
[i
+ j
] - k
- (p
& 0xFFFFFFFFLL
);
150 k
= (p
>> 32) - (t
>> 32);
154 bool need_fixup
= t
< 0;
155 #elif defined(SUB_MUL_BORROW)
156 (void)p
; // shut up unused variable warning
158 // Multiply and subtract.
160 for (int i
= 0; i
<= n
; i
++)
162 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
163 uint64_t value
= un
[i
+ j
] - (uint64_t)qhat
* vn_i
- borrow
;
164 borrow
= -(uint32_t)(value
>> 32);
165 un
[i
+ j
] = (uint32_t)value
;
167 bool need_fixup
= borrow
!= 0;
168 #elif defined(MUL_RSUB_CARRY)
169 (void)p
; // shut up unused variable warning
171 // Multiply and subtract.
173 for (int i
= 0; i
<= n
; i
++)
175 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
176 uint64_t result
= un
[i
+ j
] + ~((uint64_t)qhat
* vn_i
) + carry
;
177 uint32_t result_high
= result
>> 32;
181 un
[i
+ j
] = (uint32_t)result
;
183 bool need_fixup
= carry
!= 1;
184 #elif defined(SUB_MUL_BORROW_2_STAGE)
185 (void)p
; // shut up unused variable warning
187 // Multiply and subtract.
189 uint32_t phi
[2000]; // plenty space
190 uint32_t plo
[2000]; // plenty space
191 // first, perform mul-and-sub and store in split hi-lo
192 // this shows the vectorised sv.msubx which stores 128-bit in
193 // two 64-bit registers
194 for (int i
= 0; i
<= n
; i
++)
196 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
197 uint64_t value
= un
[i
+ j
] - (uint64_t)qhat
* vn_i
;
198 plo
[i
] = value
& 0xffffffffLL
;
199 phi
[i
] = value
>> 32;
201 // second, reconstruct the 64-bit result, subtract borrow,
202 // store top-half (-ve) in new borrow and store low-half as answer
203 // this is the new (odd) instruction
204 for (int i
= 0; i
<= n
; i
++)
206 uint64_t value
= (((uint64_t)phi
[i
] << 32) | plo
[i
]) - borrow
;
207 borrow
= ~(value
>> 32) + 1; // -(uint32_t)(value >> 32);
208 un
[i
+ j
] = (uint32_t)value
;
210 bool need_fixup
= borrow
!= 0;
211 #elif defined(MUL_RSUB_CARRY_2_STAGE)
212 (void)p
; // shut up unused variable warning
214 // Multiply and subtract.
216 uint32_t phi
[2000]; // plenty space
217 uint32_t plo
[2000]; // plenty space
218 for (int i
= 0; i
<= n
; i
++)
220 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
221 uint64_t value
= un
[i
+ j
] + ~((uint64_t)qhat
* vn_i
);
222 plo
[i
] = value
& 0xffffffffLL
;
223 phi
[i
] = value
>> 32;
225 for (int i
= 0; i
<= n
; i
++)
227 uint64_t result
= (((uint64_t)phi
[i
] << 32) | plo
[i
]) + carry
;
228 uint32_t result_high
= result
>> 32;
232 un
[i
+ j
] = (uint32_t)result
;
234 bool need_fixup
= carry
!= 1;
235 #elif defined(MUL_RSUB_CARRY_2_STAGE1)
236 (void)p
; // shut up unused variable warning
238 // Multiply and subtract.
240 uint32_t phi
[2000]; // plenty space
241 uint32_t plo
[2000]; // plenty space
242 // same mul-and-sub as SUB_MUL_BORROW but not the same
243 // mul-and-sub-minus-one as MUL_RSUB_CARRY
244 for (int i
= 0; i
<= n
; i
++)
246 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
247 uint64_t value
= un
[i
+ j
] - ((uint64_t)qhat
* vn_i
);
248 plo
[i
] = value
& 0xffffffffLL
;
249 phi
[i
] = value
>> 32;
251 // compensate for the +1 that was added by mul-and-sub by subtracting
253 for (int i
= 0; i
<= n
; i
++)
255 uint64_t result
= (((uint64_t)phi
[i
] << 32) | plo
[i
]) + carry
+
256 ~(0); // a way to express "-1"
257 uint32_t result_high
= result
>> 32;
261 un
[i
+ j
] = (uint32_t)result
;
263 bool need_fixup
= carry
!= 1;
264 #elif defined(MUL_RSUB_CARRY_2_STAGE2)
265 (void)p
; // shut up unused variable warning
267 // Multiply and subtract.
269 uint32_t phi
[2000]; // plenty space
270 uint32_t plo
[2000]; // plenty space
271 // same mul-and-sub as SUB_MUL_BORROW but not the same
272 // mul-and-sub-minus-one as MUL_RSUB_CARRY
273 for (int i
= 0; i
<= n
; i
++)
275 uint32_t vn_i
= i
< n
? vn
[i
] : 0;
276 uint64_t value
= un
[i
+ j
] - ((uint64_t)qhat
* vn_i
);
277 plo
[i
] = value
& 0xffffffffLL
;
278 phi
[i
] = value
>> 32;
280 // NOW it starts to make sense. when no carry this time, next
281 // carry as-is. rlse next carry reduces by one.
283 for (int i
= 0; i
<= n
; i
++)
285 uint64_t result
= (((uint64_t)phi
[i
] << 32) | plo
[i
]) + carry
;
286 uint32_t result_high
= result
>> 32;
290 carry
= result_high
- 1;
291 un
[i
+ j
] = (uint32_t)result
;
293 bool need_fixup
= carry
!= 0;
294 #elif defined(MADDED_SUBFE)
295 (void)p
; // shut up unused variable warning
297 // Multiply and subtract.
299 uint32_t product
[n
+ 1];
301 // sv.madded product.v, vn.v, qhat.s, carry.s
302 for (int i
= 0; i
<= n
; i
++)
304 uint32_t vn_v
= i
< n
? vn
[i
] : 0;
305 uint64_t value
= (uint64_t)vn_v
* (uint64_t)qhat
+ carry
;
306 carry
= (uint32_t)(value
>> 32);
307 product
[i
] = (uint32_t)value
;
310 uint32_t *un_j
= &un
[j
];
312 // sv.subfe un_j.v, product.v, un_j.v
313 for (int i
= 0; i
<= n
; i
++)
315 uint64_t value
= (uint64_t)~product
[i
] + (uint64_t)un_j
[i
] + ca
;
316 ca
= value
>> 32 != 0;
319 bool need_fixup
= !ca
;
321 #error need to choose one of the algorithm options; e.g. -DORIGINAL
324 q
[j
] = qhat
; // Store quotient digit.
326 { // If we subtracted too
327 q
[j
] = q
[j
] - 1; // much, add back.
329 for (i
= 0; i
< n
; i
++)
331 t
= (unsigned long long)un
[i
+ j
] + vn
[i
] + k
;
335 un
[j
+ n
] = un
[j
+ n
] + k
;
338 // If the caller wants the remainder, unnormalize
339 // it and pass it back.
342 for (i
= 0; i
< n
- 1; i
++)
343 r
[i
] = (un
[i
] >> s
) | ((unsigned long long)un
[i
+ 1] << (32 - s
));
344 r
[n
- 1] = un
[n
- 1] >> s
;
351 void check(unsigned q
[], unsigned r
[], unsigned u
[], unsigned v
[], int m
,
352 int n
, unsigned cq
[], unsigned cr
[])
356 szq
= max(m
- n
+ 1, 1);
357 for (i
= 0; i
< szq
; i
++)
362 dumpit("Error, dividend u =", m
, u
);
363 dumpit(" divisor v =", n
, v
);
364 dumpit("For quotient, got:", m
- n
+ 1, q
);
365 dumpit(" Should get:", m
- n
+ 1, cq
);
369 for (i
= 0; i
< n
; i
++)
374 dumpit("Error, dividend u =", m
, u
);
375 dumpit(" divisor v =", n
, v
);
376 dumpit("For remainder, got:", n
, r
);
377 dumpit(" Should get:", n
, cr
);
389 uint32_t u
[10], v
[10], cq
[10], cr
[10];
393 {.m
=1, .n
=1, .u
={3}, .v
={0}, .cq
={1}, .cr
={1}, .error
=true}, // Error, divide by 0.
394 {.m
=1, .n
=2, .u
={7}, .v
={1,3}, .cq
={0}, .cr
={7,0}, .error
=true}, // Error, n > m.
395 {.m
=2, .n
=2, .u
={0,0}, .v
={1,0}, .cq
={0}, .cr
={0,0}, .error
=true}, // Error, incorrect remainder cr.
396 {.m
=1, .n
=1, .u
={3}, .v
={2}, .cq
={1}, .cr
={1}},
397 {.m
=1, .n
=1, .u
={3}, .v
={3}, .cq
={1}, .cr
={0}},
398 {.m
=1, .n
=1, .u
={3}, .v
={4}, .cq
={0}, .cr
={3}},
399 {.m
=1, .n
=1, .u
={0}, .v
={0xffffffff}, .cq
={0}, .cr
={0}},
400 {.m
=1, .n
=1, .u
={0xffffffff}, .v
={1}, .cq
={0xffffffff}, .cr
={0}},
401 {.m
=1, .n
=1, .u
={0xffffffff}, .v
={0xffffffff}, .cq
={1}, .cr
={0}},
402 {.m
=1, .n
=1, .u
={0xffffffff}, .v
={3}, .cq
={0x55555555}, .cr
={0}},
403 {.m
=2, .n
=1, .u
={0xffffffff,0xffffffff}, .v
={1}, .cq
={0xffffffff,0xffffffff}, .cr
={0}},
404 {.m
=2, .n
=1, .u
={0xffffffff,0xffffffff}, .v
={0xffffffff}, .cq
={1,1}, .cr
={0}},
405 {.m
=2, .n
=1, .u
={0xffffffff,0xfffffffe}, .v
={0xffffffff}, .cq
={0xffffffff,0}, .cr
={0xfffffffe}},
406 {.m
=2, .n
=1, .u
={0x00005678,0x00001234}, .v
={0x00009abc}, .cq
={0x1e1dba76,0}, .cr
={0x6bd0}},
407 {.m
=2, .n
=2, .u
={0,0}, .v
={0,1}, .cq
={0}, .cr
={0,0}},
408 {.m
=2, .n
=2, .u
={0,7}, .v
={0,3}, .cq
={2}, .cr
={0,1}},
409 {.m
=2, .n
=2, .u
={5,7}, .v
={0,3}, .cq
={2}, .cr
={5,1}},
410 {.m
=2, .n
=2, .u
={0,6}, .v
={0,2}, .cq
={3}, .cr
={0,0}},
411 {.m
=1, .n
=1, .u
={0x80000000}, .v
={0x40000001}, .cq
={0x00000001}, .cr
={0x3fffffff}},
412 {.m
=2, .n
=1, .u
={0x00000000,0x80000000}, .v
={0x40000001}, .cq
={0xfffffff8,0x00000001}, .cr
={0x00000008}},
413 {.m
=2, .n
=2, .u
={0x00000000,0x80000000}, .v
={0x00000001,0x40000000}, .cq
={0x00000001}, .cr
={0xffffffff,0x3fffffff}},
414 {.m
=2, .n
=2, .u
={0x0000789a,0x0000bcde}, .v
={0x0000789a,0x0000bcde}, .cq
={1}, .cr
={0,0}},
415 {.m
=2, .n
=2, .u
={0x0000789b,0x0000bcde}, .v
={0x0000789a,0x0000bcde}, .cq
={1}, .cr
={1,0}},
416 {.m
=2, .n
=2, .u
={0x00007899,0x0000bcde}, .v
={0x0000789a,0x0000bcde}, .cq
={0}, .cr
={0x00007899,0x0000bcde}},
417 {.m
=2, .n
=2, .u
={0x0000ffff,0x0000ffff}, .v
={0x0000ffff,0x0000ffff}, .cq
={1}, .cr
={0,0}},
418 {.m
=2, .n
=2, .u
={0x0000ffff,0x0000ffff}, .v
={0x00000000,0x00000001}, .cq
={0x0000ffff}, .cr
={0x0000ffff,0}},
419 {.m
=3, .n
=2, .u
={0x000089ab,0x00004567,0x00000123}, .v
={0x00000000,0x00000001}, .cq
={0x00004567,0x00000123}, .cr
={0x000089ab,0}},
420 {.m
=3, .n
=2, .u
={0x00000000,0x0000fffe,0x00008000}, .v
={0x0000ffff,0x00008000}, .cq
={0xffffffff,0x00000000}, .cr
={0x0000ffff,0x00007fff}}, // Shows that first qhat can = b + 1.
421 {.m
=3, .n
=3, .u
={0x00000003,0x00000000,0x80000000}, .v
={0x00000001,0x00000000,0x20000000}, .cq
={0x00000003}, .cr
={0,0,0x20000000}}, // Adding back step req'd.
422 {.m
=3, .n
=3, .u
={0x00000003,0x00000000,0x00008000}, .v
={0x00000001,0x00000000,0x00002000}, .cq
={0x00000003}, .cr
={0,0,0x00002000}}, // Adding back step req'd.
423 {.m
=4, .n
=3, .u
={0,0,0x00008000,0x00007fff}, .v
={1,0,0x00008000}, .cq
={0xfffe0000,0}, .cr
={0x00020000,0xffffffff,0x00007fff}}, // Add back req'd.
424 {.m
=4, .n
=3, .u
={0,0x0000fffe,0,0x00008000}, .v
={0x0000ffff,0,0x00008000}, .cq
={0xffffffff,0}, .cr
={0x0000ffff,0xffffffff,0x00007fff}}, // Shows that mult-sub quantity cannot be treated as signed.
425 {.m
=4, .n
=3, .u
={0,0xfffffffe,0,0x80000000}, .v
={0x0000ffff,0,0x80000000}, .cq
={0x00000000,1}, .cr
={0x00000000,0xfffeffff,0x00000000}}, // Shows that mult-sub quantity cannot be treated as signed.
426 {.m
=4, .n
=3, .u
={0,0xfffffffe,0,0x80000000}, .v
={0xffffffff,0,0x80000000}, .cq
={0xffffffff,0}, .cr
={0xffffffff,0xffffffff,0x7fffffff}}, // Shows that mult-sub quantity cannot be treated as signed.
429 const int ncases
= sizeof(test
) / sizeof(test
[0]);
430 unsigned q
[10], r
[10];
433 for (int i
= 0; i
< ncases
; i
++)
437 uint32_t *u
= test
[i
].u
;
438 uint32_t *v
= test
[i
].v
;
439 uint32_t *cq
= test
[i
].cq
;
440 uint32_t *cr
= test
[i
].cr
;
442 int f
= divmnu(q
, r
, u
, v
, m
, n
);
443 if (f
&& !test
[i
].error
)
445 dumpit("Unexpected error return code for dividend u =", m
, u
);
446 dumpit(" divisor v =", n
, v
);
449 else if (!f
&& test
[i
].error
)
451 dumpit("Unexpected success return code for dividend u =", m
, u
);
452 dumpit(" divisor v =", n
, v
);
457 check(q
, r
, u
, v
, m
, n
, cq
, cr
);
460 printf("%d test failures out of %d cases.\n", errors
, ncases
);