carry starting to make sense
[libreriscv.git] / openpower / sv / bitmanip / divmnu64.c
1 /* original source code from Hackers-Delight
2 https://github.com/hcs0/Hackers-Delight
3 */
4 /* This divides an n-word dividend by an m-word divisor, giving an
5 n-m+1-word quotient and m-word remainder. The bignums are in arrays of
6 words. Here a "word" is 32 bits. This routine is designed for a 64-bit
7 machine which has a 64/64 division instruction. */
8
9 #include <stdio.h>
10 #include <stdlib.h> //To define "exit", req'd by XLC.
11 #include <stdbool.h>
12 #include <stdint.h>
13
14 #define max(x, y) ((x) > (y) ? (x) : (y))
15
16 int nlz(unsigned x) {
17 int n;
18
19 if (x == 0) return(32);
20 n = 0;
21 if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
22 if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
23 if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
24 if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
25 if (x <= 0x7FFFFFFF) {n = n + 1;}
26 return n;
27 }
28
29 void dumpit(char *msg, int n, unsigned v[]) {
30 int i;
31 printf("%s", msg);
32 for (i = n-1; i >= 0; i--) printf(" %08x", v[i]);
33 printf("\n");
34 }
35
36 /* q[0], r[0], u[0], and v[0] contain the LEAST significant words.
37 (The sequence is in little-endian order).
38
39 This is a fairly precise implementation of Knuth's Algorithm D, for a
40 binary computer with base b = 2**32. The caller supplies:
41 1. Space q for the quotient, m - n + 1 words (at least one).
42 2. Space r for the remainder (optional), n words.
43 3. The dividend u, m words, m >= 1.
44 4. The divisor v, n words, n >= 2.
45 The most significant digit of the divisor, v[n-1], must be nonzero. The
46 dividend u may have leading zeros; this just makes the algorithm take
47 longer and makes the quotient contain more leading zeros. A value of
48 NULL may be given for the address of the remainder to signify that the
49 caller does not want the remainder.
50 The program does not alter the input parameters u and v.
51 The quotient and remainder returned may have leading zeros. The
52 function itself returns a value of 0 for success and 1 for invalid
53 parameters (e.g., division by 0).
54 For now, we must have m >= n. Knuth's Algorithm D also requires
55 that the dividend be at least as long as the divisor. (In his terms,
56 m >= 0 (unstated). Therefore m+n >= n.) */
57
58 int divmnu(unsigned q[], unsigned r[],
59 const unsigned u[], const unsigned v[],
60 int m, int n) {
61
62 const unsigned long long b = 4294967296LL; // Number base (2**32).
63 unsigned *un, *vn; // Normalized form of u, v.
64 unsigned long long qhat; // Estimated quotient digit.
65 unsigned long long rhat; // A remainder.
66 unsigned long long p; // Product of two digits.
67 long long t, k;
68 int s, i, j;
69
70 if (m < n || n <= 0 || v[n-1] == 0)
71 return 1; // Return if invalid param.
72
73 if (n == 1) { // Take care of
74 k = 0; // the case of a
75 for (j = m - 1; j >= 0; j--) { // single-digit
76 q[j] = (k*b + u[j])/v[0]; // divisor here.
77 k = (k*b + u[j]) - q[j]*v[0];
78 }
79 if (r != NULL) r[0] = k;
80 return 0;
81 }
82
83 /* Normalize by shifting v left just enough so that its high-order
84 bit is on, and shift u left the same amount. We may have to append a
85 high-order digit on the dividend; we do that unconditionally. */
86
87 s = nlz(v[n-1]); // 0 <= s <= 31.
88 vn = (unsigned *)alloca(4*n);
89 for (i = n - 1; i > 0; i--)
90 vn[i] = (v[i] << s) | ((unsigned long long)v[i-1] >> (32-s));
91 vn[0] = v[0] << s;
92
93 un = (unsigned *)alloca(4*(m + 1));
94 un[m] = (unsigned long long)u[m-1] >> (32-s);
95 for (i = m - 1; i > 0; i--)
96 un[i] = (u[i] << s) | ((unsigned long long)u[i-1] >> (32-s));
97 un[0] = u[0] << s;
98
99 for (j = m - n; j >= 0; j--) { // Main loop.
100 // Compute estimate qhat of q[j].
101 qhat = (un[j+n]*b + un[j+n-1])/vn[n-1];
102 rhat = (un[j+n]*b + un[j+n-1]) - qhat*vn[n-1];
103 again:
104 if (qhat >= b || qhat*vn[n-2] > b*rhat + un[j+n-2])
105 { qhat = qhat - 1;
106 rhat = rhat + vn[n-1];
107 if (rhat < b) goto again;
108 }
109 #define MUL_RSUB_CARRY_2_STAGE2
110 #ifdef ORIGINAL
111 // Multiply and subtract.
112 k = 0;
113 for (i = 0; i < n; i++) {
114 p = qhat*vn[i];
115 t = un[i+j] - k - (p & 0xFFFFFFFFLL);
116 un[i+j] = t;
117 k = (p >> 32) - (t >> 32);
118 }
119 t = un[j+n] - k;
120 un[j+n] = t;
121 bool need_fixup = t < 0;
122 #elif defined(SUB_MUL_BORROW)
123 (void)p; // shut up unused variable warning
124
125 // Multiply and subtract.
126 uint32_t borrow = 0;
127 for(int i = 0; i <= n; i++) {
128 uint32_t vn_i = i < n ? vn[i] : 0;
129 uint64_t value = un[i + j] - (uint64_t)qhat * vn_i - borrow;
130 borrow = -(uint32_t)(value >> 32);
131 un[i + j] = (uint32_t)value;
132 }
133 bool need_fixup = borrow != 0;
134 #elif defined(MUL_RSUB_CARRY)
135 (void)p; // shut up unused variable warning
136
137 // Multiply and subtract.
138 uint32_t carry = 1;
139 for(int i = 0; i <= n; i++) {
140 uint32_t vn_i = i < n ? vn[i] : 0;
141 uint64_t result = un[i + j] + ~((uint64_t)qhat * vn_i) + carry;
142 uint32_t result_high = result >> 32;
143 if(carry <= 1)
144 result_high++;
145 carry = result_high;
146 un[i + j] = (uint32_t)result;
147 }
148 bool need_fixup = carry != 1;
149 #elif defined(SUB_MUL_BORROW_2_STAGE)
150 (void)p; // shut up unused variable warning
151
152 // Multiply and subtract.
153 uint32_t borrow = 0;
154 uint32_t phi[2000]; // plenty space
155 uint32_t plo[2000]; // plenty space
156 // first, perform mul-and-sub and store in split hi-lo
157 // this shows the vectorised sv.msubx which stores 128-bit in
158 // two 64-bit registers
159 for(int i = 0; i <= n; i++) {
160 uint32_t vn_i = i < n ? vn[i] : 0;
161 uint64_t value = un[i + j] - (uint64_t)qhat * vn_i;
162 plo[i] = value & 0xffffffffLL;
163 phi[i] = value >> 32;
164 }
165 // second, reconstruct the 64-bit result, subtract borrow,
166 // store top-half (-ve) in new borrow and store low-half as answer
167 // this is the new (odd) instruction
168 for(int i = 0; i <= n; i++) {
169 uint64_t value = (((uint64_t)phi[i]<<32) | plo[i]) - borrow;
170 borrow = ~(value >> 32)+1; // -(uint32_t)(value >> 32);
171 un[i + j] = (uint32_t)value;
172 }
173 bool need_fixup = borrow != 0;
174 #elif defined(MUL_RSUB_CARRY_2_STAGE)
175 (void)p; // shut up unused variable warning
176
177 // Multiply and subtract.
178 uint32_t carry = 1;
179 uint32_t phi[2000]; // plenty space
180 uint32_t plo[2000]; // plenty space
181 for(int i = 0; i <= n; i++) {
182 uint32_t vn_i = i < n ? vn[i] : 0;
183 uint64_t value = un[i + j] + ~((uint64_t)qhat * vn_i);
184 plo[i] = value & 0xffffffffLL;
185 phi[i] = value >> 32;
186 }
187 for(int i = 0; i <= n; i++) {
188 uint64_t result = (((uint64_t)phi[i]<<32) | plo[i]) + carry;
189 uint32_t result_high = result >> 32;
190 if(carry <= 1)
191 result_high++;
192 carry = result_high;
193 un[i + j] = (uint32_t)result;
194 }
195 bool need_fixup = carry != 1;
196 #elif defined(MUL_RSUB_CARRY_2_STAGE1)
197 (void)p; // shut up unused variable warning
198
199 // Multiply and subtract.
200 uint32_t carry = 1;
201 uint32_t phi[2000]; // plenty space
202 uint32_t plo[2000]; // plenty space
203 // same mul-and-sub as SUB_MUL_BORROW but not the same
204 // mul-and-sub-minus-one as MUL_RSUB_CARRY
205 for(int i = 0; i <= n; i++) {
206 uint32_t vn_i = i < n ? vn[i] : 0;
207 uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
208 plo[i] = value & 0xffffffffLL;
209 phi[i] = value >> 32;
210 }
211 // compensate for the +1 that was added by mul-and-sub by subtracting
212 // it here (as ~(0))
213 for(int i = 0; i <= n; i++) {
214 uint64_t result = (((uint64_t)phi[i]<<32) | plo[i]) + carry+
215 ~(0); // a way to express "-1"
216 uint32_t result_high = result >> 32;
217 if(carry <= 1)
218 result_high++;
219 carry = result_high;
220 un[i + j] = (uint32_t)result;
221 }
222 bool need_fixup = carry != 1;
223 #elif defined(MUL_RSUB_CARRY_2_STAGE2)
224 (void)p; // shut up unused variable warning
225
226 // Multiply and subtract.
227 uint32_t carry = 0;
228 uint32_t phi[2000]; // plenty space
229 uint32_t plo[2000]; // plenty space
230 // same mul-and-sub as SUB_MUL_BORROW but not the same
231 // mul-and-sub-minus-one as MUL_RSUB_CARRY
232 for(int i = 0; i <= n; i++) {
233 uint32_t vn_i = i < n ? vn[i] : 0;
234 uint64_t value = un[i + j] - ((uint64_t)qhat * vn_i);
235 plo[i] = value & 0xffffffffLL;
236 phi[i] = value >> 32;
237 }
238 // NOW it starts to make sense. when no carry this time, next
239 // carry as-is. rlse next carry reduces by one.
240 // it here (as ~(0))
241 for(int i = 0; i <= n; i++) {
242 uint64_t result = (((uint64_t)phi[i]<<32) | plo[i]) + carry;
243 uint32_t result_high = result >> 32;
244 if(carry == 0)
245 carry = result_high;
246 else
247 carry = result_high-1;
248 un[i + j] = (uint32_t)result;
249 }
250 bool need_fixup = carry != 0;
251 #else
252 #error need to choose one of the algorithm options; e.g. -DORIGINAL
253 #endif
254
255 q[j] = qhat; // Store quotient digit.
256 if (need_fixup) { // If we subtracted too
257 q[j] = q[j] - 1; // much, add back.
258 k = 0;
259 for (i = 0; i < n; i++) {
260 t = (unsigned long long)un[i+j] + vn[i] + k;
261 un[i+j] = t;
262 k = t >> 32;
263 }
264 un[j+n] = un[j+n] + k;
265 }
266 } // End j.
267 // If the caller wants the remainder, unnormalize
268 // it and pass it back.
269 if (r != NULL) {
270 for (i = 0; i < n-1; i++)
271 r[i] = (un[i] >> s) | ((unsigned long long)un[i+1] << (32-s));
272 r[n-1] = un[n-1] >> s;
273 }
274 return 0;
275 }
276
277 int errors;
278
279 void check(unsigned q[], unsigned r[],
280 unsigned u[], unsigned v[],
281 int m, int n,
282 unsigned cq[], unsigned cr[]) {
283 int i, szq;
284
285 szq = max(m - n + 1, 1);
286 for (i = 0; i < szq; i++) {
287 if (q[i] != cq[i]) {
288 errors = errors + 1;
289 dumpit("Error, dividend u =", m, u);
290 dumpit(" divisor v =", n, v);
291 dumpit("For quotient, got:", m-n+1, q);
292 dumpit(" Should get:", m-n+1, cq);
293 return;
294 }
295 }
296 for (i = 0; i < n; i++) {
297 if (r[i] != cr[i]) {
298 errors = errors + 1;
299 dumpit("Error, dividend u =", m, u);
300 dumpit(" divisor v =", n, v);
301 dumpit("For remainder, got:", n, r);
302 dumpit(" Should get:", n, cr);
303 return;
304 }
305 }
306 return;
307 }
308
309 int main() {
310 static unsigned test[] = {
311 // m, n, u..., v..., cq..., cr....
312 1, 1, 3, 0, 1, 1, // Error, divide by 0.
313 1, 2, 7, 1,3, 0, 7,0, // Error, n > m.
314 2, 2, 0,0, 1,0, 0, 0,0, // Error, incorrect remainder cr.
315 1, 1, 3, 2, 1, 1,
316 1, 1, 3, 3, 1, 0,
317 1, 1, 3, 4, 0, 3,
318 1, 1, 0, 0xffffffff, 0, 0,
319 1, 1, 0xffffffff, 1, 0xffffffff, 0,
320 1, 1, 0xffffffff, 0xffffffff, 1, 0,
321 1, 1, 0xffffffff, 3, 0x55555555, 0,
322 2, 1, 0xffffffff,0xffffffff, 1, 0xffffffff,0xffffffff, 0,
323 2, 1, 0xffffffff,0xffffffff, 0xffffffff, 1,1, 0,
324 2, 1, 0xffffffff,0xfffffffe, 0xffffffff, 0xffffffff,0, 0xfffffffe,
325 2, 1, 0x00005678,0x00001234, 0x00009abc, 0x1e1dba76,0, 0x6bd0,
326 2, 2, 0,0, 0,1, 0, 0,0,
327 2, 2, 0,7, 0,3, 2, 0,1,
328 2, 2, 5,7, 0,3, 2, 5,1,
329 2, 2, 0,6, 0,2, 3, 0,0,
330 1, 1, 0x80000000, 0x40000001, 0x00000001, 0x3fffffff,
331 2, 1, 0x00000000,0x80000000, 0x40000001, 0xfffffff8,0x00000001, 0x00000008,
332 2, 2, 0x00000000,0x80000000, 0x00000001,0x40000000, 0x00000001, 0xffffffff,0x3fffffff,
333 2, 2, 0x0000789a,0x0000bcde, 0x0000789a,0x0000bcde, 1, 0,0,
334 2, 2, 0x0000789b,0x0000bcde, 0x0000789a,0x0000bcde, 1, 1,0,
335 2, 2, 0x00007899,0x0000bcde, 0x0000789a,0x0000bcde, 0, 0x00007899,0x0000bcde,
336 2, 2, 0x0000ffff,0x0000ffff, 0x0000ffff,0x0000ffff, 1, 0,0,
337 2, 2, 0x0000ffff,0x0000ffff, 0x00000000,0x00000001, 0x0000ffff, 0x0000ffff,0,
338 3, 2, 0x000089ab,0x00004567,0x00000123, 0x00000000,0x00000001, 0x00004567,0x00000123, 0x000089ab,0,
339 3, 2, 0x00000000,0x0000fffe,0x00008000, 0x0000ffff,0x00008000, 0xffffffff,0x00000000, 0x0000ffff,0x00007fff, // Shows that first qhat can = b + 1.
340 3, 3, 0x00000003,0x00000000,0x80000000, 0x00000001,0x00000000,0x20000000, 0x00000003, 0,0,0x20000000, // Adding back step req'd.
341 3, 3, 0x00000003,0x00000000,0x00008000, 0x00000001,0x00000000,0x00002000, 0x00000003, 0,0,0x00002000, // Adding back step req'd.
342 4, 3, 0,0,0x00008000,0x00007fff, 1,0,0x00008000, 0xfffe0000,0, 0x00020000,0xffffffff,0x00007fff, // Add back req'd.
343 4, 3, 0,0x0000fffe,0,0x00008000, 0x0000ffff,0,0x00008000, 0xffffffff,0, 0x0000ffff,0xffffffff,0x00007fff, // Shows that mult-sub quantity cannot be treated as signed.
344 4, 3, 0,0xfffffffe,0,0x80000000, 0x0000ffff,0,0x80000000, 0x00000000,1, 0x00000000,0xfffeffff,0x00000000, // Shows that mult-sub quantity cannot be treated as signed.
345 4, 3, 0,0xfffffffe,0,0x80000000, 0xffffffff,0,0x80000000, 0xffffffff,0, 0xffffffff,0xffffffff,0x7fffffff, // Shows that mult-sub quantity cannot be treated as signed.
346 };
347 int i, n, m, ncases, f;
348 unsigned q[10], r[10];
349 unsigned *u, *v, *cq, *cr;
350
351 printf("divmnu:\n");
352 i = 0;
353 ncases = 0;
354 while (i < sizeof(test)/4) {
355 m = test[i];
356 n = test[i+1];
357 u = &test[i+2];
358 v = &test[i+2+m];
359 cq = &test[i+2+m+n];
360 cr = &test[i+2+m+n+max(m-n+1, 1)];
361
362 f = divmnu(q, r, u, v, m, n);
363 if (f) {
364 dumpit("Error return code for dividend u =", m, u);
365 dumpit(" divisor v =", n, v);
366 errors = errors + 1;
367 }
368 else
369 check(q, r, u, v, m, n, cq, cr);
370 i = i + 2 + m + n + max(m-n+1, 1) + n;
371 ncases = ncases + 1;
372 }
373
374 printf("%d errors out of %d cases; there should be 3.\n", errors, ncases);
375 return 0;
376 }