Makefile.in (dfp-filenames): Replace decimal_globals...
[gcc.git] / libgcc / config / libbid / bid_sqrt_macros.h
1 /* Copyright (C) 2007 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 2, or (at your option) any later
8 version.
9
10 In addition to the permissions in the GNU General Public License, the
11 Free Software Foundation gives you unlimited permission to link the
12 compiled version of this file into combinations with other programs,
13 and to distribute those combinations without any restriction coming
14 from the use of this file. (The General Public License restrictions
15 do apply in other respects; for example, they cover modification of
16 the file, and distribution when not linked into a combine
17 executable.)
18
19 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
20 WARRANTY; without even the implied warranty of MERCHANTABILITY or
21 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
22 for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with GCC; see the file COPYING. If not, write to the Free
26 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
27 02110-1301, USA. */
28
29 #ifndef _SQRT_MACROS_H_
30 #define _SQRT_MACROS_H_
31
32 #define FENCE __fence
33
34 #if DOUBLE_EXTENDED_ON
35
36 extern BINARY80 SQRT80 (BINARY80);
37
38
39 __BID_INLINE__ UINT64
40 short_sqrt128 (UINT128 A10) {
41 BINARY80 lx, ly, l64;
42 int_float f64;
43
44 // 2^64
45 f64.i = 0x5f800000;
46 l64 = (BINARY80) f64.d;
47 lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0];
48 ly = SQRT80 (lx);
49 return (UINT64) ly;
50 }
51
52
53 __BID_INLINE__ void
54 long_sqrt128 (UINT128 * pCS, UINT256 C256) {
55 UINT256 C4;
56 UINT128 CS;
57 UINT64 X;
58 SINT64 SE;
59 BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2,
60 l1, l0, lp, lCl;
61 int_float fx, f64, fm64;
62 int *ple = (int *) &lx;
63
64 // 2^64
65 f64.i = 0x5f800000;
66 l64 = (BINARY80) f64.d;
67
68 l128 = l64 * l64;
69 lx = l3 = (BINARY80) C256.w[3] * l64 * l128;
70 l2 = (BINARY80) C256.w[2] * l128;
71 lx = FENCE (lx + l2);
72 l1 = (BINARY80) C256.w[1] * l64;
73 lx = FENCE (lx + l1);
74 l0 = (BINARY80) C256.w[0];
75 lx = FENCE (lx + l0);
76 // sqrt(C256)
77 lS = SQRT80 (lx);
78
79 // get coefficient
80 // 2^(-64)
81 fm64.i = 0x1f800000;
82 lm64 = (BINARY80) fm64.d;
83 CS.w[1] = (UINT64) (lS * lm64);
84 CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64);
85
86 ///////////////////////////////////////
87 // CAUTION!
88 // little endian code only
89 // add solution for big endian
90 //////////////////////////////////////
91 lSH = lS;
92 *((UINT64 *) & lSH) &= 0xffffffff00000000ull;
93
94 // correction for C256 rounding
95 lCl = FENCE (l3 - lx);
96 lCl = FENCE (lCl + l2);
97 lCl = FENCE (lCl + l1);
98 lCl = FENCE (lCl + l0);
99
100 lSL = lS - lSH;
101
102 //////////////////////////////////////////
103 // Watch for compiler re-ordering
104 //
105 /////////////////////////////////////////
106 // C256-S^2
107 lxL = FENCE (lx - lSH * lSH);
108 lp = lSH * lSL;
109 lp += lp;
110 lxL = FENCE (lxL - lp);
111 lSL *= lSL;
112 lxL = FENCE (lxL - lSL);
113 lCl += lxL;
114
115 // correction term
116 lE = lCl / (lS + lS);
117
118 // get low part of coefficient
119 X = CS.w[0];
120 if (lCl >= 0) {
121 SE = (SINT64) (lE);
122 CS.w[0] += SE;
123 if (CS.w[0] < X)
124 CS.w[1]++;
125 } else {
126 SE = (SINT64) (-lE);
127 CS.w[0] -= SE;
128 if (CS.w[0] > X)
129 CS.w[1]--;
130 }
131
132 pCS->w[0] = CS.w[0];
133 pCS->w[1] = CS.w[1];
134 }
135
136 #else
137
138 extern double sqrt (double);
139
140 __BID_INLINE__ UINT64
141 short_sqrt128 (UINT128 A10) {
142 UINT256 ARS, ARS0, AE0, AE, S;
143
144 UINT64 MY, ES, CY;
145 double lx, l64;
146 int_double f64, ly;
147 int ey, k;
148
149 // 2^64
150 f64.i = 0x43f0000000000000ull;
151 l64 = f64.d;
152 lx = (double) A10.w[1] * l64 + (double) A10.w[0];
153 ly.d = 1.0 / sqrt (lx);
154
155 MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
156 ey = 0x3ff - (ly.i >> 52);
157
158 // A10*RS^2
159 __mul_64x128_to_192 (ARS0, MY, A10);
160 __mul_64x192_to_256 (ARS, MY, ARS0);
161
162 // shr by 2*ey+40, to get a 64-bit value
163 k = (ey << 1) + 104 - 64;
164 if (k >= 128) {
165 if (k > 128)
166 ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k));
167 else
168 ES = ARS.w[2];
169 } else {
170 if (k >= 64) {
171 ARS.w[0] = ARS.w[1];
172 ARS.w[1] = ARS.w[2];
173 k -= 64;
174 }
175 if (k) {
176 __shr_128 (ARS, ARS, k);
177 }
178 ES = ARS.w[0];
179 }
180
181 ES = ((SINT64) ES) >> 1;
182
183 if (((SINT64) ES) < 0) {
184 ES = -ES;
185
186 // A*RS*eps (scaled by 2^64)
187 __mul_64x192_to_256 (AE0, ES, ARS0);
188
189 AE.w[0] = AE0.w[1];
190 AE.w[1] = AE0.w[2];
191 AE.w[2] = AE0.w[3];
192
193 __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
194 __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
195 S.w[2] = ARS0.w[2] + AE.w[2] + CY;
196 } else {
197 // A*RS*eps (scaled by 2^64)
198 __mul_64x192_to_256 (AE0, ES, ARS0);
199
200 AE.w[0] = AE0.w[1];
201 AE.w[1] = AE0.w[2];
202 AE.w[2] = AE0.w[3];
203
204 __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
205 __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
206 S.w[2] = ARS0.w[2] - AE.w[2] - CY;
207 }
208
209 k = ey + 51;
210
211 if (k >= 64) {
212 if (k >= 128) {
213 S.w[0] = S.w[2];
214 S.w[1] = 0;
215 k -= 128;
216 } else {
217 S.w[0] = S.w[1];
218 S.w[1] = S.w[2];
219 }
220 k -= 64;
221 }
222 if (k) {
223 __shr_128 (S, S, k);
224 }
225
226
227 return (UINT64) ((S.w[0] + 1) >> 1);
228
229 }
230
231
232
233 __BID_INLINE__ void
234 long_sqrt128 (UINT128 * pCS, UINT256 C256) {
235 UINT512 ARS0, ARS;
236 UINT256 ARS00, AE, AE2, S;
237 UINT128 ES, ES2, ARS1;
238 UINT64 ES32, CY, MY;
239 double l64, l128, lx, l2, l1, l0;
240 int_double f64, ly;
241 int ey, k, k2;
242
243 // 2^64
244 f64.i = 0x43f0000000000000ull;
245 l64 = f64.d;
246
247 l128 = l64 * l64;
248 lx = (double) C256.w[3] * l64 * l128;
249 l2 = (double) C256.w[2] * l128;
250 lx = FENCE (lx + l2);
251 l1 = (double) C256.w[1] * l64;
252 lx = FENCE (lx + l1);
253 l0 = (double) C256.w[0];
254 lx = FENCE (lx + l0);
255 // sqrt(C256)
256 ly.d = 1.0 / sqrt (lx);
257
258 MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
259 ey = 0x3ff - (ly.i >> 52);
260
261 // A10*RS^2, scaled by 2^(2*ey+104)
262 __mul_64x256_to_320 (ARS0, MY, C256);
263 __mul_64x320_to_384 (ARS, MY, ARS0);
264
265 // shr by k=(2*ey+104)-128
266 // expect k is in the range (192, 256) if result in [10^33, 10^34)
267 // apply an additional signed shift by 1 at the same time (to get eps=eps0/2)
268 k = (ey << 1) + 104 - 128 - 192;
269 k2 = 64 - k;
270 ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1));
271 ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2);
272 ES.w[1] = ((SINT64) ES.w[1]) >> 1;
273
274 // A*RS >> 192 (for error term computation)
275 ARS1.w[0] = ARS0.w[3];
276 ARS1.w[1] = ARS0.w[4];
277
278 // A*RS>>64
279 ARS00.w[0] = ARS0.w[1];
280 ARS00.w[1] = ARS0.w[2];
281 ARS00.w[2] = ARS0.w[3];
282 ARS00.w[3] = ARS0.w[4];
283
284 if (((SINT64) ES.w[1]) < 0) {
285 ES.w[0] = -ES.w[0];
286 ES.w[1] = -ES.w[1];
287 if (ES.w[0])
288 ES.w[1]--;
289
290 // A*RS*eps
291 __mul_128x128_to_256 (AE, ES, ARS1);
292
293 __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
294 __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
295 __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
296 S.w[3] = ARS00.w[3] + AE.w[3] + CY;
297 } else {
298 // A*RS*eps
299 __mul_128x128_to_256 (AE, ES, ARS1);
300
301 __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
302 __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
303 __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
304 S.w[3] = ARS00.w[3] - AE.w[3] - CY;
305 }
306
307 // 3/2*eps^2, scaled by 2^128
308 ES32 = ES.w[1] + (ES.w[1] >> 1);
309 __mul_64x64_to_128 (ES2, ES32, ES.w[1]);
310 // A*RS*3/2*eps^2
311 __mul_128x128_to_256 (AE2, ES2, ARS1);
312
313 // result, scaled by 2^(ey+52-64)
314 __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]);
315 __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY);
316 __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY);
317 S.w[3] = S.w[3] + AE2.w[3] + CY;
318
319 // k in (0, 64)
320 k = ey + 51 - 128;
321 k2 = 64 - k;
322 S.w[0] = (S.w[1] >> k) | (S.w[2] << k2);
323 S.w[1] = (S.w[2] >> k) | (S.w[3] << k2);
324
325 // round to nearest
326 S.w[0]++;
327 if (!S.w[0])
328 S.w[1]++;
329
330 pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1);
331 pCS->w[1] = S.w[1] >> 1;
332
333 }
334
335 #endif
336 #endif