Merged with libbbid branch at revision 126349.
[gcc.git] / libgcc / config / libbid / sqrt_macros.h
1 /* Copyright (C) 2007 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 2, or (at your option) any later
8 version.
9
10 In addition to the permissions in the GNU General Public License, the
11 Free Software Foundation gives you unlimited permission to link the
12 compiled version of this file into combinations with other programs,
13 and to distribute those combinations without any restriction coming
14 from the use of this file. (The General Public License restrictions
15 do apply in other respects; for example, they cover modification of
16 the file, and distribution when not linked into a combine
17 executable.)
18
19 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
20 WARRANTY; without even the implied warranty of MERCHANTABILITY or
21 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
22 for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with GCC; see the file COPYING. If not, write to the Free
26 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
27 02110-1301, USA. */
28
29 #ifndef _SQRT_MACROS_H_
30 #define _SQRT_MACROS_H_
31
32 #include "bid_internal.h"
33
34 #define FENCE __fence
35
36 #if DOUBLE_EXTENDED_ON
37
38 extern long double sqrtl (long double);
39
40
41 __BID_INLINE__ UINT64
42 short_sqrt128 (UINT128 A10) {
43 long double lx, ly, l64;
44 int_float f64;
45
46 // 2^64
47 f64.i = 0x5f800000;
48 l64 = (long double) f64.d;
49 lx = (long double) A10.w[1] * l64 + (long double) A10.w[0];
50 ly = sqrtl (lx);
51 return (UINT64) ly;
52 }
53
54
55 __BID_INLINE__ void
56 long_sqrt128 (UINT128 * pCS, UINT256 C256) {
57 UINT256 C4;
58 UINT128 CS;
59 UINT64 X;
60 SINT64 SE;
61 long double l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2,
62 l1, l0, lp, lCl;
63 int_float fx, f64, fm64;
64 int *ple = (int *) &lx;
65
66 // 2^64
67 f64.i = 0x5f800000;
68 l64 = (long double) f64.d;
69
70 l128 = l64 * l64;
71 lx = l3 = (long double) C256.w[3] * l64 * l128;
72 l2 = (long double) C256.w[2] * l128;
73 lx = FENCE (lx + l2);
74 l1 = (long double) C256.w[1] * l64;
75 lx = FENCE (lx + l1);
76 l0 = (long double) C256.w[0];
77 lx = FENCE (lx + l0);
78 // sqrt(C256)
79 lS = sqrtl (lx);
80
81 // get coefficient
82 // 2^(-64)
83 fm64.i = 0x1f800000;
84 lm64 = (long double) fm64.d;
85 CS.w[1] = (UINT64) (lS * lm64);
86 CS.w[0] = (UINT64) (lS - (long double) CS.w[1] * l64);
87
88 ///////////////////////////////////////
89 // CAUTION!
90 // little endian code only
91 // add solution for big endian
92 //////////////////////////////////////
93 lSH = lS;
94 *((UINT64 *) & lSH) &= 0xffffffff00000000ull;
95
96 // correction for C256 rounding
97 lCl = FENCE (l3 - lx);
98 lCl = FENCE (lCl + l2);
99 lCl = FENCE (lCl + l1);
100 lCl = FENCE (lCl + l0);
101
102 lSL = lS - lSH;
103
104 //////////////////////////////////////////
105 // Watch for compiler re-ordering
106 //
107 /////////////////////////////////////////
108 // C256-S^2
109 lxL = FENCE (lx - lSH * lSH);
110 lp = lSH * lSL;
111 lp += lp;
112 lxL = FENCE (lxL - lp);
113 lSL *= lSL;
114 lxL = FENCE (lxL - lSL);
115 lCl += lxL;
116
117 // correction term
118 lE = lCl / (lS + lS);
119
120 // get low part of coefficient
121 X = CS.w[0];
122 if (lCl >= 0) {
123 SE = (SINT64) (lE);
124 CS.w[0] += SE;
125 if (CS.w[0] < X)
126 CS.w[1]++;
127 } else {
128 SE = (SINT64) (-lE);
129 CS.w[0] -= SE;
130 if (CS.w[0] > X)
131 CS.w[1]--;
132 }
133
134 pCS->w[0] = CS.w[0];
135 pCS->w[1] = CS.w[1];
136 }
137
138 #else
139
140 extern double sqrt (double);
141
142 __BID_INLINE__ UINT64
143 short_sqrt128 (UINT128 A10) {
144 UINT256 ARS, ARS0, AE0, AE, S;
145
146 UINT64 MY, ES, CY;
147 double lx, l64;
148 int_double f64, ly;
149 int ey, k;
150
151 // 2^64
152 f64.i = 0x43f0000000000000ull;
153 l64 = f64.d;
154 lx = (double) A10.w[1] * l64 + (double) A10.w[0];
155 ly.d = 1.0 / sqrt (lx);
156
157 MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
158 ey = 0x3ff - (ly.i >> 52);
159
160 // A10*RS^2
161 __mul_64x128_to_192 (ARS0, MY, A10);
162 __mul_64x192_to_256 (ARS, MY, ARS0);
163
164 // shr by 2*ey+40, to get a 64-bit value
165 k = (ey << 1) + 104 - 64;
166 if (k >= 128)
167 ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k));
168 else {
169 if (k >= 64) {
170 ARS.w[0] = ARS.w[1];
171 ARS.w[1] = ARS.w[2];
172 k -= 64;
173 }
174 if (k) {
175 __shr_128 (ARS, ARS, k);
176 }
177 ES = ARS.w[0];
178 }
179
180 ES = ((SINT64) ES) >> 1;
181
182 if (((SINT64) ES) < 0) {
183 ES = -ES;
184
185 // A*RS*eps (scaled by 2^64)
186 __mul_64x192_to_256 (AE0, ES, ARS0);
187
188 AE.w[0] = AE0.w[1];
189 AE.w[1] = AE0.w[2];
190 AE.w[2] = AE0.w[3];
191
192 __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
193 __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
194 S.w[2] = ARS0.w[2] + AE.w[2] + CY;
195 } else {
196 // A*RS*eps (scaled by 2^64)
197 __mul_64x192_to_256 (AE0, ES, ARS0);
198
199 AE.w[0] = AE0.w[1];
200 AE.w[1] = AE0.w[2];
201 AE.w[2] = AE0.w[3];
202
203 __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
204 __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
205 S.w[2] = ARS0.w[2] - AE.w[2] - CY;
206 }
207
208 k = ey + 51;
209
210 if (k >= 64) {
211 if (k >= 128) {
212 S.w[0] = S.w[2];
213 S.w[1] = 0;
214 k -= 128;
215 } else {
216 S.w[0] = S.w[1];
217 S.w[1] = S.w[2];
218 }
219 k -= 64;
220 }
221 if (k) {
222 __shr_128 (S, S, k);
223 }
224
225
226 return (UINT64) ((S.w[0] + 1) >> 1);
227
228 }
229
230
231
232 __BID_INLINE__ void
233 long_sqrt128 (UINT128 * pCS, UINT256 C256) {
234 UINT512 ARS0, ARS;
235 UINT256 ARS00, AE, AE2, S;
236 UINT128 ES, ES2, ARS1;
237 UINT64 ES32, CY, MY;
238 double l64, l128, lx, l2, l1, l0;
239 int_double f64, ly;
240 int ey, k, k2;
241
242 // 2^64
243 f64.i = 0x43f0000000000000ull;
244 l64 = f64.d;
245
246 l128 = l64 * l64;
247 lx = (double) C256.w[3] * l64 * l128;
248 l2 = (double) C256.w[2] * l128;
249 lx = FENCE (lx + l2);
250 l1 = (double) C256.w[1] * l64;
251 lx = FENCE (lx + l1);
252 l0 = (double) C256.w[0];
253 lx = FENCE (lx + l0);
254 // sqrt(C256)
255 ly.d = 1.0 / sqrt (lx);
256
257 MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
258 ey = 0x3ff - (ly.i >> 52);
259
260 // A10*RS^2, scaled by 2^(2*ey+104)
261 __mul_64x256_to_320 (ARS0, MY, C256);
262 __mul_64x320_to_384 (ARS, MY, ARS0);
263
264 // shr by k=(2*ey+104)-128
265 // expect k is in the range (192, 256) if result in [10^33, 10^34)
266 // apply an additional signed shift by 1 at the same time (to get eps=eps0/2)
267 k = (ey << 1) + 104 - 128 - 192;
268 k2 = 64 - k;
269 ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1));
270 ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2);
271 ES.w[1] = ((SINT64) ES.w[1]) >> 1;
272
273 // A*RS >> 192 (for error term computation)
274 ARS1.w[0] = ARS0.w[3];
275 ARS1.w[1] = ARS0.w[4];
276
277 // A*RS>>64
278 ARS00.w[0] = ARS0.w[1];
279 ARS00.w[1] = ARS0.w[2];
280 ARS00.w[2] = ARS0.w[3];
281 ARS00.w[3] = ARS0.w[4];
282
283 if (((SINT64) ES.w[1]) < 0) {
284 ES.w[0] = -ES.w[0];
285 ES.w[1] = -ES.w[1];
286 if (ES.w[0])
287 ES.w[1]--;
288
289 // A*RS*eps
290 __mul_128x128_to_256 (AE, ES, ARS1);
291
292 __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
293 __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
294 __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
295 S.w[3] = ARS00.w[3] + AE.w[3] + CY;
296 } else {
297 // A*RS*eps
298 __mul_128x128_to_256 (AE, ES, ARS1);
299
300 __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
301 __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
302 __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
303 S.w[3] = ARS00.w[3] - AE.w[3] - CY;
304 }
305
306 // 3/2*eps^2, scaled by 2^128
307 ES32 = ES.w[1] + (ES.w[1] >> 1);
308 __mul_64x64_to_128 (ES2, ES32, ES.w[1]);
309 // A*RS*3/2*eps^2
310 __mul_128x128_to_256 (AE2, ES2, ARS1);
311
312 // result, scaled by 2^(ey+52-64)
313 __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]);
314 __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY);
315 __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY);
316 S.w[3] = S.w[3] + AE2.w[3] + CY;
317
318 // k in (0, 64)
319 k = ey + 51 - 128;
320 k2 = 64 - k;
321 S.w[0] = (S.w[1] >> k) | (S.w[2] << k2);
322 S.w[1] = (S.w[2] >> k) | (S.w[3] << k2);
323
324 // round to nearest
325 S.w[0]++;
326 if (!S.w[0])
327 S.w[1]++;
328
329 pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1);
330 pCS->w[1] = S.w[1] >> 1;
331
332 }
333
334 #endif
335 #endif