gallium/util: rewrite global constructor system for half floats (GCC/MSVC only!)
[mesa.git] / src / gallium / auxiliary / util / u_half.c
1 /*
2 * Copyright 2010 Luca Barbieri
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the
13 * next paragraph) shall be included in all copies or substantial
14 * portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
20 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 **************************************************************************/
25
26 /* The code is a reimplementation of the algorithm in
27 * www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
28 * "Fast Half Float Conversions" by Jeroen van der Zijp, Nov 2008
29 *
30 * The table contents have been slightly changed so that the exponent
31 * bias is now in the exponent table instead of the mantissa table (mostly
32 * for cosmetic reasons, and because it theoretically allows a variant
33 * that flushes denormal to zero but uses a mantissa table with 24-bit
34 * entries).
35 *
36 * The tables are also constructed slightly differently.
37 */
38
39 /* Note that using a 64K * 4 table is a terrible idea since it will not fit
40 * in the L1 cache and will massively pollute the L2 cache as well
41 *
42 * These should instead fit in the L1 cache.
43 *
44 * TODO: we could use a denormal bias table instead of the mantissa/offset
45 * tables: this would reduce the L1 cache usage from 8704 to 2304 bytes
46 * but would involve more computation
47 *
48 * Note however that if denormals are never encountered, the L1 cache usage
49 * is only about 4608 bytes anyway.
50 */
51
52 #include "util/u_half.h"
53 #include "util/u_init.h"
54
55 uint32_t util_half_to_float_mantissa_table[2048];
56 uint32_t util_half_to_float_exponent_table[64];
57 uint32_t util_half_to_float_offset_table[64];
58 uint16_t util_float_to_half_base_table[512];
59 uint8_t util_float_to_half_shift_table[512];
60
61 static void util_half_init_tables(void)
62 {
63 int i;
64
65 /* zero */
66 util_half_to_float_mantissa_table[0] = 0;
67
68 /* denormals */
69 for(i = 1; i < 1024; ++i) {
70 unsigned int m = i << 13;
71 unsigned int e = 0;
72
73 /* Normalize number */
74 while(!(m & 0x00800000)) {
75 e -= 0x00800000;
76 m<<=1;
77 }
78 m &= ~0x00800000;
79 e+= 0x38800000;
80 util_half_to_float_mantissa_table[i] = m | e;
81 }
82
83 /* normals */
84 for(i = 1024; i < 2048; ++i)
85 util_half_to_float_mantissa_table[i] = ((i-1024)<<13);
86
87 /* positive zero or denormals */
88 util_half_to_float_exponent_table[0] = 0;
89
90 /* positive numbers */
91 for(i = 1; i <= 30; ++i)
92 util_half_to_float_exponent_table[i] = 0x38000000 + (i << 23);
93
94 /* positive infinity/NaN */
95 util_half_to_float_exponent_table[31] = 0x7f800000;
96
97 /* negative zero or denormals */
98 util_half_to_float_exponent_table[32] = 0x80000000;
99
100 /* negative numbers */
101 for(i = 33; i <= 62; ++i)
102 util_half_to_float_exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
103
104 /* negative infinity/NaN */
105 util_half_to_float_exponent_table[63] = 0xff800000;
106
107 /* positive zero or denormals */
108 util_half_to_float_offset_table[0] = 0;
109
110 /* positive normals */
111 for(i = 1; i < 32; ++i)
112 util_half_to_float_offset_table[i] = 1024;
113
114 /* negative zero or denormals */
115 util_half_to_float_offset_table[32] = 0;
116
117 /* negative normals */
118 for(i = 33; i < 64; ++i)
119 util_half_to_float_offset_table[i] = 1024;
120
121
122
123 /* very small numbers mapping to zero */
124 for(i = -127; i < -24; ++i) {
125 util_float_to_half_base_table[127 + i] = 0;
126 util_float_to_half_shift_table[127 + i] = 24;
127 }
128
129 /* small numbers mapping to denormals */
130 for(i = -24; i < -14; ++i) {
131 util_float_to_half_base_table[127 + i] = 0x0400 >> (-14 - i);
132 util_float_to_half_shift_table[127 + i] = -i - 1;
133 }
134
135 /* normal numbers */
136 for(i = -14; i < 16; ++i) {
137 util_float_to_half_base_table[127 + i] = (i + 15) << 10;
138 util_float_to_half_shift_table[127 + i] = 13;
139 }
140
141 /* large numbers mapping to infinity */
142 for(i = 16; i < 128; ++i) {
143 util_float_to_half_base_table[127 + i] = 0x7c00;
144 util_float_to_half_shift_table[127 + i] = 24;
145 }
146
147 /* infinity and NaNs */
148 util_float_to_half_base_table[255] = 0x7c00;
149 util_float_to_half_shift_table[255] = 13;
150
151 /* negative numbers */
152 for(i = 0; i < 256; ++i) {
153 util_float_to_half_base_table[256 + i] = util_float_to_half_base_table[i] | 0x8000;
154 util_float_to_half_shift_table[256 + i] = util_float_to_half_shift_table[i];
155 }
156 }
157
158 UTIL_INIT(util_half_init_tables);