Merge branch 'mesa_7_6_branch' into mesa_7_7_branch
[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_conv.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for type conversions.
32 *
33 * We want to use the fastest type for a given computation whenever feasible.
34 * The other side of this is that we need to be able convert between several
35 * types accurately and efficiently.
36 *
37 * Conversion between types of different bit width is quite complex since a
38 *
39 * To remember there are a few invariants in type conversions:
40 *
41 * - register width must remain constant:
42 *
43 * src_type.width * src_type.length == dst_type.width * dst_type.length
44 *
45 * - total number of elements must remain constant:
46 *
47 * src_type.length * num_srcs == dst_type.length * num_dsts
48 *
49 * It is not always possible to do the conversion both accurately and
50 * efficiently, usually due to lack of adequate machine instructions. In these
51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52 * there this functions can be used anywhere. In the future we might have a
53 * precision parameter which can gauge the accuracy vs efficiency compromise,
54 * but for now if the data conversion between two stages happens to be the
55 * bottleneck, then most likely should just avoid converting at all and run
56 * both stages with the same type.
57 *
58 * Make sure to run lp_test_conv unit test after any change to this file.
59 *
60 * @author Jose Fonseca <jfonseca@vmware.com>
61 */
62
63
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/u_cpu_detect.h"
67
68 #include "lp_bld_type.h"
69 #include "lp_bld_const.h"
70 #include "lp_bld_intr.h"
71 #include "lp_bld_arit.h"
72 #include "lp_bld_pack.h"
73 #include "lp_bld_conv.h"
74
75
76 /**
77 * Special case for converting clamped IEEE-754 floats to unsigned norms.
78 *
79 * The mathematical voodoo below may seem excessive but it is actually
80 * paramount we do it this way for several reasons. First, there is no single
81 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
82 * secondly, even if there was, since the FP's mantissa takes only a fraction
83 * of register bits the typically scale and cast approach would require double
84 * precision for accurate results, and therefore half the throughput
85 *
86 * Although the result values can be scaled to an arbitrary bit width specified
87 * by dst_width, the actual result type will have the same width.
88 */
89 LLVMValueRef
90 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
91 struct lp_type src_type,
92 unsigned dst_width,
93 LLVMValueRef src)
94 {
95 LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type);
96 LLVMValueRef res;
97 unsigned mantissa;
98 unsigned n;
99 unsigned long long ubound;
100 unsigned long long mask;
101 double scale;
102 double bias;
103
104 assert(src_type.floating);
105
106 mantissa = lp_mantissa(src_type);
107
108 /* We cannot carry more bits than the mantissa */
109 n = MIN2(mantissa, dst_width);
110
111 /* This magic coefficients will make the desired result to appear in the
112 * lowest significant bits of the mantissa.
113 */
114 ubound = ((unsigned long long)1 << n);
115 mask = ubound - 1;
116 scale = (double)mask/ubound;
117 bias = (double)((unsigned long long)1 << (mantissa - n));
118
119 res = LLVMBuildMul(builder, src, lp_build_const_scalar(src_type, scale), "");
120 res = LLVMBuildAdd(builder, res, lp_build_const_scalar(src_type, bias), "");
121 res = LLVMBuildBitCast(builder, res, int_vec_type, "");
122
123 if(dst_width > n) {
124 int shift = dst_width - n;
125 res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
126
127 /* TODO: Fill in the empty lower bits for additional precision? */
128 #if 0
129 {
130 LLVMValueRef msb;
131 msb = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, dst_width - 1), "");
132 msb = LLVMBuildShl(builder, msb, lp_build_int_const_scalar(src_type, shift), "");
133 msb = LLVMBuildSub(builder, msb, lp_build_int_const_scalar(src_type, 1), "");
134 res = LLVMBuildOr(builder, res, msb, "");
135 }
136 #elif 0
137 while(shift > 0) {
138 res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, n), ""), "");
139 shift -= n;
140 n *= 2;
141 }
142 #endif
143 }
144 else
145 res = LLVMBuildAnd(builder, res, lp_build_int_const_scalar(src_type, mask), "");
146
147 return res;
148 }
149
150
151 /**
152 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
153 */
154 LLVMValueRef
155 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
156 unsigned src_width,
157 struct lp_type dst_type,
158 LLVMValueRef src)
159 {
160 LLVMTypeRef vec_type = lp_build_vec_type(dst_type);
161 LLVMTypeRef int_vec_type = lp_build_int_vec_type(dst_type);
162 LLVMValueRef bias_;
163 LLVMValueRef res;
164 unsigned mantissa;
165 unsigned n;
166 unsigned long long ubound;
167 unsigned long long mask;
168 double scale;
169 double bias;
170
171 mantissa = lp_mantissa(dst_type);
172
173 n = MIN2(mantissa, src_width);
174
175 ubound = ((unsigned long long)1 << n);
176 mask = ubound - 1;
177 scale = (double)ubound/mask;
178 bias = (double)((unsigned long long)1 << (mantissa - n));
179
180 res = src;
181
182 if(src_width > mantissa) {
183 int shift = src_width - mantissa;
184 res = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(dst_type, shift), "");
185 }
186
187 bias_ = lp_build_const_scalar(dst_type, bias);
188
189 res = LLVMBuildOr(builder,
190 res,
191 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
192
193 res = LLVMBuildBitCast(builder, res, vec_type, "");
194
195 res = LLVMBuildSub(builder, res, bias_, "");
196 res = LLVMBuildMul(builder, res, lp_build_const_scalar(dst_type, scale), "");
197
198 return res;
199 }
200
201
202 /**
203 * Generic type conversion.
204 *
205 * TODO: Take a precision argument, or even better, add a new precision member
206 * to the lp_type union.
207 */
208 void
209 lp_build_conv(LLVMBuilderRef builder,
210 struct lp_type src_type,
211 struct lp_type dst_type,
212 const LLVMValueRef *src, unsigned num_srcs,
213 LLVMValueRef *dst, unsigned num_dsts)
214 {
215 struct lp_type tmp_type;
216 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
217 unsigned num_tmps;
218 unsigned i;
219
220 /* Register width must remain constant */
221 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
222
223 /* We must not loose or gain channels. Only precision */
224 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
225
226 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
227 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
228
229 tmp_type = src_type;
230 for(i = 0; i < num_srcs; ++i)
231 tmp[i] = src[i];
232 num_tmps = num_srcs;
233
234 /*
235 * Clamp if necessary
236 */
237
238 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
239 struct lp_build_context bld;
240 double src_min = lp_const_min(src_type);
241 double dst_min = lp_const_min(dst_type);
242 double src_max = lp_const_max(src_type);
243 double dst_max = lp_const_max(dst_type);
244 LLVMValueRef thres;
245
246 lp_build_context_init(&bld, builder, tmp_type);
247
248 if(src_min < dst_min) {
249 if(dst_min == 0.0)
250 thres = bld.zero;
251 else
252 thres = lp_build_const_scalar(src_type, dst_min);
253 for(i = 0; i < num_tmps; ++i)
254 tmp[i] = lp_build_max(&bld, tmp[i], thres);
255 }
256
257 if(src_max > dst_max) {
258 if(dst_max == 1.0)
259 thres = bld.one;
260 else
261 thres = lp_build_const_scalar(src_type, dst_max);
262 for(i = 0; i < num_tmps; ++i)
263 tmp[i] = lp_build_min(&bld, tmp[i], thres);
264 }
265 }
266
267 /*
268 * Scale to the narrowest range
269 */
270
271 if(dst_type.floating) {
272 /* Nothing to do */
273 }
274 else if(tmp_type.floating) {
275 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
276 for(i = 0; i < num_tmps; ++i) {
277 tmp[i] = lp_build_clamped_float_to_unsigned_norm(builder,
278 tmp_type,
279 dst_type.width,
280 tmp[i]);
281 }
282 tmp_type.floating = FALSE;
283 }
284 else {
285 double dst_scale = lp_const_scale(dst_type);
286 LLVMTypeRef tmp_vec_type;
287
288 if (dst_scale != 1.0) {
289 LLVMValueRef scale = lp_build_const_scalar(tmp_type, dst_scale);
290 for(i = 0; i < num_tmps; ++i)
291 tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
292 }
293
294 /* Use an equally sized integer for intermediate computations */
295 tmp_type.floating = FALSE;
296 tmp_vec_type = lp_build_vec_type(tmp_type);
297 for(i = 0; i < num_tmps; ++i) {
298 #if 0
299 if(dst_type.sign)
300 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
301 else
302 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
303 #else
304 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
305 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
306 #endif
307 }
308 }
309 }
310 else {
311 unsigned src_shift = lp_const_shift(src_type);
312 unsigned dst_shift = lp_const_shift(dst_type);
313
314 /* FIXME: compensate different offsets too */
315 if(src_shift > dst_shift) {
316 LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, src_shift - dst_shift);
317 for(i = 0; i < num_tmps; ++i)
318 if(src_type.sign)
319 tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
320 else
321 tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
322 }
323 }
324
325 /*
326 * Truncate or expand bit width
327 */
328
329 assert(!tmp_type.floating || tmp_type.width == dst_type.width);
330
331 if(tmp_type.width > dst_type.width) {
332 assert(num_dsts == 1);
333 tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
334 tmp_type.width = dst_type.width;
335 tmp_type.length = dst_type.length;
336 num_tmps = 1;
337 }
338
339 if(tmp_type.width < dst_type.width) {
340 assert(num_tmps == 1);
341 lp_build_unpack(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
342 tmp_type.width = dst_type.width;
343 tmp_type.length = dst_type.length;
344 num_tmps = num_dsts;
345 }
346
347 assert(tmp_type.width == dst_type.width);
348 assert(tmp_type.length == dst_type.length);
349 assert(num_tmps == num_dsts);
350
351 /*
352 * Scale to the widest range
353 */
354
355 if(src_type.floating) {
356 /* Nothing to do */
357 }
358 else if(!src_type.floating && dst_type.floating) {
359 if(!src_type.fixed && !src_type.sign && src_type.norm) {
360 for(i = 0; i < num_tmps; ++i) {
361 tmp[i] = lp_build_unsigned_norm_to_float(builder,
362 src_type.width,
363 dst_type,
364 tmp[i]);
365 }
366 tmp_type.floating = TRUE;
367 }
368 else {
369 double src_scale = lp_const_scale(src_type);
370 LLVMTypeRef tmp_vec_type;
371
372 /* Use an equally sized integer for intermediate computations */
373 tmp_type.floating = TRUE;
374 tmp_type.sign = TRUE;
375 tmp_vec_type = lp_build_vec_type(tmp_type);
376 for(i = 0; i < num_tmps; ++i) {
377 #if 0
378 if(dst_type.sign)
379 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
380 else
381 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
382 #else
383 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
384 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
385 #endif
386 }
387
388 if (src_scale != 1.0) {
389 LLVMValueRef scale = lp_build_const_scalar(tmp_type, 1.0/src_scale);
390 for(i = 0; i < num_tmps; ++i)
391 tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
392 }
393 }
394 }
395 else {
396 unsigned src_shift = lp_const_shift(src_type);
397 unsigned dst_shift = lp_const_shift(dst_type);
398
399 /* FIXME: compensate different offsets too */
400 if(src_shift < dst_shift) {
401 LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, dst_shift - src_shift);
402 for(i = 0; i < num_tmps; ++i)
403 tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
404 }
405 }
406
407 for(i = 0; i < num_dsts; ++i)
408 dst[i] = tmp[i];
409 }
410
411
412 /**
413 * Bit mask conversion.
414 *
415 * This will convert the integer masks that match the given types.
416 *
417 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
418 * Any other value will likely cause in unpredictable results.
419 *
420 * This is basically a very trimmed down version of lp_build_conv.
421 */
422 void
423 lp_build_conv_mask(LLVMBuilderRef builder,
424 struct lp_type src_type,
425 struct lp_type dst_type,
426 const LLVMValueRef *src, unsigned num_srcs,
427 LLVMValueRef *dst, unsigned num_dsts)
428 {
429 /* Register width must remain constant */
430 assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
431
432 /* We must not loose or gain channels. Only precision */
433 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
434
435 /*
436 * Drop
437 *
438 * We assume all values are 0 or -1
439 */
440
441 src_type.floating = FALSE;
442 src_type.fixed = FALSE;
443 src_type.sign = TRUE;
444 src_type.norm = FALSE;
445
446 dst_type.floating = FALSE;
447 dst_type.fixed = FALSE;
448 dst_type.sign = TRUE;
449 dst_type.norm = FALSE;
450
451 /*
452 * Truncate or expand bit width
453 */
454
455 if(src_type.width > dst_type.width) {
456 assert(num_dsts == 1);
457 dst[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
458 }
459 else if(src_type.width < dst_type.width) {
460 assert(num_srcs == 1);
461 lp_build_unpack(builder, src_type, dst_type, src[0], dst, num_dsts);
462 }
463 else {
464 assert(num_srcs == num_dsts);
465 memcpy(dst, src, num_dsts * sizeof *dst);
466 }
467 }