053f4132080887658b3c9adf4cc3403f67d503dc
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_conv.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for type conversions.
32 *
33 * We want to use the fastest type for a given computation whenever feasible.
34 * The other side of this is that we need to be able convert between several
35 * types accurately and efficiently.
36 *
37 * Conversion between types of different bit width is quite complex since a
38 *
39 * To remember there are a few invariants in type conversions:
40 *
41 * - register width must remain constant:
42 *
43 * src_type.width * src_type.length == dst_type.width * dst_type.length
44 *
45 * - total number of elements must remain constant:
46 *
47 * src_type.length * num_srcs == dst_type.length * num_dsts
48 *
49 * It is not always possible to do the conversion both accurately and
50 * efficiently, usually due to lack of adequate machine instructions. In these
51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52 * there this functions can be used anywhere. In the future we might have a
53 * precision parameter which can gauge the accuracy vs efficiency compromise,
54 * but for now if the data conversion between two stages happens to be the
55 * bottleneck, then most likely should just avoid converting at all and run
56 * both stages with the same type.
57 *
58 * Make sure to run lp_test_conv unit test after any change to this file.
59 *
60 * @author Jose Fonseca <jfonseca@vmware.com>
61 */
62
63
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/u_half.h"
67 #include "util/u_cpu_detect.h"
68
69 #include "lp_bld_type.h"
70 #include "lp_bld_const.h"
71 #include "lp_bld_arit.h"
72 #include "lp_bld_bitarit.h"
73 #include "lp_bld_pack.h"
74 #include "lp_bld_conv.h"
75 #include "lp_bld_logic.h"
76 #include "lp_bld_intr.h"
77 #include "lp_bld_printf.h"
78
79
80
81 /**
82 * Byte swap on element. It will construct a call to intrinsic llvm.bswap
83 * based on the type.
84 *
85 * @param res element to byte swap.
86 * @param type int16_t, int32_t, int64_t, float or double
87 * @param
88 */
89 LLVMValueRef
90 lp_build_bswap(struct gallivm_state *gallivm,
91 LLVMValueRef res,
92 struct lp_type type)
93 {
94 LLVMTypeRef int_type = LLVMIntTypeInContext(gallivm->context,
95 type.width);
96 const char *intrinsic = NULL;
97 if (type.width == 8)
98 return res;
99 if (type.width == 16)
100 intrinsic = "llvm.bswap.i16";
101 else if (type.width == 32)
102 intrinsic = "llvm.bswap.i32";
103 else if (type.width == 64)
104 intrinsic = "llvm.bswap.i64";
105
106 assert (intrinsic != NULL);
107
108 /* In case of a floating-point type cast to a int of same size and then
109 * cast back to fp type.
110 */
111 if (type.floating)
112 res = LLVMBuildBitCast(gallivm->builder, res, int_type, "");
113 res = lp_build_intrinsic_unary(gallivm->builder, intrinsic, int_type, res);
114 if (type.floating)
115 res = LLVMBuildBitCast(gallivm->builder, res,
116 lp_build_elem_type(gallivm, type), "");
117 return res;
118 }
119
120
121 /**
122 * Byte swap every element in the vector.
123 *
124 * @param packed <vector> to convert
125 * @param src_type <vector> type of int16_t, int32_t, int64_t, float or
126 * double
127 * @param dst_type <vector> type to return
128 */
129 LLVMValueRef
130 lp_build_bswap_vec(struct gallivm_state *gallivm,
131 LLVMValueRef packed,
132 struct lp_type src_type_vec,
133 struct lp_type dst_type_vec)
134 {
135 LLVMBuilderRef builder = gallivm->builder;
136 LLVMTypeRef dst_type = lp_build_elem_type(gallivm, dst_type_vec);
137 LLVMValueRef res;
138
139 if (src_type_vec.length == 1) {
140 res = lp_build_bswap(gallivm, packed, src_type_vec);
141 res = LLVMBuildBitCast(gallivm->builder, res, dst_type, "");
142 } else {
143 unsigned i;
144 res = LLVMGetUndef(lp_build_vec_type(gallivm, dst_type_vec));
145 for (i = 0; i < src_type_vec.length; ++i) {
146 LLVMValueRef index = lp_build_const_int32(gallivm, i);
147 LLVMValueRef elem = LLVMBuildExtractElement(builder, packed, index, "");
148 elem = lp_build_bswap(gallivm, elem, src_type_vec);
149 elem = LLVMBuildBitCast(gallivm->builder, elem, dst_type, "");
150 res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
151 }
152 }
153 return res;
154 }
155
156
157 /**
158 * Convert float32 to a float-like value with less exponent and mantissa
159 * bits. The mantissa is still biased, and the mantissa still has an implied 1,
160 * but there's no sign bit.
161 *
162 * @param src (vector) float value to convert
163 * @param mantissa_bits the number of mantissa bits
164 * @param exponent_bits the number of exponent bits
165 *
166 * Unlike float_to_half using accurate method here.
167 * This implements round-towards-zero (trunc) hence too large numbers get
168 * converted to largest representable number, not infinity.
169 * Small numbers may get converted to denorms, depending on normal
170 * float denorm handling of the cpu.
171 * Note that compared to the references, below, we skip any rounding bias
172 * since we do rounding towards zero - OpenGL allows rounding towards zero
173 * (though not preferred) and DX10 even seems to require it.
174 * Note that this will not do any packing - the value will
175 * look like a "rescaled float" (except for Inf/NaN) but be returned
176 * as int32.
177 *
178 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
179 * ref https://gist.github.com/rygorous/2156668
180 */
181 static LLVMValueRef
182 lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
183 struct lp_type i32_type,
184 LLVMValueRef src,
185 unsigned mantissa_bits,
186 unsigned exponent_bits)
187 {
188 LLVMBuilderRef builder = gallivm->builder;
189 LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
190 LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
191 LLVMValueRef is_nan, is_posinf, is_nan_or_posinf, i32_qnanbit, nan_or_posinf;
192 struct lp_type f32_type = lp_type_float_vec(32, 32 * i32_type.length);
193 struct lp_build_context f32_bld, i32_bld;
194 LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
195
196 lp_build_context_init(&f32_bld, gallivm, f32_type);
197 lp_build_context_init(&i32_bld, gallivm, i32_type);
198
199 i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
200 ((1 << exponent_bits) - 1) << 23);
201 i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
202
203 /* "ordinary" number */
204 /* clamp to pos range (can still have sign bit if NaN or negative zero) */
205 clamped = lp_build_max(&f32_bld, src, zero);
206 clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, "");
207 /* get rid of excess mantissa bits, and while here also potential sign bit */
208 i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
209 ~((1 << (23 - mantissa_bits)) - 1) |
210 0x7fffffff);
211
212 tmp = lp_build_and(&i32_bld, clamped, i32_roundmask);
213 tmp = LLVMBuildBitCast(builder, tmp, f32_bld.vec_type, "");
214 /* bias exponent (and denormalize if necessary) */
215 magic = lp_build_const_int_vec(gallivm, i32_type,
216 ((1 << (exponent_bits - 1)) - 1) << 23);
217 magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
218 normal = lp_build_mul(&f32_bld, tmp, magic);
219
220 /* clamp to max value */
221 small_max = lp_build_const_int_vec(gallivm, i32_type,
222 (((1 << exponent_bits) - 2) << 23) |
223 (((1 << mantissa_bits) - 1) << (23 - mantissa_bits)));
224 small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, "");
225 normal = lp_build_min(&f32_bld, normal, small_max);
226 normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, "");
227
228 /*
229 * handle nan/inf cases
230 * a little bit tricky since -Inf -> 0, +Inf -> +Inf, +-Nan -> +Nan
231 * Note that on a lucky day, we could simplify this a bit,
232 * by just using the max(src, zero) result - this will have -Inf
233 * clamped to 0, and MIGHT preserve the NaNs.
234 */
235 src_abs = lp_build_abs(&f32_bld, src);
236 src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, "");
237 src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, "");
238 is_nan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER,
239 src_abs, i32_floatexpmask);
240 is_posinf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL,
241 src, i32_floatexpmask);
242 is_nan_or_posinf = lp_build_and(&i32_bld, is_nan, is_posinf);
243 /* could also set more mantissa bits but need at least the highest mantissa bit */
244 i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22);
245 /* combine maxexp with qnanbit */
246 nan_or_posinf = lp_build_or(&i32_bld, i32_smallexpmask,
247 lp_build_and(&i32_bld, is_nan, i32_qnanbit));
248
249 return lp_build_select(&i32_bld, is_nan_or_posinf, nan_or_posinf, normal);
250 }
251
252
253 /**
254 * Convert rgba float SoA values to packed r11g11b10 values.
255 *
256 * @param src SoA float (vector) values to convert.
257 */
258 LLVMValueRef
259 lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
260 LLVMValueRef *src)
261 {
262 LLVMValueRef dst, rcomp, bcomp, gcomp, shift, mask;
263 struct lp_build_context i32_bld;
264 LLVMTypeRef src_type = LLVMTypeOf(*src);
265 unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
266 LLVMGetVectorSize(src_type) : 1;
267 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
268
269 lp_build_context_init(&i32_bld, gallivm, i32_type);
270
271 /* "rescale" - this does the actual conversion except the packing */
272 rcomp = lp_build_float_to_smallfloat_nosign(gallivm, i32_type, src[0], 6, 5);
273 gcomp = lp_build_float_to_smallfloat_nosign(gallivm, i32_type, src[1], 6, 5);
274 bcomp = lp_build_float_to_smallfloat_nosign(gallivm, i32_type, src[2], 5, 5);
275
276 /* pack rescaled SoA floats to r11g11b10 AoS values */
277 shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 6);
278 rcomp = lp_build_shr(&i32_bld, rcomp, shift);
279
280 shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 17);
281 mask = lp_build_const_int_vec(gallivm, i32_type, 0x7ff << 11);
282 gcomp = lp_build_shr(&i32_bld, gcomp, shift);
283 gcomp = lp_build_and(&i32_bld, gcomp, mask);
284
285 shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
286 mask = lp_build_const_int_vec(gallivm, i32_type, 0x3ff << 22);
287 bcomp = lp_build_shl(&i32_bld, bcomp, shift);
288 bcomp = lp_build_and(&i32_bld, bcomp, mask);
289
290 dst = lp_build_or(&i32_bld, rcomp, gcomp);
291 return lp_build_or(&i32_bld, dst, bcomp);
292 }
293
294
295 /**
296 * Convert a float-like value with less exponent and mantissa
297 * bits than a normal float32 to a float32. The mantissa of
298 * the source value is assumed to have an implied 1, and the exponent
299 * is biased. There are no negative values.
300 * The source value to extract must be in a 32bit int.
301 * While this helper is generic, it is only ever going to be useful for
302 * r11g11b10 (no other common format exists with the same properties).
303 *
304 * @param src (vector) value to convert
305 * @param mantissa_bits the number of mantissa bits
306 * @param exponent_bits the number of exponent bits
307 * @param mantissa_start the bit start position of the packed component
308 *
309 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
310 * ref https://gist.github.com/rygorous/2156668
311 */
312 static LLVMValueRef
313 lp_build_smallfloat_nosign_to_float(struct gallivm_state *gallivm,
314 struct lp_type f32_type,
315 LLVMValueRef src,
316 unsigned mantissa_bits,
317 unsigned exponent_bits,
318 unsigned mantissa_start)
319 {
320 LLVMBuilderRef builder = gallivm->builder;
321 LLVMValueRef smallexpmask, i32_floatexpmask, magic;
322 LLVMValueRef wasinfnan, tmp, res, shift, mask;
323 unsigned exponent_start = mantissa_start + mantissa_bits;
324 struct lp_type i32_type = lp_type_int_vec(32, 32 * f32_type.length);
325 struct lp_build_context f32_bld, i32_bld;
326
327 lp_build_context_init(&f32_bld, gallivm, f32_type);
328 lp_build_context_init(&i32_bld, gallivm, i32_type);
329
330 /* extract the component to "float position" */
331 if (exponent_start < 23) {
332 shift = lp_build_const_int_vec(gallivm, i32_type, 23 - exponent_start);
333 src = lp_build_shl(&i32_bld, src, shift);
334 }
335 else {
336 shift = lp_build_const_int_vec(gallivm, i32_type, exponent_start - 23);
337 src = lp_build_shr(&i32_bld, src, shift);
338 }
339 mask = lp_build_const_int_vec(gallivm, i32_type,
340 ((1 << (mantissa_bits + exponent_bits)) - 1) <<
341 (23 - mantissa_bits));
342 src = lp_build_and(&i32_bld, src, mask);
343 src = LLVMBuildBitCast(builder, src, f32_bld.vec_type, "");
344
345 /* now do the actual scaling */
346 smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
347 ((1 << exponent_bits) - 1) << 23);
348 i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
349 /*
350 * magic number has exponent new exp bias + (new exp bias - old exp bias),
351 * mantissa is 0.
352 */
353 magic = lp_build_const_int_vec(gallivm, i32_type,
354 (255 - (1 << (exponent_bits - 1))) << 23);
355 magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
356
357 /* adjust exponent and fix denorms */
358 res = lp_build_mul(&f32_bld, src, magic);
359
360 /*
361 * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
362 * so a simple "or" will do (because exp adjust will leave mantissa intact)
363 */
364 /* use float compare (better for AVX 8-wide / no AVX2 though otherwise should use int) */
365 smallexpmask = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
366 wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, src, smallexpmask);
367 res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
368 tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
369 res = lp_build_or(&i32_bld, tmp, res);
370
371 return LLVMBuildBitCast(builder, res, f32_bld.vec_type, "");
372 }
373
374
375 /**
376 * Convert packed float format (r11g11b10) value(s) to rgba float SoA values.
377 *
378 * @param src packed AoS r11g11b10 values (as (vector) int32)
379 * @param dst pointer to the SoA result values
380 */
381 void
382 lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
383 LLVMValueRef src,
384 LLVMValueRef *dst)
385 {
386 LLVMTypeRef src_type = LLVMTypeOf(src);
387 unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
388 LLVMGetVectorSize(src_type) : 1;
389 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
390
391 dst[0] = lp_build_smallfloat_nosign_to_float(gallivm, f32_type, src, 6, 5, 0);
392 dst[1] = lp_build_smallfloat_nosign_to_float(gallivm, f32_type, src, 6, 5, 11);
393 dst[2] = lp_build_smallfloat_nosign_to_float(gallivm, f32_type, src, 5, 5, 22);
394
395 /* Just set alpha to one */
396 dst[3] = lp_build_one(gallivm, f32_type);
397 }
398
399
400 /**
401 * Converts int16 half-float to float32
402 * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
403 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
404 *
405 * @param src value to convert
406 *
407 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
408 * ref https://gist.github.com/2144712
409 */
410 LLVMValueRef
411 lp_build_half_to_float(struct gallivm_state *gallivm,
412 LLVMValueRef src)
413 {
414 int src_length = LLVMGetVectorSize(LLVMTypeOf(src));
415
416 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
417 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
418
419 LLVMBuilderRef builder = gallivm->builder;
420 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
421 LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
422
423 /* Constants */
424 LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13);
425 LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16);
426 LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
427 LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
428 LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
429 LLVMValueRef f32_magic = LLVMBuildBitCast(builder,
430 lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
431 float_vec_type, "");
432
433 /* Convert int16 vector to int32 vector by zero ext */
434 LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, "");
435
436 /* Exponent / mantissa bits */
437 LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
438 LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
439
440 /* Exponent adjust */
441 LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
442
443 /* Make sure Inf/NaN survive */
444 LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
445 LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
446
447 /* Sign bit */
448 LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, "");
449 LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, "");
450
451 /* Combine result */
452 LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, "");
453 LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, "");
454
455 /* Cast from int32 vector to float32 vector */
456 return LLVMBuildBitCast(builder, final, float_vec_type, "");
457 }
458
459
460 /**
461 * Converts float32 to int16 half-float
462 * Note this can be performed in 1 instruction if vcvtps2ph exists (sse5 i think?)
463 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
464 *
465 * @param src value to convert
466 *
467 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
468 * ref https://gist.github.com/2156668
469 *
470 * XXX: This is an approximation. It is faster but certain NaNs are converted to
471 * infinity, and rounding is not correct.
472 */
473 LLVMValueRef
474 lp_build_float_to_half(struct gallivm_state *gallivm,
475 LLVMValueRef src)
476 {
477 LLVMBuilderRef builder = gallivm->builder;
478 LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
479 unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
480 ? LLVMGetVectorSize(f32_vec_type) : 1;
481 struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
482 struct lp_type u32_type = lp_type_uint_vec(32, 32 * length);
483 struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
484 LLVMTypeRef u32_vec_type = lp_build_vec_type(gallivm, u32_type);
485 LLVMTypeRef i16_vec_type = lp_build_vec_type(gallivm, i16_type);
486 struct lp_build_context f32_bld;
487 struct lp_build_context u32_bld;
488 LLVMValueRef result;
489
490 lp_build_context_init(&f32_bld, gallivm, f32_type);
491 lp_build_context_init(&u32_bld, gallivm, u32_type);
492
493 {
494 /* Constants */
495 LLVMValueRef u32_f32inf = lp_build_const_int_vec(gallivm, u32_type, 0xff << 23);
496 LLVMValueRef u32_expinf = lp_build_const_int_vec(gallivm, u32_type, 0xe0 << 23);
497 LLVMValueRef f32_f16max = lp_build_const_vec(gallivm, f32_type, 65536.0); // 0x8f << 23
498 LLVMValueRef f32_magic = lp_build_const_vec(gallivm, f32_type, 1.92592994e-34); // 0x0f << 23
499
500 /* Cast from float32 to int32 */
501 LLVMValueRef f = LLVMBuildBitCast(builder, src, u32_vec_type, "");
502
503 /* Remove sign */
504 LLVMValueRef srcabs = lp_build_abs(&f32_bld, src);
505 LLVMValueRef fabs = LLVMBuildBitCast(builder, srcabs, u32_vec_type, "");
506
507 /* Magic conversion */
508 LLVMValueRef clamped = lp_build_min(&f32_bld, f32_f16max, srcabs);
509 LLVMValueRef scaled = LLVMBuildBitCast(builder,
510 LLVMBuildFMul(builder,
511 clamped,
512 f32_magic,
513 ""),
514 u32_vec_type,
515 "");
516 /* Make sure Inf/NaN and unormalised survive */
517 LLVMValueRef infnancase = LLVMBuildXor(builder, u32_expinf, fabs, "");
518 LLVMValueRef b_notnormal = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL,
519 srcabs,
520 LLVMBuildBitCast(builder, u32_f32inf, f32_vec_type, ""));
521
522 /* Merge normal / unnormal case */
523 LLVMValueRef merged = lp_build_select(&u32_bld, b_notnormal, infnancase, scaled);
524 LLVMValueRef shifted = lp_build_shr_imm(&u32_bld, merged, 13);
525
526 /* Sign bit */
527 LLVMValueRef justsign = LLVMBuildXor(builder, f, fabs, "");
528 LLVMValueRef signshifted = lp_build_shr_imm(&u32_bld, justsign, 16);
529
530 /* Combine result */
531 result = LLVMBuildOr(builder, shifted, signshifted, "");
532 }
533
534 result = LLVMBuildTrunc(builder, result, i16_vec_type, "");
535
536 /*
537 * Debugging code.
538 */
539 if (0) {
540 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
541 LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
542 LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
543 LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));
544 unsigned i;
545
546 LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);
547 LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half));
548 func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half");
549
550 for (i = 0; i < length; ++i) {
551 LLVMValueRef index = LLVMConstInt(i32t, i, 0);
552 LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
553 #if 0
554 /* XXX: not really supported by backends */
555 LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
556 #else
557 LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
558 #endif
559 ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");
560 }
561
562 lp_build_print_value(gallivm, "src = ", src);
563 lp_build_print_value(gallivm, "llvm = ", result);
564 lp_build_print_value(gallivm, "util = ", ref_result);
565 lp_build_printf(gallivm, "\n");
566 }
567
568 return result;
569 }
570
571
572 /**
573 * Special case for converting clamped IEEE-754 floats to unsigned norms.
574 *
575 * The mathematical voodoo below may seem excessive but it is actually
576 * paramount we do it this way for several reasons. First, there is no single
577 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
578 * secondly, even if there was, since the FP's mantissa takes only a fraction
579 * of register bits the typically scale and cast approach would require double
580 * precision for accurate results, and therefore half the throughput
581 *
582 * Although the result values can be scaled to an arbitrary bit width specified
583 * by dst_width, the actual result type will have the same width.
584 *
585 * Ex: src = { float, float, float, float }
586 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
587 */
588 LLVMValueRef
589 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
590 struct lp_type src_type,
591 unsigned dst_width,
592 LLVMValueRef src)
593 {
594 LLVMBuilderRef builder = gallivm->builder;
595 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
596 LLVMValueRef res;
597 unsigned mantissa;
598
599 assert(src_type.floating);
600 assert(dst_width <= src_type.width);
601 src_type.sign = FALSE;
602
603 mantissa = lp_mantissa(src_type);
604
605 if (dst_width <= mantissa) {
606 /*
607 * Apply magic coefficients that will make the desired result to appear
608 * in the lowest significant bits of the mantissa, with correct rounding.
609 *
610 * This only works if the destination width fits in the mantissa.
611 */
612
613 unsigned long long ubound;
614 unsigned long long mask;
615 double scale;
616 double bias;
617
618 ubound = (1ULL << dst_width);
619 mask = ubound - 1;
620 scale = (double)mask/ubound;
621 bias = (double)(1ULL << (mantissa - dst_width));
622
623 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
624 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
625 res = LLVMBuildBitCast(builder, res, int_vec_type, "");
626 res = LLVMBuildAnd(builder, res,
627 lp_build_const_int_vec(gallivm, src_type, mask), "");
628 }
629 else if (dst_width == (mantissa + 1)) {
630 /*
631 * The destination width matches exactly what can be represented in
632 * floating point (i.e., mantissa + 1 bits). So do a straight
633 * multiplication followed by casting. No further rounding is necessary.
634 */
635
636 double scale;
637
638 scale = (double)((1ULL << dst_width) - 1);
639
640 res = LLVMBuildFMul(builder, src,
641 lp_build_const_vec(gallivm, src_type, scale), "");
642 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
643 }
644 else {
645 /*
646 * The destination exceeds what can be represented in the floating point.
647 * So multiply by the largest power two we get away with, and when
648 * subtract the most significant bit to rescale to normalized values.
649 *
650 * The largest power of two factor we can get away is
651 * (1 << (src_type.width - 1)), because we need to use signed . In theory it
652 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
653 * INT_MIN should be returned in FPToSI, which is the correct result for
654 * values near 1.0!
655 *
656 * This means we get (src_type.width - 1) correct bits for values near 0.0,
657 * and (mantissa + 1) correct bits for values near 1.0. Equally or more
658 * important, we also get exact results for 0.0 and 1.0.
659 */
660
661 unsigned n = MIN2(src_type.width - 1, dst_width);
662
663 double scale = (double)(1ULL << n);
664 unsigned lshift = dst_width - n;
665 unsigned rshift = n;
666 LLVMValueRef lshifted;
667 LLVMValueRef rshifted;
668
669 res = LLVMBuildFMul(builder, src,
670 lp_build_const_vec(gallivm, src_type, scale), "");
671 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
672
673 /*
674 * Align the most significant bit to its final place.
675 *
676 * This will cause 1.0 to overflow to 0, but the later adjustment will
677 * get it right.
678 */
679 if (lshift) {
680 lshifted = LLVMBuildShl(builder, res,
681 lp_build_const_int_vec(gallivm, src_type,
682 lshift), "");
683 } else {
684 lshifted = res;
685 }
686
687 /*
688 * Align the most significant bit to the right.
689 */
690 rshifted = LLVMBuildLShr(builder, res,
691 lp_build_const_int_vec(gallivm, src_type, rshift),
692 "");
693
694 /*
695 * Subtract the MSB to the LSB, therefore re-scaling from
696 * (1 << dst_width) to ((1 << dst_width) - 1).
697 */
698
699 res = LLVMBuildSub(builder, lshifted, rshifted, "");
700 }
701
702 return res;
703 }
704
705
706 /**
707 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
708 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
709 * return {float, float, float, float} with values in range [0, 1].
710 */
711 LLVMValueRef
712 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
713 unsigned src_width,
714 struct lp_type dst_type,
715 LLVMValueRef src)
716 {
717 LLVMBuilderRef builder = gallivm->builder;
718 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
719 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
720 LLVMValueRef bias_;
721 LLVMValueRef res;
722 unsigned mantissa;
723 unsigned n;
724 unsigned long long ubound;
725 unsigned long long mask;
726 double scale;
727 double bias;
728
729 assert(dst_type.floating);
730
731 mantissa = lp_mantissa(dst_type);
732
733 if (src_width <= (mantissa + 1)) {
734 /*
735 * The source width matches fits what can be represented in floating
736 * point (i.e., mantissa + 1 bits). So do a straight multiplication
737 * followed by casting. No further rounding is necessary.
738 */
739
740 scale = 1.0/(double)((1ULL << src_width) - 1);
741 res = LLVMBuildSIToFP(builder, src, vec_type, "");
742 res = LLVMBuildFMul(builder, res,
743 lp_build_const_vec(gallivm, dst_type, scale), "");
744 return res;
745 }
746 else {
747 /*
748 * The source width exceeds what can be represented in floating
749 * point. So truncate the incoming values.
750 */
751
752 n = MIN2(mantissa, src_width);
753
754 ubound = ((unsigned long long)1 << n);
755 mask = ubound - 1;
756 scale = (double)ubound/mask;
757 bias = (double)((unsigned long long)1 << (mantissa - n));
758
759 res = src;
760
761 if (src_width > mantissa) {
762 int shift = src_width - mantissa;
763 res = LLVMBuildLShr(builder, res,
764 lp_build_const_int_vec(gallivm, dst_type, shift), "");
765 }
766
767 bias_ = lp_build_const_vec(gallivm, dst_type, bias);
768
769 res = LLVMBuildOr(builder,
770 res,
771 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
772
773 res = LLVMBuildBitCast(builder, res, vec_type, "");
774
775 res = LLVMBuildFSub(builder, res, bias_, "");
776 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
777 }
778
779 return res;
780 }
781
782
783 /**
784 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
785 *
786 * Returns the number of dsts created from src
787 */
788 int lp_build_conv_auto(struct gallivm_state *gallivm,
789 struct lp_type src_type,
790 struct lp_type* dst_type,
791 const LLVMValueRef *src,
792 unsigned num_srcs,
793 LLVMValueRef *dst)
794 {
795 int i;
796 int num_dsts = num_srcs;
797
798 if (src_type.floating == dst_type->floating &&
799 src_type.width == dst_type->width &&
800 src_type.length == dst_type->length &&
801 src_type.fixed == dst_type->fixed &&
802 src_type.norm == dst_type->norm &&
803 src_type.sign == dst_type->sign)
804 return num_dsts;
805
806 /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
807 */
808 if (src_type.floating == 1 &&
809 src_type.fixed == 0 &&
810 src_type.sign == 1 &&
811 src_type.norm == 0 &&
812 src_type.width == 32 &&
813
814 dst_type->floating == 0 &&
815 dst_type->fixed == 0 &&
816 dst_type->sign == 0 &&
817 dst_type->norm == 1 &&
818 dst_type->width == 8)
819 {
820 /* Special case 4x4f --> 1x16ub */
821 if (src_type.length == 4 && util_cpu_caps.has_sse2)
822 {
823 assert((num_srcs % 4) == 0);
824
825 num_dsts = num_srcs / 4;
826 dst_type->length = 16;
827
828 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
829 return num_dsts;
830 }
831
832 /* Special case 2x8f --> 1x16ub */
833 if (src_type.length == 8 && util_cpu_caps.has_avx)
834 {
835 assert((num_srcs % 2) == 0);
836
837 num_dsts = num_srcs / 2;
838 dst_type->length = 16;
839
840 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
841 return num_dsts;
842 }
843 }
844
845 /* lp_build_resize does not support M:N */
846 if (src_type.width == dst_type->width) {
847 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
848 } else {
849 for (i = 0; i < num_srcs; ++i) {
850 lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
851 }
852 }
853
854 return num_dsts;
855 }
856
857
858 /**
859 * Generic type conversion.
860 *
861 * TODO: Take a precision argument, or even better, add a new precision member
862 * to the lp_type union.
863 */
864 void
865 lp_build_conv(struct gallivm_state *gallivm,
866 struct lp_type src_type,
867 struct lp_type dst_type,
868 const LLVMValueRef *src, unsigned num_srcs,
869 LLVMValueRef *dst, unsigned num_dsts)
870 {
871 LLVMBuilderRef builder = gallivm->builder;
872 struct lp_type tmp_type;
873 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
874 unsigned num_tmps;
875 unsigned i;
876
877 /* We must not loose or gain channels. Only precision */
878 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
879
880 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
881 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
882 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
883 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
884
885 tmp_type = src_type;
886 for(i = 0; i < num_srcs; ++i) {
887 assert(lp_check_value(src_type, src[i]));
888 tmp[i] = src[i];
889 }
890 num_tmps = num_srcs;
891
892
893 /* Special case 4x4f --> 1x16ub
894 */
895 if (src_type.floating == 1 &&
896 src_type.fixed == 0 &&
897 src_type.sign == 1 &&
898 src_type.norm == 0 &&
899 src_type.width == 32 &&
900 src_type.length == 4 &&
901
902 dst_type.floating == 0 &&
903 dst_type.fixed == 0 &&
904 dst_type.sign == 0 &&
905 dst_type.norm == 1 &&
906 dst_type.width == 8 &&
907 dst_type.length == 16 &&
908
909 4 * num_dsts == num_srcs &&
910
911 util_cpu_caps.has_sse2)
912 {
913 struct lp_build_context bld;
914 struct lp_type int16_type = dst_type;
915 struct lp_type int32_type = dst_type;
916 LLVMValueRef const_255f;
917 unsigned i, j;
918
919 lp_build_context_init(&bld, gallivm, src_type);
920
921 int16_type.width *= 2;
922 int16_type.length /= 2;
923 int16_type.sign = 1;
924
925 int32_type.width *= 4;
926 int32_type.length /= 4;
927 int32_type.sign = 1;
928
929 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
930
931 for (i = 0; i < num_dsts; ++i, src += 4) {
932 LLVMValueRef lo, hi;
933
934 for (j = 0; j < 4; ++j) {
935 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
936 tmp[j] = lp_build_iround(&bld, tmp[j]);
937 }
938
939 /* relying on clamping behavior of sse2 intrinsics here */
940 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
941 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
942 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
943 }
944
945 return;
946 }
947
948 /* Special case 2x8f --> 1x16ub
949 */
950 else if (src_type.floating == 1 &&
951 src_type.fixed == 0 &&
952 src_type.sign == 1 &&
953 src_type.norm == 0 &&
954 src_type.width == 32 &&
955 src_type.length == 8 &&
956
957 dst_type.floating == 0 &&
958 dst_type.fixed == 0 &&
959 dst_type.sign == 0 &&
960 dst_type.norm == 1 &&
961 dst_type.width == 8 &&
962 dst_type.length == 16 &&
963
964 2 * num_dsts == num_srcs &&
965
966 util_cpu_caps.has_avx) {
967
968 struct lp_build_context bld;
969 struct lp_type int16_type = dst_type;
970 struct lp_type int32_type = dst_type;
971 LLVMValueRef const_255f;
972 unsigned i;
973
974 lp_build_context_init(&bld, gallivm, src_type);
975
976 int16_type.width *= 2;
977 int16_type.length /= 2;
978 int16_type.sign = 1;
979
980 int32_type.width *= 4;
981 int32_type.length /= 4;
982 int32_type.sign = 1;
983
984 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
985
986 for (i = 0; i < num_dsts; ++i, src += 2) {
987 LLVMValueRef lo, hi, a, b;
988
989 a = LLVMBuildFMul(builder, src[0], const_255f, "");
990 b = LLVMBuildFMul(builder, src[1], const_255f, "");
991
992 a = lp_build_iround(&bld, a);
993 b = lp_build_iround(&bld, b);
994
995 tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
996 tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
997 tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
998 tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
999
1000 /* relying on clamping behavior of sse2 intrinsics here */
1001 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
1002 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
1003 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
1004 }
1005 return;
1006 }
1007
1008 /* Special case -> 16bit half-float
1009 */
1010 else if (dst_type.floating && dst_type.width == 16)
1011 {
1012 /* Only support src as 32bit float currently */
1013 assert(src_type.floating && src_type.width == 32);
1014
1015 for(i = 0; i < num_tmps; ++i)
1016 dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
1017
1018 return;
1019 }
1020
1021 /* Pre convert half-floats to floats
1022 */
1023 else if (src_type.floating && src_type.width == 16)
1024 {
1025 for(i = 0; i < num_tmps; ++i)
1026 tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
1027
1028 tmp_type.width = 32;
1029 }
1030
1031 /*
1032 * Clamp if necessary
1033 */
1034
1035 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
1036 struct lp_build_context bld;
1037 double src_min = lp_const_min(src_type);
1038 double dst_min = lp_const_min(dst_type);
1039 double src_max = lp_const_max(src_type);
1040 double dst_max = lp_const_max(dst_type);
1041 LLVMValueRef thres;
1042
1043 lp_build_context_init(&bld, gallivm, tmp_type);
1044
1045 if(src_min < dst_min) {
1046 if(dst_min == 0.0)
1047 thres = bld.zero;
1048 else
1049 thres = lp_build_const_vec(gallivm, src_type, dst_min);
1050 for(i = 0; i < num_tmps; ++i)
1051 tmp[i] = lp_build_max(&bld, tmp[i], thres);
1052 }
1053
1054 if(src_max > dst_max) {
1055 if(dst_max == 1.0)
1056 thres = bld.one;
1057 else
1058 thres = lp_build_const_vec(gallivm, src_type, dst_max);
1059 for(i = 0; i < num_tmps; ++i)
1060 tmp[i] = lp_build_min(&bld, tmp[i], thres);
1061 }
1062 }
1063
1064 /*
1065 * Scale to the narrowest range
1066 */
1067
1068 if(dst_type.floating) {
1069 /* Nothing to do */
1070 }
1071 else if(tmp_type.floating) {
1072 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
1073 for(i = 0; i < num_tmps; ++i) {
1074 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
1075 tmp_type,
1076 dst_type.width,
1077 tmp[i]);
1078 }
1079 tmp_type.floating = FALSE;
1080 }
1081 else {
1082 double dst_scale = lp_const_scale(dst_type);
1083 LLVMTypeRef tmp_vec_type;
1084
1085 if (dst_scale != 1.0) {
1086 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
1087 for(i = 0; i < num_tmps; ++i)
1088 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
1089 }
1090
1091 /* Use an equally sized integer for intermediate computations */
1092 tmp_type.floating = FALSE;
1093 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
1094 for(i = 0; i < num_tmps; ++i) {
1095 #if 0
1096 if(dst_type.sign)
1097 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
1098 else
1099 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
1100 #else
1101 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
1102 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
1103 #endif
1104 }
1105 }
1106 }
1107 else {
1108 unsigned src_shift = lp_const_shift(src_type);
1109 unsigned dst_shift = lp_const_shift(dst_type);
1110 unsigned src_offset = lp_const_offset(src_type);
1111 unsigned dst_offset = lp_const_offset(dst_type);
1112
1113 /* Compensate for different offsets */
1114 if (dst_offset > src_offset && src_type.width > dst_type.width) {
1115 for (i = 0; i < num_tmps; ++i) {
1116 LLVMValueRef shifted;
1117 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1);
1118 if(src_type.sign)
1119 shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
1120 else
1121 shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
1122
1123 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
1124 }
1125 }
1126
1127 if(src_shift > dst_shift) {
1128 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
1129 src_shift - dst_shift);
1130 for(i = 0; i < num_tmps; ++i)
1131 if(src_type.sign)
1132 tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
1133 else
1134 tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
1135 }
1136 }
1137
1138 /*
1139 * Truncate or expand bit width
1140 *
1141 * No data conversion should happen here, although the sign bits are
1142 * crucial to avoid bad clamping.
1143 */
1144
1145 {
1146 struct lp_type new_type;
1147
1148 new_type = tmp_type;
1149 new_type.sign = dst_type.sign;
1150 new_type.width = dst_type.width;
1151 new_type.length = dst_type.length;
1152
1153 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
1154
1155 tmp_type = new_type;
1156 num_tmps = num_dsts;
1157 }
1158
1159 /*
1160 * Scale to the widest range
1161 */
1162
1163 if(src_type.floating) {
1164 /* Nothing to do */
1165 }
1166 else if(!src_type.floating && dst_type.floating) {
1167 if(!src_type.fixed && !src_type.sign && src_type.norm) {
1168 for(i = 0; i < num_tmps; ++i) {
1169 tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
1170 src_type.width,
1171 dst_type,
1172 tmp[i]);
1173 }
1174 tmp_type.floating = TRUE;
1175 }
1176 else {
1177 double src_scale = lp_const_scale(src_type);
1178 LLVMTypeRef tmp_vec_type;
1179
1180 /* Use an equally sized integer for intermediate computations */
1181 tmp_type.floating = TRUE;
1182 tmp_type.sign = TRUE;
1183 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
1184 for(i = 0; i < num_tmps; ++i) {
1185 #if 0
1186 if(dst_type.sign)
1187 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
1188 else
1189 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
1190 #else
1191 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
1192 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
1193 #endif
1194 }
1195
1196 if (src_scale != 1.0) {
1197 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
1198 for(i = 0; i < num_tmps; ++i)
1199 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
1200 }
1201 }
1202 }
1203 else {
1204 unsigned src_shift = lp_const_shift(src_type);
1205 unsigned dst_shift = lp_const_shift(dst_type);
1206 unsigned src_offset = lp_const_offset(src_type);
1207 unsigned dst_offset = lp_const_offset(dst_type);
1208
1209 if (src_shift < dst_shift) {
1210 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
1211 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift);
1212
1213 for (i = 0; i < num_tmps; ++i) {
1214 pre_shift[i] = tmp[i];
1215 tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
1216 }
1217
1218 /* Compensate for different offsets */
1219 if (dst_offset > src_offset) {
1220 for (i = 0; i < num_tmps; ++i) {
1221 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
1222 }
1223 }
1224 }
1225 }
1226
1227 for(i = 0; i < num_dsts; ++i) {
1228 dst[i] = tmp[i];
1229 assert(lp_check_value(dst_type, dst[i]));
1230 }
1231 }
1232
1233
1234 /**
1235 * Bit mask conversion.
1236 *
1237 * This will convert the integer masks that match the given types.
1238 *
1239 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
1240 * Any other value will likely cause unpredictable results.
1241 *
1242 * This is basically a very trimmed down version of lp_build_conv.
1243 */
1244 void
1245 lp_build_conv_mask(struct gallivm_state *gallivm,
1246 struct lp_type src_type,
1247 struct lp_type dst_type,
1248 const LLVMValueRef *src, unsigned num_srcs,
1249 LLVMValueRef *dst, unsigned num_dsts)
1250 {
1251
1252 /* We must not loose or gain channels. Only precision */
1253 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
1254
1255 /*
1256 * Drop
1257 *
1258 * We assume all values are 0 or -1
1259 */
1260
1261 src_type.floating = FALSE;
1262 src_type.fixed = FALSE;
1263 src_type.sign = TRUE;
1264 src_type.norm = FALSE;
1265
1266 dst_type.floating = FALSE;
1267 dst_type.fixed = FALSE;
1268 dst_type.sign = TRUE;
1269 dst_type.norm = FALSE;
1270
1271 /*
1272 * Truncate or expand bit width
1273 */
1274
1275 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
1276 }