2f39abc63d8f3aa663d803f00d4b202b0da7c4bc
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_conv.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for type conversions.
32 *
33 * We want to use the fastest type for a given computation whenever feasible.
34 * The other side of this is that we need to be able convert between several
35 * types accurately and efficiently.
36 *
37 * Conversion between types of different bit width is quite complex since a
38 *
39 * To remember there are a few invariants in type conversions:
40 *
41 * - register width must remain constant:
42 *
43 * src_type.width * src_type.length == dst_type.width * dst_type.length
44 *
45 * - total number of elements must remain constant:
46 *
47 * src_type.length * num_srcs == dst_type.length * num_dsts
48 *
49 * It is not always possible to do the conversion both accurately and
50 * efficiently, usually due to lack of adequate machine instructions. In these
51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52 * there this functions can be used anywhere. In the future we might have a
53 * precision parameter which can gauge the accuracy vs efficiency compromise,
54 * but for now if the data conversion between two stages happens to be the
55 * bottleneck, then most likely should just avoid converting at all and run
56 * both stages with the same type.
57 *
58 * Make sure to run lp_test_conv unit test after any change to this file.
59 *
60 * @author Jose Fonseca <jfonseca@vmware.com>
61 */
62
63
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/u_half.h"
67 #include "util/u_cpu_detect.h"
68
69 #include "lp_bld_type.h"
70 #include "lp_bld_const.h"
71 #include "lp_bld_arit.h"
72 #include "lp_bld_bitarit.h"
73 #include "lp_bld_pack.h"
74 #include "lp_bld_conv.h"
75 #include "lp_bld_logic.h"
76 #include "lp_bld_intr.h"
77 #include "lp_bld_printf.h"
78
79
80
81 /**
82 * Byte swap on element. It will construct a call to intrinsic llvm.bswap
83 * based on the type.
84 *
85 * @param res element to byte swap.
86 * @param type int16_t, int32_t, int64_t, float or double
87 * @param
88 */
89 LLVMValueRef
90 lp_build_bswap(struct gallivm_state *gallivm,
91 LLVMValueRef res,
92 struct lp_type type)
93 {
94 LLVMTypeRef int_type = LLVMIntTypeInContext(gallivm->context,
95 type.width);
96 const char *intrinsic = NULL;
97 if (type.width == 8)
98 return res;
99 if (type.width == 16)
100 intrinsic = "llvm.bswap.i16";
101 else if (type.width == 32)
102 intrinsic = "llvm.bswap.i32";
103 else if (type.width == 64)
104 intrinsic = "llvm.bswap.i64";
105
106 assert (intrinsic != NULL);
107
108 /* In case of a floating-point type cast to a int of same size and then
109 * cast back to fp type.
110 */
111 if (type.floating)
112 res = LLVMBuildBitCast(gallivm->builder, res, int_type, "");
113 res = lp_build_intrinsic_unary(gallivm->builder, intrinsic, int_type, res);
114 if (type.floating)
115 res = LLVMBuildBitCast(gallivm->builder, res,
116 lp_build_elem_type(gallivm, type), "");
117 return res;
118 }
119
120
121 /**
122 * Byte swap every element in the vector.
123 *
124 * @param packed <vector> to convert
125 * @param src_type <vector> type of int16_t, int32_t, int64_t, float or
126 * double
127 * @param dst_type <vector> type to return
128 */
129 LLVMValueRef
130 lp_build_bswap_vec(struct gallivm_state *gallivm,
131 LLVMValueRef packed,
132 struct lp_type src_type_vec,
133 struct lp_type dst_type_vec)
134 {
135 LLVMBuilderRef builder = gallivm->builder;
136 LLVMTypeRef dst_type = lp_build_elem_type(gallivm, dst_type_vec);
137 LLVMValueRef res;
138
139 if (src_type_vec.length == 1) {
140 res = lp_build_bswap(gallivm, packed, src_type_vec);
141 res = LLVMBuildBitCast(gallivm->builder, res, dst_type, "");
142 } else {
143 unsigned i;
144 res = LLVMGetUndef(lp_build_vec_type(gallivm, dst_type_vec));
145 for (i = 0; i < src_type_vec.length; ++i) {
146 LLVMValueRef index = lp_build_const_int32(gallivm, i);
147 LLVMValueRef elem = LLVMBuildExtractElement(builder, packed, index, "");
148 elem = lp_build_bswap(gallivm, elem, src_type_vec);
149 elem = LLVMBuildBitCast(gallivm->builder, elem, dst_type, "");
150 res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
151 }
152 }
153 return res;
154 }
155
156
157 /**
158 * Convert float32 to a float-like value with less exponent and mantissa
159 * bits. The mantissa is still biased, and the mantissa still has an implied 1,
160 * but there's no sign bit.
161 *
162 * @param src (vector) float value to convert
163 * @param mantissa_bits the number of mantissa bits
164 * @param exponent_bits the number of exponent bits
165 *
166 * Unlike float_to_half using accurate method here.
167 * This implements round-towards-zero (trunc) hence too large numbers get
168 * converted to largest representable number, not infinity.
169 * Small numbers may get converted to denorms, depending on normal
170 * float denorm handling of the cpu.
171 * Note that compared to the references, below, we skip any rounding bias
172 * since we do rounding towards zero - OpenGL allows rounding towards zero
173 * (though not preferred) and DX10 even seems to require it.
174 * Note that this will not do any packing - the value will
175 * look like a "rescaled float" (except for Inf/NaN) but be returned
176 * as int32.
177 *
178 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
179 * ref https://gist.github.com/rygorous/2156668
180 */
181 static LLVMValueRef
182 lp_build_float_to_smallfloat_nosign(struct gallivm_state *gallivm,
183 struct lp_type i32_type,
184 LLVMValueRef src,
185 unsigned mantissa_bits,
186 unsigned exponent_bits)
187 {
188 LLVMBuilderRef builder = gallivm->builder;
189 LLVMValueRef i32_floatexpmask, i32_smallexpmask, magic, normal;
190 LLVMValueRef clamped, tmp, i32_roundmask, small_max, src_abs;
191 LLVMValueRef is_nan, is_posinf, is_nan_or_posinf, i32_qnanbit, nan_or_posinf;
192 struct lp_type f32_type = lp_type_float_vec(32, 32 * i32_type.length);
193 struct lp_build_context f32_bld, i32_bld;
194 LLVMValueRef zero = lp_build_const_vec(gallivm, f32_type, 0.0f);
195
196 lp_build_context_init(&f32_bld, gallivm, f32_type);
197 lp_build_context_init(&i32_bld, gallivm, i32_type);
198
199 i32_smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
200 ((1 << exponent_bits) - 1) << 23);
201 i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
202
203 /* "ordinary" number */
204 /* clamp to pos range (can still have sign bit if NaN or negative zero) */
205 clamped = lp_build_max(&f32_bld, src, zero);
206 clamped = LLVMBuildBitCast(builder, clamped, i32_bld.vec_type, "");
207 /* get rid of excess mantissa bits, and while here also potential sign bit */
208 i32_roundmask = lp_build_const_int_vec(gallivm, i32_type,
209 ~((1 << (23 - mantissa_bits)) - 1) |
210 0x7fffffff);
211
212 tmp = lp_build_and(&i32_bld, clamped, i32_roundmask);
213 tmp = LLVMBuildBitCast(builder, tmp, f32_bld.vec_type, "");
214 /* bias exponent (and denormalize if necessary) */
215 magic = lp_build_const_int_vec(gallivm, i32_type,
216 ((1 << (exponent_bits - 1)) - 1) << 23);
217 magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
218 normal = lp_build_mul(&f32_bld, tmp, magic);
219
220 /* clamp to max value */
221 small_max = lp_build_const_int_vec(gallivm, i32_type,
222 (((1 << exponent_bits) - 2) << 23) |
223 (((1 << mantissa_bits) - 1) << (23 - mantissa_bits)));
224 small_max = LLVMBuildBitCast(builder, small_max, f32_bld.vec_type, "");
225 normal = lp_build_min(&f32_bld, normal, small_max);
226 normal = LLVMBuildBitCast(builder, normal, i32_bld.vec_type, "");
227
228 /*
229 * handle nan/inf cases
230 * a little bit tricky since -Inf -> 0, +Inf -> +Inf, +-Nan -> +Nan
231 * Note that on a lucky day, we could simplify this a bit,
232 * by just using the max(src, zero) result - this will have -Inf
233 * clamped to 0, and MIGHT preserve the NaNs.
234 */
235 src_abs = lp_build_abs(&f32_bld, src);
236 src_abs = LLVMBuildBitCast(builder, src_abs, i32_bld.vec_type, "");
237 src = LLVMBuildBitCast(builder, src, i32_bld.vec_type, "");
238 is_nan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER,
239 src_abs, i32_floatexpmask);
240 is_posinf = lp_build_compare(gallivm, i32_type, PIPE_FUNC_EQUAL,
241 src, i32_floatexpmask);
242 is_nan_or_posinf = lp_build_and(&i32_bld, is_nan, is_posinf);
243 /* could also set more mantissa bits but need at least the highest mantissa bit */
244 i32_qnanbit = lp_build_const_vec(gallivm, i32_type, 1 << 22);
245 /* combine maxexp with qnanbit */
246 nan_or_posinf = lp_build_or(&i32_bld, i32_smallexpmask,
247 lp_build_and(&i32_bld, is_nan, i32_qnanbit));
248
249 return lp_build_select(&i32_bld, is_nan_or_posinf, nan_or_posinf, normal);
250 }
251
252
253 /**
254 * Convert rgba float SoA values to packed r11g11b10 values.
255 *
256 * @param src SoA float (vector) values to convert.
257 */
258 LLVMValueRef
259 lp_build_float_to_r11g11b10(struct gallivm_state *gallivm,
260 LLVMValueRef *src)
261 {
262 LLVMValueRef dst, rcomp, bcomp, gcomp, shift, mask;
263 struct lp_build_context i32_bld;
264 LLVMTypeRef src_type = LLVMTypeOf(*src);
265 unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
266 LLVMGetVectorSize(src_type) : 1;
267 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
268
269 lp_build_context_init(&i32_bld, gallivm, i32_type);
270
271 /* "rescale" - this does the actual conversion except the packing */
272 rcomp = lp_build_float_to_smallfloat_nosign(gallivm, i32_type, src[0], 6, 5);
273 gcomp = lp_build_float_to_smallfloat_nosign(gallivm, i32_type, src[1], 6, 5);
274 bcomp = lp_build_float_to_smallfloat_nosign(gallivm, i32_type, src[2], 5, 5);
275
276 /* pack rescaled SoA floats to r11g11b10 AoS values */
277 shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 6);
278 rcomp = lp_build_shr(&i32_bld, rcomp, shift);
279
280 shift = lp_build_const_int_vec(gallivm, i32_type, 23 - 17);
281 mask = lp_build_const_int_vec(gallivm, i32_type, 0x7ff << 11);
282 gcomp = lp_build_shr(&i32_bld, gcomp, shift);
283 gcomp = lp_build_and(&i32_bld, gcomp, mask);
284
285 shift = lp_build_const_int_vec(gallivm, i32_type, 27 - 23);
286 mask = lp_build_const_int_vec(gallivm, i32_type, 0x3ff << 22);
287 bcomp = lp_build_shl(&i32_bld, bcomp, shift);
288 bcomp = lp_build_and(&i32_bld, bcomp, mask);
289
290 dst = lp_build_or(&i32_bld, rcomp, gcomp);
291 return lp_build_or(&i32_bld, dst, bcomp);
292 }
293
294
295 /**
296 * Convert a float-like value with less exponent and mantissa
297 * bits than a normal float32 to a float32. The mantissa of
298 * the source value is assumed to have an implied 1, and the exponent
299 * is biased. There are no negative values.
300 * The source value to extract must be in a 32bit int.
301 * While this helper is generic, it is only ever going to be useful for
302 * r11g11b10 (no other common format exists with the same properties).
303 *
304 * @param src (vector) value to convert
305 * @param mantissa_bits the number of mantissa bits
306 * @param exponent_bits the number of exponent bits
307 * @param mantissa_start the bit start position of the packed component
308 *
309 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
310 * ref https://gist.github.com/rygorous/2156668
311 */
312 static LLVMValueRef
313 lp_build_smallfloat_nosign_to_float(struct gallivm_state *gallivm,
314 struct lp_type f32_type,
315 LLVMValueRef src,
316 unsigned mantissa_bits,
317 unsigned exponent_bits,
318 unsigned mantissa_start)
319 {
320 LLVMBuilderRef builder = gallivm->builder;
321 LLVMValueRef smallexpmask, i32_floatexpmask, magic;
322 LLVMValueRef wasinfnan, tmp, res, shift, mask;
323 unsigned exponent_start = mantissa_start + mantissa_bits;
324 struct lp_type i32_type = lp_type_int_vec(32, 32 * f32_type.length);
325 struct lp_build_context f32_bld, i32_bld;
326
327 lp_build_context_init(&f32_bld, gallivm, f32_type);
328 lp_build_context_init(&i32_bld, gallivm, i32_type);
329
330 /* extract the component to "float position" */
331 if (exponent_start < 23) {
332 shift = lp_build_const_int_vec(gallivm, i32_type, 23 - exponent_start);
333 src = lp_build_shl(&i32_bld, src, shift);
334 }
335 else {
336 shift = lp_build_const_int_vec(gallivm, i32_type, exponent_start - 23);
337 src = lp_build_shr(&i32_bld, src, shift);
338 }
339 mask = lp_build_const_int_vec(gallivm, i32_type,
340 ((1 << (mantissa_bits + exponent_bits)) - 1) <<
341 (23 - mantissa_bits));
342 src = lp_build_and(&i32_bld, src, mask);
343 src = LLVMBuildBitCast(builder, src, f32_bld.vec_type, "");
344
345 /* now do the actual scaling */
346 smallexpmask = lp_build_const_int_vec(gallivm, i32_type,
347 ((1 << exponent_bits) - 1) << 23);
348 i32_floatexpmask = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
349 /*
350 * magic number has exponent new exp bias + (new exp bias - old exp bias),
351 * mantissa is 0.
352 */
353 magic = lp_build_const_int_vec(gallivm, i32_type,
354 (255 - (1 << (exponent_bits - 1))) << 23);
355 magic = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
356
357 /* adjust exponent and fix denorms */
358 res = lp_build_mul(&f32_bld, src, magic);
359
360 /*
361 * if exp was max (== NaN or Inf) set new exp to max (keep mantissa),
362 * so a simple "or" will do (because exp adjust will leave mantissa intact)
363 */
364 /* use float compare (better for AVX 8-wide / no AVX2 though otherwise should use int) */
365 smallexpmask = LLVMBuildBitCast(builder, magic, f32_bld.vec_type, "");
366 wasinfnan = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL, src, smallexpmask);
367 res = LLVMBuildBitCast(builder, res, i32_bld.vec_type, "");
368 tmp = lp_build_and(&i32_bld, i32_floatexpmask, wasinfnan);
369 res = lp_build_or(&i32_bld, tmp, res);
370
371 return LLVMBuildBitCast(builder, res, f32_bld.vec_type, "");
372 }
373
374
375 /**
376 * Convert packed float format (r11g11b10) value(s) to rgba float SoA values.
377 *
378 * @param src packed AoS r11g11b10 values (as (vector) int32)
379 * @param dst pointer to the SoA result values
380 */
381 void
382 lp_build_r11g11b10_to_float(struct gallivm_state *gallivm,
383 LLVMValueRef src,
384 LLVMValueRef *dst)
385 {
386 LLVMTypeRef src_type = LLVMTypeOf(src);
387 unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
388 LLVMGetVectorSize(src_type) : 1;
389 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
390
391 dst[0] = lp_build_smallfloat_nosign_to_float(gallivm, f32_type, src, 6, 5, 0);
392 dst[1] = lp_build_smallfloat_nosign_to_float(gallivm, f32_type, src, 6, 5, 11);
393 dst[2] = lp_build_smallfloat_nosign_to_float(gallivm, f32_type, src, 5, 5, 22);
394
395 /* Just set alpha to one */
396 dst[3] = lp_build_one(gallivm, f32_type);
397 }
398
399
400 static LLVMValueRef
401 lp_build_rgb9_to_float_helper(struct gallivm_state *gallivm,
402 struct lp_type f32_type,
403 LLVMValueRef src,
404 LLVMValueRef scale,
405 unsigned mantissa_start)
406 {
407 LLVMValueRef shift, mask;
408
409 struct lp_type i32_type = lp_type_int_vec(32, 32 * f32_type.length);
410 struct lp_build_context i32_bld, f32_bld;
411
412 lp_build_context_init(&i32_bld, gallivm, i32_type);
413 lp_build_context_init(&f32_bld, gallivm, f32_type);
414
415 /*
416 * This is much easier as other weirdo float formats, since
417 * there's no sign, no Inf/NaN, and there's nothing special
418 * required for normals/denormals neither (as without the implied one
419 * for the mantissa for other formats, everything looks like a denormal).
420 * So just do (float)comp_bits * scale
421 */
422 shift = lp_build_const_int_vec(gallivm, i32_type, mantissa_start);
423 mask = lp_build_const_int_vec(gallivm, i32_type, 0x1ff);
424 src = lp_build_shr(&i32_bld, src, shift);
425 src = lp_build_and(&i32_bld, src, mask);
426 src = lp_build_int_to_float(&f32_bld, src);
427 return lp_build_mul(&f32_bld, src, scale);
428 }
429
430
431 /**
432 * Convert shared exponent format (rgb9e5) value(s) to rgba float SoA values.
433 *
434 * @param src packed AoS rgb9e5 values (as (vector) int32)
435 * @param dst pointer to the SoA result values
436 */
437 void
438 lp_build_rgb9e5_to_float(struct gallivm_state *gallivm,
439 LLVMValueRef src,
440 LLVMValueRef *dst)
441 {
442 LLVMBuilderRef builder = gallivm->builder;
443 LLVMTypeRef src_type = LLVMTypeOf(src);
444 LLVMValueRef shift, scale, bias, exp;
445 unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
446 LLVMGetVectorSize(src_type) : 1;
447 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
448 struct lp_type u32_type = lp_type_uint_vec(32, 32 * src_length);
449 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
450 struct lp_build_context i32_bld, u32_bld, f32_bld;
451
452 lp_build_context_init(&i32_bld, gallivm, i32_type);
453 lp_build_context_init(&u32_bld, gallivm, u32_type);
454 lp_build_context_init(&f32_bld, gallivm, f32_type);
455
456 /* extract exponent */
457 shift = lp_build_const_int_vec(gallivm, i32_type, 27);
458 /* this shift needs to be unsigned otherwise need mask */
459 exp = lp_build_shr(&u32_bld, src, shift);
460
461 /*
462 * scale factor is 2 ^ (exp - bias)
463 * (and additionally corrected here for the mantissa bits)
464 * not using shift because
465 * a) don't have vector shift in a lot of cases
466 * b) shift direction changes hence need 2 shifts + conditional
467 * (or rotate instruction which is even more rare (for instance XOP))
468 * so use whacky float 2 ^ function instead manipulating exponent
469 * (saves us the float conversion at the end too)
470 */
471 bias = lp_build_const_int_vec(gallivm, i32_type, 127 - (15 + 9));
472 scale = lp_build_add(&i32_bld, exp, bias);
473 shift = lp_build_const_int_vec(gallivm, i32_type, 23);
474 scale = lp_build_shl(&i32_bld, scale, shift);
475 scale = LLVMBuildBitCast(builder, scale, f32_bld.vec_type, "");
476
477 dst[0] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 0);
478 dst[1] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 9);
479 dst[2] = lp_build_rgb9_to_float_helper(gallivm, f32_type, src, scale, 18);
480
481 /* Just set alpha to one */
482 dst[3] = f32_bld.one;
483 }
484
485
486 /**
487 * Converts int16 half-float to float32
488 * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
489 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
490 *
491 * @param src value to convert
492 *
493 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
494 * ref https://gist.github.com/2144712
495 */
496 LLVMValueRef
497 lp_build_half_to_float(struct gallivm_state *gallivm,
498 LLVMValueRef src)
499 {
500 int src_length = LLVMGetVectorSize(LLVMTypeOf(src));
501
502 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
503 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
504
505 LLVMBuilderRef builder = gallivm->builder;
506 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
507 LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
508
509 /* Constants */
510 LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13);
511 LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16);
512 LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
513 LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
514 LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
515 LLVMValueRef f32_magic = LLVMBuildBitCast(builder,
516 lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
517 float_vec_type, "");
518
519 /* Convert int16 vector to int32 vector by zero ext */
520 LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, "");
521
522 /* Exponent / mantissa bits */
523 LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
524 LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
525
526 /* Exponent adjust */
527 LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
528
529 /* Make sure Inf/NaN survive */
530 LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
531 LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
532
533 /* Sign bit */
534 LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, "");
535 LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, "");
536
537 /* Combine result */
538 LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, "");
539 LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, "");
540
541 /* Cast from int32 vector to float32 vector */
542 return LLVMBuildBitCast(builder, final, float_vec_type, "");
543 }
544
545
546 /**
547 * Converts float32 to int16 half-float
548 * Note this can be performed in 1 instruction if vcvtps2ph exists (sse5 i think?)
549 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
550 *
551 * @param src value to convert
552 *
553 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
554 * ref https://gist.github.com/2156668
555 *
556 * XXX: This is an approximation. It is faster but certain NaNs are converted to
557 * infinity, and rounding is not correct.
558 */
559 LLVMValueRef
560 lp_build_float_to_half(struct gallivm_state *gallivm,
561 LLVMValueRef src)
562 {
563 LLVMBuilderRef builder = gallivm->builder;
564 LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
565 unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
566 ? LLVMGetVectorSize(f32_vec_type) : 1;
567 struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
568 struct lp_type u32_type = lp_type_uint_vec(32, 32 * length);
569 struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
570 LLVMTypeRef u32_vec_type = lp_build_vec_type(gallivm, u32_type);
571 LLVMTypeRef i16_vec_type = lp_build_vec_type(gallivm, i16_type);
572 struct lp_build_context f32_bld;
573 struct lp_build_context u32_bld;
574 LLVMValueRef result;
575
576 lp_build_context_init(&f32_bld, gallivm, f32_type);
577 lp_build_context_init(&u32_bld, gallivm, u32_type);
578
579 {
580 /* Constants */
581 LLVMValueRef u32_f32inf = lp_build_const_int_vec(gallivm, u32_type, 0xff << 23);
582 LLVMValueRef u32_expinf = lp_build_const_int_vec(gallivm, u32_type, 0xe0 << 23);
583 LLVMValueRef f32_f16max = lp_build_const_vec(gallivm, f32_type, 65536.0); // 0x8f << 23
584 LLVMValueRef f32_magic = lp_build_const_vec(gallivm, f32_type, 1.92592994e-34); // 0x0f << 23
585
586 /* Cast from float32 to int32 */
587 LLVMValueRef f = LLVMBuildBitCast(builder, src, u32_vec_type, "");
588
589 /* Remove sign */
590 LLVMValueRef srcabs = lp_build_abs(&f32_bld, src);
591 LLVMValueRef fabs = LLVMBuildBitCast(builder, srcabs, u32_vec_type, "");
592
593 /* Magic conversion */
594 LLVMValueRef clamped = lp_build_min(&f32_bld, f32_f16max, srcabs);
595 LLVMValueRef scaled = LLVMBuildBitCast(builder,
596 LLVMBuildFMul(builder,
597 clamped,
598 f32_magic,
599 ""),
600 u32_vec_type,
601 "");
602 /* Make sure Inf/NaN and unormalised survive */
603 LLVMValueRef infnancase = LLVMBuildXor(builder, u32_expinf, fabs, "");
604 LLVMValueRef b_notnormal = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GEQUAL,
605 srcabs,
606 LLVMBuildBitCast(builder, u32_f32inf, f32_vec_type, ""));
607
608 /* Merge normal / unnormal case */
609 LLVMValueRef merged = lp_build_select(&u32_bld, b_notnormal, infnancase, scaled);
610 LLVMValueRef shifted = lp_build_shr_imm(&u32_bld, merged, 13);
611
612 /* Sign bit */
613 LLVMValueRef justsign = LLVMBuildXor(builder, f, fabs, "");
614 LLVMValueRef signshifted = lp_build_shr_imm(&u32_bld, justsign, 16);
615
616 /* Combine result */
617 result = LLVMBuildOr(builder, shifted, signshifted, "");
618 }
619
620 result = LLVMBuildTrunc(builder, result, i16_vec_type, "");
621
622 /*
623 * Debugging code.
624 */
625 if (0) {
626 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
627 LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
628 LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
629 LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));
630 unsigned i;
631
632 LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);
633 LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half));
634 func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half");
635
636 for (i = 0; i < length; ++i) {
637 LLVMValueRef index = LLVMConstInt(i32t, i, 0);
638 LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
639 #if 0
640 /* XXX: not really supported by backends */
641 LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
642 #else
643 LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
644 #endif
645 ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");
646 }
647
648 lp_build_print_value(gallivm, "src = ", src);
649 lp_build_print_value(gallivm, "llvm = ", result);
650 lp_build_print_value(gallivm, "util = ", ref_result);
651 lp_build_printf(gallivm, "\n");
652 }
653
654 return result;
655 }
656
657
658 /**
659 * Special case for converting clamped IEEE-754 floats to unsigned norms.
660 *
661 * The mathematical voodoo below may seem excessive but it is actually
662 * paramount we do it this way for several reasons. First, there is no single
663 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
664 * secondly, even if there was, since the FP's mantissa takes only a fraction
665 * of register bits the typically scale and cast approach would require double
666 * precision for accurate results, and therefore half the throughput
667 *
668 * Although the result values can be scaled to an arbitrary bit width specified
669 * by dst_width, the actual result type will have the same width.
670 *
671 * Ex: src = { float, float, float, float }
672 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
673 */
674 LLVMValueRef
675 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
676 struct lp_type src_type,
677 unsigned dst_width,
678 LLVMValueRef src)
679 {
680 LLVMBuilderRef builder = gallivm->builder;
681 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
682 LLVMValueRef res;
683 unsigned mantissa;
684
685 assert(src_type.floating);
686 assert(dst_width <= src_type.width);
687 src_type.sign = FALSE;
688
689 mantissa = lp_mantissa(src_type);
690
691 if (dst_width <= mantissa) {
692 /*
693 * Apply magic coefficients that will make the desired result to appear
694 * in the lowest significant bits of the mantissa, with correct rounding.
695 *
696 * This only works if the destination width fits in the mantissa.
697 */
698
699 unsigned long long ubound;
700 unsigned long long mask;
701 double scale;
702 double bias;
703
704 ubound = (1ULL << dst_width);
705 mask = ubound - 1;
706 scale = (double)mask/ubound;
707 bias = (double)(1ULL << (mantissa - dst_width));
708
709 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
710 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
711 res = LLVMBuildBitCast(builder, res, int_vec_type, "");
712 res = LLVMBuildAnd(builder, res,
713 lp_build_const_int_vec(gallivm, src_type, mask), "");
714 }
715 else if (dst_width == (mantissa + 1)) {
716 /*
717 * The destination width matches exactly what can be represented in
718 * floating point (i.e., mantissa + 1 bits). So do a straight
719 * multiplication followed by casting. No further rounding is necessary.
720 */
721
722 double scale;
723
724 scale = (double)((1ULL << dst_width) - 1);
725
726 res = LLVMBuildFMul(builder, src,
727 lp_build_const_vec(gallivm, src_type, scale), "");
728 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
729 }
730 else {
731 /*
732 * The destination exceeds what can be represented in the floating point.
733 * So multiply by the largest power two we get away with, and when
734 * subtract the most significant bit to rescale to normalized values.
735 *
736 * The largest power of two factor we can get away is
737 * (1 << (src_type.width - 1)), because we need to use signed . In theory it
738 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
739 * INT_MIN should be returned in FPToSI, which is the correct result for
740 * values near 1.0!
741 *
742 * This means we get (src_type.width - 1) correct bits for values near 0.0,
743 * and (mantissa + 1) correct bits for values near 1.0. Equally or more
744 * important, we also get exact results for 0.0 and 1.0.
745 */
746
747 unsigned n = MIN2(src_type.width - 1, dst_width);
748
749 double scale = (double)(1ULL << n);
750 unsigned lshift = dst_width - n;
751 unsigned rshift = n;
752 LLVMValueRef lshifted;
753 LLVMValueRef rshifted;
754
755 res = LLVMBuildFMul(builder, src,
756 lp_build_const_vec(gallivm, src_type, scale), "");
757 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
758
759 /*
760 * Align the most significant bit to its final place.
761 *
762 * This will cause 1.0 to overflow to 0, but the later adjustment will
763 * get it right.
764 */
765 if (lshift) {
766 lshifted = LLVMBuildShl(builder, res,
767 lp_build_const_int_vec(gallivm, src_type,
768 lshift), "");
769 } else {
770 lshifted = res;
771 }
772
773 /*
774 * Align the most significant bit to the right.
775 */
776 rshifted = LLVMBuildLShr(builder, res,
777 lp_build_const_int_vec(gallivm, src_type, rshift),
778 "");
779
780 /*
781 * Subtract the MSB to the LSB, therefore re-scaling from
782 * (1 << dst_width) to ((1 << dst_width) - 1).
783 */
784
785 res = LLVMBuildSub(builder, lshifted, rshifted, "");
786 }
787
788 return res;
789 }
790
791
792 /**
793 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
794 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
795 * return {float, float, float, float} with values in range [0, 1].
796 */
797 LLVMValueRef
798 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
799 unsigned src_width,
800 struct lp_type dst_type,
801 LLVMValueRef src)
802 {
803 LLVMBuilderRef builder = gallivm->builder;
804 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
805 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
806 LLVMValueRef bias_;
807 LLVMValueRef res;
808 unsigned mantissa;
809 unsigned n;
810 unsigned long long ubound;
811 unsigned long long mask;
812 double scale;
813 double bias;
814
815 assert(dst_type.floating);
816
817 mantissa = lp_mantissa(dst_type);
818
819 if (src_width <= (mantissa + 1)) {
820 /*
821 * The source width matches fits what can be represented in floating
822 * point (i.e., mantissa + 1 bits). So do a straight multiplication
823 * followed by casting. No further rounding is necessary.
824 */
825
826 scale = 1.0/(double)((1ULL << src_width) - 1);
827 res = LLVMBuildSIToFP(builder, src, vec_type, "");
828 res = LLVMBuildFMul(builder, res,
829 lp_build_const_vec(gallivm, dst_type, scale), "");
830 return res;
831 }
832 else {
833 /*
834 * The source width exceeds what can be represented in floating
835 * point. So truncate the incoming values.
836 */
837
838 n = MIN2(mantissa, src_width);
839
840 ubound = ((unsigned long long)1 << n);
841 mask = ubound - 1;
842 scale = (double)ubound/mask;
843 bias = (double)((unsigned long long)1 << (mantissa - n));
844
845 res = src;
846
847 if (src_width > mantissa) {
848 int shift = src_width - mantissa;
849 res = LLVMBuildLShr(builder, res,
850 lp_build_const_int_vec(gallivm, dst_type, shift), "");
851 }
852
853 bias_ = lp_build_const_vec(gallivm, dst_type, bias);
854
855 res = LLVMBuildOr(builder,
856 res,
857 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
858
859 res = LLVMBuildBitCast(builder, res, vec_type, "");
860
861 res = LLVMBuildFSub(builder, res, bias_, "");
862 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
863 }
864
865 return res;
866 }
867
868
869 /**
870 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
871 *
872 * Returns the number of dsts created from src
873 */
874 int lp_build_conv_auto(struct gallivm_state *gallivm,
875 struct lp_type src_type,
876 struct lp_type* dst_type,
877 const LLVMValueRef *src,
878 unsigned num_srcs,
879 LLVMValueRef *dst)
880 {
881 int i;
882 int num_dsts = num_srcs;
883
884 if (src_type.floating == dst_type->floating &&
885 src_type.width == dst_type->width &&
886 src_type.length == dst_type->length &&
887 src_type.fixed == dst_type->fixed &&
888 src_type.norm == dst_type->norm &&
889 src_type.sign == dst_type->sign)
890 return num_dsts;
891
892 /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
893 */
894 if (src_type.floating == 1 &&
895 src_type.fixed == 0 &&
896 src_type.sign == 1 &&
897 src_type.norm == 0 &&
898 src_type.width == 32 &&
899
900 dst_type->floating == 0 &&
901 dst_type->fixed == 0 &&
902 dst_type->sign == 0 &&
903 dst_type->norm == 1 &&
904 dst_type->width == 8)
905 {
906 /* Special case 4x4f --> 1x16ub */
907 if (src_type.length == 4 && util_cpu_caps.has_sse2)
908 {
909 assert((num_srcs % 4) == 0);
910
911 num_dsts = num_srcs / 4;
912 dst_type->length = 16;
913
914 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
915 return num_dsts;
916 }
917
918 /* Special case 2x8f --> 1x16ub */
919 if (src_type.length == 8 && util_cpu_caps.has_avx)
920 {
921 assert((num_srcs % 2) == 0);
922
923 num_dsts = num_srcs / 2;
924 dst_type->length = 16;
925
926 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
927 return num_dsts;
928 }
929 }
930
931 /* lp_build_resize does not support M:N */
932 if (src_type.width == dst_type->width) {
933 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
934 } else {
935 for (i = 0; i < num_srcs; ++i) {
936 lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
937 }
938 }
939
940 return num_dsts;
941 }
942
943
944 /**
945 * Generic type conversion.
946 *
947 * TODO: Take a precision argument, or even better, add a new precision member
948 * to the lp_type union.
949 */
950 void
951 lp_build_conv(struct gallivm_state *gallivm,
952 struct lp_type src_type,
953 struct lp_type dst_type,
954 const LLVMValueRef *src, unsigned num_srcs,
955 LLVMValueRef *dst, unsigned num_dsts)
956 {
957 LLVMBuilderRef builder = gallivm->builder;
958 struct lp_type tmp_type;
959 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
960 unsigned num_tmps;
961 unsigned i;
962
963 /* We must not loose or gain channels. Only precision */
964 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
965
966 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
967 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
968 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
969 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
970
971 tmp_type = src_type;
972 for(i = 0; i < num_srcs; ++i) {
973 assert(lp_check_value(src_type, src[i]));
974 tmp[i] = src[i];
975 }
976 num_tmps = num_srcs;
977
978
979 /* Special case 4x4f --> 1x16ub
980 */
981 if (src_type.floating == 1 &&
982 src_type.fixed == 0 &&
983 src_type.sign == 1 &&
984 src_type.norm == 0 &&
985 src_type.width == 32 &&
986 src_type.length == 4 &&
987
988 dst_type.floating == 0 &&
989 dst_type.fixed == 0 &&
990 dst_type.sign == 0 &&
991 dst_type.norm == 1 &&
992 dst_type.width == 8 &&
993 dst_type.length == 16 &&
994
995 4 * num_dsts == num_srcs &&
996
997 util_cpu_caps.has_sse2)
998 {
999 struct lp_build_context bld;
1000 struct lp_type int16_type = dst_type;
1001 struct lp_type int32_type = dst_type;
1002 LLVMValueRef const_255f;
1003 unsigned i, j;
1004
1005 lp_build_context_init(&bld, gallivm, src_type);
1006
1007 int16_type.width *= 2;
1008 int16_type.length /= 2;
1009 int16_type.sign = 1;
1010
1011 int32_type.width *= 4;
1012 int32_type.length /= 4;
1013 int32_type.sign = 1;
1014
1015 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
1016
1017 for (i = 0; i < num_dsts; ++i, src += 4) {
1018 LLVMValueRef lo, hi;
1019
1020 for (j = 0; j < 4; ++j) {
1021 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
1022 tmp[j] = lp_build_iround(&bld, tmp[j]);
1023 }
1024
1025 /* relying on clamping behavior of sse2 intrinsics here */
1026 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
1027 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
1028 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
1029 }
1030
1031 return;
1032 }
1033
1034 /* Special case 2x8f --> 1x16ub
1035 */
1036 else if (src_type.floating == 1 &&
1037 src_type.fixed == 0 &&
1038 src_type.sign == 1 &&
1039 src_type.norm == 0 &&
1040 src_type.width == 32 &&
1041 src_type.length == 8 &&
1042
1043 dst_type.floating == 0 &&
1044 dst_type.fixed == 0 &&
1045 dst_type.sign == 0 &&
1046 dst_type.norm == 1 &&
1047 dst_type.width == 8 &&
1048 dst_type.length == 16 &&
1049
1050 2 * num_dsts == num_srcs &&
1051
1052 util_cpu_caps.has_avx) {
1053
1054 struct lp_build_context bld;
1055 struct lp_type int16_type = dst_type;
1056 struct lp_type int32_type = dst_type;
1057 LLVMValueRef const_255f;
1058 unsigned i;
1059
1060 lp_build_context_init(&bld, gallivm, src_type);
1061
1062 int16_type.width *= 2;
1063 int16_type.length /= 2;
1064 int16_type.sign = 1;
1065
1066 int32_type.width *= 4;
1067 int32_type.length /= 4;
1068 int32_type.sign = 1;
1069
1070 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
1071
1072 for (i = 0; i < num_dsts; ++i, src += 2) {
1073 LLVMValueRef lo, hi, a, b;
1074
1075 a = LLVMBuildFMul(builder, src[0], const_255f, "");
1076 b = LLVMBuildFMul(builder, src[1], const_255f, "");
1077
1078 a = lp_build_iround(&bld, a);
1079 b = lp_build_iround(&bld, b);
1080
1081 tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
1082 tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
1083 tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
1084 tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
1085
1086 /* relying on clamping behavior of sse2 intrinsics here */
1087 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
1088 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
1089 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
1090 }
1091 return;
1092 }
1093
1094 /* Special case -> 16bit half-float
1095 */
1096 else if (dst_type.floating && dst_type.width == 16)
1097 {
1098 /* Only support src as 32bit float currently */
1099 assert(src_type.floating && src_type.width == 32);
1100
1101 for(i = 0; i < num_tmps; ++i)
1102 dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
1103
1104 return;
1105 }
1106
1107 /* Pre convert half-floats to floats
1108 */
1109 else if (src_type.floating && src_type.width == 16)
1110 {
1111 for(i = 0; i < num_tmps; ++i)
1112 tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
1113
1114 tmp_type.width = 32;
1115 }
1116
1117 /*
1118 * Clamp if necessary
1119 */
1120
1121 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
1122 struct lp_build_context bld;
1123 double src_min = lp_const_min(src_type);
1124 double dst_min = lp_const_min(dst_type);
1125 double src_max = lp_const_max(src_type);
1126 double dst_max = lp_const_max(dst_type);
1127 LLVMValueRef thres;
1128
1129 lp_build_context_init(&bld, gallivm, tmp_type);
1130
1131 if(src_min < dst_min) {
1132 if(dst_min == 0.0)
1133 thres = bld.zero;
1134 else
1135 thres = lp_build_const_vec(gallivm, src_type, dst_min);
1136 for(i = 0; i < num_tmps; ++i)
1137 tmp[i] = lp_build_max(&bld, tmp[i], thres);
1138 }
1139
1140 if(src_max > dst_max) {
1141 if(dst_max == 1.0)
1142 thres = bld.one;
1143 else
1144 thres = lp_build_const_vec(gallivm, src_type, dst_max);
1145 for(i = 0; i < num_tmps; ++i)
1146 tmp[i] = lp_build_min(&bld, tmp[i], thres);
1147 }
1148 }
1149
1150 /*
1151 * Scale to the narrowest range
1152 */
1153
1154 if(dst_type.floating) {
1155 /* Nothing to do */
1156 }
1157 else if(tmp_type.floating) {
1158 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
1159 for(i = 0; i < num_tmps; ++i) {
1160 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
1161 tmp_type,
1162 dst_type.width,
1163 tmp[i]);
1164 }
1165 tmp_type.floating = FALSE;
1166 }
1167 else {
1168 double dst_scale = lp_const_scale(dst_type);
1169 LLVMTypeRef tmp_vec_type;
1170
1171 if (dst_scale != 1.0) {
1172 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
1173 for(i = 0; i < num_tmps; ++i)
1174 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
1175 }
1176
1177 /* Use an equally sized integer for intermediate computations */
1178 tmp_type.floating = FALSE;
1179 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
1180 for(i = 0; i < num_tmps; ++i) {
1181 #if 0
1182 if(dst_type.sign)
1183 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
1184 else
1185 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
1186 #else
1187 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
1188 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
1189 #endif
1190 }
1191 }
1192 }
1193 else {
1194 unsigned src_shift = lp_const_shift(src_type);
1195 unsigned dst_shift = lp_const_shift(dst_type);
1196 unsigned src_offset = lp_const_offset(src_type);
1197 unsigned dst_offset = lp_const_offset(dst_type);
1198
1199 /* Compensate for different offsets */
1200 if (dst_offset > src_offset && src_type.width > dst_type.width) {
1201 for (i = 0; i < num_tmps; ++i) {
1202 LLVMValueRef shifted;
1203 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1);
1204 if(src_type.sign)
1205 shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
1206 else
1207 shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
1208
1209 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
1210 }
1211 }
1212
1213 if(src_shift > dst_shift) {
1214 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
1215 src_shift - dst_shift);
1216 for(i = 0; i < num_tmps; ++i)
1217 if(src_type.sign)
1218 tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
1219 else
1220 tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
1221 }
1222 }
1223
1224 /*
1225 * Truncate or expand bit width
1226 *
1227 * No data conversion should happen here, although the sign bits are
1228 * crucial to avoid bad clamping.
1229 */
1230
1231 {
1232 struct lp_type new_type;
1233
1234 new_type = tmp_type;
1235 new_type.sign = dst_type.sign;
1236 new_type.width = dst_type.width;
1237 new_type.length = dst_type.length;
1238
1239 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
1240
1241 tmp_type = new_type;
1242 num_tmps = num_dsts;
1243 }
1244
1245 /*
1246 * Scale to the widest range
1247 */
1248
1249 if(src_type.floating) {
1250 /* Nothing to do */
1251 }
1252 else if(!src_type.floating && dst_type.floating) {
1253 if(!src_type.fixed && !src_type.sign && src_type.norm) {
1254 for(i = 0; i < num_tmps; ++i) {
1255 tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
1256 src_type.width,
1257 dst_type,
1258 tmp[i]);
1259 }
1260 tmp_type.floating = TRUE;
1261 }
1262 else {
1263 double src_scale = lp_const_scale(src_type);
1264 LLVMTypeRef tmp_vec_type;
1265
1266 /* Use an equally sized integer for intermediate computations */
1267 tmp_type.floating = TRUE;
1268 tmp_type.sign = TRUE;
1269 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
1270 for(i = 0; i < num_tmps; ++i) {
1271 #if 0
1272 if(dst_type.sign)
1273 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
1274 else
1275 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
1276 #else
1277 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
1278 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
1279 #endif
1280 }
1281
1282 if (src_scale != 1.0) {
1283 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
1284 for(i = 0; i < num_tmps; ++i)
1285 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
1286 }
1287 }
1288 }
1289 else {
1290 unsigned src_shift = lp_const_shift(src_type);
1291 unsigned dst_shift = lp_const_shift(dst_type);
1292 unsigned src_offset = lp_const_offset(src_type);
1293 unsigned dst_offset = lp_const_offset(dst_type);
1294
1295 if (src_shift < dst_shift) {
1296 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
1297 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift);
1298
1299 for (i = 0; i < num_tmps; ++i) {
1300 pre_shift[i] = tmp[i];
1301 tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
1302 }
1303
1304 /* Compensate for different offsets */
1305 if (dst_offset > src_offset) {
1306 for (i = 0; i < num_tmps; ++i) {
1307 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
1308 }
1309 }
1310 }
1311 }
1312
1313 for(i = 0; i < num_dsts; ++i) {
1314 dst[i] = tmp[i];
1315 assert(lp_check_value(dst_type, dst[i]));
1316 }
1317 }
1318
1319
1320 /**
1321 * Bit mask conversion.
1322 *
1323 * This will convert the integer masks that match the given types.
1324 *
1325 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
1326 * Any other value will likely cause unpredictable results.
1327 *
1328 * This is basically a very trimmed down version of lp_build_conv.
1329 */
1330 void
1331 lp_build_conv_mask(struct gallivm_state *gallivm,
1332 struct lp_type src_type,
1333 struct lp_type dst_type,
1334 const LLVMValueRef *src, unsigned num_srcs,
1335 LLVMValueRef *dst, unsigned num_dsts)
1336 {
1337
1338 /* We must not loose or gain channels. Only precision */
1339 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
1340
1341 /*
1342 * Drop
1343 *
1344 * We assume all values are 0 or -1
1345 */
1346
1347 src_type.floating = FALSE;
1348 src_type.fixed = FALSE;
1349 src_type.sign = TRUE;
1350 src_type.norm = FALSE;
1351
1352 dst_type.floating = FALSE;
1353 dst_type.fixed = FALSE;
1354 dst_type.sign = TRUE;
1355 dst_type.norm = FALSE;
1356
1357 /*
1358 * Truncate or expand bit width
1359 */
1360
1361 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
1362 }