mesa/gallium: Move u_bit_scan{,64} from gallium to util.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_conv.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for type conversions.
32 *
33 * We want to use the fastest type for a given computation whenever feasible.
34 * The other side of this is that we need to be able convert between several
35 * types accurately and efficiently.
36 *
37 * Conversion between types of different bit width is quite complex since a
38 *
39 * To remember there are a few invariants in type conversions:
40 *
41 * - register width must remain constant:
42 *
43 * src_type.width * src_type.length == dst_type.width * dst_type.length
44 *
45 * - total number of elements must remain constant:
46 *
47 * src_type.length * num_srcs == dst_type.length * num_dsts
48 *
49 * It is not always possible to do the conversion both accurately and
50 * efficiently, usually due to lack of adequate machine instructions. In these
51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52 * there this functions can be used anywhere. In the future we might have a
53 * precision parameter which can gauge the accuracy vs efficiency compromise,
54 * but for now if the data conversion between two stages happens to be the
55 * bottleneck, then most likely should just avoid converting at all and run
56 * both stages with the same type.
57 *
58 * Make sure to run lp_test_conv unit test after any change to this file.
59 *
60 * @author Jose Fonseca <jfonseca@vmware.com>
61 */
62
63
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/u_half.h"
67 #include "util/u_cpu_detect.h"
68
69 #include "lp_bld_type.h"
70 #include "lp_bld_const.h"
71 #include "lp_bld_arit.h"
72 #include "lp_bld_bitarit.h"
73 #include "lp_bld_pack.h"
74 #include "lp_bld_conv.h"
75 #include "lp_bld_logic.h"
76 #include "lp_bld_intr.h"
77 #include "lp_bld_printf.h"
78 #include "lp_bld_format.h"
79
80
81
82 /**
83 * Converts int16 half-float to float32
84 * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
85 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
86 *
87 * @param src value to convert
88 *
89 */
90 LLVMValueRef
91 lp_build_half_to_float(struct gallivm_state *gallivm,
92 LLVMValueRef src)
93 {
94 LLVMBuilderRef builder = gallivm->builder;
95 LLVMTypeRef src_type = LLVMTypeOf(src);
96 unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
97 LLVMGetVectorSize(src_type) : 1;
98
99 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
100 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
101 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
102 LLVMValueRef h;
103
104 if (util_cpu_caps.has_f16c &&
105 (src_length == 4 || src_length == 8)) {
106 const char *intrinsic = NULL;
107 if (src_length == 4) {
108 src = lp_build_pad_vector(gallivm, src, 8);
109 intrinsic = "llvm.x86.vcvtph2ps.128";
110 }
111 else {
112 intrinsic = "llvm.x86.vcvtph2ps.256";
113 }
114 return lp_build_intrinsic_unary(builder, intrinsic,
115 lp_build_vec_type(gallivm, f32_type), src);
116 }
117
118 /* Convert int16 vector to int32 vector by zero ext (might generate bad code) */
119 h = LLVMBuildZExt(builder, src, int_vec_type, "");
120 return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
121 }
122
123
124 /**
125 * Converts float32 to int16 half-float
126 * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
127 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
128 *
129 * @param src value to convert
130 *
131 * Convert float32 to half floats, preserving Infs and NaNs,
132 * with rounding towards zero (trunc).
133 * XXX: For GL, would prefer rounding towards nearest(-even).
134 */
135 LLVMValueRef
136 lp_build_float_to_half(struct gallivm_state *gallivm,
137 LLVMValueRef src)
138 {
139 LLVMBuilderRef builder = gallivm->builder;
140 LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
141 unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
142 ? LLVMGetVectorSize(f32_vec_type) : 1;
143 struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
144 struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
145 LLVMValueRef result;
146
147 /*
148 * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
149 * directly, without any (x86 or generic) intrinsics.
150 * Albeit the rounding mode cannot be specified (and is undefined,
151 * though in practice on x86 seems to do nearest-even but it may
152 * be dependent on instruction set support), so is essentially
153 * useless.
154 */
155
156 if (util_cpu_caps.has_f16c &&
157 (length == 4 || length == 8)) {
158 struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
159 unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
160 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
161 const char *intrinsic = NULL;
162 if (length == 4) {
163 intrinsic = "llvm.x86.vcvtps2ph.128";
164 }
165 else {
166 intrinsic = "llvm.x86.vcvtps2ph.256";
167 }
168 result = lp_build_intrinsic_binary(builder, intrinsic,
169 lp_build_vec_type(gallivm, i168_type),
170 src, LLVMConstInt(i32t, mode, 0));
171 if (length == 4) {
172 result = lp_build_extract_range(gallivm, result, 0, 4);
173 }
174 }
175
176 else {
177 result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
178 /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
179 result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
180 }
181
182 /*
183 * Debugging code.
184 */
185 if (0) {
186 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
187 LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
188 LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
189 LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));
190 unsigned i;
191
192 LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);
193 LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half));
194 func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half");
195
196 for (i = 0; i < length; ++i) {
197 LLVMValueRef index = LLVMConstInt(i32t, i, 0);
198 LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
199 #if 0
200 /*
201 * XXX: not really supported by backends.
202 * Even if they would now, rounding mode cannot be specified and
203 * is undefined.
204 */
205 LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
206 #else
207 LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
208 #endif
209 ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");
210 }
211
212 lp_build_print_value(gallivm, "src = ", src);
213 lp_build_print_value(gallivm, "llvm = ", result);
214 lp_build_print_value(gallivm, "util = ", ref_result);
215 lp_build_printf(gallivm, "\n");
216 }
217
218 return result;
219 }
220
221
222 /**
223 * Special case for converting clamped IEEE-754 floats to unsigned norms.
224 *
225 * The mathematical voodoo below may seem excessive but it is actually
226 * paramount we do it this way for several reasons. First, there is no single
227 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
228 * secondly, even if there was, since the FP's mantissa takes only a fraction
229 * of register bits the typically scale and cast approach would require double
230 * precision for accurate results, and therefore half the throughput
231 *
232 * Although the result values can be scaled to an arbitrary bit width specified
233 * by dst_width, the actual result type will have the same width.
234 *
235 * Ex: src = { float, float, float, float }
236 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
237 */
238 LLVMValueRef
239 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
240 struct lp_type src_type,
241 unsigned dst_width,
242 LLVMValueRef src)
243 {
244 LLVMBuilderRef builder = gallivm->builder;
245 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
246 LLVMValueRef res;
247 unsigned mantissa;
248
249 assert(src_type.floating);
250 assert(dst_width <= src_type.width);
251 src_type.sign = FALSE;
252
253 mantissa = lp_mantissa(src_type);
254
255 if (dst_width <= mantissa) {
256 /*
257 * Apply magic coefficients that will make the desired result to appear
258 * in the lowest significant bits of the mantissa, with correct rounding.
259 *
260 * This only works if the destination width fits in the mantissa.
261 */
262
263 unsigned long long ubound;
264 unsigned long long mask;
265 double scale;
266 double bias;
267
268 ubound = (1ULL << dst_width);
269 mask = ubound - 1;
270 scale = (double)mask/ubound;
271 bias = (double)(1ULL << (mantissa - dst_width));
272
273 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
274 /* instead of fadd/and could (with sse2) just use lp_build_iround */
275 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
276 res = LLVMBuildBitCast(builder, res, int_vec_type, "");
277 res = LLVMBuildAnd(builder, res,
278 lp_build_const_int_vec(gallivm, src_type, mask), "");
279 }
280 else if (dst_width == (mantissa + 1)) {
281 /*
282 * The destination width matches exactly what can be represented in
283 * floating point (i.e., mantissa + 1 bits). Even so correct rounding
284 * still needs to be applied (only for numbers in [0.5-1.0] would
285 * conversion using truncation after scaling be sufficient).
286 */
287 double scale;
288 struct lp_build_context uf32_bld;
289
290 lp_build_context_init(&uf32_bld, gallivm, src_type);
291 scale = (double)((1ULL << dst_width) - 1);
292
293 res = LLVMBuildFMul(builder, src,
294 lp_build_const_vec(gallivm, src_type, scale), "");
295 res = lp_build_iround(&uf32_bld, res);
296 }
297 else {
298 /*
299 * The destination exceeds what can be represented in the floating point.
300 * So multiply by the largest power two we get away with, and when
301 * subtract the most significant bit to rescale to normalized values.
302 *
303 * The largest power of two factor we can get away is
304 * (1 << (src_type.width - 1)), because we need to use signed . In theory it
305 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
306 * INT_MIN should be returned in FPToSI, which is the correct result for
307 * values near 1.0!
308 *
309 * This means we get (src_type.width - 1) correct bits for values near 0.0,
310 * and (mantissa + 1) correct bits for values near 1.0. Equally or more
311 * important, we also get exact results for 0.0 and 1.0.
312 */
313
314 unsigned n = MIN2(src_type.width - 1u, dst_width);
315
316 double scale = (double)(1ULL << n);
317 unsigned lshift = dst_width - n;
318 unsigned rshift = n;
319 LLVMValueRef lshifted;
320 LLVMValueRef rshifted;
321
322 res = LLVMBuildFMul(builder, src,
323 lp_build_const_vec(gallivm, src_type, scale), "");
324 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
325
326 /*
327 * Align the most significant bit to its final place.
328 *
329 * This will cause 1.0 to overflow to 0, but the later adjustment will
330 * get it right.
331 */
332 if (lshift) {
333 lshifted = LLVMBuildShl(builder, res,
334 lp_build_const_int_vec(gallivm, src_type,
335 lshift), "");
336 } else {
337 lshifted = res;
338 }
339
340 /*
341 * Align the most significant bit to the right.
342 */
343 rshifted = LLVMBuildLShr(builder, res,
344 lp_build_const_int_vec(gallivm, src_type, rshift),
345 "");
346
347 /*
348 * Subtract the MSB to the LSB, therefore re-scaling from
349 * (1 << dst_width) to ((1 << dst_width) - 1).
350 */
351
352 res = LLVMBuildSub(builder, lshifted, rshifted, "");
353 }
354
355 return res;
356 }
357
358
359 /**
360 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
361 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
362 * return {float, float, float, float} with values in range [0, 1].
363 */
364 LLVMValueRef
365 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
366 unsigned src_width,
367 struct lp_type dst_type,
368 LLVMValueRef src)
369 {
370 LLVMBuilderRef builder = gallivm->builder;
371 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
372 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
373 LLVMValueRef bias_;
374 LLVMValueRef res;
375 unsigned mantissa;
376 unsigned n;
377 unsigned long long ubound;
378 unsigned long long mask;
379 double scale;
380 double bias;
381
382 assert(dst_type.floating);
383
384 mantissa = lp_mantissa(dst_type);
385
386 if (src_width <= (mantissa + 1)) {
387 /*
388 * The source width matches fits what can be represented in floating
389 * point (i.e., mantissa + 1 bits). So do a straight multiplication
390 * followed by casting. No further rounding is necessary.
391 */
392
393 scale = 1.0/(double)((1ULL << src_width) - 1);
394 res = LLVMBuildSIToFP(builder, src, vec_type, "");
395 res = LLVMBuildFMul(builder, res,
396 lp_build_const_vec(gallivm, dst_type, scale), "");
397 return res;
398 }
399 else {
400 /*
401 * The source width exceeds what can be represented in floating
402 * point. So truncate the incoming values.
403 */
404
405 n = MIN2(mantissa, src_width);
406
407 ubound = ((unsigned long long)1 << n);
408 mask = ubound - 1;
409 scale = (double)ubound/mask;
410 bias = (double)((unsigned long long)1 << (mantissa - n));
411
412 res = src;
413
414 if (src_width > mantissa) {
415 int shift = src_width - mantissa;
416 res = LLVMBuildLShr(builder, res,
417 lp_build_const_int_vec(gallivm, dst_type, shift), "");
418 }
419
420 bias_ = lp_build_const_vec(gallivm, dst_type, bias);
421
422 res = LLVMBuildOr(builder,
423 res,
424 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
425
426 res = LLVMBuildBitCast(builder, res, vec_type, "");
427
428 res = LLVMBuildFSub(builder, res, bias_, "");
429 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
430 }
431
432 return res;
433 }
434
435
436 /**
437 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
438 *
439 * Returns the number of dsts created from src
440 */
441 int lp_build_conv_auto(struct gallivm_state *gallivm,
442 struct lp_type src_type,
443 struct lp_type* dst_type,
444 const LLVMValueRef *src,
445 unsigned num_srcs,
446 LLVMValueRef *dst)
447 {
448 unsigned i;
449 int num_dsts = num_srcs;
450
451 if (src_type.floating == dst_type->floating &&
452 src_type.width == dst_type->width &&
453 src_type.length == dst_type->length &&
454 src_type.fixed == dst_type->fixed &&
455 src_type.norm == dst_type->norm &&
456 src_type.sign == dst_type->sign)
457 return num_dsts;
458
459 /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
460 */
461 if (src_type.floating == 1 &&
462 src_type.fixed == 0 &&
463 src_type.sign == 1 &&
464 src_type.norm == 0 &&
465 src_type.width == 32 &&
466
467 dst_type->floating == 0 &&
468 dst_type->fixed == 0 &&
469 dst_type->sign == 0 &&
470 dst_type->norm == 1 &&
471 dst_type->width == 8)
472 {
473 /* Special case 4x4f --> 1x16ub */
474 if (src_type.length == 4 &&
475 (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
476 {
477 num_dsts = (num_srcs + 3) / 4;
478 dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
479
480 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
481 return num_dsts;
482 }
483
484 /* Special case 2x8f --> 1x16ub */
485 if (src_type.length == 8 &&
486 util_cpu_caps.has_avx)
487 {
488 num_dsts = (num_srcs + 1) / 2;
489 dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
490
491 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
492 return num_dsts;
493 }
494 }
495
496 /* lp_build_resize does not support M:N */
497 if (src_type.width == dst_type->width) {
498 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
499 } else {
500 for (i = 0; i < num_srcs; ++i) {
501 lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
502 }
503 }
504
505 return num_dsts;
506 }
507
508
509 /**
510 * Generic type conversion.
511 *
512 * TODO: Take a precision argument, or even better, add a new precision member
513 * to the lp_type union.
514 */
515 void
516 lp_build_conv(struct gallivm_state *gallivm,
517 struct lp_type src_type,
518 struct lp_type dst_type,
519 const LLVMValueRef *src, unsigned num_srcs,
520 LLVMValueRef *dst, unsigned num_dsts)
521 {
522 LLVMBuilderRef builder = gallivm->builder;
523 struct lp_type tmp_type;
524 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
525 unsigned num_tmps;
526 unsigned i;
527
528 /* We must not loose or gain channels. Only precision */
529 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
530
531 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
532 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
533 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
534 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
535
536 tmp_type = src_type;
537 for(i = 0; i < num_srcs; ++i) {
538 assert(lp_check_value(src_type, src[i]));
539 tmp[i] = src[i];
540 }
541 num_tmps = num_srcs;
542
543
544 /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
545 */
546 if (src_type.floating == 1 &&
547 src_type.fixed == 0 &&
548 src_type.sign == 1 &&
549 src_type.norm == 0 &&
550 src_type.width == 32 &&
551 src_type.length == 4 &&
552
553 dst_type.floating == 0 &&
554 dst_type.fixed == 0 &&
555 dst_type.sign == 0 &&
556 dst_type.norm == 1 &&
557 dst_type.width == 8 &&
558
559 ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
560 (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
561
562 (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
563 {
564 struct lp_build_context bld;
565 struct lp_type int16_type, int32_type;
566 struct lp_type dst_type_ext = dst_type;
567 LLVMValueRef const_255f;
568 unsigned i, j;
569
570 lp_build_context_init(&bld, gallivm, src_type);
571
572 dst_type_ext.length = 16;
573 int16_type = int32_type = dst_type_ext;
574
575 int16_type.width *= 2;
576 int16_type.length /= 2;
577 int16_type.sign = 1;
578
579 int32_type.width *= 4;
580 int32_type.length /= 4;
581 int32_type.sign = 1;
582
583 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
584
585 for (i = 0; i < num_dsts; ++i, src += 4) {
586 LLVMValueRef lo, hi;
587
588 for (j = 0; j < dst_type.length / 4; ++j) {
589 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
590 tmp[j] = lp_build_iround(&bld, tmp[j]);
591 }
592
593 if (num_srcs == 1) {
594 tmp[1] = tmp[0];
595 }
596
597 /* relying on clamping behavior of sse2 intrinsics here */
598 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
599
600 if (num_srcs < 4) {
601 hi = lo;
602 }
603 else {
604 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
605 }
606 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
607 }
608 if (num_srcs < 4) {
609 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
610 }
611
612 return;
613 }
614
615 /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
616 */
617 else if (src_type.floating == 1 &&
618 src_type.fixed == 0 &&
619 src_type.sign == 1 &&
620 src_type.norm == 0 &&
621 src_type.width == 32 &&
622 src_type.length == 8 &&
623
624 dst_type.floating == 0 &&
625 dst_type.fixed == 0 &&
626 dst_type.sign == 0 &&
627 dst_type.norm == 1 &&
628 dst_type.width == 8 &&
629
630 ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
631 (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
632
633 util_cpu_caps.has_avx) {
634
635 struct lp_build_context bld;
636 struct lp_type int16_type, int32_type;
637 struct lp_type dst_type_ext = dst_type;
638 LLVMValueRef const_255f;
639 unsigned i;
640
641 lp_build_context_init(&bld, gallivm, src_type);
642
643 dst_type_ext.length = 16;
644 int16_type = int32_type = dst_type_ext;
645
646 int16_type.width *= 2;
647 int16_type.length /= 2;
648 int16_type.sign = 1;
649
650 int32_type.width *= 4;
651 int32_type.length /= 4;
652 int32_type.sign = 1;
653
654 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
655
656 for (i = 0; i < num_dsts; ++i, src += 2) {
657 LLVMValueRef lo, hi, a, b;
658
659 a = LLVMBuildFMul(builder, src[0], const_255f, "");
660 a = lp_build_iround(&bld, a);
661 tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
662 tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
663 /* relying on clamping behavior of sse2 intrinsics here */
664 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
665
666 if (num_srcs == 1) {
667 hi = lo;
668 }
669 else {
670 b = LLVMBuildFMul(builder, src[1], const_255f, "");
671 b = lp_build_iround(&bld, b);
672 tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
673 tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
674 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
675
676 }
677 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
678 }
679
680 if (num_srcs == 1) {
681 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
682 }
683
684 return;
685 }
686
687 /* Special case -> 16bit half-float
688 */
689 else if (dst_type.floating && dst_type.width == 16)
690 {
691 /* Only support src as 32bit float currently */
692 assert(src_type.floating && src_type.width == 32);
693
694 for(i = 0; i < num_tmps; ++i)
695 dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
696
697 return;
698 }
699
700 /* Pre convert half-floats to floats
701 */
702 else if (src_type.floating && src_type.width == 16)
703 {
704 for(i = 0; i < num_tmps; ++i)
705 tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
706
707 tmp_type.width = 32;
708 }
709
710 /*
711 * Clamp if necessary
712 */
713
714 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
715 struct lp_build_context bld;
716 double src_min = lp_const_min(src_type);
717 double dst_min = lp_const_min(dst_type);
718 double src_max = lp_const_max(src_type);
719 double dst_max = lp_const_max(dst_type);
720 LLVMValueRef thres;
721
722 lp_build_context_init(&bld, gallivm, tmp_type);
723
724 if(src_min < dst_min) {
725 if(dst_min == 0.0)
726 thres = bld.zero;
727 else
728 thres = lp_build_const_vec(gallivm, src_type, dst_min);
729 for(i = 0; i < num_tmps; ++i)
730 tmp[i] = lp_build_max(&bld, tmp[i], thres);
731 }
732
733 if(src_max > dst_max) {
734 if(dst_max == 1.0)
735 thres = bld.one;
736 else
737 thres = lp_build_const_vec(gallivm, src_type, dst_max);
738 for(i = 0; i < num_tmps; ++i)
739 tmp[i] = lp_build_min(&bld, tmp[i], thres);
740 }
741 }
742
743 /*
744 * Scale to the narrowest range
745 */
746
747 if(dst_type.floating) {
748 /* Nothing to do */
749 }
750 else if(tmp_type.floating) {
751 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
752 for(i = 0; i < num_tmps; ++i) {
753 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
754 tmp_type,
755 dst_type.width,
756 tmp[i]);
757 }
758 tmp_type.floating = FALSE;
759 }
760 else {
761 double dst_scale = lp_const_scale(dst_type);
762
763 if (dst_scale != 1.0) {
764 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
765 for(i = 0; i < num_tmps; ++i)
766 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
767 }
768
769 /*
770 * these functions will use fptosi in some form which won't work
771 * with 32bit uint dst. Causes lp_test_conv failures though.
772 */
773 if (0)
774 assert(dst_type.sign || dst_type.width < 32);
775
776 if (dst_type.sign && dst_type.norm && !dst_type.fixed) {
777 struct lp_build_context bld;
778
779 lp_build_context_init(&bld, gallivm, tmp_type);
780 for(i = 0; i < num_tmps; ++i) {
781 tmp[i] = lp_build_iround(&bld, tmp[i]);
782 }
783 tmp_type.floating = FALSE;
784 }
785 else {
786 LLVMTypeRef tmp_vec_type;
787
788 tmp_type.floating = FALSE;
789 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
790 for(i = 0; i < num_tmps; ++i) {
791 #if 0
792 if(dst_type.sign)
793 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
794 else
795 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
796 #else
797 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
798 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
799 #endif
800 }
801 }
802 }
803 }
804 else {
805 unsigned src_shift = lp_const_shift(src_type);
806 unsigned dst_shift = lp_const_shift(dst_type);
807 unsigned src_offset = lp_const_offset(src_type);
808 unsigned dst_offset = lp_const_offset(dst_type);
809 struct lp_build_context bld;
810 lp_build_context_init(&bld, gallivm, tmp_type);
811
812 /* Compensate for different offsets */
813 /* sscaled -> unorm and similar would cause negative shift count, skip */
814 if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) {
815 for (i = 0; i < num_tmps; ++i) {
816 LLVMValueRef shifted;
817
818 shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1);
819 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
820 }
821 }
822
823 if(src_shift > dst_shift) {
824 for(i = 0; i < num_tmps; ++i)
825 tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift);
826 }
827 }
828
829 /*
830 * Truncate or expand bit width
831 *
832 * No data conversion should happen here, although the sign bits are
833 * crucial to avoid bad clamping.
834 */
835
836 {
837 struct lp_type new_type;
838
839 new_type = tmp_type;
840 new_type.sign = dst_type.sign;
841 new_type.width = dst_type.width;
842 new_type.length = dst_type.length;
843
844 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
845
846 tmp_type = new_type;
847 num_tmps = num_dsts;
848 }
849
850 /*
851 * Scale to the widest range
852 */
853
854 if(src_type.floating) {
855 /* Nothing to do */
856 }
857 else if(!src_type.floating && dst_type.floating) {
858 if(!src_type.fixed && !src_type.sign && src_type.norm) {
859 for(i = 0; i < num_tmps; ++i) {
860 tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
861 src_type.width,
862 dst_type,
863 tmp[i]);
864 }
865 tmp_type.floating = TRUE;
866 }
867 else {
868 double src_scale = lp_const_scale(src_type);
869 LLVMTypeRef tmp_vec_type;
870
871 /* Use an equally sized integer for intermediate computations */
872 tmp_type.floating = TRUE;
873 tmp_type.sign = TRUE;
874 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
875 for(i = 0; i < num_tmps; ++i) {
876 #if 0
877 if(dst_type.sign)
878 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
879 else
880 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
881 #else
882 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
883 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
884 #endif
885 }
886
887 if (src_scale != 1.0) {
888 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
889 for(i = 0; i < num_tmps; ++i)
890 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
891 }
892
893 /* the formula above will produce value below -1.0 for most negative
894 * value but everything seems happy with that hence disable for now */
895 if (0 && !src_type.fixed && src_type.norm && src_type.sign) {
896 struct lp_build_context bld;
897
898 lp_build_context_init(&bld, gallivm, dst_type);
899 for(i = 0; i < num_tmps; ++i) {
900 tmp[i] = lp_build_max(&bld, tmp[i],
901 lp_build_const_vec(gallivm, dst_type, -1.0f));
902 }
903 }
904 }
905 }
906 else {
907 unsigned src_shift = lp_const_shift(src_type);
908 unsigned dst_shift = lp_const_shift(dst_type);
909 unsigned src_offset = lp_const_offset(src_type);
910 unsigned dst_offset = lp_const_offset(dst_type);
911 struct lp_build_context bld;
912 lp_build_context_init(&bld, gallivm, tmp_type);
913
914 if (src_shift < dst_shift) {
915 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
916
917 if (dst_shift - src_shift < dst_type.width) {
918 for (i = 0; i < num_tmps; ++i) {
919 pre_shift[i] = tmp[i];
920 tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift);
921 }
922 }
923 else {
924 /*
925 * This happens for things like sscaled -> unorm conversions. Shift
926 * counts equal to bit width cause undefined results, so hack around it.
927 */
928 for (i = 0; i < num_tmps; ++i) {
929 pre_shift[i] = tmp[i];
930 tmp[i] = lp_build_zero(gallivm, dst_type);
931 }
932 }
933
934 /* Compensate for different offsets */
935 if (dst_offset > src_offset) {
936 for (i = 0; i < num_tmps; ++i) {
937 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
938 }
939 }
940 }
941 }
942
943 for(i = 0; i < num_dsts; ++i) {
944 dst[i] = tmp[i];
945 assert(lp_check_value(dst_type, dst[i]));
946 }
947 }
948
949
950 /**
951 * Bit mask conversion.
952 *
953 * This will convert the integer masks that match the given types.
954 *
955 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
956 * Any other value will likely cause unpredictable results.
957 *
958 * This is basically a very trimmed down version of lp_build_conv.
959 */
960 void
961 lp_build_conv_mask(struct gallivm_state *gallivm,
962 struct lp_type src_type,
963 struct lp_type dst_type,
964 const LLVMValueRef *src, unsigned num_srcs,
965 LLVMValueRef *dst, unsigned num_dsts)
966 {
967
968 /* We must not loose or gain channels. Only precision */
969 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
970
971 /*
972 * Drop
973 *
974 * We assume all values are 0 or -1
975 */
976
977 src_type.floating = FALSE;
978 src_type.fixed = FALSE;
979 src_type.sign = TRUE;
980 src_type.norm = FALSE;
981
982 dst_type.floating = FALSE;
983 dst_type.fixed = FALSE;
984 dst_type.sign = TRUE;
985 dst_type.norm = FALSE;
986
987 /*
988 * Truncate or expand bit width
989 */
990
991 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
992 }