1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
31 * Helper functions for type conversions.
33 * We want to use the fastest type for a given computation whenever feasible.
34 * The other side of this is that we need to be able convert between several
35 * types accurately and efficiently.
37 * Conversion between types of different bit width is quite complex since a
39 * To remember there are a few invariants in type conversions:
41 * - register width must remain constant:
43 * src_type.width * src_type.length == dst_type.width * dst_type.length
45 * - total number of elements must remain constant:
47 * src_type.length * num_srcs == dst_type.length * num_dsts
49 * It is not always possible to do the conversion both accurately and
50 * efficiently, usually due to lack of adequate machine instructions. In these
51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52 * there this functions can be used anywhere. In the future we might have a
53 * precision parameter which can gauge the accuracy vs efficiency compromise,
54 * but for now if the data conversion between two stages happens to be the
55 * bottleneck, then most likely should just avoid converting at all and run
56 * both stages with the same type.
58 * Make sure to run lp_test_conv unit test after any change to this file.
60 * @author Jose Fonseca <jfonseca@vmware.com>
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/u_half.h"
67 #include "util/u_cpu_detect.h"
69 #include "lp_bld_type.h"
70 #include "lp_bld_const.h"
71 #include "lp_bld_arit.h"
72 #include "lp_bld_bitarit.h"
73 #include "lp_bld_pack.h"
74 #include "lp_bld_conv.h"
75 #include "lp_bld_logic.h"
76 #include "lp_bld_intr.h"
77 #include "lp_bld_printf.h"
78 #include "lp_bld_format.h"
83 * Converts int16 half-float to float32
84 * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
85 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
87 * @param src value to convert
91 lp_build_half_to_float(struct gallivm_state
*gallivm
,
94 LLVMBuilderRef builder
= gallivm
->builder
;
95 LLVMTypeRef src_type
= LLVMTypeOf(src
);
96 unsigned src_length
= LLVMGetTypeKind(src_type
) == LLVMVectorTypeKind
?
97 LLVMGetVectorSize(src_type
) : 1;
99 struct lp_type f32_type
= lp_type_float_vec(32, 32 * src_length
);
100 struct lp_type i32_type
= lp_type_int_vec(32, 32 * src_length
);
101 LLVMTypeRef int_vec_type
= lp_build_vec_type(gallivm
, i32_type
);
104 if (util_cpu_caps
.has_f16c
&&
105 (src_length
== 4 || src_length
== 8)) {
106 if (LLVM_VERSION_MAJOR
< 11) {
107 const char *intrinsic
= NULL
;
108 if (src_length
== 4) {
109 src
= lp_build_pad_vector(gallivm
, src
, 8);
110 intrinsic
= "llvm.x86.vcvtph2ps.128";
113 intrinsic
= "llvm.x86.vcvtph2ps.256";
115 return lp_build_intrinsic_unary(builder
, intrinsic
,
116 lp_build_vec_type(gallivm
, f32_type
), src
);
119 * XXX: could probably use on other archs as well.
120 * But if the cpu doesn't support it natively it looks like the backends still
121 * can't lower it and will try to call out to external libraries, which will crash.
124 * XXX: lp_build_vec_type() would use int16 vector. Probably need to revisit
125 * this at some point.
127 src
= LLVMBuildBitCast(builder
, src
,
128 LLVMVectorType(LLVMHalfTypeInContext(gallivm
->context
), src_length
), "");
129 return LLVMBuildFPExt(builder
, src
, lp_build_vec_type(gallivm
, f32_type
), "");
133 h
= LLVMBuildZExt(builder
, src
, int_vec_type
, "");
134 return lp_build_smallfloat_to_float(gallivm
, f32_type
, h
, 10, 5, 0, true);
139 * Converts float32 to int16 half-float
140 * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
141 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
143 * @param src value to convert
145 * Convert float32 to half floats, preserving Infs and NaNs,
146 * with rounding towards zero (trunc).
147 * XXX: For GL, would prefer rounding towards nearest(-even).
150 lp_build_float_to_half(struct gallivm_state
*gallivm
,
153 LLVMBuilderRef builder
= gallivm
->builder
;
154 LLVMTypeRef f32_vec_type
= LLVMTypeOf(src
);
155 unsigned length
= LLVMGetTypeKind(f32_vec_type
) == LLVMVectorTypeKind
156 ? LLVMGetVectorSize(f32_vec_type
) : 1;
157 struct lp_type i32_type
= lp_type_int_vec(32, 32 * length
);
158 struct lp_type i16_type
= lp_type_int_vec(16, 16 * length
);
162 * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
163 * directly, without any (x86 or generic) intrinsics.
164 * Albeit the rounding mode cannot be specified (and is undefined,
165 * though in practice on x86 seems to do nearest-even but it may
166 * be dependent on instruction set support), so is essentially
170 if (util_cpu_caps
.has_f16c
&&
171 (length
== 4 || length
== 8)) {
172 struct lp_type i168_type
= lp_type_int_vec(16, 16 * 8);
173 unsigned mode
= 3; /* same as LP_BUILD_ROUND_TRUNCATE */
174 LLVMTypeRef i32t
= LLVMInt32TypeInContext(gallivm
->context
);
175 const char *intrinsic
= NULL
;
177 intrinsic
= "llvm.x86.vcvtps2ph.128";
180 intrinsic
= "llvm.x86.vcvtps2ph.256";
182 result
= lp_build_intrinsic_binary(builder
, intrinsic
,
183 lp_build_vec_type(gallivm
, i168_type
),
184 src
, LLVMConstInt(i32t
, mode
, 0));
186 result
= lp_build_extract_range(gallivm
, result
, 0, 4);
191 result
= lp_build_float_to_smallfloat(gallivm
, i32_type
, src
, 10, 5, 0, true);
192 /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
193 result
= LLVMBuildTrunc(builder
, result
, lp_build_vec_type(gallivm
, i16_type
), "");
200 LLVMTypeRef i32t
= LLVMInt32TypeInContext(gallivm
->context
);
201 LLVMTypeRef i16t
= LLVMInt16TypeInContext(gallivm
->context
);
202 LLVMTypeRef f32t
= LLVMFloatTypeInContext(gallivm
->context
);
203 LLVMValueRef ref_result
= LLVMGetUndef(LLVMVectorType(i16t
, length
));
206 LLVMTypeRef func_type
= LLVMFunctionType(i16t
, &f32t
, 1, 0);
207 LLVMValueRef func
= lp_build_const_int_pointer(gallivm
, func_to_pointer((func_pointer
)util_float_to_half
));
208 func
= LLVMBuildBitCast(builder
, func
, LLVMPointerType(func_type
, 0), "util_float_to_half");
210 for (i
= 0; i
< length
; ++i
) {
211 LLVMValueRef index
= LLVMConstInt(i32t
, i
, 0);
212 LLVMValueRef f32
= LLVMBuildExtractElement(builder
, src
, index
, "");
215 * XXX: not really supported by backends.
216 * Even if they would now, rounding mode cannot be specified and
219 LLVMValueRef f16
= lp_build_intrinsic_unary(builder
, "llvm.convert.to.fp16", i16t
, f32
);
221 LLVMValueRef f16
= LLVMBuildCall(builder
, func
, &f32
, 1, "");
223 ref_result
= LLVMBuildInsertElement(builder
, ref_result
, f16
, index
, "");
226 lp_build_print_value(gallivm
, "src = ", src
);
227 lp_build_print_value(gallivm
, "llvm = ", result
);
228 lp_build_print_value(gallivm
, "util = ", ref_result
);
229 lp_build_printf(gallivm
, "\n");
237 * Special case for converting clamped IEEE-754 floats to unsigned norms.
239 * The mathematical voodoo below may seem excessive but it is actually
240 * paramount we do it this way for several reasons. First, there is no single
241 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
242 * secondly, even if there was, since the FP's mantissa takes only a fraction
243 * of register bits the typically scale and cast approach would require double
244 * precision for accurate results, and therefore half the throughput
246 * Although the result values can be scaled to an arbitrary bit width specified
247 * by dst_width, the actual result type will have the same width.
249 * Ex: src = { float, float, float, float }
250 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
253 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state
*gallivm
,
254 struct lp_type src_type
,
258 LLVMBuilderRef builder
= gallivm
->builder
;
259 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(gallivm
, src_type
);
263 assert(src_type
.floating
);
264 assert(dst_width
<= src_type
.width
);
265 src_type
.sign
= FALSE
;
267 mantissa
= lp_mantissa(src_type
);
269 if (dst_width
<= mantissa
) {
271 * Apply magic coefficients that will make the desired result to appear
272 * in the lowest significant bits of the mantissa, with correct rounding.
274 * This only works if the destination width fits in the mantissa.
277 unsigned long long ubound
;
278 unsigned long long mask
;
282 ubound
= (1ULL << dst_width
);
284 scale
= (double)mask
/ubound
;
285 bias
= (double)(1ULL << (mantissa
- dst_width
));
287 res
= LLVMBuildFMul(builder
, src
, lp_build_const_vec(gallivm
, src_type
, scale
), "");
288 /* instead of fadd/and could (with sse2) just use lp_build_iround */
289 res
= LLVMBuildFAdd(builder
, res
, lp_build_const_vec(gallivm
, src_type
, bias
), "");
290 res
= LLVMBuildBitCast(builder
, res
, int_vec_type
, "");
291 res
= LLVMBuildAnd(builder
, res
,
292 lp_build_const_int_vec(gallivm
, src_type
, mask
), "");
294 else if (dst_width
== (mantissa
+ 1)) {
296 * The destination width matches exactly what can be represented in
297 * floating point (i.e., mantissa + 1 bits). Even so correct rounding
298 * still needs to be applied (only for numbers in [0.5-1.0] would
299 * conversion using truncation after scaling be sufficient).
302 struct lp_build_context uf32_bld
;
304 lp_build_context_init(&uf32_bld
, gallivm
, src_type
);
305 scale
= (double)((1ULL << dst_width
) - 1);
307 res
= LLVMBuildFMul(builder
, src
,
308 lp_build_const_vec(gallivm
, src_type
, scale
), "");
309 res
= lp_build_iround(&uf32_bld
, res
);
313 * The destination exceeds what can be represented in the floating point.
314 * So multiply by the largest power two we get away with, and when
315 * subtract the most significant bit to rescale to normalized values.
317 * The largest power of two factor we can get away is
318 * (1 << (src_type.width - 1)), because we need to use signed . In theory it
319 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
320 * INT_MIN should be returned in FPToSI, which is the correct result for
323 * This means we get (src_type.width - 1) correct bits for values near 0.0,
324 * and (mantissa + 1) correct bits for values near 1.0. Equally or more
325 * important, we also get exact results for 0.0 and 1.0.
328 unsigned n
= MIN2(src_type
.width
- 1u, dst_width
);
330 double scale
= (double)(1ULL << n
);
331 unsigned lshift
= dst_width
- n
;
333 LLVMValueRef lshifted
;
334 LLVMValueRef rshifted
;
336 res
= LLVMBuildFMul(builder
, src
,
337 lp_build_const_vec(gallivm
, src_type
, scale
), "");
338 if (!src_type
.sign
&& src_type
.width
== 32)
339 res
= LLVMBuildFPToUI(builder
, res
, int_vec_type
, "");
341 res
= LLVMBuildFPToSI(builder
, res
, int_vec_type
, "");
344 * Align the most significant bit to its final place.
346 * This will cause 1.0 to overflow to 0, but the later adjustment will
350 lshifted
= LLVMBuildShl(builder
, res
,
351 lp_build_const_int_vec(gallivm
, src_type
,
358 * Align the most significant bit to the right.
360 rshifted
= LLVMBuildLShr(builder
, res
,
361 lp_build_const_int_vec(gallivm
, src_type
, rshift
),
365 * Subtract the MSB to the LSB, therefore re-scaling from
366 * (1 << dst_width) to ((1 << dst_width) - 1).
369 res
= LLVMBuildSub(builder
, lshifted
, rshifted
, "");
377 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
378 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
379 * return {float, float, float, float} with values in range [0, 1].
382 lp_build_unsigned_norm_to_float(struct gallivm_state
*gallivm
,
384 struct lp_type dst_type
,
387 LLVMBuilderRef builder
= gallivm
->builder
;
388 LLVMTypeRef vec_type
= lp_build_vec_type(gallivm
, dst_type
);
389 LLVMTypeRef int_vec_type
= lp_build_int_vec_type(gallivm
, dst_type
);
394 unsigned long long ubound
;
395 unsigned long long mask
;
399 assert(dst_type
.floating
);
401 mantissa
= lp_mantissa(dst_type
);
403 if (src_width
<= (mantissa
+ 1)) {
405 * The source width matches fits what can be represented in floating
406 * point (i.e., mantissa + 1 bits). So do a straight multiplication
407 * followed by casting. No further rounding is necessary.
410 scale
= 1.0/(double)((1ULL << src_width
) - 1);
411 res
= LLVMBuildSIToFP(builder
, src
, vec_type
, "");
412 res
= LLVMBuildFMul(builder
, res
,
413 lp_build_const_vec(gallivm
, dst_type
, scale
), "");
418 * The source width exceeds what can be represented in floating
419 * point. So truncate the incoming values.
422 n
= MIN2(mantissa
, src_width
);
424 ubound
= ((unsigned long long)1 << n
);
426 scale
= (double)ubound
/mask
;
427 bias
= (double)((unsigned long long)1 << (mantissa
- n
));
431 if (src_width
> mantissa
) {
432 int shift
= src_width
- mantissa
;
433 res
= LLVMBuildLShr(builder
, res
,
434 lp_build_const_int_vec(gallivm
, dst_type
, shift
), "");
437 bias_
= lp_build_const_vec(gallivm
, dst_type
, bias
);
439 res
= LLVMBuildOr(builder
,
441 LLVMBuildBitCast(builder
, bias_
, int_vec_type
, ""), "");
443 res
= LLVMBuildBitCast(builder
, res
, vec_type
, "");
445 res
= LLVMBuildFSub(builder
, res
, bias_
, "");
446 res
= LLVMBuildFMul(builder
, res
, lp_build_const_vec(gallivm
, dst_type
, scale
), "");
454 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
456 * Returns the number of dsts created from src
458 int lp_build_conv_auto(struct gallivm_state
*gallivm
,
459 struct lp_type src_type
,
460 struct lp_type
* dst_type
,
461 const LLVMValueRef
*src
,
466 int num_dsts
= num_srcs
;
468 if (src_type
.floating
== dst_type
->floating
&&
469 src_type
.width
== dst_type
->width
&&
470 src_type
.length
== dst_type
->length
&&
471 src_type
.fixed
== dst_type
->fixed
&&
472 src_type
.norm
== dst_type
->norm
&&
473 src_type
.sign
== dst_type
->sign
)
476 /* Special case 4x4x32 -> 1x16x8 or 2x8x32 -> 1x16x8
478 if (src_type
.norm
== 0 &&
479 src_type
.width
== 32 &&
480 src_type
.fixed
== 0 &&
482 dst_type
->floating
== 0 &&
483 dst_type
->fixed
== 0 &&
484 dst_type
->width
== 8 &&
486 ((src_type
.floating
== 1 && src_type
.sign
== 1 && dst_type
->norm
== 1) ||
487 (src_type
.floating
== 0 && dst_type
->floating
== 0 &&
488 src_type
.sign
== dst_type
->sign
&& dst_type
->norm
== 0))) {
490 /* Special case 4x4x32 --> 1x16x8 */
491 if (src_type
.length
== 4 &&
492 (util_cpu_caps
.has_sse2
|| util_cpu_caps
.has_altivec
))
494 num_dsts
= (num_srcs
+ 3) / 4;
495 dst_type
->length
= num_srcs
* 4 >= 16 ? 16 : num_srcs
* 4;
497 lp_build_conv(gallivm
, src_type
, *dst_type
, src
, num_srcs
, dst
, num_dsts
);
501 /* Special case 2x8x32 --> 1x16x8 */
502 if (src_type
.length
== 8 &&
503 util_cpu_caps
.has_avx
)
505 num_dsts
= (num_srcs
+ 1) / 2;
506 dst_type
->length
= num_srcs
* 8 >= 16 ? 16 : num_srcs
* 8;
508 lp_build_conv(gallivm
, src_type
, *dst_type
, src
, num_srcs
, dst
, num_dsts
);
513 /* lp_build_resize does not support M:N */
514 if (src_type
.width
== dst_type
->width
) {
515 lp_build_conv(gallivm
, src_type
, *dst_type
, src
, num_srcs
, dst
, num_dsts
);
518 * If dst_width is 16 bits and src_width 32 and the dst vector size
519 * 64bit, try feeding 2 vectors at once so pack intrinsics can be used.
520 * (For AVX, this isn't needed, since we usually get 256bit src and
521 * 128bit dst vectors which works ok. If we do AVX2 pack this should
522 * be extended but need to be able to tell conversion code about pack
526 if (src_type
.width
== 2 * dst_type
->width
&&
527 src_type
.length
== dst_type
->length
&&
528 dst_type
->floating
== 0 && (num_srcs
% 2 == 0) &&
529 dst_type
->width
* dst_type
->length
== 64) {
532 dst_type
->length
*= 2;
534 for (i
= 0; i
< num_dsts
; i
++) {
535 lp_build_conv(gallivm
, src_type
, *dst_type
, &src
[i
*ratio
], ratio
, &dst
[i
], 1);
544 * Generic type conversion.
546 * TODO: Take a precision argument, or even better, add a new precision member
547 * to the lp_type union.
550 lp_build_conv(struct gallivm_state
*gallivm
,
551 struct lp_type src_type
,
552 struct lp_type dst_type
,
553 const LLVMValueRef
*src
, unsigned num_srcs
,
554 LLVMValueRef
*dst
, unsigned num_dsts
)
556 LLVMBuilderRef builder
= gallivm
->builder
;
557 struct lp_type tmp_type
;
558 LLVMValueRef tmp
[LP_MAX_VECTOR_LENGTH
];
562 /* We must not loose or gain channels. Only precision */
563 assert(src_type
.length
* num_srcs
== dst_type
.length
* num_dsts
);
565 assert(src_type
.length
<= LP_MAX_VECTOR_LENGTH
);
566 assert(dst_type
.length
<= LP_MAX_VECTOR_LENGTH
);
567 assert(num_srcs
<= LP_MAX_VECTOR_LENGTH
);
568 assert(num_dsts
<= LP_MAX_VECTOR_LENGTH
);
571 for(i
= 0; i
< num_srcs
; ++i
) {
572 assert(lp_check_value(src_type
, src
[i
]));
579 * Special case 4x4x32 --> 1x16x8, 2x4x32 -> 1x8x8, 1x4x32 -> 1x4x8
580 * Only float -> s/unorm8 and (u)int32->(u)int8.
581 * XXX: This should cover all interesting backend cases for 8 bit,
582 * but should use same strategy if dst is 16 bit.
584 if (src_type
.norm
== 0 &&
585 src_type
.width
== 32 &&
586 src_type
.length
== 4 &&
587 src_type
.fixed
== 0 &&
589 dst_type
.floating
== 0 &&
590 dst_type
.fixed
== 0 &&
591 dst_type
.width
== 8 &&
593 ((src_type
.floating
== 1 && src_type
.sign
== 1 && dst_type
.norm
== 1) ||
594 (src_type
.floating
== 0 && dst_type
.floating
== 0 &&
595 src_type
.sign
== dst_type
.sign
&& dst_type
.norm
== 0)) &&
597 ((dst_type
.length
== 16 && 4 * num_dsts
== num_srcs
) ||
598 (num_dsts
== 1 && dst_type
.length
* num_srcs
== 16 && num_srcs
!= 3)) &&
600 (util_cpu_caps
.has_sse2
|| util_cpu_caps
.has_altivec
))
602 struct lp_build_context bld
;
603 struct lp_type int16_type
, int32_type
;
604 struct lp_type dst_type_ext
= dst_type
;
605 LLVMValueRef const_scale
;
608 lp_build_context_init(&bld
, gallivm
, src_type
);
610 dst_type_ext
.length
= 16;
611 int16_type
= int32_type
= dst_type_ext
;
613 int16_type
.width
*= 2;
614 int16_type
.length
/= 2;
617 int32_type
.width
*= 4;
618 int32_type
.length
/= 4;
621 const_scale
= lp_build_const_vec(gallivm
, src_type
, lp_const_scale(dst_type
));
623 for (i
= 0; i
< num_dsts
; ++i
, src
+= 4) {
626 if (src_type
.floating
) {
627 for (j
= 0; j
< dst_type
.length
/ 4; ++j
) {
629 * XXX This is not actually fully correct. The float to int
630 * conversion will produce 0x80000000 value for everything
631 * out of range and NaNs (on x86, llvm.x86.sse2.cvtps2dq).
632 * Hence, NaNs and negatives will get clamped just fine to zero
633 * (relying on clamping pack behavior) when converting to unorm,
634 * however too large values (both finite and infinite) will also
635 * end up as zero, not 255.
636 * For snorm, for now we'll keep bug compatibility with generic
637 * conversion path (meaning too large values are fine, but
638 * NaNs get converted to -128 (purely by luck, as we don't
639 * specify nan behavior for the max there) instead of 0).
641 * dEQP has GLES31 tests that expect +inf -> 255.0.
644 tmp
[j
] = lp_build_min(&bld
, bld
.one
, src
[j
]);
649 tmp
[j
] = lp_build_min_ext(&bld
, bld
.one
, src
[j
],
650 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
654 tmp
[j
] = LLVMBuildFMul(builder
, tmp
[j
], const_scale
, "");
655 tmp
[j
] = lp_build_iround(&bld
, tmp
[j
]);
658 for (j
= 0; j
< dst_type
.length
/ 4; ++j
) {
659 if (!dst_type
.sign
) {
661 * Pack clamp is always signed->unsigned (or signed->signed).
664 LLVMValueRef const_max
;
665 const_max
= lp_build_const_int_vec(gallivm
, src_type
, 255);
666 tmp
[j
] = lp_build_min(&bld
, src
[j
], const_max
);
677 /* relying on clamping behavior of sse2 intrinsics here */
678 lo
= lp_build_pack2(gallivm
, int32_type
, int16_type
, tmp
[0], tmp
[1]);
684 hi
= lp_build_pack2(gallivm
, int32_type
, int16_type
, tmp
[2], tmp
[3]);
686 dst
[i
] = lp_build_pack2(gallivm
, int16_type
, dst_type_ext
, lo
, hi
);
689 dst
[0] = lp_build_extract_range(gallivm
, dst
[0], 0, dst_type
.length
);
695 /* Special case 2x8x32 --> 1x16x8, 1x8x32 ->1x8x8
697 else if (src_type
.norm
== 0 &&
698 src_type
.width
== 32 &&
699 src_type
.length
== 8 &&
700 src_type
.fixed
== 0 &&
702 dst_type
.floating
== 0 &&
703 dst_type
.fixed
== 0 &&
704 dst_type
.width
== 8 &&
706 ((src_type
.floating
== 1 && src_type
.sign
== 1 && dst_type
.norm
== 1) ||
707 (src_type
.floating
== 0 && dst_type
.floating
== 0 &&
708 src_type
.sign
== dst_type
.sign
&& dst_type
.norm
== 0)) &&
710 ((dst_type
.length
== 16 && 2 * num_dsts
== num_srcs
) ||
711 (num_dsts
== 1 && dst_type
.length
* num_srcs
== 8)) &&
713 util_cpu_caps
.has_avx
) {
715 struct lp_build_context bld
;
716 struct lp_type int16_type
, int32_type
;
717 struct lp_type dst_type_ext
= dst_type
;
718 LLVMValueRef const_scale
;
721 lp_build_context_init(&bld
, gallivm
, src_type
);
723 dst_type_ext
.length
= 16;
724 int16_type
= int32_type
= dst_type_ext
;
726 int16_type
.width
*= 2;
727 int16_type
.length
/= 2;
730 int32_type
.width
*= 4;
731 int32_type
.length
/= 4;
734 const_scale
= lp_build_const_vec(gallivm
, src_type
, lp_const_scale(dst_type
));
736 for (i
= 0; i
< num_dsts
; ++i
, src
+= 2) {
738 for (j
= 0; j
< (num_srcs
== 1 ? 1 : 2); j
++) {
739 LLVMValueRef lo
, hi
, a
;
742 if (src_type
.floating
) {
744 a
= lp_build_min(&bld
, bld
.one
, a
);
749 a
= lp_build_min_ext(&bld
, bld
.one
, a
,
750 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN
);
753 a
= LLVMBuildFMul(builder
, a
, const_scale
, "");
754 a
= lp_build_iround(&bld
, a
);
756 if (!dst_type
.sign
) {
757 LLVMValueRef const_max
;
758 const_max
= lp_build_const_int_vec(gallivm
, src_type
, 255);
759 a
= lp_build_min(&bld
, a
, const_max
);
762 lo
= lp_build_extract_range(gallivm
, a
, 0, 4);
763 hi
= lp_build_extract_range(gallivm
, a
, 4, 4);
764 /* relying on clamping behavior of sse2 intrinsics here */
765 tmp
[j
] = lp_build_pack2(gallivm
, int32_type
, int16_type
, lo
, hi
);
771 dst
[i
] = lp_build_pack2(gallivm
, int16_type
, dst_type_ext
, tmp
[0], tmp
[1]);
775 dst
[0] = lp_build_extract_range(gallivm
, dst
[0], 0, dst_type
.length
);
781 /* Special case -> 16bit half-float
783 else if (dst_type
.floating
&& dst_type
.width
== 16)
785 /* Only support src as 32bit float currently */
786 assert(src_type
.floating
&& src_type
.width
== 32);
788 for(i
= 0; i
< num_tmps
; ++i
)
789 dst
[i
] = lp_build_float_to_half(gallivm
, tmp
[i
]);
794 /* Pre convert half-floats to floats
796 else if (src_type
.floating
&& src_type
.width
== 16)
798 for(i
= 0; i
< num_tmps
; ++i
)
799 tmp
[i
] = lp_build_half_to_float(gallivm
, tmp
[i
]);
808 if(memcmp(&src_type
, &dst_type
, sizeof src_type
) != 0) {
809 struct lp_build_context bld
;
810 double src_min
= lp_const_min(src_type
);
811 double dst_min
= lp_const_min(dst_type
);
812 double src_max
= lp_const_max(src_type
);
813 double dst_max
= lp_const_max(dst_type
);
816 lp_build_context_init(&bld
, gallivm
, tmp_type
);
818 if(src_min
< dst_min
) {
822 thres
= lp_build_const_vec(gallivm
, src_type
, dst_min
);
823 for(i
= 0; i
< num_tmps
; ++i
)
824 tmp
[i
] = lp_build_max(&bld
, tmp
[i
], thres
);
827 if(src_max
> dst_max
) {
831 thres
= lp_build_const_vec(gallivm
, src_type
, dst_max
);
832 for(i
= 0; i
< num_tmps
; ++i
)
833 tmp
[i
] = lp_build_min(&bld
, tmp
[i
], thres
);
838 * Scale to the narrowest range
841 if(dst_type
.floating
) {
844 else if(tmp_type
.floating
) {
845 if(!dst_type
.fixed
&& !dst_type
.sign
&& dst_type
.norm
) {
846 for(i
= 0; i
< num_tmps
; ++i
) {
847 tmp
[i
] = lp_build_clamped_float_to_unsigned_norm(gallivm
,
852 tmp_type
.floating
= FALSE
;
855 double dst_scale
= lp_const_scale(dst_type
);
857 if (dst_scale
!= 1.0) {
858 LLVMValueRef scale
= lp_build_const_vec(gallivm
, tmp_type
, dst_scale
);
859 for(i
= 0; i
< num_tmps
; ++i
)
860 tmp
[i
] = LLVMBuildFMul(builder
, tmp
[i
], scale
, "");
864 * these functions will use fptosi in some form which won't work
865 * with 32bit uint dst. Causes lp_test_conv failures though.
868 assert(dst_type
.sign
|| dst_type
.width
< 32);
870 if (dst_type
.sign
&& dst_type
.norm
&& !dst_type
.fixed
) {
871 struct lp_build_context bld
;
873 lp_build_context_init(&bld
, gallivm
, tmp_type
);
874 for(i
= 0; i
< num_tmps
; ++i
) {
875 tmp
[i
] = lp_build_iround(&bld
, tmp
[i
]);
877 tmp_type
.floating
= FALSE
;
880 LLVMTypeRef tmp_vec_type
;
882 tmp_type
.floating
= FALSE
;
883 tmp_vec_type
= lp_build_vec_type(gallivm
, tmp_type
);
884 for(i
= 0; i
< num_tmps
; ++i
) {
887 tmp
[i
] = LLVMBuildFPToSI(builder
, tmp
[i
], tmp_vec_type
, "");
889 tmp
[i
] = LLVMBuildFPToUI(builder
, tmp
[i
], tmp_vec_type
, "");
891 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
892 tmp
[i
] = LLVMBuildFPToSI(builder
, tmp
[i
], tmp_vec_type
, "");
899 unsigned src_shift
= lp_const_shift(src_type
);
900 unsigned dst_shift
= lp_const_shift(dst_type
);
901 unsigned src_offset
= lp_const_offset(src_type
);
902 unsigned dst_offset
= lp_const_offset(dst_type
);
903 struct lp_build_context bld
;
904 lp_build_context_init(&bld
, gallivm
, tmp_type
);
906 /* Compensate for different offsets */
907 /* sscaled -> unorm and similar would cause negative shift count, skip */
908 if (dst_offset
> src_offset
&& src_type
.width
> dst_type
.width
&& src_shift
> 0) {
909 for (i
= 0; i
< num_tmps
; ++i
) {
910 LLVMValueRef shifted
;
912 shifted
= lp_build_shr_imm(&bld
, tmp
[i
], src_shift
- 1);
913 tmp
[i
] = LLVMBuildSub(builder
, tmp
[i
], shifted
, "");
917 if(src_shift
> dst_shift
) {
918 for(i
= 0; i
< num_tmps
; ++i
)
919 tmp
[i
] = lp_build_shr_imm(&bld
, tmp
[i
], src_shift
- dst_shift
);
924 * Truncate or expand bit width
926 * No data conversion should happen here, although the sign bits are
927 * crucial to avoid bad clamping.
931 struct lp_type new_type
;
934 new_type
.sign
= dst_type
.sign
;
935 new_type
.width
= dst_type
.width
;
936 new_type
.length
= dst_type
.length
;
939 * Note that resize when using packs can sometimes get min/max
940 * clamping for free. Should be able to exploit this...
942 lp_build_resize(gallivm
, tmp_type
, new_type
, tmp
, num_srcs
, tmp
, num_dsts
);
949 * Scale to the widest range
952 if(src_type
.floating
) {
955 else if(!src_type
.floating
&& dst_type
.floating
) {
956 if(!src_type
.fixed
&& !src_type
.sign
&& src_type
.norm
) {
957 for(i
= 0; i
< num_tmps
; ++i
) {
958 tmp
[i
] = lp_build_unsigned_norm_to_float(gallivm
,
963 tmp_type
.floating
= TRUE
;
966 double src_scale
= lp_const_scale(src_type
);
967 LLVMTypeRef tmp_vec_type
;
969 /* Use an equally sized integer for intermediate computations */
970 tmp_type
.floating
= TRUE
;
971 tmp_type
.sign
= TRUE
;
972 tmp_vec_type
= lp_build_vec_type(gallivm
, tmp_type
);
973 for(i
= 0; i
< num_tmps
; ++i
) {
976 tmp
[i
] = LLVMBuildSIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
978 tmp
[i
] = LLVMBuildUIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
980 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
981 tmp
[i
] = LLVMBuildSIToFP(builder
, tmp
[i
], tmp_vec_type
, "");
985 if (src_scale
!= 1.0) {
986 LLVMValueRef scale
= lp_build_const_vec(gallivm
, tmp_type
, 1.0/src_scale
);
987 for(i
= 0; i
< num_tmps
; ++i
)
988 tmp
[i
] = LLVMBuildFMul(builder
, tmp
[i
], scale
, "");
991 /* the formula above will produce value below -1.0 for most negative
992 * value but everything seems happy with that hence disable for now */
993 if (0 && !src_type
.fixed
&& src_type
.norm
&& src_type
.sign
) {
994 struct lp_build_context bld
;
996 lp_build_context_init(&bld
, gallivm
, dst_type
);
997 for(i
= 0; i
< num_tmps
; ++i
) {
998 tmp
[i
] = lp_build_max(&bld
, tmp
[i
],
999 lp_build_const_vec(gallivm
, dst_type
, -1.0f
));
1005 unsigned src_shift
= lp_const_shift(src_type
);
1006 unsigned dst_shift
= lp_const_shift(dst_type
);
1007 unsigned src_offset
= lp_const_offset(src_type
);
1008 unsigned dst_offset
= lp_const_offset(dst_type
);
1009 struct lp_build_context bld
;
1010 lp_build_context_init(&bld
, gallivm
, tmp_type
);
1012 if (src_shift
< dst_shift
) {
1013 LLVMValueRef pre_shift
[LP_MAX_VECTOR_LENGTH
];
1015 if (dst_shift
- src_shift
< dst_type
.width
) {
1016 for (i
= 0; i
< num_tmps
; ++i
) {
1017 pre_shift
[i
] = tmp
[i
];
1018 tmp
[i
] = lp_build_shl_imm(&bld
, tmp
[i
], dst_shift
- src_shift
);
1023 * This happens for things like sscaled -> unorm conversions. Shift
1024 * counts equal to bit width cause undefined results, so hack around it.
1026 for (i
= 0; i
< num_tmps
; ++i
) {
1027 pre_shift
[i
] = tmp
[i
];
1028 tmp
[i
] = lp_build_zero(gallivm
, dst_type
);
1032 /* Compensate for different offsets */
1033 if (dst_offset
> src_offset
) {
1034 for (i
= 0; i
< num_tmps
; ++i
) {
1035 tmp
[i
] = LLVMBuildSub(builder
, tmp
[i
], pre_shift
[i
], "");
1041 for(i
= 0; i
< num_dsts
; ++i
) {
1043 assert(lp_check_value(dst_type
, dst
[i
]));
1049 * Bit mask conversion.
1051 * This will convert the integer masks that match the given types.
1053 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
1054 * Any other value will likely cause unpredictable results.
1056 * This is basically a very trimmed down version of lp_build_conv.
1059 lp_build_conv_mask(struct gallivm_state
*gallivm
,
1060 struct lp_type src_type
,
1061 struct lp_type dst_type
,
1062 const LLVMValueRef
*src
, unsigned num_srcs
,
1063 LLVMValueRef
*dst
, unsigned num_dsts
)
1066 /* We must not loose or gain channels. Only precision */
1067 assert(src_type
.length
* num_srcs
== dst_type
.length
* num_dsts
);
1072 * We assume all values are 0 or -1
1075 src_type
.floating
= FALSE
;
1076 src_type
.fixed
= FALSE
;
1077 src_type
.sign
= TRUE
;
1078 src_type
.norm
= FALSE
;
1080 dst_type
.floating
= FALSE
;
1081 dst_type
.fixed
= FALSE
;
1082 dst_type
.sign
= TRUE
;
1083 dst_type
.norm
= FALSE
;
1086 * Truncate or expand bit width
1089 lp_build_resize(gallivm
, src_type
, dst_type
, src
, num_srcs
, dst
, num_dsts
);