ba51ff794f657da5b15d568b2f385120238f197a
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_conv.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for type conversions.
32 *
33 * We want to use the fastest type for a given computation whenever feasible.
34 * The other side of this is that we need to be able convert between several
35 * types accurately and efficiently.
36 *
37 * Conversion between types of different bit width is quite complex since a
38 *
39 * To remember there are a few invariants in type conversions:
40 *
41 * - register width must remain constant:
42 *
43 * src_type.width * src_type.length == dst_type.width * dst_type.length
44 *
45 * - total number of elements must remain constant:
46 *
47 * src_type.length * num_srcs == dst_type.length * num_dsts
48 *
49 * It is not always possible to do the conversion both accurately and
50 * efficiently, usually due to lack of adequate machine instructions. In these
51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52 * there this functions can be used anywhere. In the future we might have a
53 * precision parameter which can gauge the accuracy vs efficiency compromise,
54 * but for now if the data conversion between two stages happens to be the
55 * bottleneck, then most likely should just avoid converting at all and run
56 * both stages with the same type.
57 *
58 * Make sure to run lp_test_conv unit test after any change to this file.
59 *
60 * @author Jose Fonseca <jfonseca@vmware.com>
61 */
62
63
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/u_half.h"
67 #include "util/u_cpu_detect.h"
68
69 #include "lp_bld_type.h"
70 #include "lp_bld_const.h"
71 #include "lp_bld_arit.h"
72 #include "lp_bld_bitarit.h"
73 #include "lp_bld_pack.h"
74 #include "lp_bld_conv.h"
75 #include "lp_bld_logic.h"
76 #include "lp_bld_intr.h"
77 #include "lp_bld_printf.h"
78 #include "lp_bld_format.h"
79
80
81
82 /**
83 * Converts int16 half-float to float32
84 * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
85 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
86 *
87 * @param src value to convert
88 *
89 */
90 LLVMValueRef
91 lp_build_half_to_float(struct gallivm_state *gallivm,
92 LLVMValueRef src)
93 {
94 LLVMBuilderRef builder = gallivm->builder;
95 LLVMTypeRef src_type = LLVMTypeOf(src);
96 unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
97 LLVMGetVectorSize(src_type) : 1;
98
99 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
100 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
101 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
102 LLVMValueRef h;
103
104 if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 &&
105 (src_length == 4 || src_length == 8)) {
106 const char *intrinsic = NULL;
107 if (src_length == 4) {
108 src = lp_build_pad_vector(gallivm, src, 8);
109 intrinsic = "llvm.x86.vcvtph2ps.128";
110 }
111 else {
112 intrinsic = "llvm.x86.vcvtph2ps.256";
113 }
114 return lp_build_intrinsic_unary(builder, intrinsic,
115 lp_build_vec_type(gallivm, f32_type), src);
116 }
117
118 /* Convert int16 vector to int32 vector by zero ext (might generate bad code) */
119 h = LLVMBuildZExt(builder, src, int_vec_type, "");
120 return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
121 }
122
123
124 /**
125 * Converts float32 to int16 half-float
126 * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
127 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
128 *
129 * @param src value to convert
130 *
131 * Convert float32 to half floats, preserving Infs and NaNs,
132 * with rounding towards zero (trunc).
133 */
134 LLVMValueRef
135 lp_build_float_to_half(struct gallivm_state *gallivm,
136 LLVMValueRef src)
137 {
138 LLVMBuilderRef builder = gallivm->builder;
139 LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
140 unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
141 ? LLVMGetVectorSize(f32_vec_type) : 1;
142 struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
143 struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
144 LLVMValueRef result;
145
146 if (util_cpu_caps.has_f16c && HAVE_LLVM >= 0x0301 &&
147 (length == 4 || length == 8)) {
148 struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
149 unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
150 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
151 const char *intrinsic = NULL;
152 if (length == 4) {
153 intrinsic = "llvm.x86.vcvtps2ph.128";
154 }
155 else {
156 intrinsic = "llvm.x86.vcvtps2ph.256";
157 }
158 result = lp_build_intrinsic_binary(builder, intrinsic,
159 lp_build_vec_type(gallivm, i168_type),
160 src, LLVMConstInt(i32t, mode, 0));
161 if (length == 4) {
162 result = lp_build_extract_range(gallivm, result, 0, 4);
163 }
164 }
165
166 else {
167 result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
168 /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
169 result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
170 }
171
172 /*
173 * Debugging code.
174 */
175 if (0) {
176 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
177 LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
178 LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
179 LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));
180 unsigned i;
181
182 LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);
183 LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half));
184 func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half");
185
186 for (i = 0; i < length; ++i) {
187 LLVMValueRef index = LLVMConstInt(i32t, i, 0);
188 LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
189 #if 0
190 /* XXX: not really supported by backends */
191 LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
192 #else
193 LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
194 #endif
195 ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");
196 }
197
198 lp_build_print_value(gallivm, "src = ", src);
199 lp_build_print_value(gallivm, "llvm = ", result);
200 lp_build_print_value(gallivm, "util = ", ref_result);
201 lp_build_printf(gallivm, "\n");
202 }
203
204 return result;
205 }
206
207
208 /**
209 * Special case for converting clamped IEEE-754 floats to unsigned norms.
210 *
211 * The mathematical voodoo below may seem excessive but it is actually
212 * paramount we do it this way for several reasons. First, there is no single
213 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
214 * secondly, even if there was, since the FP's mantissa takes only a fraction
215 * of register bits the typically scale and cast approach would require double
216 * precision for accurate results, and therefore half the throughput
217 *
218 * Although the result values can be scaled to an arbitrary bit width specified
219 * by dst_width, the actual result type will have the same width.
220 *
221 * Ex: src = { float, float, float, float }
222 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
223 */
224 LLVMValueRef
225 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
226 struct lp_type src_type,
227 unsigned dst_width,
228 LLVMValueRef src)
229 {
230 LLVMBuilderRef builder = gallivm->builder;
231 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
232 LLVMValueRef res;
233 unsigned mantissa;
234
235 assert(src_type.floating);
236 assert(dst_width <= src_type.width);
237 src_type.sign = FALSE;
238
239 mantissa = lp_mantissa(src_type);
240
241 if (dst_width <= mantissa) {
242 /*
243 * Apply magic coefficients that will make the desired result to appear
244 * in the lowest significant bits of the mantissa, with correct rounding.
245 *
246 * This only works if the destination width fits in the mantissa.
247 */
248
249 unsigned long long ubound;
250 unsigned long long mask;
251 double scale;
252 double bias;
253
254 ubound = (1ULL << dst_width);
255 mask = ubound - 1;
256 scale = (double)mask/ubound;
257 bias = (double)(1ULL << (mantissa - dst_width));
258
259 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
260 /* instead of fadd/and could (with sse2) just use lp_build_iround */
261 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
262 res = LLVMBuildBitCast(builder, res, int_vec_type, "");
263 res = LLVMBuildAnd(builder, res,
264 lp_build_const_int_vec(gallivm, src_type, mask), "");
265 }
266 else if (dst_width == (mantissa + 1)) {
267 /*
268 * The destination width matches exactly what can be represented in
269 * floating point (i.e., mantissa + 1 bits). So do a straight
270 * multiplication followed by casting. No further rounding is necessary.
271 */
272
273 double scale;
274
275 scale = (double)((1ULL << dst_width) - 1);
276
277 res = LLVMBuildFMul(builder, src,
278 lp_build_const_vec(gallivm, src_type, scale), "");
279 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
280 }
281 else {
282 /*
283 * The destination exceeds what can be represented in the floating point.
284 * So multiply by the largest power two we get away with, and when
285 * subtract the most significant bit to rescale to normalized values.
286 *
287 * The largest power of two factor we can get away is
288 * (1 << (src_type.width - 1)), because we need to use signed . In theory it
289 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
290 * INT_MIN should be returned in FPToSI, which is the correct result for
291 * values near 1.0!
292 *
293 * This means we get (src_type.width - 1) correct bits for values near 0.0,
294 * and (mantissa + 1) correct bits for values near 1.0. Equally or more
295 * important, we also get exact results for 0.0 and 1.0.
296 */
297
298 unsigned n = MIN2(src_type.width - 1, dst_width);
299
300 double scale = (double)(1ULL << n);
301 unsigned lshift = dst_width - n;
302 unsigned rshift = n;
303 LLVMValueRef lshifted;
304 LLVMValueRef rshifted;
305
306 res = LLVMBuildFMul(builder, src,
307 lp_build_const_vec(gallivm, src_type, scale), "");
308 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
309
310 /*
311 * Align the most significant bit to its final place.
312 *
313 * This will cause 1.0 to overflow to 0, but the later adjustment will
314 * get it right.
315 */
316 if (lshift) {
317 lshifted = LLVMBuildShl(builder, res,
318 lp_build_const_int_vec(gallivm, src_type,
319 lshift), "");
320 } else {
321 lshifted = res;
322 }
323
324 /*
325 * Align the most significant bit to the right.
326 */
327 rshifted = LLVMBuildLShr(builder, res,
328 lp_build_const_int_vec(gallivm, src_type, rshift),
329 "");
330
331 /*
332 * Subtract the MSB to the LSB, therefore re-scaling from
333 * (1 << dst_width) to ((1 << dst_width) - 1).
334 */
335
336 res = LLVMBuildSub(builder, lshifted, rshifted, "");
337 }
338
339 return res;
340 }
341
342
343 /**
344 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
345 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
346 * return {float, float, float, float} with values in range [0, 1].
347 */
348 LLVMValueRef
349 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
350 unsigned src_width,
351 struct lp_type dst_type,
352 LLVMValueRef src)
353 {
354 LLVMBuilderRef builder = gallivm->builder;
355 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
356 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
357 LLVMValueRef bias_;
358 LLVMValueRef res;
359 unsigned mantissa;
360 unsigned n;
361 unsigned long long ubound;
362 unsigned long long mask;
363 double scale;
364 double bias;
365
366 assert(dst_type.floating);
367
368 mantissa = lp_mantissa(dst_type);
369
370 if (src_width <= (mantissa + 1)) {
371 /*
372 * The source width matches fits what can be represented in floating
373 * point (i.e., mantissa + 1 bits). So do a straight multiplication
374 * followed by casting. No further rounding is necessary.
375 */
376
377 scale = 1.0/(double)((1ULL << src_width) - 1);
378 res = LLVMBuildSIToFP(builder, src, vec_type, "");
379 res = LLVMBuildFMul(builder, res,
380 lp_build_const_vec(gallivm, dst_type, scale), "");
381 return res;
382 }
383 else {
384 /*
385 * The source width exceeds what can be represented in floating
386 * point. So truncate the incoming values.
387 */
388
389 n = MIN2(mantissa, src_width);
390
391 ubound = ((unsigned long long)1 << n);
392 mask = ubound - 1;
393 scale = (double)ubound/mask;
394 bias = (double)((unsigned long long)1 << (mantissa - n));
395
396 res = src;
397
398 if (src_width > mantissa) {
399 int shift = src_width - mantissa;
400 res = LLVMBuildLShr(builder, res,
401 lp_build_const_int_vec(gallivm, dst_type, shift), "");
402 }
403
404 bias_ = lp_build_const_vec(gallivm, dst_type, bias);
405
406 res = LLVMBuildOr(builder,
407 res,
408 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
409
410 res = LLVMBuildBitCast(builder, res, vec_type, "");
411
412 res = LLVMBuildFSub(builder, res, bias_, "");
413 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
414 }
415
416 return res;
417 }
418
419
420 /**
421 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
422 *
423 * Returns the number of dsts created from src
424 */
425 int lp_build_conv_auto(struct gallivm_state *gallivm,
426 struct lp_type src_type,
427 struct lp_type* dst_type,
428 const LLVMValueRef *src,
429 unsigned num_srcs,
430 LLVMValueRef *dst)
431 {
432 int i;
433 int num_dsts = num_srcs;
434
435 if (src_type.floating == dst_type->floating &&
436 src_type.width == dst_type->width &&
437 src_type.length == dst_type->length &&
438 src_type.fixed == dst_type->fixed &&
439 src_type.norm == dst_type->norm &&
440 src_type.sign == dst_type->sign)
441 return num_dsts;
442
443 /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
444 */
445 if (src_type.floating == 1 &&
446 src_type.fixed == 0 &&
447 src_type.sign == 1 &&
448 src_type.norm == 0 &&
449 src_type.width == 32 &&
450
451 dst_type->floating == 0 &&
452 dst_type->fixed == 0 &&
453 dst_type->sign == 0 &&
454 dst_type->norm == 1 &&
455 dst_type->width == 8)
456 {
457 /* Special case 4x4f --> 1x16ub */
458 if (src_type.length == 4 &&
459 util_cpu_caps.has_sse2)
460 {
461 num_dsts = (num_srcs + 3) / 4;
462 dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
463
464 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
465 return num_dsts;
466 }
467
468 /* Special case 2x8f --> 1x16ub */
469 if (src_type.length == 8 &&
470 util_cpu_caps.has_avx)
471 {
472 num_dsts = (num_srcs + 1) / 2;
473 dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
474
475 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
476 return num_dsts;
477 }
478 }
479
480 /* lp_build_resize does not support M:N */
481 if (src_type.width == dst_type->width) {
482 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
483 } else {
484 for (i = 0; i < num_srcs; ++i) {
485 lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
486 }
487 }
488
489 return num_dsts;
490 }
491
492
493 /**
494 * Generic type conversion.
495 *
496 * TODO: Take a precision argument, or even better, add a new precision member
497 * to the lp_type union.
498 */
499 void
500 lp_build_conv(struct gallivm_state *gallivm,
501 struct lp_type src_type,
502 struct lp_type dst_type,
503 const LLVMValueRef *src, unsigned num_srcs,
504 LLVMValueRef *dst, unsigned num_dsts)
505 {
506 LLVMBuilderRef builder = gallivm->builder;
507 struct lp_type tmp_type;
508 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
509 unsigned num_tmps;
510 unsigned i;
511
512 /* We must not loose or gain channels. Only precision */
513 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
514
515 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
516 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
517 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
518 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
519
520 tmp_type = src_type;
521 for(i = 0; i < num_srcs; ++i) {
522 assert(lp_check_value(src_type, src[i]));
523 tmp[i] = src[i];
524 }
525 num_tmps = num_srcs;
526
527
528 /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
529 */
530 if (src_type.floating == 1 &&
531 src_type.fixed == 0 &&
532 src_type.sign == 1 &&
533 src_type.norm == 0 &&
534 src_type.width == 32 &&
535 src_type.length == 4 &&
536
537 dst_type.floating == 0 &&
538 dst_type.fixed == 0 &&
539 dst_type.sign == 0 &&
540 dst_type.norm == 1 &&
541 dst_type.width == 8 &&
542
543 ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
544 (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
545
546 util_cpu_caps.has_sse2)
547 {
548 struct lp_build_context bld;
549 struct lp_type int16_type, int32_type;
550 struct lp_type dst_type_ext = dst_type;
551 LLVMValueRef const_255f;
552 unsigned i, j;
553
554 lp_build_context_init(&bld, gallivm, src_type);
555
556 dst_type_ext.length = 16;
557 int16_type = int32_type = dst_type_ext;
558
559 int16_type.width *= 2;
560 int16_type.length /= 2;
561 int16_type.sign = 1;
562
563 int32_type.width *= 4;
564 int32_type.length /= 4;
565 int32_type.sign = 1;
566
567 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
568
569 for (i = 0; i < num_dsts; ++i, src += 4) {
570 LLVMValueRef lo, hi;
571
572 for (j = 0; j < dst_type.length / 4; ++j) {
573 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
574 tmp[j] = lp_build_iround(&bld, tmp[j]);
575 }
576
577 if (num_srcs == 1) {
578 tmp[1] = tmp[0];
579 }
580
581 /* relying on clamping behavior of sse2 intrinsics here */
582 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
583
584 if (num_srcs < 4) {
585 hi = lo;
586 }
587 else {
588 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
589 }
590 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
591 }
592 if (num_srcs < 4) {
593 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
594 }
595
596 return;
597 }
598
599 /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
600 */
601 else if (src_type.floating == 1 &&
602 src_type.fixed == 0 &&
603 src_type.sign == 1 &&
604 src_type.norm == 0 &&
605 src_type.width == 32 &&
606 src_type.length == 8 &&
607
608 dst_type.floating == 0 &&
609 dst_type.fixed == 0 &&
610 dst_type.sign == 0 &&
611 dst_type.norm == 1 &&
612 dst_type.width == 8 &&
613
614 ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
615 (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
616
617 util_cpu_caps.has_avx) {
618
619 struct lp_build_context bld;
620 struct lp_type int16_type, int32_type;
621 struct lp_type dst_type_ext = dst_type;
622 LLVMValueRef const_255f;
623 unsigned i;
624
625 lp_build_context_init(&bld, gallivm, src_type);
626
627 dst_type_ext.length = 16;
628 int16_type = int32_type = dst_type_ext;
629
630 int16_type.width *= 2;
631 int16_type.length /= 2;
632 int16_type.sign = 1;
633
634 int32_type.width *= 4;
635 int32_type.length /= 4;
636 int32_type.sign = 1;
637
638 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
639
640 for (i = 0; i < num_dsts; ++i, src += 2) {
641 LLVMValueRef lo, hi, a, b;
642
643 a = LLVMBuildFMul(builder, src[0], const_255f, "");
644 a = lp_build_iround(&bld, a);
645 tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
646 tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
647 /* relying on clamping behavior of sse2 intrinsics here */
648 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
649
650 if (num_srcs == 1) {
651 hi = lo;
652 }
653 else {
654 b = LLVMBuildFMul(builder, src[1], const_255f, "");
655 b = lp_build_iround(&bld, b);
656 tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
657 tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
658 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
659
660 }
661 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
662 }
663
664 if (num_srcs == 1) {
665 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
666 }
667
668 return;
669 }
670
671 /* Special case -> 16bit half-float
672 */
673 else if (dst_type.floating && dst_type.width == 16)
674 {
675 /* Only support src as 32bit float currently */
676 assert(src_type.floating && src_type.width == 32);
677
678 for(i = 0; i < num_tmps; ++i)
679 dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
680
681 return;
682 }
683
684 /* Pre convert half-floats to floats
685 */
686 else if (src_type.floating && src_type.width == 16)
687 {
688 for(i = 0; i < num_tmps; ++i)
689 tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
690
691 tmp_type.width = 32;
692 }
693
694 /*
695 * Clamp if necessary
696 */
697
698 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
699 struct lp_build_context bld;
700 double src_min = lp_const_min(src_type);
701 double dst_min = lp_const_min(dst_type);
702 double src_max = lp_const_max(src_type);
703 double dst_max = lp_const_max(dst_type);
704 LLVMValueRef thres;
705
706 lp_build_context_init(&bld, gallivm, tmp_type);
707
708 if(src_min < dst_min) {
709 if(dst_min == 0.0)
710 thres = bld.zero;
711 else
712 thres = lp_build_const_vec(gallivm, src_type, dst_min);
713 for(i = 0; i < num_tmps; ++i)
714 tmp[i] = lp_build_max(&bld, tmp[i], thres);
715 }
716
717 if(src_max > dst_max) {
718 if(dst_max == 1.0)
719 thres = bld.one;
720 else
721 thres = lp_build_const_vec(gallivm, src_type, dst_max);
722 for(i = 0; i < num_tmps; ++i)
723 tmp[i] = lp_build_min(&bld, tmp[i], thres);
724 }
725 }
726
727 /*
728 * Scale to the narrowest range
729 */
730
731 if(dst_type.floating) {
732 /* Nothing to do */
733 }
734 else if(tmp_type.floating) {
735 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
736 for(i = 0; i < num_tmps; ++i) {
737 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
738 tmp_type,
739 dst_type.width,
740 tmp[i]);
741 }
742 tmp_type.floating = FALSE;
743 }
744 else {
745 double dst_scale = lp_const_scale(dst_type);
746
747 if (dst_scale != 1.0) {
748 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
749 for(i = 0; i < num_tmps; ++i)
750 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
751 }
752
753 /*
754 * these functions will use fptosi in some form which won't work
755 * with 32bit uint dst. Causes lp_test_conv failures though.
756 */
757 if (0)
758 assert(dst_type.sign || dst_type.width < 32);
759
760 if (dst_type.sign && dst_type.norm && !dst_type.fixed) {
761 struct lp_build_context bld;
762
763 lp_build_context_init(&bld, gallivm, tmp_type);
764 for(i = 0; i < num_tmps; ++i) {
765 tmp[i] = lp_build_iround(&bld, tmp[i]);
766 }
767 tmp_type.floating = FALSE;
768 }
769 else {
770 LLVMTypeRef tmp_vec_type;
771
772 tmp_type.floating = FALSE;
773 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
774 for(i = 0; i < num_tmps; ++i) {
775 #if 0
776 if(dst_type.sign)
777 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
778 else
779 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
780 #else
781 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
782 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
783 #endif
784 }
785 }
786 }
787 }
788 else {
789 unsigned src_shift = lp_const_shift(src_type);
790 unsigned dst_shift = lp_const_shift(dst_type);
791 unsigned src_offset = lp_const_offset(src_type);
792 unsigned dst_offset = lp_const_offset(dst_type);
793
794 /* Compensate for different offsets */
795 if (dst_offset > src_offset && src_type.width > dst_type.width) {
796 for (i = 0; i < num_tmps; ++i) {
797 LLVMValueRef shifted;
798 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1);
799 if(src_type.sign)
800 shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
801 else
802 shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
803
804 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
805 }
806 }
807
808 if(src_shift > dst_shift) {
809 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
810 src_shift - dst_shift);
811 for(i = 0; i < num_tmps; ++i)
812 if(src_type.sign)
813 tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
814 else
815 tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
816 }
817 }
818
819 /*
820 * Truncate or expand bit width
821 *
822 * No data conversion should happen here, although the sign bits are
823 * crucial to avoid bad clamping.
824 */
825
826 {
827 struct lp_type new_type;
828
829 new_type = tmp_type;
830 new_type.sign = dst_type.sign;
831 new_type.width = dst_type.width;
832 new_type.length = dst_type.length;
833
834 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
835
836 tmp_type = new_type;
837 num_tmps = num_dsts;
838 }
839
840 /*
841 * Scale to the widest range
842 */
843
844 if(src_type.floating) {
845 /* Nothing to do */
846 }
847 else if(!src_type.floating && dst_type.floating) {
848 if(!src_type.fixed && !src_type.sign && src_type.norm) {
849 for(i = 0; i < num_tmps; ++i) {
850 tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
851 src_type.width,
852 dst_type,
853 tmp[i]);
854 }
855 tmp_type.floating = TRUE;
856 }
857 else {
858 double src_scale = lp_const_scale(src_type);
859 LLVMTypeRef tmp_vec_type;
860
861 /* Use an equally sized integer for intermediate computations */
862 tmp_type.floating = TRUE;
863 tmp_type.sign = TRUE;
864 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
865 for(i = 0; i < num_tmps; ++i) {
866 #if 0
867 if(dst_type.sign)
868 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
869 else
870 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
871 #else
872 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
873 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
874 #endif
875 }
876
877 if (src_scale != 1.0) {
878 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
879 for(i = 0; i < num_tmps; ++i)
880 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
881 }
882
883 /* the formula above will produce value below -1.0 for most negative
884 * value but everything seems happy with that hence disable for now */
885 if (0 && !src_type.fixed && src_type.norm && src_type.sign) {
886 struct lp_build_context bld;
887
888 lp_build_context_init(&bld, gallivm, dst_type);
889 for(i = 0; i < num_tmps; ++i) {
890 tmp[i] = lp_build_max(&bld, tmp[i],
891 lp_build_const_vec(gallivm, dst_type, -1.0f));
892 }
893 }
894 }
895 }
896 else {
897 unsigned src_shift = lp_const_shift(src_type);
898 unsigned dst_shift = lp_const_shift(dst_type);
899 unsigned src_offset = lp_const_offset(src_type);
900 unsigned dst_offset = lp_const_offset(dst_type);
901
902 if (src_shift < dst_shift) {
903 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
904 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift);
905
906 for (i = 0; i < num_tmps; ++i) {
907 pre_shift[i] = tmp[i];
908 tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
909 }
910
911 /* Compensate for different offsets */
912 if (dst_offset > src_offset) {
913 for (i = 0; i < num_tmps; ++i) {
914 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
915 }
916 }
917 }
918 }
919
920 for(i = 0; i < num_dsts; ++i) {
921 dst[i] = tmp[i];
922 assert(lp_check_value(dst_type, dst[i]));
923 }
924 }
925
926
927 /**
928 * Bit mask conversion.
929 *
930 * This will convert the integer masks that match the given types.
931 *
932 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
933 * Any other value will likely cause unpredictable results.
934 *
935 * This is basically a very trimmed down version of lp_build_conv.
936 */
937 void
938 lp_build_conv_mask(struct gallivm_state *gallivm,
939 struct lp_type src_type,
940 struct lp_type dst_type,
941 const LLVMValueRef *src, unsigned num_srcs,
942 LLVMValueRef *dst, unsigned num_dsts)
943 {
944
945 /* We must not loose or gain channels. Only precision */
946 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
947
948 /*
949 * Drop
950 *
951 * We assume all values are 0 or -1
952 */
953
954 src_type.floating = FALSE;
955 src_type.fixed = FALSE;
956 src_type.sign = TRUE;
957 src_type.norm = FALSE;
958
959 dst_type.floating = FALSE;
960 dst_type.fixed = FALSE;
961 dst_type.sign = TRUE;
962 dst_type.norm = FALSE;
963
964 /*
965 * Truncate or expand bit width
966 */
967
968 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
969 }