Merge remote-tracking branch 'jekstrand/wip/i965-uniforms' into vulkan
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_conv.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for type conversions.
32 *
33 * We want to use the fastest type for a given computation whenever feasible.
34 * The other side of this is that we need to be able convert between several
35 * types accurately and efficiently.
36 *
37 * Conversion between types of different bit width is quite complex since a
38 *
39 * To remember there are a few invariants in type conversions:
40 *
41 * - register width must remain constant:
42 *
43 * src_type.width * src_type.length == dst_type.width * dst_type.length
44 *
45 * - total number of elements must remain constant:
46 *
47 * src_type.length * num_srcs == dst_type.length * num_dsts
48 *
49 * It is not always possible to do the conversion both accurately and
50 * efficiently, usually due to lack of adequate machine instructions. In these
51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52 * there this functions can be used anywhere. In the future we might have a
53 * precision parameter which can gauge the accuracy vs efficiency compromise,
54 * but for now if the data conversion between two stages happens to be the
55 * bottleneck, then most likely should just avoid converting at all and run
56 * both stages with the same type.
57 *
58 * Make sure to run lp_test_conv unit test after any change to this file.
59 *
60 * @author Jose Fonseca <jfonseca@vmware.com>
61 */
62
63
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/u_half.h"
67 #include "util/u_cpu_detect.h"
68
69 #include "lp_bld_type.h"
70 #include "lp_bld_const.h"
71 #include "lp_bld_arit.h"
72 #include "lp_bld_bitarit.h"
73 #include "lp_bld_pack.h"
74 #include "lp_bld_conv.h"
75 #include "lp_bld_logic.h"
76 #include "lp_bld_intr.h"
77 #include "lp_bld_printf.h"
78 #include "lp_bld_format.h"
79
80
81
82 /**
83 * Converts int16 half-float to float32
84 * Note this can be performed in 1 instruction if vcvtph2ps exists (f16c/cvt16)
85 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
86 *
87 * @param src value to convert
88 *
89 */
90 LLVMValueRef
91 lp_build_half_to_float(struct gallivm_state *gallivm,
92 LLVMValueRef src)
93 {
94 LLVMBuilderRef builder = gallivm->builder;
95 LLVMTypeRef src_type = LLVMTypeOf(src);
96 unsigned src_length = LLVMGetTypeKind(src_type) == LLVMVectorTypeKind ?
97 LLVMGetVectorSize(src_type) : 1;
98
99 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
100 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
101 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
102 LLVMValueRef h;
103
104 if (util_cpu_caps.has_f16c &&
105 (src_length == 4 || src_length == 8)) {
106 const char *intrinsic = NULL;
107 if (src_length == 4) {
108 src = lp_build_pad_vector(gallivm, src, 8);
109 intrinsic = "llvm.x86.vcvtph2ps.128";
110 }
111 else {
112 intrinsic = "llvm.x86.vcvtph2ps.256";
113 }
114 return lp_build_intrinsic_unary(builder, intrinsic,
115 lp_build_vec_type(gallivm, f32_type), src);
116 }
117
118 /* Convert int16 vector to int32 vector by zero ext (might generate bad code) */
119 h = LLVMBuildZExt(builder, src, int_vec_type, "");
120 return lp_build_smallfloat_to_float(gallivm, f32_type, h, 10, 5, 0, true);
121 }
122
123
124 /**
125 * Converts float32 to int16 half-float
126 * Note this can be performed in 1 instruction if vcvtps2ph exists (f16c/cvt16)
127 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
128 *
129 * @param src value to convert
130 *
131 * Convert float32 to half floats, preserving Infs and NaNs,
132 * with rounding towards zero (trunc).
133 */
134 LLVMValueRef
135 lp_build_float_to_half(struct gallivm_state *gallivm,
136 LLVMValueRef src)
137 {
138 LLVMBuilderRef builder = gallivm->builder;
139 LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
140 unsigned length = LLVMGetTypeKind(f32_vec_type) == LLVMVectorTypeKind
141 ? LLVMGetVectorSize(f32_vec_type) : 1;
142 struct lp_type i32_type = lp_type_int_vec(32, 32 * length);
143 struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
144 LLVMValueRef result;
145
146 if (util_cpu_caps.has_f16c &&
147 (length == 4 || length == 8)) {
148 struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
149 unsigned mode = 3; /* same as LP_BUILD_ROUND_TRUNCATE */
150 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
151 const char *intrinsic = NULL;
152 if (length == 4) {
153 intrinsic = "llvm.x86.vcvtps2ph.128";
154 }
155 else {
156 intrinsic = "llvm.x86.vcvtps2ph.256";
157 }
158 result = lp_build_intrinsic_binary(builder, intrinsic,
159 lp_build_vec_type(gallivm, i168_type),
160 src, LLVMConstInt(i32t, mode, 0));
161 if (length == 4) {
162 result = lp_build_extract_range(gallivm, result, 0, 4);
163 }
164 }
165
166 else {
167 result = lp_build_float_to_smallfloat(gallivm, i32_type, src, 10, 5, 0, true);
168 /* Convert int32 vector to int16 vector by trunc (might generate bad code) */
169 result = LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i16_type), "");
170 }
171
172 /*
173 * Debugging code.
174 */
175 if (0) {
176 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
177 LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
178 LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
179 LLVMValueRef ref_result = LLVMGetUndef(LLVMVectorType(i16t, length));
180 unsigned i;
181
182 LLVMTypeRef func_type = LLVMFunctionType(i16t, &f32t, 1, 0);
183 LLVMValueRef func = lp_build_const_int_pointer(gallivm, func_to_pointer((func_pointer)util_float_to_half));
184 func = LLVMBuildBitCast(builder, func, LLVMPointerType(func_type, 0), "util_float_to_half");
185
186 for (i = 0; i < length; ++i) {
187 LLVMValueRef index = LLVMConstInt(i32t, i, 0);
188 LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
189 #if 0
190 /* XXX: not really supported by backends */
191 LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
192 #else
193 LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
194 #endif
195 ref_result = LLVMBuildInsertElement(builder, ref_result, f16, index, "");
196 }
197
198 lp_build_print_value(gallivm, "src = ", src);
199 lp_build_print_value(gallivm, "llvm = ", result);
200 lp_build_print_value(gallivm, "util = ", ref_result);
201 lp_build_printf(gallivm, "\n");
202 }
203
204 return result;
205 }
206
207
208 /**
209 * Special case for converting clamped IEEE-754 floats to unsigned norms.
210 *
211 * The mathematical voodoo below may seem excessive but it is actually
212 * paramount we do it this way for several reasons. First, there is no single
213 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
214 * secondly, even if there was, since the FP's mantissa takes only a fraction
215 * of register bits the typically scale and cast approach would require double
216 * precision for accurate results, and therefore half the throughput
217 *
218 * Although the result values can be scaled to an arbitrary bit width specified
219 * by dst_width, the actual result type will have the same width.
220 *
221 * Ex: src = { float, float, float, float }
222 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
223 */
224 LLVMValueRef
225 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
226 struct lp_type src_type,
227 unsigned dst_width,
228 LLVMValueRef src)
229 {
230 LLVMBuilderRef builder = gallivm->builder;
231 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
232 LLVMValueRef res;
233 unsigned mantissa;
234
235 assert(src_type.floating);
236 assert(dst_width <= src_type.width);
237 src_type.sign = FALSE;
238
239 mantissa = lp_mantissa(src_type);
240
241 if (dst_width <= mantissa) {
242 /*
243 * Apply magic coefficients that will make the desired result to appear
244 * in the lowest significant bits of the mantissa, with correct rounding.
245 *
246 * This only works if the destination width fits in the mantissa.
247 */
248
249 unsigned long long ubound;
250 unsigned long long mask;
251 double scale;
252 double bias;
253
254 ubound = (1ULL << dst_width);
255 mask = ubound - 1;
256 scale = (double)mask/ubound;
257 bias = (double)(1ULL << (mantissa - dst_width));
258
259 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
260 /* instead of fadd/and could (with sse2) just use lp_build_iround */
261 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
262 res = LLVMBuildBitCast(builder, res, int_vec_type, "");
263 res = LLVMBuildAnd(builder, res,
264 lp_build_const_int_vec(gallivm, src_type, mask), "");
265 }
266 else if (dst_width == (mantissa + 1)) {
267 /*
268 * The destination width matches exactly what can be represented in
269 * floating point (i.e., mantissa + 1 bits). Even so correct rounding
270 * still needs to be applied (only for numbers in [0.5-1.0] would
271 * conversion using truncation after scaling be sufficient).
272 */
273 double scale;
274 struct lp_build_context uf32_bld;
275
276 lp_build_context_init(&uf32_bld, gallivm, src_type);
277 scale = (double)((1ULL << dst_width) - 1);
278
279 res = LLVMBuildFMul(builder, src,
280 lp_build_const_vec(gallivm, src_type, scale), "");
281 res = lp_build_iround(&uf32_bld, res);
282 }
283 else {
284 /*
285 * The destination exceeds what can be represented in the floating point.
286 * So multiply by the largest power two we get away with, and when
287 * subtract the most significant bit to rescale to normalized values.
288 *
289 * The largest power of two factor we can get away is
290 * (1 << (src_type.width - 1)), because we need to use signed . In theory it
291 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
292 * INT_MIN should be returned in FPToSI, which is the correct result for
293 * values near 1.0!
294 *
295 * This means we get (src_type.width - 1) correct bits for values near 0.0,
296 * and (mantissa + 1) correct bits for values near 1.0. Equally or more
297 * important, we also get exact results for 0.0 and 1.0.
298 */
299
300 unsigned n = MIN2(src_type.width - 1, dst_width);
301
302 double scale = (double)(1ULL << n);
303 unsigned lshift = dst_width - n;
304 unsigned rshift = n;
305 LLVMValueRef lshifted;
306 LLVMValueRef rshifted;
307
308 res = LLVMBuildFMul(builder, src,
309 lp_build_const_vec(gallivm, src_type, scale), "");
310 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
311
312 /*
313 * Align the most significant bit to its final place.
314 *
315 * This will cause 1.0 to overflow to 0, but the later adjustment will
316 * get it right.
317 */
318 if (lshift) {
319 lshifted = LLVMBuildShl(builder, res,
320 lp_build_const_int_vec(gallivm, src_type,
321 lshift), "");
322 } else {
323 lshifted = res;
324 }
325
326 /*
327 * Align the most significant bit to the right.
328 */
329 rshifted = LLVMBuildLShr(builder, res,
330 lp_build_const_int_vec(gallivm, src_type, rshift),
331 "");
332
333 /*
334 * Subtract the MSB to the LSB, therefore re-scaling from
335 * (1 << dst_width) to ((1 << dst_width) - 1).
336 */
337
338 res = LLVMBuildSub(builder, lshifted, rshifted, "");
339 }
340
341 return res;
342 }
343
344
345 /**
346 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
347 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
348 * return {float, float, float, float} with values in range [0, 1].
349 */
350 LLVMValueRef
351 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
352 unsigned src_width,
353 struct lp_type dst_type,
354 LLVMValueRef src)
355 {
356 LLVMBuilderRef builder = gallivm->builder;
357 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
358 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
359 LLVMValueRef bias_;
360 LLVMValueRef res;
361 unsigned mantissa;
362 unsigned n;
363 unsigned long long ubound;
364 unsigned long long mask;
365 double scale;
366 double bias;
367
368 assert(dst_type.floating);
369
370 mantissa = lp_mantissa(dst_type);
371
372 if (src_width <= (mantissa + 1)) {
373 /*
374 * The source width matches fits what can be represented in floating
375 * point (i.e., mantissa + 1 bits). So do a straight multiplication
376 * followed by casting. No further rounding is necessary.
377 */
378
379 scale = 1.0/(double)((1ULL << src_width) - 1);
380 res = LLVMBuildSIToFP(builder, src, vec_type, "");
381 res = LLVMBuildFMul(builder, res,
382 lp_build_const_vec(gallivm, dst_type, scale), "");
383 return res;
384 }
385 else {
386 /*
387 * The source width exceeds what can be represented in floating
388 * point. So truncate the incoming values.
389 */
390
391 n = MIN2(mantissa, src_width);
392
393 ubound = ((unsigned long long)1 << n);
394 mask = ubound - 1;
395 scale = (double)ubound/mask;
396 bias = (double)((unsigned long long)1 << (mantissa - n));
397
398 res = src;
399
400 if (src_width > mantissa) {
401 int shift = src_width - mantissa;
402 res = LLVMBuildLShr(builder, res,
403 lp_build_const_int_vec(gallivm, dst_type, shift), "");
404 }
405
406 bias_ = lp_build_const_vec(gallivm, dst_type, bias);
407
408 res = LLVMBuildOr(builder,
409 res,
410 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
411
412 res = LLVMBuildBitCast(builder, res, vec_type, "");
413
414 res = LLVMBuildFSub(builder, res, bias_, "");
415 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
416 }
417
418 return res;
419 }
420
421
422 /**
423 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
424 *
425 * Returns the number of dsts created from src
426 */
427 int lp_build_conv_auto(struct gallivm_state *gallivm,
428 struct lp_type src_type,
429 struct lp_type* dst_type,
430 const LLVMValueRef *src,
431 unsigned num_srcs,
432 LLVMValueRef *dst)
433 {
434 int i;
435 int num_dsts = num_srcs;
436
437 if (src_type.floating == dst_type->floating &&
438 src_type.width == dst_type->width &&
439 src_type.length == dst_type->length &&
440 src_type.fixed == dst_type->fixed &&
441 src_type.norm == dst_type->norm &&
442 src_type.sign == dst_type->sign)
443 return num_dsts;
444
445 /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
446 */
447 if (src_type.floating == 1 &&
448 src_type.fixed == 0 &&
449 src_type.sign == 1 &&
450 src_type.norm == 0 &&
451 src_type.width == 32 &&
452
453 dst_type->floating == 0 &&
454 dst_type->fixed == 0 &&
455 dst_type->sign == 0 &&
456 dst_type->norm == 1 &&
457 dst_type->width == 8)
458 {
459 /* Special case 4x4f --> 1x16ub */
460 if (src_type.length == 4 &&
461 (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
462 {
463 num_dsts = (num_srcs + 3) / 4;
464 dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
465
466 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
467 return num_dsts;
468 }
469
470 /* Special case 2x8f --> 1x16ub */
471 if (src_type.length == 8 &&
472 util_cpu_caps.has_avx)
473 {
474 num_dsts = (num_srcs + 1) / 2;
475 dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
476
477 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
478 return num_dsts;
479 }
480 }
481
482 /* lp_build_resize does not support M:N */
483 if (src_type.width == dst_type->width) {
484 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
485 } else {
486 for (i = 0; i < num_srcs; ++i) {
487 lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
488 }
489 }
490
491 return num_dsts;
492 }
493
494
495 /**
496 * Generic type conversion.
497 *
498 * TODO: Take a precision argument, or even better, add a new precision member
499 * to the lp_type union.
500 */
501 void
502 lp_build_conv(struct gallivm_state *gallivm,
503 struct lp_type src_type,
504 struct lp_type dst_type,
505 const LLVMValueRef *src, unsigned num_srcs,
506 LLVMValueRef *dst, unsigned num_dsts)
507 {
508 LLVMBuilderRef builder = gallivm->builder;
509 struct lp_type tmp_type;
510 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
511 unsigned num_tmps;
512 unsigned i;
513
514 /* We must not loose or gain channels. Only precision */
515 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
516
517 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
518 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
519 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
520 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
521
522 tmp_type = src_type;
523 for(i = 0; i < num_srcs; ++i) {
524 assert(lp_check_value(src_type, src[i]));
525 tmp[i] = src[i];
526 }
527 num_tmps = num_srcs;
528
529
530 /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
531 */
532 if (src_type.floating == 1 &&
533 src_type.fixed == 0 &&
534 src_type.sign == 1 &&
535 src_type.norm == 0 &&
536 src_type.width == 32 &&
537 src_type.length == 4 &&
538
539 dst_type.floating == 0 &&
540 dst_type.fixed == 0 &&
541 dst_type.sign == 0 &&
542 dst_type.norm == 1 &&
543 dst_type.width == 8 &&
544
545 ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
546 (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
547
548 (util_cpu_caps.has_sse2 || util_cpu_caps.has_altivec))
549 {
550 struct lp_build_context bld;
551 struct lp_type int16_type, int32_type;
552 struct lp_type dst_type_ext = dst_type;
553 LLVMValueRef const_255f;
554 unsigned i, j;
555
556 lp_build_context_init(&bld, gallivm, src_type);
557
558 dst_type_ext.length = 16;
559 int16_type = int32_type = dst_type_ext;
560
561 int16_type.width *= 2;
562 int16_type.length /= 2;
563 int16_type.sign = 1;
564
565 int32_type.width *= 4;
566 int32_type.length /= 4;
567 int32_type.sign = 1;
568
569 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
570
571 for (i = 0; i < num_dsts; ++i, src += 4) {
572 LLVMValueRef lo, hi;
573
574 for (j = 0; j < dst_type.length / 4; ++j) {
575 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
576 tmp[j] = lp_build_iround(&bld, tmp[j]);
577 }
578
579 if (num_srcs == 1) {
580 tmp[1] = tmp[0];
581 }
582
583 /* relying on clamping behavior of sse2 intrinsics here */
584 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
585
586 if (num_srcs < 4) {
587 hi = lo;
588 }
589 else {
590 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
591 }
592 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
593 }
594 if (num_srcs < 4) {
595 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
596 }
597
598 return;
599 }
600
601 /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
602 */
603 else if (src_type.floating == 1 &&
604 src_type.fixed == 0 &&
605 src_type.sign == 1 &&
606 src_type.norm == 0 &&
607 src_type.width == 32 &&
608 src_type.length == 8 &&
609
610 dst_type.floating == 0 &&
611 dst_type.fixed == 0 &&
612 dst_type.sign == 0 &&
613 dst_type.norm == 1 &&
614 dst_type.width == 8 &&
615
616 ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
617 (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
618
619 util_cpu_caps.has_avx) {
620
621 struct lp_build_context bld;
622 struct lp_type int16_type, int32_type;
623 struct lp_type dst_type_ext = dst_type;
624 LLVMValueRef const_255f;
625 unsigned i;
626
627 lp_build_context_init(&bld, gallivm, src_type);
628
629 dst_type_ext.length = 16;
630 int16_type = int32_type = dst_type_ext;
631
632 int16_type.width *= 2;
633 int16_type.length /= 2;
634 int16_type.sign = 1;
635
636 int32_type.width *= 4;
637 int32_type.length /= 4;
638 int32_type.sign = 1;
639
640 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
641
642 for (i = 0; i < num_dsts; ++i, src += 2) {
643 LLVMValueRef lo, hi, a, b;
644
645 a = LLVMBuildFMul(builder, src[0], const_255f, "");
646 a = lp_build_iround(&bld, a);
647 tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
648 tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
649 /* relying on clamping behavior of sse2 intrinsics here */
650 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
651
652 if (num_srcs == 1) {
653 hi = lo;
654 }
655 else {
656 b = LLVMBuildFMul(builder, src[1], const_255f, "");
657 b = lp_build_iround(&bld, b);
658 tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
659 tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
660 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
661
662 }
663 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
664 }
665
666 if (num_srcs == 1) {
667 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
668 }
669
670 return;
671 }
672
673 /* Special case -> 16bit half-float
674 */
675 else if (dst_type.floating && dst_type.width == 16)
676 {
677 /* Only support src as 32bit float currently */
678 assert(src_type.floating && src_type.width == 32);
679
680 for(i = 0; i < num_tmps; ++i)
681 dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
682
683 return;
684 }
685
686 /* Pre convert half-floats to floats
687 */
688 else if (src_type.floating && src_type.width == 16)
689 {
690 for(i = 0; i < num_tmps; ++i)
691 tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
692
693 tmp_type.width = 32;
694 }
695
696 /*
697 * Clamp if necessary
698 */
699
700 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
701 struct lp_build_context bld;
702 double src_min = lp_const_min(src_type);
703 double dst_min = lp_const_min(dst_type);
704 double src_max = lp_const_max(src_type);
705 double dst_max = lp_const_max(dst_type);
706 LLVMValueRef thres;
707
708 lp_build_context_init(&bld, gallivm, tmp_type);
709
710 if(src_min < dst_min) {
711 if(dst_min == 0.0)
712 thres = bld.zero;
713 else
714 thres = lp_build_const_vec(gallivm, src_type, dst_min);
715 for(i = 0; i < num_tmps; ++i)
716 tmp[i] = lp_build_max(&bld, tmp[i], thres);
717 }
718
719 if(src_max > dst_max) {
720 if(dst_max == 1.0)
721 thres = bld.one;
722 else
723 thres = lp_build_const_vec(gallivm, src_type, dst_max);
724 for(i = 0; i < num_tmps; ++i)
725 tmp[i] = lp_build_min(&bld, tmp[i], thres);
726 }
727 }
728
729 /*
730 * Scale to the narrowest range
731 */
732
733 if(dst_type.floating) {
734 /* Nothing to do */
735 }
736 else if(tmp_type.floating) {
737 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
738 for(i = 0; i < num_tmps; ++i) {
739 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
740 tmp_type,
741 dst_type.width,
742 tmp[i]);
743 }
744 tmp_type.floating = FALSE;
745 }
746 else {
747 double dst_scale = lp_const_scale(dst_type);
748
749 if (dst_scale != 1.0) {
750 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
751 for(i = 0; i < num_tmps; ++i)
752 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
753 }
754
755 /*
756 * these functions will use fptosi in some form which won't work
757 * with 32bit uint dst. Causes lp_test_conv failures though.
758 */
759 if (0)
760 assert(dst_type.sign || dst_type.width < 32);
761
762 if (dst_type.sign && dst_type.norm && !dst_type.fixed) {
763 struct lp_build_context bld;
764
765 lp_build_context_init(&bld, gallivm, tmp_type);
766 for(i = 0; i < num_tmps; ++i) {
767 tmp[i] = lp_build_iround(&bld, tmp[i]);
768 }
769 tmp_type.floating = FALSE;
770 }
771 else {
772 LLVMTypeRef tmp_vec_type;
773
774 tmp_type.floating = FALSE;
775 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
776 for(i = 0; i < num_tmps; ++i) {
777 #if 0
778 if(dst_type.sign)
779 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
780 else
781 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
782 #else
783 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
784 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
785 #endif
786 }
787 }
788 }
789 }
790 else {
791 unsigned src_shift = lp_const_shift(src_type);
792 unsigned dst_shift = lp_const_shift(dst_type);
793 unsigned src_offset = lp_const_offset(src_type);
794 unsigned dst_offset = lp_const_offset(dst_type);
795 struct lp_build_context bld;
796 lp_build_context_init(&bld, gallivm, tmp_type);
797
798 /* Compensate for different offsets */
799 /* sscaled -> unorm and similar would cause negative shift count, skip */
800 if (dst_offset > src_offset && src_type.width > dst_type.width && src_shift > 0) {
801 for (i = 0; i < num_tmps; ++i) {
802 LLVMValueRef shifted;
803
804 shifted = lp_build_shr_imm(&bld, tmp[i], src_shift - 1);
805 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
806 }
807 }
808
809 if(src_shift > dst_shift) {
810 for(i = 0; i < num_tmps; ++i)
811 tmp[i] = lp_build_shr_imm(&bld, tmp[i], src_shift - dst_shift);
812 }
813 }
814
815 /*
816 * Truncate or expand bit width
817 *
818 * No data conversion should happen here, although the sign bits are
819 * crucial to avoid bad clamping.
820 */
821
822 {
823 struct lp_type new_type;
824
825 new_type = tmp_type;
826 new_type.sign = dst_type.sign;
827 new_type.width = dst_type.width;
828 new_type.length = dst_type.length;
829
830 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
831
832 tmp_type = new_type;
833 num_tmps = num_dsts;
834 }
835
836 /*
837 * Scale to the widest range
838 */
839
840 if(src_type.floating) {
841 /* Nothing to do */
842 }
843 else if(!src_type.floating && dst_type.floating) {
844 if(!src_type.fixed && !src_type.sign && src_type.norm) {
845 for(i = 0; i < num_tmps; ++i) {
846 tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
847 src_type.width,
848 dst_type,
849 tmp[i]);
850 }
851 tmp_type.floating = TRUE;
852 }
853 else {
854 double src_scale = lp_const_scale(src_type);
855 LLVMTypeRef tmp_vec_type;
856
857 /* Use an equally sized integer for intermediate computations */
858 tmp_type.floating = TRUE;
859 tmp_type.sign = TRUE;
860 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
861 for(i = 0; i < num_tmps; ++i) {
862 #if 0
863 if(dst_type.sign)
864 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
865 else
866 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
867 #else
868 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
869 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
870 #endif
871 }
872
873 if (src_scale != 1.0) {
874 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
875 for(i = 0; i < num_tmps; ++i)
876 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
877 }
878
879 /* the formula above will produce value below -1.0 for most negative
880 * value but everything seems happy with that hence disable for now */
881 if (0 && !src_type.fixed && src_type.norm && src_type.sign) {
882 struct lp_build_context bld;
883
884 lp_build_context_init(&bld, gallivm, dst_type);
885 for(i = 0; i < num_tmps; ++i) {
886 tmp[i] = lp_build_max(&bld, tmp[i],
887 lp_build_const_vec(gallivm, dst_type, -1.0f));
888 }
889 }
890 }
891 }
892 else {
893 unsigned src_shift = lp_const_shift(src_type);
894 unsigned dst_shift = lp_const_shift(dst_type);
895 unsigned src_offset = lp_const_offset(src_type);
896 unsigned dst_offset = lp_const_offset(dst_type);
897 struct lp_build_context bld;
898 lp_build_context_init(&bld, gallivm, tmp_type);
899
900 if (src_shift < dst_shift) {
901 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
902
903 if (dst_shift - src_shift < dst_type.width) {
904 for (i = 0; i < num_tmps; ++i) {
905 pre_shift[i] = tmp[i];
906 tmp[i] = lp_build_shl_imm(&bld, tmp[i], dst_shift - src_shift);
907 }
908 }
909 else {
910 /*
911 * This happens for things like sscaled -> unorm conversions. Shift
912 * counts equal to bit width cause undefined results, so hack around it.
913 */
914 for (i = 0; i < num_tmps; ++i) {
915 pre_shift[i] = tmp[i];
916 tmp[i] = lp_build_zero(gallivm, dst_type);
917 }
918 }
919
920 /* Compensate for different offsets */
921 if (dst_offset > src_offset) {
922 for (i = 0; i < num_tmps; ++i) {
923 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
924 }
925 }
926 }
927 }
928
929 for(i = 0; i < num_dsts; ++i) {
930 dst[i] = tmp[i];
931 assert(lp_check_value(dst_type, dst[i]));
932 }
933 }
934
935
936 /**
937 * Bit mask conversion.
938 *
939 * This will convert the integer masks that match the given types.
940 *
941 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
942 * Any other value will likely cause unpredictable results.
943 *
944 * This is basically a very trimmed down version of lp_build_conv.
945 */
946 void
947 lp_build_conv_mask(struct gallivm_state *gallivm,
948 struct lp_type src_type,
949 struct lp_type dst_type,
950 const LLVMValueRef *src, unsigned num_srcs,
951 LLVMValueRef *dst, unsigned num_dsts)
952 {
953
954 /* We must not loose or gain channels. Only precision */
955 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
956
957 /*
958 * Drop
959 *
960 * We assume all values are 0 or -1
961 */
962
963 src_type.floating = FALSE;
964 src_type.fixed = FALSE;
965 src_type.sign = TRUE;
966 src_type.norm = FALSE;
967
968 dst_type.floating = FALSE;
969 dst_type.fixed = FALSE;
970 dst_type.sign = TRUE;
971 dst_type.norm = FALSE;
972
973 /*
974 * Truncate or expand bit width
975 */
976
977 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
978 }