cc442369630459e6f5ef640cf6764e8983a0e94c
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_conv.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for type conversions.
32 *
33 * We want to use the fastest type for a given computation whenever feasible.
34 * The other side of this is that we need to be able convert between several
35 * types accurately and efficiently.
36 *
37 * Conversion between types of different bit width is quite complex since a
38 *
39 * To remember there are a few invariants in type conversions:
40 *
41 * - register width must remain constant:
42 *
43 * src_type.width * src_type.length == dst_type.width * dst_type.length
44 *
45 * - total number of elements must remain constant:
46 *
47 * src_type.length * num_srcs == dst_type.length * num_dsts
48 *
49 * It is not always possible to do the conversion both accurately and
50 * efficiently, usually due to lack of adequate machine instructions. In these
51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52 * there this functions can be used anywhere. In the future we might have a
53 * precision parameter which can gauge the accuracy vs efficiency compromise,
54 * but for now if the data conversion between two stages happens to be the
55 * bottleneck, then most likely should just avoid converting at all and run
56 * both stages with the same type.
57 *
58 * Make sure to run lp_test_conv unit test after any change to this file.
59 *
60 * @author Jose Fonseca <jfonseca@vmware.com>
61 */
62
63
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/u_cpu_detect.h"
67
68 #include "lp_bld_type.h"
69 #include "lp_bld_const.h"
70 #include "lp_bld_arit.h"
71 #include "lp_bld_pack.h"
72 #include "lp_bld_conv.h"
73 #include "lp_bld_logic.h"
74
75
76 /**
77 * Converts int16 half-float to float32
78 * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
79 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
80 *
81 * @param src value to convert
82 *
83 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
84 * ref https://gist.github.com/2144712
85 */
86 LLVMValueRef
87 lp_build_half_to_float(struct gallivm_state *gallivm,
88 LLVMValueRef src)
89 {
90 int src_length = LLVMGetVectorSize(LLVMTypeOf(src));
91
92 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
93 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
94
95 LLVMBuilderRef builder = gallivm->builder;
96 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
97 LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
98
99 /* Constants */
100 LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13);
101 LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16);
102 LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
103 LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
104 LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
105 LLVMValueRef f32_magic = LLVMBuildBitCast(builder,
106 lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
107 float_vec_type, "");
108
109 /* Convert int16 vector to int32 vector by zero ext */
110 LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, "");
111
112 /* Exponent / mantissa bits */
113 LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
114 LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
115
116 /* Exponent adjust */
117 LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
118
119 /* Make sure Inf/NaN survive */
120 LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
121 LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
122
123 /* Sign bit */
124 LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, "");
125 LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, "");
126
127 /* Combine result */
128 LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, "");
129 LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, "");
130
131 /* Cast from int32 vector to float32 vector */
132 return LLVMBuildBitCast(builder, final, float_vec_type, "");
133 }
134
135
136 /**
137 * Converts float32 to int16 half-float
138 * Note this can be performed in 1 instruction if vcvtps2ph exists (sse5 i think?)
139 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
140 *
141 * @param src value to convert
142 *
143 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
144 * ref https://gist.github.com/2156668
145 */
146 LLVMValueRef
147 lp_build_float_to_half(struct gallivm_state *gallivm,
148 LLVMValueRef src)
149 {
150 struct lp_type i32_type = lp_type_int_vec(32, 32 * LLVMGetVectorSize(LLVMTypeOf(src)));
151
152 LLVMBuilderRef builder = gallivm->builder;
153 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
154
155 struct lp_build_context bld;
156
157 LLVMValueRef result;
158
159 lp_build_context_init(&bld, gallivm, i32_type);
160
161 /* Extra scope because lp_build_min needs a build context, le sigh */
162 {
163 /* Constants */
164 LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13);
165 LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16);
166 LLVMValueRef i32_mask_fabs = lp_build_const_int_vec(gallivm, i32_type, 0x7fffffff);
167 LLVMValueRef i32_f32infty = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
168 LLVMValueRef i32_expinf = lp_build_const_int_vec(gallivm, i32_type, 0xe0 << 23);
169 LLVMValueRef i32_f16max = lp_build_const_int_vec(gallivm, i32_type, 0x8f << 23);
170 LLVMValueRef i32_magic = lp_build_const_int_vec(gallivm, i32_type, 0x0f << 23);
171
172 /* Cast from float32 to int32 */
173 LLVMValueRef f = LLVMBuildBitCast(builder, src, int_vec_type, "");
174
175 /* Remove sign */
176 LLVMValueRef fabs = LLVMBuildAnd(builder, i32_mask_fabs, f, "");
177
178 /* Magic conversion */
179 LLVMValueRef clamped = lp_build_min(&bld, i32_f16max, fabs);
180 LLVMValueRef scaled = LLVMBuildMul(builder, clamped, i32_magic, "");
181
182 /* Make sure Inf/NaN and unormalised survive */
183 LLVMValueRef infnancase = LLVMBuildXor(builder, i32_expinf, fabs, "");
184 LLVMValueRef b_notnormal = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, fabs, i32_f32infty);
185
186 /* Merge normal / unnormal case */
187 LLVMValueRef merge1 = LLVMBuildAnd(builder, infnancase, b_notnormal, "");
188 LLVMValueRef merge2 = LLVMBuildNot(builder, LLVMBuildAnd(builder, b_notnormal, scaled, ""), "");
189 LLVMValueRef merged = LLVMBuildOr(builder, merge1, merge2, "");
190 LLVMValueRef shifted = LLVMBuildLShr(builder, merged, i32_13, "");
191
192 /* Sign bit */
193 LLVMValueRef justsign = LLVMBuildXor(builder, f, fabs, "");
194 LLVMValueRef signshifted = LLVMBuildLShr(builder, justsign, i32_16, "");
195
196 /* Combine result */
197 result = LLVMBuildOr(builder, shifted, signshifted, "");
198 }
199
200 /* Truncate from 32 bit to 16 bit */
201 i32_type.width = 16;
202 return LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i32_type), "");
203 }
204
205
206 /**
207 * Special case for converting clamped IEEE-754 floats to unsigned norms.
208 *
209 * The mathematical voodoo below may seem excessive but it is actually
210 * paramount we do it this way for several reasons. First, there is no single
211 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
212 * secondly, even if there was, since the FP's mantissa takes only a fraction
213 * of register bits the typically scale and cast approach would require double
214 * precision for accurate results, and therefore half the throughput
215 *
216 * Although the result values can be scaled to an arbitrary bit width specified
217 * by dst_width, the actual result type will have the same width.
218 *
219 * Ex: src = { float, float, float, float }
220 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
221 */
222 LLVMValueRef
223 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
224 struct lp_type src_type,
225 unsigned dst_width,
226 LLVMValueRef src)
227 {
228 LLVMBuilderRef builder = gallivm->builder;
229 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
230 LLVMValueRef res;
231 unsigned mantissa;
232
233 assert(src_type.floating);
234 assert(dst_width <= src_type.width);
235 src_type.sign = FALSE;
236
237 mantissa = lp_mantissa(src_type);
238
239 if (dst_width <= mantissa) {
240 /*
241 * Apply magic coefficients that will make the desired result to appear
242 * in the lowest significant bits of the mantissa, with correct rounding.
243 *
244 * This only works if the destination width fits in the mantissa.
245 */
246
247 unsigned long long ubound;
248 unsigned long long mask;
249 double scale;
250 double bias;
251
252 ubound = (1ULL << dst_width);
253 mask = ubound - 1;
254 scale = (double)mask/ubound;
255 bias = (double)(1ULL << (mantissa - dst_width));
256
257 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
258 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
259 res = LLVMBuildBitCast(builder, res, int_vec_type, "");
260 res = LLVMBuildAnd(builder, res,
261 lp_build_const_int_vec(gallivm, src_type, mask), "");
262 }
263 else if (dst_width == (mantissa + 1)) {
264 /*
265 * The destination width matches exactly what can be represented in
266 * floating point (i.e., mantissa + 1 bits). So do a straight
267 * multiplication followed by casting. No further rounding is necessary.
268 */
269
270 double scale;
271
272 scale = (double)((1ULL << dst_width) - 1);
273
274 res = LLVMBuildFMul(builder, src,
275 lp_build_const_vec(gallivm, src_type, scale), "");
276 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
277 }
278 else {
279 /*
280 * The destination exceeds what can be represented in the floating point.
281 * So multiply by the largest power two we get away with, and when
282 * subtract the most significant bit to rescale to normalized values.
283 *
284 * The largest power of two factor we can get away is
285 * (1 << (src_type.width - 1)), because we need to use signed . In theory it
286 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
287 * INT_MIN should be returned in FPToSI, which is the correct result for
288 * values near 1.0!
289 *
290 * This means we get (src_type.width - 1) correct bits for values near 0.0,
291 * and (mantissa + 1) correct bits for values near 1.0. Equally or more
292 * important, we also get exact results for 0.0 and 1.0.
293 */
294
295 unsigned n = MIN2(src_type.width - 1, dst_width);
296
297 double scale = (double)(1ULL << n);
298 unsigned lshift = dst_width - n;
299 unsigned rshift = n;
300 LLVMValueRef lshifted;
301 LLVMValueRef rshifted;
302
303 res = LLVMBuildFMul(builder, src,
304 lp_build_const_vec(gallivm, src_type, scale), "");
305 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
306
307 /*
308 * Align the most significant bit to its final place.
309 *
310 * This will cause 1.0 to overflow to 0, but the later adjustment will
311 * get it right.
312 */
313 if (lshift) {
314 lshifted = LLVMBuildShl(builder, res,
315 lp_build_const_int_vec(gallivm, src_type,
316 lshift), "");
317 } else {
318 lshifted = res;
319 }
320
321 /*
322 * Align the most significant bit to the right.
323 */
324 rshifted = LLVMBuildLShr(builder, res,
325 lp_build_const_int_vec(gallivm, src_type, rshift),
326 "");
327
328 /*
329 * Subtract the MSB to the LSB, therefore re-scaling from
330 * (1 << dst_width) to ((1 << dst_width) - 1).
331 */
332
333 res = LLVMBuildSub(builder, lshifted, rshifted, "");
334 }
335
336 return res;
337 }
338
339
340 /**
341 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
342 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
343 * return {float, float, float, float} with values in range [0, 1].
344 */
345 LLVMValueRef
346 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
347 unsigned src_width,
348 struct lp_type dst_type,
349 LLVMValueRef src)
350 {
351 LLVMBuilderRef builder = gallivm->builder;
352 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
353 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
354 LLVMValueRef bias_;
355 LLVMValueRef res;
356 unsigned mantissa;
357 unsigned n;
358 unsigned long long ubound;
359 unsigned long long mask;
360 double scale;
361 double bias;
362
363 assert(dst_type.floating);
364
365 mantissa = lp_mantissa(dst_type);
366
367 if (src_width <= (mantissa + 1)) {
368 /*
369 * The source width matches fits what can be represented in floating
370 * point (i.e., mantissa + 1 bits). So do a straight multiplication
371 * followed by casting. No further rounding is necessary.
372 */
373
374 scale = 1.0/(double)((1ULL << src_width) - 1);
375 res = LLVMBuildSIToFP(builder, src, vec_type, "");
376 res = LLVMBuildFMul(builder, res,
377 lp_build_const_vec(gallivm, dst_type, scale), "");
378 return res;
379 }
380 else {
381 /*
382 * The source width exceeds what can be represented in floating
383 * point. So truncate the incoming values.
384 */
385
386 n = MIN2(mantissa, src_width);
387
388 ubound = ((unsigned long long)1 << n);
389 mask = ubound - 1;
390 scale = (double)ubound/mask;
391 bias = (double)((unsigned long long)1 << (mantissa - n));
392
393 res = src;
394
395 if (src_width > mantissa) {
396 int shift = src_width - mantissa;
397 res = LLVMBuildLShr(builder, res,
398 lp_build_const_int_vec(gallivm, dst_type, shift), "");
399 }
400
401 bias_ = lp_build_const_vec(gallivm, dst_type, bias);
402
403 res = LLVMBuildOr(builder,
404 res,
405 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
406
407 res = LLVMBuildBitCast(builder, res, vec_type, "");
408
409 res = LLVMBuildFSub(builder, res, bias_, "");
410 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
411 }
412
413 return res;
414 }
415
416
417 /**
418 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
419 *
420 * Returns the number of dsts created from src
421 */
422 int lp_build_conv_auto(struct gallivm_state *gallivm,
423 struct lp_type src_type,
424 struct lp_type* dst_type,
425 const LLVMValueRef *src,
426 unsigned num_srcs,
427 LLVMValueRef *dst)
428 {
429 int i;
430 int num_dsts = num_srcs;
431
432 if (src_type.floating == dst_type->floating &&
433 src_type.width == dst_type->width &&
434 src_type.length == dst_type->length &&
435 src_type.fixed == dst_type->fixed &&
436 src_type.norm == dst_type->norm &&
437 src_type.sign == dst_type->sign)
438 return num_dsts;
439
440 /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
441 */
442 if (src_type.floating == 1 &&
443 src_type.fixed == 0 &&
444 src_type.sign == 1 &&
445 src_type.norm == 0 &&
446 src_type.width == 32 &&
447
448 dst_type->floating == 0 &&
449 dst_type->fixed == 0 &&
450 dst_type->sign == 0 &&
451 dst_type->norm == 1 &&
452 dst_type->width == 8)
453 {
454 /* Special case 4x4f --> 1x16ub */
455 if (src_type.length == 4 && util_cpu_caps.has_sse2)
456 {
457 assert((num_srcs % 4) == 0);
458
459 num_dsts = num_srcs / 4;
460 dst_type->length = 16;
461
462 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
463 return num_dsts;
464 }
465
466 /* Special case 2x8f --> 1x16ub */
467 if (src_type.length == 8 && util_cpu_caps.has_avx)
468 {
469 assert((num_srcs % 2) == 0);
470
471 num_dsts = num_srcs / 2;
472 dst_type->length = 16;
473
474 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
475 return num_dsts;
476 }
477 }
478
479 /* lp_build_resize does not support M:N */
480 if (src_type.width == dst_type->width) {
481 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
482 } else {
483 for (i = 0; i < num_srcs; ++i) {
484 lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
485 }
486 }
487
488 return num_dsts;
489 }
490
491
492 /**
493 * Generic type conversion.
494 *
495 * TODO: Take a precision argument, or even better, add a new precision member
496 * to the lp_type union.
497 */
498 void
499 lp_build_conv(struct gallivm_state *gallivm,
500 struct lp_type src_type,
501 struct lp_type dst_type,
502 const LLVMValueRef *src, unsigned num_srcs,
503 LLVMValueRef *dst, unsigned num_dsts)
504 {
505 LLVMBuilderRef builder = gallivm->builder;
506 struct lp_type tmp_type;
507 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
508 unsigned num_tmps;
509 unsigned i;
510
511 /* We must not loose or gain channels. Only precision */
512 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
513
514 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
515 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
516 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
517 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
518
519 tmp_type = src_type;
520 for(i = 0; i < num_srcs; ++i) {
521 assert(lp_check_value(src_type, src[i]));
522 tmp[i] = src[i];
523 }
524 num_tmps = num_srcs;
525
526
527 /* Special case 4x4f --> 1x16ub
528 */
529 if (src_type.floating == 1 &&
530 src_type.fixed == 0 &&
531 src_type.sign == 1 &&
532 src_type.norm == 0 &&
533 src_type.width == 32 &&
534 src_type.length == 4 &&
535
536 dst_type.floating == 0 &&
537 dst_type.fixed == 0 &&
538 dst_type.sign == 0 &&
539 dst_type.norm == 1 &&
540 dst_type.width == 8 &&
541 dst_type.length == 16 &&
542
543 4 * num_dsts == num_srcs &&
544
545 util_cpu_caps.has_sse2)
546 {
547 struct lp_build_context bld;
548 struct lp_type int16_type = dst_type;
549 struct lp_type int32_type = dst_type;
550 LLVMValueRef const_255f;
551 unsigned i, j;
552
553 lp_build_context_init(&bld, gallivm, src_type);
554
555 int16_type.width *= 2;
556 int16_type.length /= 2;
557 int16_type.sign = 1;
558
559 int32_type.width *= 4;
560 int32_type.length /= 4;
561 int32_type.sign = 1;
562
563 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
564
565 for (i = 0; i < num_dsts; ++i, src += 4) {
566 LLVMValueRef lo, hi;
567
568 for (j = 0; j < 4; ++j) {
569 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
570 tmp[j] = lp_build_iround(&bld, tmp[j]);
571 }
572
573 /* relying on clamping behavior of sse2 intrinsics here */
574 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
575 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
576 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
577 }
578
579 return;
580 }
581
582 /* Special case 2x8f --> 1x16ub
583 */
584 else if (src_type.floating == 1 &&
585 src_type.fixed == 0 &&
586 src_type.sign == 1 &&
587 src_type.norm == 0 &&
588 src_type.width == 32 &&
589 src_type.length == 8 &&
590
591 dst_type.floating == 0 &&
592 dst_type.fixed == 0 &&
593 dst_type.sign == 0 &&
594 dst_type.norm == 1 &&
595 dst_type.width == 8 &&
596 dst_type.length == 16 &&
597
598 2 * num_dsts == num_srcs &&
599
600 util_cpu_caps.has_avx) {
601
602 struct lp_build_context bld;
603 struct lp_type int16_type = dst_type;
604 struct lp_type int32_type = dst_type;
605 LLVMValueRef const_255f;
606 unsigned i;
607
608 lp_build_context_init(&bld, gallivm, src_type);
609
610 int16_type.width *= 2;
611 int16_type.length /= 2;
612 int16_type.sign = 1;
613
614 int32_type.width *= 4;
615 int32_type.length /= 4;
616 int32_type.sign = 1;
617
618 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
619
620 for (i = 0; i < num_dsts; ++i, src += 2) {
621 LLVMValueRef lo, hi, a, b;
622
623 a = LLVMBuildFMul(builder, src[0], const_255f, "");
624 b = LLVMBuildFMul(builder, src[1], const_255f, "");
625
626 a = lp_build_iround(&bld, a);
627 b = lp_build_iround(&bld, b);
628
629 tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
630 tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
631 tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
632 tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
633
634 /* relying on clamping behavior of sse2 intrinsics here */
635 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
636 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
637 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
638 }
639 return;
640 }
641
642 /* Special case -> 16bit half-float
643 */
644 else if (dst_type.floating && dst_type.width == 16)
645 {
646 /* Only support src as 32bit float currently */
647 assert(src_type.floating && src_type.width == 32);
648
649 for(i = 0; i < num_tmps; ++i)
650 dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
651
652 return;
653 }
654
655 /* Pre convert half-floats to floats
656 */
657 else if (src_type.floating && src_type.width == 16)
658 {
659 for(i = 0; i < num_tmps; ++i)
660 tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
661
662 tmp_type.width = 32;
663 }
664
665 /*
666 * Clamp if necessary
667 */
668
669 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
670 struct lp_build_context bld;
671 double src_min = lp_const_min(src_type);
672 double dst_min = lp_const_min(dst_type);
673 double src_max = lp_const_max(src_type);
674 double dst_max = lp_const_max(dst_type);
675 LLVMValueRef thres;
676
677 lp_build_context_init(&bld, gallivm, tmp_type);
678
679 if(src_min < dst_min) {
680 if(dst_min == 0.0)
681 thres = bld.zero;
682 else
683 thres = lp_build_const_vec(gallivm, src_type, dst_min);
684 for(i = 0; i < num_tmps; ++i)
685 tmp[i] = lp_build_max(&bld, tmp[i], thres);
686 }
687
688 if(src_max > dst_max) {
689 if(dst_max == 1.0)
690 thres = bld.one;
691 else
692 thres = lp_build_const_vec(gallivm, src_type, dst_max);
693 for(i = 0; i < num_tmps; ++i)
694 tmp[i] = lp_build_min(&bld, tmp[i], thres);
695 }
696 }
697
698 /*
699 * Scale to the narrowest range
700 */
701
702 if(dst_type.floating) {
703 /* Nothing to do */
704 }
705 else if(tmp_type.floating) {
706 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
707 for(i = 0; i < num_tmps; ++i) {
708 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
709 tmp_type,
710 dst_type.width,
711 tmp[i]);
712 }
713 tmp_type.floating = FALSE;
714 }
715 else {
716 double dst_scale = lp_const_scale(dst_type);
717 LLVMTypeRef tmp_vec_type;
718
719 if (dst_scale != 1.0) {
720 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
721 for(i = 0; i < num_tmps; ++i)
722 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
723 }
724
725 /* Use an equally sized integer for intermediate computations */
726 tmp_type.floating = FALSE;
727 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
728 for(i = 0; i < num_tmps; ++i) {
729 #if 0
730 if(dst_type.sign)
731 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
732 else
733 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
734 #else
735 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
736 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
737 #endif
738 }
739 }
740 }
741 else {
742 unsigned src_shift = lp_const_shift(src_type);
743 unsigned dst_shift = lp_const_shift(dst_type);
744 unsigned src_offset = lp_const_offset(src_type);
745 unsigned dst_offset = lp_const_offset(dst_type);
746
747 /* Compensate for different offsets */
748 if (dst_offset > src_offset && src_type.width > dst_type.width) {
749 for (i = 0; i < num_tmps; ++i) {
750 LLVMValueRef shifted;
751 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1);
752 if(src_type.sign)
753 shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
754 else
755 shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
756
757 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
758 }
759 }
760
761 if(src_shift > dst_shift) {
762 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
763 src_shift - dst_shift);
764 for(i = 0; i < num_tmps; ++i)
765 if(src_type.sign)
766 tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
767 else
768 tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
769 }
770 }
771
772 /*
773 * Truncate or expand bit width
774 *
775 * No data conversion should happen here, although the sign bits are
776 * crucial to avoid bad clamping.
777 */
778
779 {
780 struct lp_type new_type;
781
782 new_type = tmp_type;
783 new_type.sign = dst_type.sign;
784 new_type.width = dst_type.width;
785 new_type.length = dst_type.length;
786
787 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
788
789 tmp_type = new_type;
790 num_tmps = num_dsts;
791 }
792
793 /*
794 * Scale to the widest range
795 */
796
797 if(src_type.floating) {
798 /* Nothing to do */
799 }
800 else if(!src_type.floating && dst_type.floating) {
801 if(!src_type.fixed && !src_type.sign && src_type.norm) {
802 for(i = 0; i < num_tmps; ++i) {
803 tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
804 src_type.width,
805 dst_type,
806 tmp[i]);
807 }
808 tmp_type.floating = TRUE;
809 }
810 else {
811 double src_scale = lp_const_scale(src_type);
812 LLVMTypeRef tmp_vec_type;
813
814 /* Use an equally sized integer for intermediate computations */
815 tmp_type.floating = TRUE;
816 tmp_type.sign = TRUE;
817 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
818 for(i = 0; i < num_tmps; ++i) {
819 #if 0
820 if(dst_type.sign)
821 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
822 else
823 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
824 #else
825 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
826 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
827 #endif
828 }
829
830 if (src_scale != 1.0) {
831 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
832 for(i = 0; i < num_tmps; ++i)
833 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
834 }
835 }
836 }
837 else {
838 unsigned src_shift = lp_const_shift(src_type);
839 unsigned dst_shift = lp_const_shift(dst_type);
840 unsigned src_offset = lp_const_offset(src_type);
841 unsigned dst_offset = lp_const_offset(dst_type);
842
843 if (src_shift < dst_shift) {
844 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
845 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift);
846
847 for (i = 0; i < num_tmps; ++i) {
848 pre_shift[i] = tmp[i];
849 tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
850 }
851
852 /* Compensate for different offsets */
853 if (dst_offset > src_offset) {
854 for (i = 0; i < num_tmps; ++i) {
855 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
856 }
857 }
858 }
859 }
860
861 for(i = 0; i < num_dsts; ++i) {
862 dst[i] = tmp[i];
863 assert(lp_check_value(dst_type, dst[i]));
864 }
865 }
866
867
868 /**
869 * Bit mask conversion.
870 *
871 * This will convert the integer masks that match the given types.
872 *
873 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
874 * Any other value will likely cause unpredictable results.
875 *
876 * This is basically a very trimmed down version of lp_build_conv.
877 */
878 void
879 lp_build_conv_mask(struct gallivm_state *gallivm,
880 struct lp_type src_type,
881 struct lp_type dst_type,
882 const LLVMValueRef *src, unsigned num_srcs,
883 LLVMValueRef *dst, unsigned num_dsts)
884 {
885
886 /* We must not loose or gain channels. Only precision */
887 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
888
889 /*
890 * Drop
891 *
892 * We assume all values are 0 or -1
893 */
894
895 src_type.floating = FALSE;
896 src_type.fixed = FALSE;
897 src_type.sign = TRUE;
898 src_type.norm = FALSE;
899
900 dst_type.floating = FALSE;
901 dst_type.fixed = FALSE;
902 dst_type.sign = TRUE;
903 dst_type.norm = FALSE;
904
905 /*
906 * Truncate or expand bit width
907 */
908
909 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
910 }