cd18b0c652045cf1035cf8ab0fd4d2ef25ac5f50
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_conv.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Helper functions for type conversions.
32 *
33 * We want to use the fastest type for a given computation whenever feasible.
34 * The other side of this is that we need to be able convert between several
35 * types accurately and efficiently.
36 *
37 * Conversion between types of different bit width is quite complex since a
38 *
39 * To remember there are a few invariants in type conversions:
40 *
41 * - register width must remain constant:
42 *
43 * src_type.width * src_type.length == dst_type.width * dst_type.length
44 *
45 * - total number of elements must remain constant:
46 *
47 * src_type.length * num_srcs == dst_type.length * num_dsts
48 *
49 * It is not always possible to do the conversion both accurately and
50 * efficiently, usually due to lack of adequate machine instructions. In these
51 * cases it is important not to cut shortcuts here and sacrifice accuracy, as
52 * there this functions can be used anywhere. In the future we might have a
53 * precision parameter which can gauge the accuracy vs efficiency compromise,
54 * but for now if the data conversion between two stages happens to be the
55 * bottleneck, then most likely should just avoid converting at all and run
56 * both stages with the same type.
57 *
58 * Make sure to run lp_test_conv unit test after any change to this file.
59 *
60 * @author Jose Fonseca <jfonseca@vmware.com>
61 */
62
63
64 #include "util/u_debug.h"
65 #include "util/u_math.h"
66 #include "util/u_cpu_detect.h"
67
68 #include "lp_bld_type.h"
69 #include "lp_bld_const.h"
70 #include "lp_bld_arit.h"
71 #include "lp_bld_pack.h"
72 #include "lp_bld_conv.h"
73 #include "lp_bld_logic.h"
74 #include "lp_bld_intr.h"
75
76
77
78 /**
79 * Byte swap on element. It will construct a call to intrinsic llvm.bswap
80 * based on the type.
81 *
82 * @param res element to byte swap.
83 * @param type int16_t, int32_t, int64_t, float or double
84 * @param
85 */
86 LLVMValueRef
87 lp_build_bswap(struct gallivm_state *gallivm,
88 LLVMValueRef res,
89 struct lp_type type)
90 {
91 LLVMTypeRef int_type = LLVMIntTypeInContext(gallivm->context,
92 type.width);
93 const char *intrinsic = NULL;
94 if (type.width == 8)
95 return res;
96 if (type.width == 16)
97 intrinsic = "llvm.bswap.i16";
98 else if (type.width == 32)
99 intrinsic = "llvm.bswap.i32";
100 else if (type.width == 64)
101 intrinsic = "llvm.bswap.i64";
102
103 assert (intrinsic != NULL);
104
105 /* In case of a floating-point type cast to a int of same size and then
106 * cast back to fp type.
107 */
108 if (type.floating)
109 res = LLVMBuildBitCast(gallivm->builder, res, int_type, "");
110 res = lp_build_intrinsic_unary(gallivm->builder, intrinsic, int_type, res);
111 if (type.floating)
112 res = LLVMBuildBitCast(gallivm->builder, res,
113 lp_build_elem_type(gallivm, type), "");
114 return res;
115 }
116
117
118 /**
119 * Byte swap every element in the vector.
120 *
121 * @param packed <vector> to convert
122 * @param src_type <vector> type of int16_t, int32_t, int64_t, float or
123 * double
124 * @param dst_type <vector> type to return
125 */
126 LLVMValueRef
127 lp_build_bswap_vec(struct gallivm_state *gallivm,
128 LLVMValueRef packed,
129 struct lp_type src_type_vec,
130 struct lp_type dst_type_vec)
131 {
132 LLVMBuilderRef builder = gallivm->builder;
133 LLVMTypeRef dst_type = lp_build_elem_type(gallivm, dst_type_vec);
134 LLVMValueRef res;
135
136 if (src_type_vec.length == 1) {
137 res = lp_build_bswap(gallivm, packed, src_type_vec);
138 res = LLVMBuildBitCast(gallivm->builder, res, dst_type, "");
139 } else {
140 unsigned i;
141 res = LLVMGetUndef(lp_build_vec_type(gallivm, dst_type_vec));
142 for (i = 0; i < src_type_vec.length; ++i) {
143 LLVMValueRef index = lp_build_const_int32(gallivm, i);
144 LLVMValueRef elem = LLVMBuildExtractElement(builder, packed, index, "");
145 elem = lp_build_bswap(gallivm, elem, src_type_vec);
146 elem = LLVMBuildBitCast(gallivm->builder, elem, dst_type, "");
147 res = LLVMBuildInsertElement(gallivm->builder, res, elem, index, "");
148 }
149 }
150 return res;
151 }
152
153
154 /**
155 * Converts int16 half-float to float32
156 * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
157 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
158 *
159 * @param src value to convert
160 *
161 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
162 * ref https://gist.github.com/2144712
163 */
164 LLVMValueRef
165 lp_build_half_to_float(struct gallivm_state *gallivm,
166 LLVMValueRef src)
167 {
168 int src_length = LLVMGetVectorSize(LLVMTypeOf(src));
169
170 struct lp_type f32_type = lp_type_float_vec(32, 32 * src_length);
171 struct lp_type i32_type = lp_type_int_vec(32, 32 * src_length);
172
173 LLVMBuilderRef builder = gallivm->builder;
174 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
175 LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);
176
177 /* Constants */
178 LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13);
179 LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16);
180 LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
181 LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
182 LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
183 LLVMValueRef f32_magic = LLVMBuildBitCast(builder,
184 lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
185 float_vec_type, "");
186
187 /* Convert int16 vector to int32 vector by zero ext */
188 LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, "");
189
190 /* Exponent / mantissa bits */
191 LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
192 LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");
193
194 /* Exponent adjust */
195 LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");
196
197 /* Make sure Inf/NaN survive */
198 LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
199 LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");
200
201 /* Sign bit */
202 LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, "");
203 LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, "");
204
205 /* Combine result */
206 LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, "");
207 LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, "");
208
209 /* Cast from int32 vector to float32 vector */
210 return LLVMBuildBitCast(builder, final, float_vec_type, "");
211 }
212
213
214 /**
215 * Converts float32 to int16 half-float
216 * Note this can be performed in 1 instruction if vcvtps2ph exists (sse5 i think?)
217 * [llvm.x86.vcvtps2ph / _mm_cvtps_ph]
218 *
219 * @param src value to convert
220 *
221 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
222 * ref https://gist.github.com/2156668
223 */
224 LLVMValueRef
225 lp_build_float_to_half(struct gallivm_state *gallivm,
226 LLVMValueRef src)
227 {
228 struct lp_type i32_type = lp_type_int_vec(32, 32 * LLVMGetVectorSize(LLVMTypeOf(src)));
229
230 LLVMBuilderRef builder = gallivm->builder;
231 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
232
233 struct lp_build_context bld;
234
235 LLVMValueRef result;
236
237 lp_build_context_init(&bld, gallivm, i32_type);
238
239 /* Extra scope because lp_build_min needs a build context, le sigh */
240 {
241 /* Constants */
242 LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13);
243 LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16);
244 LLVMValueRef i32_mask_fabs = lp_build_const_int_vec(gallivm, i32_type, 0x7fffffff);
245 LLVMValueRef i32_f32infty = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
246 LLVMValueRef i32_expinf = lp_build_const_int_vec(gallivm, i32_type, 0xe0 << 23);
247 LLVMValueRef i32_f16max = lp_build_const_int_vec(gallivm, i32_type, 0x8f << 23);
248 LLVMValueRef i32_magic = lp_build_const_int_vec(gallivm, i32_type, 0x0f << 23);
249
250 /* Cast from float32 to int32 */
251 LLVMValueRef f = LLVMBuildBitCast(builder, src, int_vec_type, "");
252
253 /* Remove sign */
254 LLVMValueRef fabs = LLVMBuildAnd(builder, i32_mask_fabs, f, "");
255
256 /* Magic conversion */
257 LLVMValueRef clamped = lp_build_min(&bld, i32_f16max, fabs);
258 LLVMValueRef scaled = LLVMBuildMul(builder, clamped, i32_magic, "");
259
260 /* Make sure Inf/NaN and unormalised survive */
261 LLVMValueRef infnancase = LLVMBuildXor(builder, i32_expinf, fabs, "");
262 LLVMValueRef b_notnormal = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, fabs, i32_f32infty);
263
264 /* Merge normal / unnormal case */
265 LLVMValueRef merge1 = LLVMBuildAnd(builder, infnancase, b_notnormal, "");
266 LLVMValueRef merge2 = LLVMBuildNot(builder, LLVMBuildAnd(builder, b_notnormal, scaled, ""), "");
267 LLVMValueRef merged = LLVMBuildOr(builder, merge1, merge2, "");
268 LLVMValueRef shifted = LLVMBuildLShr(builder, merged, i32_13, "");
269
270 /* Sign bit */
271 LLVMValueRef justsign = LLVMBuildXor(builder, f, fabs, "");
272 LLVMValueRef signshifted = LLVMBuildLShr(builder, justsign, i32_16, "");
273
274 /* Combine result */
275 result = LLVMBuildOr(builder, shifted, signshifted, "");
276 }
277
278 /* Truncate from 32 bit to 16 bit */
279 i32_type.width = 16;
280 return LLVMBuildTrunc(builder, result, lp_build_vec_type(gallivm, i32_type), "");
281 }
282
283
284 /**
285 * Special case for converting clamped IEEE-754 floats to unsigned norms.
286 *
287 * The mathematical voodoo below may seem excessive but it is actually
288 * paramount we do it this way for several reasons. First, there is no single
289 * precision FP to unsigned integer conversion Intel SSE instruction. Second,
290 * secondly, even if there was, since the FP's mantissa takes only a fraction
291 * of register bits the typically scale and cast approach would require double
292 * precision for accurate results, and therefore half the throughput
293 *
294 * Although the result values can be scaled to an arbitrary bit width specified
295 * by dst_width, the actual result type will have the same width.
296 *
297 * Ex: src = { float, float, float, float }
298 * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
299 */
300 LLVMValueRef
301 lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm,
302 struct lp_type src_type,
303 unsigned dst_width,
304 LLVMValueRef src)
305 {
306 LLVMBuilderRef builder = gallivm->builder;
307 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type);
308 LLVMValueRef res;
309 unsigned mantissa;
310
311 assert(src_type.floating);
312 assert(dst_width <= src_type.width);
313 src_type.sign = FALSE;
314
315 mantissa = lp_mantissa(src_type);
316
317 if (dst_width <= mantissa) {
318 /*
319 * Apply magic coefficients that will make the desired result to appear
320 * in the lowest significant bits of the mantissa, with correct rounding.
321 *
322 * This only works if the destination width fits in the mantissa.
323 */
324
325 unsigned long long ubound;
326 unsigned long long mask;
327 double scale;
328 double bias;
329
330 ubound = (1ULL << dst_width);
331 mask = ubound - 1;
332 scale = (double)mask/ubound;
333 bias = (double)(1ULL << (mantissa - dst_width));
334
335 res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), "");
336 res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), "");
337 res = LLVMBuildBitCast(builder, res, int_vec_type, "");
338 res = LLVMBuildAnd(builder, res,
339 lp_build_const_int_vec(gallivm, src_type, mask), "");
340 }
341 else if (dst_width == (mantissa + 1)) {
342 /*
343 * The destination width matches exactly what can be represented in
344 * floating point (i.e., mantissa + 1 bits). So do a straight
345 * multiplication followed by casting. No further rounding is necessary.
346 */
347
348 double scale;
349
350 scale = (double)((1ULL << dst_width) - 1);
351
352 res = LLVMBuildFMul(builder, src,
353 lp_build_const_vec(gallivm, src_type, scale), "");
354 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
355 }
356 else {
357 /*
358 * The destination exceeds what can be represented in the floating point.
359 * So multiply by the largest power two we get away with, and when
360 * subtract the most significant bit to rescale to normalized values.
361 *
362 * The largest power of two factor we can get away is
363 * (1 << (src_type.width - 1)), because we need to use signed . In theory it
364 * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
365 * INT_MIN should be returned in FPToSI, which is the correct result for
366 * values near 1.0!
367 *
368 * This means we get (src_type.width - 1) correct bits for values near 0.0,
369 * and (mantissa + 1) correct bits for values near 1.0. Equally or more
370 * important, we also get exact results for 0.0 and 1.0.
371 */
372
373 unsigned n = MIN2(src_type.width - 1, dst_width);
374
375 double scale = (double)(1ULL << n);
376 unsigned lshift = dst_width - n;
377 unsigned rshift = n;
378 LLVMValueRef lshifted;
379 LLVMValueRef rshifted;
380
381 res = LLVMBuildFMul(builder, src,
382 lp_build_const_vec(gallivm, src_type, scale), "");
383 res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
384
385 /*
386 * Align the most significant bit to its final place.
387 *
388 * This will cause 1.0 to overflow to 0, but the later adjustment will
389 * get it right.
390 */
391 if (lshift) {
392 lshifted = LLVMBuildShl(builder, res,
393 lp_build_const_int_vec(gallivm, src_type,
394 lshift), "");
395 } else {
396 lshifted = res;
397 }
398
399 /*
400 * Align the most significant bit to the right.
401 */
402 rshifted = LLVMBuildLShr(builder, res,
403 lp_build_const_int_vec(gallivm, src_type, rshift),
404 "");
405
406 /*
407 * Subtract the MSB to the LSB, therefore re-scaling from
408 * (1 << dst_width) to ((1 << dst_width) - 1).
409 */
410
411 res = LLVMBuildSub(builder, lshifted, rshifted, "");
412 }
413
414 return res;
415 }
416
417
418 /**
419 * Inverse of lp_build_clamped_float_to_unsigned_norm above.
420 * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
421 * return {float, float, float, float} with values in range [0, 1].
422 */
423 LLVMValueRef
424 lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
425 unsigned src_width,
426 struct lp_type dst_type,
427 LLVMValueRef src)
428 {
429 LLVMBuilderRef builder = gallivm->builder;
430 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type);
431 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type);
432 LLVMValueRef bias_;
433 LLVMValueRef res;
434 unsigned mantissa;
435 unsigned n;
436 unsigned long long ubound;
437 unsigned long long mask;
438 double scale;
439 double bias;
440
441 assert(dst_type.floating);
442
443 mantissa = lp_mantissa(dst_type);
444
445 if (src_width <= (mantissa + 1)) {
446 /*
447 * The source width matches fits what can be represented in floating
448 * point (i.e., mantissa + 1 bits). So do a straight multiplication
449 * followed by casting. No further rounding is necessary.
450 */
451
452 scale = 1.0/(double)((1ULL << src_width) - 1);
453 res = LLVMBuildSIToFP(builder, src, vec_type, "");
454 res = LLVMBuildFMul(builder, res,
455 lp_build_const_vec(gallivm, dst_type, scale), "");
456 return res;
457 }
458 else {
459 /*
460 * The source width exceeds what can be represented in floating
461 * point. So truncate the incoming values.
462 */
463
464 n = MIN2(mantissa, src_width);
465
466 ubound = ((unsigned long long)1 << n);
467 mask = ubound - 1;
468 scale = (double)ubound/mask;
469 bias = (double)((unsigned long long)1 << (mantissa - n));
470
471 res = src;
472
473 if (src_width > mantissa) {
474 int shift = src_width - mantissa;
475 res = LLVMBuildLShr(builder, res,
476 lp_build_const_int_vec(gallivm, dst_type, shift), "");
477 }
478
479 bias_ = lp_build_const_vec(gallivm, dst_type, bias);
480
481 res = LLVMBuildOr(builder,
482 res,
483 LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
484
485 res = LLVMBuildBitCast(builder, res, vec_type, "");
486
487 res = LLVMBuildFSub(builder, res, bias_, "");
488 res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
489 }
490
491 return res;
492 }
493
494
495 /**
496 * Pick a suitable num_dsts for lp_build_conv to ensure optimal cases are used.
497 *
498 * Returns the number of dsts created from src
499 */
500 int lp_build_conv_auto(struct gallivm_state *gallivm,
501 struct lp_type src_type,
502 struct lp_type* dst_type,
503 const LLVMValueRef *src,
504 unsigned num_srcs,
505 LLVMValueRef *dst)
506 {
507 int i;
508 int num_dsts = num_srcs;
509
510 if (src_type.floating == dst_type->floating &&
511 src_type.width == dst_type->width &&
512 src_type.length == dst_type->length &&
513 src_type.fixed == dst_type->fixed &&
514 src_type.norm == dst_type->norm &&
515 src_type.sign == dst_type->sign)
516 return num_dsts;
517
518 /* Special case 4x4f -> 1x16ub or 2x8f -> 1x16ub
519 */
520 if (src_type.floating == 1 &&
521 src_type.fixed == 0 &&
522 src_type.sign == 1 &&
523 src_type.norm == 0 &&
524 src_type.width == 32 &&
525
526 dst_type->floating == 0 &&
527 dst_type->fixed == 0 &&
528 dst_type->sign == 0 &&
529 dst_type->norm == 1 &&
530 dst_type->width == 8)
531 {
532 /* Special case 4x4f --> 1x16ub */
533 if (src_type.length == 4 && util_cpu_caps.has_sse2)
534 {
535 assert((num_srcs % 4) == 0);
536
537 num_dsts = num_srcs / 4;
538 dst_type->length = 16;
539
540 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
541 return num_dsts;
542 }
543
544 /* Special case 2x8f --> 1x16ub */
545 if (src_type.length == 8 && util_cpu_caps.has_avx)
546 {
547 assert((num_srcs % 2) == 0);
548
549 num_dsts = num_srcs / 2;
550 dst_type->length = 16;
551
552 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
553 return num_dsts;
554 }
555 }
556
557 /* lp_build_resize does not support M:N */
558 if (src_type.width == dst_type->width) {
559 lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
560 } else {
561 for (i = 0; i < num_srcs; ++i) {
562 lp_build_conv(gallivm, src_type, *dst_type, &src[i], 1, &dst[i], 1);
563 }
564 }
565
566 return num_dsts;
567 }
568
569
570 /**
571 * Generic type conversion.
572 *
573 * TODO: Take a precision argument, or even better, add a new precision member
574 * to the lp_type union.
575 */
576 void
577 lp_build_conv(struct gallivm_state *gallivm,
578 struct lp_type src_type,
579 struct lp_type dst_type,
580 const LLVMValueRef *src, unsigned num_srcs,
581 LLVMValueRef *dst, unsigned num_dsts)
582 {
583 LLVMBuilderRef builder = gallivm->builder;
584 struct lp_type tmp_type;
585 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
586 unsigned num_tmps;
587 unsigned i;
588
589 /* We must not loose or gain channels. Only precision */
590 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
591
592 assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
593 assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
594 assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
595 assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
596
597 tmp_type = src_type;
598 for(i = 0; i < num_srcs; ++i) {
599 assert(lp_check_value(src_type, src[i]));
600 tmp[i] = src[i];
601 }
602 num_tmps = num_srcs;
603
604
605 /* Special case 4x4f --> 1x16ub
606 */
607 if (src_type.floating == 1 &&
608 src_type.fixed == 0 &&
609 src_type.sign == 1 &&
610 src_type.norm == 0 &&
611 src_type.width == 32 &&
612 src_type.length == 4 &&
613
614 dst_type.floating == 0 &&
615 dst_type.fixed == 0 &&
616 dst_type.sign == 0 &&
617 dst_type.norm == 1 &&
618 dst_type.width == 8 &&
619 dst_type.length == 16 &&
620
621 4 * num_dsts == num_srcs &&
622
623 util_cpu_caps.has_sse2)
624 {
625 struct lp_build_context bld;
626 struct lp_type int16_type = dst_type;
627 struct lp_type int32_type = dst_type;
628 LLVMValueRef const_255f;
629 unsigned i, j;
630
631 lp_build_context_init(&bld, gallivm, src_type);
632
633 int16_type.width *= 2;
634 int16_type.length /= 2;
635 int16_type.sign = 1;
636
637 int32_type.width *= 4;
638 int32_type.length /= 4;
639 int32_type.sign = 1;
640
641 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
642
643 for (i = 0; i < num_dsts; ++i, src += 4) {
644 LLVMValueRef lo, hi;
645
646 for (j = 0; j < 4; ++j) {
647 tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
648 tmp[j] = lp_build_iround(&bld, tmp[j]);
649 }
650
651 /* relying on clamping behavior of sse2 intrinsics here */
652 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
653 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
654 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
655 }
656
657 return;
658 }
659
660 /* Special case 2x8f --> 1x16ub
661 */
662 else if (src_type.floating == 1 &&
663 src_type.fixed == 0 &&
664 src_type.sign == 1 &&
665 src_type.norm == 0 &&
666 src_type.width == 32 &&
667 src_type.length == 8 &&
668
669 dst_type.floating == 0 &&
670 dst_type.fixed == 0 &&
671 dst_type.sign == 0 &&
672 dst_type.norm == 1 &&
673 dst_type.width == 8 &&
674 dst_type.length == 16 &&
675
676 2 * num_dsts == num_srcs &&
677
678 util_cpu_caps.has_avx) {
679
680 struct lp_build_context bld;
681 struct lp_type int16_type = dst_type;
682 struct lp_type int32_type = dst_type;
683 LLVMValueRef const_255f;
684 unsigned i;
685
686 lp_build_context_init(&bld, gallivm, src_type);
687
688 int16_type.width *= 2;
689 int16_type.length /= 2;
690 int16_type.sign = 1;
691
692 int32_type.width *= 4;
693 int32_type.length /= 4;
694 int32_type.sign = 1;
695
696 const_255f = lp_build_const_vec(gallivm, src_type, 255.0f);
697
698 for (i = 0; i < num_dsts; ++i, src += 2) {
699 LLVMValueRef lo, hi, a, b;
700
701 a = LLVMBuildFMul(builder, src[0], const_255f, "");
702 b = LLVMBuildFMul(builder, src[1], const_255f, "");
703
704 a = lp_build_iround(&bld, a);
705 b = lp_build_iround(&bld, b);
706
707 tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
708 tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
709 tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
710 tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
711
712 /* relying on clamping behavior of sse2 intrinsics here */
713 lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
714 hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
715 dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
716 }
717 return;
718 }
719
720 /* Special case -> 16bit half-float
721 */
722 else if (dst_type.floating && dst_type.width == 16)
723 {
724 /* Only support src as 32bit float currently */
725 assert(src_type.floating && src_type.width == 32);
726
727 for(i = 0; i < num_tmps; ++i)
728 dst[i] = lp_build_float_to_half(gallivm, tmp[i]);
729
730 return;
731 }
732
733 /* Pre convert half-floats to floats
734 */
735 else if (src_type.floating && src_type.width == 16)
736 {
737 for(i = 0; i < num_tmps; ++i)
738 tmp[i] = lp_build_half_to_float(gallivm, tmp[i]);
739
740 tmp_type.width = 32;
741 }
742
743 /*
744 * Clamp if necessary
745 */
746
747 if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
748 struct lp_build_context bld;
749 double src_min = lp_const_min(src_type);
750 double dst_min = lp_const_min(dst_type);
751 double src_max = lp_const_max(src_type);
752 double dst_max = lp_const_max(dst_type);
753 LLVMValueRef thres;
754
755 lp_build_context_init(&bld, gallivm, tmp_type);
756
757 if(src_min < dst_min) {
758 if(dst_min == 0.0)
759 thres = bld.zero;
760 else
761 thres = lp_build_const_vec(gallivm, src_type, dst_min);
762 for(i = 0; i < num_tmps; ++i)
763 tmp[i] = lp_build_max(&bld, tmp[i], thres);
764 }
765
766 if(src_max > dst_max) {
767 if(dst_max == 1.0)
768 thres = bld.one;
769 else
770 thres = lp_build_const_vec(gallivm, src_type, dst_max);
771 for(i = 0; i < num_tmps; ++i)
772 tmp[i] = lp_build_min(&bld, tmp[i], thres);
773 }
774 }
775
776 /*
777 * Scale to the narrowest range
778 */
779
780 if(dst_type.floating) {
781 /* Nothing to do */
782 }
783 else if(tmp_type.floating) {
784 if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
785 for(i = 0; i < num_tmps; ++i) {
786 tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm,
787 tmp_type,
788 dst_type.width,
789 tmp[i]);
790 }
791 tmp_type.floating = FALSE;
792 }
793 else {
794 double dst_scale = lp_const_scale(dst_type);
795 LLVMTypeRef tmp_vec_type;
796
797 if (dst_scale != 1.0) {
798 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale);
799 for(i = 0; i < num_tmps; ++i)
800 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
801 }
802
803 /* Use an equally sized integer for intermediate computations */
804 tmp_type.floating = FALSE;
805 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
806 for(i = 0; i < num_tmps; ++i) {
807 #if 0
808 if(dst_type.sign)
809 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
810 else
811 tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
812 #else
813 /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
814 tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
815 #endif
816 }
817 }
818 }
819 else {
820 unsigned src_shift = lp_const_shift(src_type);
821 unsigned dst_shift = lp_const_shift(dst_type);
822 unsigned src_offset = lp_const_offset(src_type);
823 unsigned dst_offset = lp_const_offset(dst_type);
824
825 /* Compensate for different offsets */
826 if (dst_offset > src_offset && src_type.width > dst_type.width) {
827 for (i = 0; i < num_tmps; ++i) {
828 LLVMValueRef shifted;
829 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1);
830 if(src_type.sign)
831 shifted = LLVMBuildAShr(builder, tmp[i], shift, "");
832 else
833 shifted = LLVMBuildLShr(builder, tmp[i], shift, "");
834
835 tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, "");
836 }
837 }
838
839 if(src_shift > dst_shift) {
840 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type,
841 src_shift - dst_shift);
842 for(i = 0; i < num_tmps; ++i)
843 if(src_type.sign)
844 tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
845 else
846 tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
847 }
848 }
849
850 /*
851 * Truncate or expand bit width
852 *
853 * No data conversion should happen here, although the sign bits are
854 * crucial to avoid bad clamping.
855 */
856
857 {
858 struct lp_type new_type;
859
860 new_type = tmp_type;
861 new_type.sign = dst_type.sign;
862 new_type.width = dst_type.width;
863 new_type.length = dst_type.length;
864
865 lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
866
867 tmp_type = new_type;
868 num_tmps = num_dsts;
869 }
870
871 /*
872 * Scale to the widest range
873 */
874
875 if(src_type.floating) {
876 /* Nothing to do */
877 }
878 else if(!src_type.floating && dst_type.floating) {
879 if(!src_type.fixed && !src_type.sign && src_type.norm) {
880 for(i = 0; i < num_tmps; ++i) {
881 tmp[i] = lp_build_unsigned_norm_to_float(gallivm,
882 src_type.width,
883 dst_type,
884 tmp[i]);
885 }
886 tmp_type.floating = TRUE;
887 }
888 else {
889 double src_scale = lp_const_scale(src_type);
890 LLVMTypeRef tmp_vec_type;
891
892 /* Use an equally sized integer for intermediate computations */
893 tmp_type.floating = TRUE;
894 tmp_type.sign = TRUE;
895 tmp_vec_type = lp_build_vec_type(gallivm, tmp_type);
896 for(i = 0; i < num_tmps; ++i) {
897 #if 0
898 if(dst_type.sign)
899 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
900 else
901 tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
902 #else
903 /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
904 tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
905 #endif
906 }
907
908 if (src_scale != 1.0) {
909 LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale);
910 for(i = 0; i < num_tmps; ++i)
911 tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
912 }
913 }
914 }
915 else {
916 unsigned src_shift = lp_const_shift(src_type);
917 unsigned dst_shift = lp_const_shift(dst_type);
918 unsigned src_offset = lp_const_offset(src_type);
919 unsigned dst_offset = lp_const_offset(dst_type);
920
921 if (src_shift < dst_shift) {
922 LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH];
923 LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift);
924
925 for (i = 0; i < num_tmps; ++i) {
926 pre_shift[i] = tmp[i];
927 tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
928 }
929
930 /* Compensate for different offsets */
931 if (dst_offset > src_offset) {
932 for (i = 0; i < num_tmps; ++i) {
933 tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], "");
934 }
935 }
936 }
937 }
938
939 for(i = 0; i < num_dsts; ++i) {
940 dst[i] = tmp[i];
941 assert(lp_check_value(dst_type, dst[i]));
942 }
943 }
944
945
946 /**
947 * Bit mask conversion.
948 *
949 * This will convert the integer masks that match the given types.
950 *
951 * The mask values should 0 or -1, i.e., all bits either set to zero or one.
952 * Any other value will likely cause unpredictable results.
953 *
954 * This is basically a very trimmed down version of lp_build_conv.
955 */
956 void
957 lp_build_conv_mask(struct gallivm_state *gallivm,
958 struct lp_type src_type,
959 struct lp_type dst_type,
960 const LLVMValueRef *src, unsigned num_srcs,
961 LLVMValueRef *dst, unsigned num_dsts)
962 {
963
964 /* We must not loose or gain channels. Only precision */
965 assert(src_type.length * num_srcs == dst_type.length * num_dsts);
966
967 /*
968 * Drop
969 *
970 * We assume all values are 0 or -1
971 */
972
973 src_type.floating = FALSE;
974 src_type.fixed = FALSE;
975 src_type.sign = TRUE;
976 src_type.norm = FALSE;
977
978 dst_type.floating = FALSE;
979 dst_type.fixed = FALSE;
980 dst_type.sign = TRUE;
981 dst_type.norm = FALSE;
982
983 /*
984 * Truncate or expand bit width
985 */
986
987 lp_build_resize(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
988 }