1 /**************************************************************************
3 * Copyright 2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
26 **************************************************************************/
29 #include "util/u_debug.h"
30 #include "util/u_cpu_detect.h"
31 #include "util/u_math.h"
32 #include "lp_bld_debug.h"
33 #include "lp_bld_const.h"
34 #include "lp_bld_format.h"
35 #include "lp_bld_gather.h"
36 #include "lp_bld_swizzle.h"
37 #include "lp_bld_type.h"
38 #include "lp_bld_init.h"
39 #include "lp_bld_intr.h"
40 #include "lp_bld_pack.h"
44 * Get the pointer to one element from scatter positions in memory.
46 * @sa lp_build_gather()
49 lp_build_gather_elem_ptr(struct gallivm_state
*gallivm
,
51 LLVMValueRef base_ptr
,
58 assert(LLVMTypeOf(base_ptr
) == LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0));
64 LLVMValueRef index
= lp_build_const_int32(gallivm
, i
);
65 offset
= LLVMBuildExtractElement(gallivm
->builder
, offsets
, index
, "");
68 ptr
= LLVMBuildGEP(gallivm
->builder
, base_ptr
, &offset
, 1, "");
75 * Gather one element from scatter positions in memory.
77 * @sa lp_build_gather()
80 lp_build_gather_elem(struct gallivm_state
*gallivm
,
85 LLVMValueRef base_ptr
,
88 boolean vector_justify
)
90 LLVMTypeRef src_type
= LLVMIntTypeInContext(gallivm
->context
, src_width
);
91 LLVMTypeRef src_ptr_type
= LLVMPointerType(src_type
, 0);
92 LLVMTypeRef dst_elem_type
= LLVMIntTypeInContext(gallivm
->context
, dst_width
);
96 assert(LLVMTypeOf(base_ptr
) == LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0));
98 ptr
= lp_build_gather_elem_ptr(gallivm
, length
, base_ptr
, offsets
, i
);
99 ptr
= LLVMBuildBitCast(gallivm
->builder
, ptr
, src_ptr_type
, "");
100 res
= LLVMBuildLoad(gallivm
->builder
, ptr
, "");
103 * On some archs we probably really want to avoid having to deal
104 * with alignments lower than 4 bytes (if fetch size is a power of
105 * two >= 32). On x86 it doesn't matter, however.
106 * We should be able to guarantee full alignment for any kind of texture
107 * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
108 * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
109 * but I don't think that's quite what we wanted).
110 * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
111 * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
112 * enforcing what we want (which is what d3d10 does, the offset needs to
113 * be aligned to element size, but GL has bytes regardless of element
114 * size which would only leave us with minimum alignment restriction of 16
115 * which doesn't make much sense if the type isn't 4x32bit). Due to
116 * translation of offsets to first_elem in sampler_views it actually seems
117 * gallium could not do anything else except 16 no matter what...
120 LLVMSetAlignment(res
, 1);
121 } else if (!util_is_power_of_two_or_zero(src_width
)) {
123 * Full alignment is impossible, assume the caller really meant
124 * the individual elements were aligned (e.g. 3x32bit format).
125 * And yes the generated code may otherwise crash, llvm will
126 * really assume 128bit alignment with a 96bit fetch (I suppose
127 * that makes sense as it can just assume the upper 32bit to be
129 * Maybe the caller should be able to explicitly set this, but
130 * this should cover all the 3-channel formats.
132 if (((src_width
/ 24) * 24 == src_width
) &&
133 util_is_power_of_two_or_zero(src_width
/ 24)) {
134 LLVMSetAlignment(res
, src_width
/ 24);
136 LLVMSetAlignment(res
, 1);
140 assert(src_width
<= dst_width
);
141 if (src_width
< dst_width
) {
142 res
= LLVMBuildZExt(gallivm
->builder
, res
, dst_elem_type
, "");
143 if (vector_justify
) {
144 #ifdef PIPE_ARCH_BIG_ENDIAN
145 res
= LLVMBuildShl(gallivm
->builder
, res
,
146 LLVMConstInt(dst_elem_type
, dst_width
- src_width
, 0), "");
156 * Gather one element from scatter positions in memory.
157 * Nearly the same as above, however the individual elements
158 * may be vectors themselves, and fetches may be float type.
159 * Can also do pad vector instead of ZExt.
161 * @sa lp_build_gather()
164 lp_build_gather_elem_vec(struct gallivm_state
*gallivm
,
167 LLVMTypeRef src_type
,
168 struct lp_type dst_type
,
170 LLVMValueRef base_ptr
,
171 LLVMValueRef offsets
,
173 boolean vector_justify
)
175 LLVMValueRef ptr
, res
;
176 LLVMTypeRef src_ptr_type
= LLVMPointerType(src_type
, 0);
177 assert(LLVMTypeOf(base_ptr
) == LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0));
179 ptr
= lp_build_gather_elem_ptr(gallivm
, length
, base_ptr
, offsets
, i
);
180 ptr
= LLVMBuildBitCast(gallivm
->builder
, ptr
, src_ptr_type
, "");
181 res
= LLVMBuildLoad(gallivm
->builder
, ptr
, "");
184 * On some archs we probably really want to avoid having to deal
185 * with alignments lower than 4 bytes (if fetch size is a power of
186 * two >= 32). On x86 it doesn't matter, however.
187 * We should be able to guarantee full alignment for any kind of texture
188 * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
189 * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
190 * but I don't think that's quite what we wanted).
191 * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
192 * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
193 * enforcing what we want (which is what d3d10 does, the offset needs to
194 * be aligned to element size, but GL has bytes regardless of element
195 * size which would only leave us with minimum alignment restriction of 16
196 * which doesn't make much sense if the type isn't 4x32bit). Due to
197 * translation of offsets to first_elem in sampler_views it actually seems
198 * gallium could not do anything else except 16 no matter what...
201 LLVMSetAlignment(res
, 1);
202 } else if (!util_is_power_of_two_or_zero(src_width
)) {
204 * Full alignment is impossible, assume the caller really meant
205 * the individual elements were aligned (e.g. 3x32bit format).
206 * And yes the generated code may otherwise crash, llvm will
207 * really assume 128bit alignment with a 96bit fetch (I suppose
208 * that makes sense as it can just assume the upper 32bit to be
210 * Maybe the caller should be able to explicitly set this, but
211 * this should cover all the 3-channel formats.
213 if (((src_width
/ 24) * 24 == src_width
) &&
214 util_is_power_of_two_or_zero(src_width
/ 24)) {
215 LLVMSetAlignment(res
, src_width
/ 24);
217 LLVMSetAlignment(res
, 1);
221 assert(src_width
<= dst_type
.width
* dst_type
.length
);
222 if (src_width
< dst_type
.width
* dst_type
.length
) {
223 if (dst_type
.length
> 1) {
224 res
= lp_build_pad_vector(gallivm
, res
, dst_type
.length
);
226 * vector_justify hopefully a non-issue since we only deal
227 * with src_width >= 32 here?
230 LLVMTypeRef dst_elem_type
= lp_build_vec_type(gallivm
, dst_type
);
233 * Only valid if src_ptr_type is int type...
235 res
= LLVMBuildZExt(gallivm
->builder
, res
, dst_elem_type
, "");
237 #ifdef PIPE_ARCH_BIG_ENDIAN
238 if (vector_justify
) {
239 res
= LLVMBuildShl(gallivm
->builder
, res
,
240 LLVMConstInt(dst_elem_type
,
241 dst_type
.width
- src_width
, 0), "");
243 if (src_width
== 48) {
244 /* Load 3x16 bit vector.
245 * The sequence of loads on big-endian hardware proceeds as follows.
246 * 16-bit fields are denoted by X, Y, Z, and 0. In memory, the sequence
247 * of three fields appears in the order X, Y, Z.
249 * Load 32-bit word: 0.0.X.Y
250 * Load 16-bit halfword: 0.0.0.Z
251 * Rotate left: 0.X.Y.0
252 * Bitwise OR: 0.X.Y.Z
254 * The order in which we need the fields in the result is 0.Z.Y.X,
255 * the same as on little-endian; permute 16-bit fields accordingly
256 * within 64-bit register:
258 LLVMValueRef shuffles
[4] = {
259 lp_build_const_int32(gallivm
, 2),
260 lp_build_const_int32(gallivm
, 1),
261 lp_build_const_int32(gallivm
, 0),
262 lp_build_const_int32(gallivm
, 3),
264 res
= LLVMBuildBitCast(gallivm
->builder
, res
,
265 lp_build_vec_type(gallivm
, lp_type_uint_vec(16, 4*16)), "");
266 res
= LLVMBuildShuffleVector(gallivm
->builder
, res
, res
, LLVMConstVector(shuffles
, 4), "");
267 res
= LLVMBuildBitCast(gallivm
->builder
, res
, dst_elem_type
, "");
279 lp_build_gather_avx2(struct gallivm_state
*gallivm
,
282 struct lp_type dst_type
,
283 LLVMValueRef base_ptr
,
284 LLVMValueRef offsets
)
286 LLVMBuilderRef builder
= gallivm
->builder
;
287 LLVMTypeRef src_type
, src_vec_type
;
289 struct lp_type res_type
= dst_type
;
290 res_type
.length
*= length
;
292 if (dst_type
.floating
) {
293 src_type
= src_width
== 64 ? LLVMDoubleTypeInContext(gallivm
->context
) :
294 LLVMFloatTypeInContext(gallivm
->context
);
296 src_type
= LLVMIntTypeInContext(gallivm
->context
, src_width
);
298 src_vec_type
= LLVMVectorType(src_type
, length
);
300 /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
301 assert(LLVMTypeOf(base_ptr
) == LLVMPointerType(LLVMInt8TypeInContext(gallivm
->context
), 0));
305 * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
306 * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
307 * least with Haswell. See
308 * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
309 * And the generated code doing the emulation is quite a bit worse
310 * than what we get by doing it ourselves too.
312 LLVMTypeRef i32_type
= LLVMIntTypeInContext(gallivm
->context
, 32);
313 LLVMTypeRef i32_vec_type
= LLVMVectorType(i32_type
, length
);
314 LLVMTypeRef i1_type
= LLVMIntTypeInContext(gallivm
->context
, 1);
315 LLVMTypeRef i1_vec_type
= LLVMVectorType(i1_type
, length
);
316 LLVMTypeRef src_ptr_type
= LLVMPointerType(src_type
, 0);
317 LLVMValueRef src_ptr
;
319 base_ptr
= LLVMBuildBitCast(builder
, base_ptr
, src_ptr_type
, "");
321 /* Rescale offsets from bytes to elements */
322 LLVMValueRef scale
= LLVMConstInt(i32_type
, src_width
/8, 0);
323 scale
= lp_build_broadcast(gallivm
, i32_vec_type
, scale
);
324 assert(LLVMTypeOf(offsets
) == i32_vec_type
);
325 offsets
= LLVMBuildSDiv(builder
, offsets
, scale
, "");
327 src_ptr
= LLVMBuildGEP(builder
, base_ptr
, &offsets
, 1, "vector-gep");
330 snprintf(intrinsic
, sizeof intrinsic
, "llvm.masked.gather.v%u%s%u",
331 length
, dst_type
.floating
? "f" : "i", src_width
);
332 LLVMValueRef alignment
= LLVMConstInt(i32_type
, src_width
/8, 0);
333 LLVMValueRef mask
= LLVMConstAllOnes(i1_vec_type
);
334 LLVMValueRef passthru
= LLVMGetUndef(src_vec_type
);
336 LLVMValueRef args
[] = { src_ptr
, alignment
, mask
, passthru
};
338 res
= lp_build_intrinsic(builder
, intrinsic
, src_vec_type
, args
, 4, 0);
340 LLVMTypeRef i8_type
= LLVMIntTypeInContext(gallivm
->context
, 8);
341 const char *intrinsic
= NULL
;
344 assert(src_width
== 32 || src_width
== 64);
345 if (src_width
== 32) {
346 assert(length
== 4 || length
== 8);
348 assert(length
== 2 || length
== 4);
351 static const char *intrinsics
[2][2][2] = {
353 {{"llvm.x86.avx2.gather.d.d",
354 "llvm.x86.avx2.gather.d.d.256"},
355 {"llvm.x86.avx2.gather.d.q",
356 "llvm.x86.avx2.gather.d.q.256"}},
358 {{"llvm.x86.avx2.gather.d.ps",
359 "llvm.x86.avx2.gather.d.ps.256"},
360 {"llvm.x86.avx2.gather.d.pd",
361 "llvm.x86.avx2.gather.d.pd.256"}},
364 if ((src_width
== 32 && length
== 8) ||
365 (src_width
== 64 && length
== 4)) {
368 intrinsic
= intrinsics
[dst_type
.floating
][src_width
== 64][l_idx
];
370 LLVMValueRef passthru
= LLVMGetUndef(src_vec_type
);
371 LLVMValueRef mask
= LLVMConstAllOnes(src_vec_type
);
372 mask
= LLVMConstBitCast(mask
, src_vec_type
);
373 LLVMValueRef scale
= LLVMConstInt(i8_type
, 1, 0);
375 LLVMValueRef args
[] = { passthru
, base_ptr
, offsets
, mask
, scale
};
377 res
= lp_build_intrinsic(builder
, intrinsic
, src_vec_type
, args
, 5, 0);
379 res
= LLVMBuildBitCast(builder
, res
, lp_build_vec_type(gallivm
, res_type
), "");
386 * Gather elements from scatter positions in memory into a single vector.
387 * Use for fetching texels from a texture.
388 * For SSE, typical values are length=4, src_width=32, dst_width=32.
390 * When src_width < dst_width, the return value can be justified in
392 * "integer justification" is used when the caller treats the destination
393 * as a packed integer bitmask, as described by the channels' "shift" and
395 * "vector justification" is used when the caller casts the destination
396 * to a vector and needs channel X to be in vector element 0.
398 * @param length length of the offsets
399 * @param src_width src element width in bits
400 * @param dst_type result element type (src will be expanded to fit,
401 * but truncation is not allowed)
402 * (this may be a vector, must be pot sized)
403 * @param aligned whether the data is guaranteed to be aligned (to src_width)
404 * @param base_ptr base pointer, needs to be a i8 pointer type.
405 * @param offsets vector with offsets
406 * @param vector_justify select vector rather than integer justification
409 lp_build_gather(struct gallivm_state
*gallivm
,
412 struct lp_type dst_type
,
414 LLVMValueRef base_ptr
,
415 LLVMValueRef offsets
,
416 boolean vector_justify
)
419 boolean need_expansion
= src_width
< dst_type
.width
* dst_type
.length
;
421 struct lp_type fetch_type
, fetch_dst_type
;
422 LLVMTypeRef src_type
;
424 assert(src_width
<= dst_type
.width
* dst_type
.length
);
427 * This is quite a mess...
428 * Figure out if the fetch should be done as:
429 * a) scalar or vector
432 * As an example, for a 96bit fetch expanded into 4x32bit, it is better
433 * to use (3x32bit) vector type (then pad the vector). Otherwise, the
434 * zext will cause extra instructions.
435 * However, the same isn't true for 3x16bit (the codegen for that is
436 * completely worthless on x86 simd, and for 3x8bit is is way worse
437 * still, don't try that... (To get really good code out of llvm for
438 * these cases, the only way is to decompose the fetches manually
439 * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
440 * case requires sse41, otherwise simple scalar zext is way better.
441 * But probably not important enough, so don't bother.)
442 * Also, we try to honor the floating bit of destination (but isn't
443 * possible if caller asks for instance for 2x32bit dst_type with
444 * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
445 * cast to 2x32f type, so the fetch is always int and on top of that
446 * we avoid the vec pad and use scalar zext due the above mentioned
448 * Note this is optimized for x86 sse2 and up backend. Could be tweaked
449 * for other archs if necessary...
451 if (((src_width
% 32) == 0) && ((src_width
% dst_type
.width
) == 0) &&
452 (dst_type
.length
> 1)) {
453 /* use vector fetch (if dst_type is vector) */
455 if (dst_type
.floating
) {
456 fetch_type
= lp_type_float_vec(dst_type
.width
, src_width
);
458 fetch_type
= lp_type_int_vec(dst_type
.width
, src_width
);
460 /* intentionally not using lp_build_vec_type here */
461 src_type
= LLVMVectorType(lp_build_elem_type(gallivm
, fetch_type
),
463 fetch_dst_type
= fetch_type
;
464 fetch_dst_type
.length
= dst_type
.length
;
466 /* use scalar fetch */
468 if (dst_type
.floating
&& ((src_width
== 32) || (src_width
== 64))) {
469 fetch_type
= lp_type_float(src_width
);
471 fetch_type
= lp_type_int(src_width
);
473 src_type
= lp_build_vec_type(gallivm
, fetch_type
);
474 fetch_dst_type
= fetch_type
;
475 fetch_dst_type
.width
= dst_type
.width
* dst_type
.length
;
480 res
= lp_build_gather_elem_vec(gallivm
, length
,
481 src_width
, src_type
, fetch_dst_type
,
482 aligned
, base_ptr
, offsets
, 0,
484 return LLVMBuildBitCast(gallivm
->builder
, res
,
485 lp_build_vec_type(gallivm
, dst_type
), "");
487 * Excluding expansion from these paths because if you need it for
488 * 32bit/64bit fetches you're doing it wrong (this is gather, not
489 * conversion) and it would be awkward for floats.
491 } else if (util_cpu_caps
.has_avx2
&& !need_expansion
&&
492 src_width
== 32 && (length
== 4 || length
== 8)) {
493 return lp_build_gather_avx2(gallivm
, length
, src_width
, dst_type
,
496 * This looks bad on paper wrt throughtput/latency on Haswell.
497 * Even on Broadwell it doesn't look stellar.
498 * Albeit no measurements were done (but tested to work).
499 * Should definitely enable on Skylake.
500 * (In general, should be more of a win if the fetch is 256bit wide -
501 * this is true for the 32bit case above too.)
503 } else if (0 && util_cpu_caps
.has_avx2
&& !need_expansion
&&
504 src_width
== 64 && (length
== 2 || length
== 4)) {
505 return lp_build_gather_avx2(gallivm
, length
, src_width
, dst_type
,
510 LLVMValueRef elems
[LP_MAX_VECTOR_WIDTH
/ 8];
512 boolean vec_zext
= FALSE
;
513 struct lp_type res_type
, gather_res_type
;
514 LLVMTypeRef res_t
, gather_res_t
;
516 res_type
= fetch_dst_type
;
517 res_type
.length
*= length
;
518 gather_res_type
= res_type
;
520 if (src_width
== 16 && dst_type
.width
== 32 && dst_type
.length
== 1) {
522 * Note that llvm is never able to optimize zext/insert combos
523 * directly (i.e. zero the simd reg, then place the elements into
524 * the appropriate place directly). (I think this has to do with
525 * scalar/vector transition.) And scalar 16->32bit zext simd loads
526 * aren't possible (instead loading to scalar reg first).
527 * No idea about other archs...
528 * We could do this manually, but instead we just use a vector
529 * zext, which is simple enough (and, in fact, llvm might optimize
531 * (We're not trying that with other bit widths as that might not be
532 * easier, in particular with 8 bit values at least with only sse2.)
534 assert(vec_fetch
== FALSE
);
535 gather_res_type
.width
/= 2;
536 fetch_dst_type
= fetch_type
;
537 src_type
= lp_build_vec_type(gallivm
, fetch_type
);
540 res_t
= lp_build_vec_type(gallivm
, res_type
);
541 gather_res_t
= lp_build_vec_type(gallivm
, gather_res_type
);
542 res
= LLVMGetUndef(gather_res_t
);
543 for (i
= 0; i
< length
; ++i
) {
544 LLVMValueRef index
= lp_build_const_int32(gallivm
, i
);
545 elems
[i
] = lp_build_gather_elem_vec(gallivm
, length
,
546 src_width
, src_type
, fetch_dst_type
,
547 aligned
, base_ptr
, offsets
, i
,
550 res
= LLVMBuildInsertElement(gallivm
->builder
, res
, elems
[i
], index
, "");
554 res
= LLVMBuildZExt(gallivm
->builder
, res
, res_t
, "");
555 if (vector_justify
) {
556 #ifdef PIPE_ARCH_BIG_ENDIAN
557 unsigned sv
= dst_type
.width
- src_width
;
558 res
= LLVMBuildShl(gallivm
->builder
, res
,
559 lp_build_const_int_vec(gallivm
, res_type
, sv
), "");
565 * Do bitcast now otherwise llvm might get some funny ideas wrt
568 for (i
= 0; i
< length
; i
++) {
569 elems
[i
] = LLVMBuildBitCast(gallivm
->builder
, elems
[i
],
570 lp_build_vec_type(gallivm
, dst_type
), "");
572 res
= lp_build_concat(gallivm
, elems
, dst_type
, length
);
574 struct lp_type really_final_type
= dst_type
;
575 assert(res_type
.length
* res_type
.width
==
576 dst_type
.length
* dst_type
.width
* length
);
577 really_final_type
.length
*= length
;
578 res
= LLVMBuildBitCast(gallivm
->builder
, res
,
579 lp_build_vec_type(gallivm
, really_final_type
), "");
587 lp_build_gather_values(struct gallivm_state
* gallivm
,
588 LLVMValueRef
* values
,
589 unsigned value_count
)
591 LLVMTypeRef vec_type
= LLVMVectorType(LLVMTypeOf(values
[0]), value_count
);
592 LLVMBuilderRef builder
= gallivm
->builder
;
593 LLVMValueRef vec
= LLVMGetUndef(vec_type
);
596 for (i
= 0; i
< value_count
; i
++) {
597 LLVMValueRef index
= lp_build_const_int32(gallivm
, i
);
598 vec
= LLVMBuildInsertElement(builder
, vec
, values
[i
], index
, "");