1 /**************************************************************************
3 * Copyright 2010-2018 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
26 **************************************************************************/
31 * s3tc pixel format manipulation.
33 * @author Roland Scheidegger <sroland@vmware.com>
37 #include "util/u_format.h"
38 #include "util/u_math.h"
39 #include "util/u_string.h"
40 #include "util/u_cpu_detect.h"
41 #include "util/u_debug.h"
43 #include "lp_bld_arit.h"
44 #include "lp_bld_type.h"
45 #include "lp_bld_const.h"
46 #include "lp_bld_conv.h"
47 #include "lp_bld_gather.h"
48 #include "lp_bld_format.h"
49 #include "lp_bld_logic.h"
50 #include "lp_bld_pack.h"
51 #include "lp_bld_flow.h"
52 #include "lp_bld_printf.h"
53 #include "lp_bld_struct.h"
54 #include "lp_bld_swizzle.h"
55 #include "lp_bld_init.h"
56 #include "lp_bld_debug.h"
57 #include "lp_bld_intr.h"
61 * Reverse an interleave2_half
62 * (ie. pick every second element, independent lower/upper halfs)
63 * sse2 can only do that with 32bit (shufps) or larger elements
64 * natively. (Otherwise, and/pack (even) or shift/pack (odd)
65 * could be used, ideally llvm would do that for us.)
66 * XXX: Unfortunately, this does NOT translate to a shufps if those
67 * are int vectors (and casting will not help, llvm needs to recognize it
68 * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
69 * sequence which I'm pretty sure is a lot worse despite domain transition
70 * penalties with shufps (except maybe on Nehalem).
73 lp_build_uninterleave2_half(struct gallivm_state
*gallivm
,
79 LLVMValueRef shuffle
, elems
[LP_MAX_VECTOR_LENGTH
];
82 assert(type
.length
<= LP_MAX_VECTOR_LENGTH
);
85 if (type
.length
* type
.width
== 256) {
86 assert(type
.length
>= 4);
87 for (i
= 0, j
= 0; i
< type
.length
; ++i
) {
88 if (i
== type
.length
/ 4) {
90 } else if (i
== type
.length
/ 2) {
92 } else if (i
== 3 * type
.length
/ 4) {
93 j
= 3 * type
.length
/ 4;
97 elems
[i
] = lp_build_const_int32(gallivm
, j
+ lo_hi
);
100 for (i
= 0; i
< type
.length
; ++i
) {
101 elems
[i
] = lp_build_const_int32(gallivm
, 2*i
+ lo_hi
);
105 shuffle
= LLVMConstVector(elems
, type
.length
);
107 return LLVMBuildShuffleVector(gallivm
->builder
, a
, b
, shuffle
, "");
113 * Build shuffle for extending vectors.
116 lp_build_const_extend_shuffle(struct gallivm_state
*gallivm
,
117 unsigned n
, unsigned length
)
119 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
123 assert(length
<= LP_MAX_VECTOR_LENGTH
);
125 /* TODO: cache results in a static table */
127 for(i
= 0; i
< n
; i
++) {
128 elems
[i
] = lp_build_const_int32(gallivm
, i
);
130 for (i
= n
; i
< length
; i
++) {
131 elems
[i
] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm
->context
));
134 return LLVMConstVector(elems
, length
);
138 lp_build_const_unpackx2_shuffle(struct gallivm_state
*gallivm
, unsigned n
)
140 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
143 assert(n
<= LP_MAX_VECTOR_LENGTH
);
145 /* TODO: cache results in a static table */
147 for(i
= 0, j
= 0; i
< n
; i
+= 2, ++j
) {
148 elems
[i
+ 0] = lp_build_const_int32(gallivm
, 0 + j
);
149 elems
[i
+ 1] = lp_build_const_int32(gallivm
, n
+ j
);
150 elems
[n
+ i
+ 0] = lp_build_const_int32(gallivm
, 0 + n
/2 + j
);
151 elems
[n
+ i
+ 1] = lp_build_const_int32(gallivm
, n
+ n
/2 + j
);
154 return LLVMConstVector(elems
, n
* 2);
158 * broadcast 1 element to all elements
161 lp_build_const_shuffle1(struct gallivm_state
*gallivm
,
162 unsigned index
, unsigned n
)
164 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
167 assert(n
<= LP_MAX_VECTOR_LENGTH
);
169 /* TODO: cache results in a static table */
171 for (i
= 0; i
< n
; i
++) {
172 elems
[i
] = lp_build_const_int32(gallivm
, index
);
175 return LLVMConstVector(elems
, n
);
179 * move 1 element to pos 0, rest undef
182 lp_build_shuffle1undef(struct gallivm_state
*gallivm
,
183 LLVMValueRef a
, unsigned index
, unsigned n
)
185 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
], shuf
;
188 assert(n
<= LP_MAX_VECTOR_LENGTH
);
190 elems
[0] = lp_build_const_int32(gallivm
, index
);
192 for (i
= 1; i
< n
; i
++) {
193 elems
[i
] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm
->context
));
195 shuf
= LLVMConstVector(elems
, n
);
197 return LLVMBuildShuffleVector(gallivm
->builder
, a
, a
, shuf
, "");
201 format_dxt1_variant(enum pipe_format format
)
203 return format
== PIPE_FORMAT_DXT1_RGB
||
204 format
== PIPE_FORMAT_DXT1_RGBA
||
205 format
== PIPE_FORMAT_DXT1_SRGB
||
206 format
== PIPE_FORMAT_DXT1_SRGBA
;
211 * Gather elements from scatter positions in memory into vectors.
212 * This is customised for fetching texels from s3tc textures.
213 * For SSE, typical value is length=4.
215 * @param length length of the offsets
216 * @param colors the stored colors of the blocks will be extracted into this.
217 * @param codewords the codewords of the blocks will be extracted into this.
218 * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
219 * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
220 * @param base_ptr base pointer, should be a i8 pointer type.
221 * @param offsets vector with offsets
224 lp_build_gather_s3tc(struct gallivm_state
*gallivm
,
226 const struct util_format_description
*format_desc
,
227 LLVMValueRef
*colors
,
228 LLVMValueRef
*codewords
,
229 LLVMValueRef
*alpha_lo
,
230 LLVMValueRef
*alpha_hi
,
231 LLVMValueRef base_ptr
,
232 LLVMValueRef offsets
)
234 LLVMBuilderRef builder
= gallivm
->builder
;
235 unsigned block_bits
= format_desc
->block
.bits
;
237 LLVMValueRef elems
[8];
238 LLVMTypeRef type32
= LLVMInt32TypeInContext(gallivm
->context
);
239 LLVMTypeRef type64
= LLVMInt64TypeInContext(gallivm
->context
);
240 LLVMTypeRef type32dxt
;
241 struct lp_type lp_type32dxt
;
243 memset(&lp_type32dxt
, 0, sizeof lp_type32dxt
);
244 lp_type32dxt
.width
= 32;
245 lp_type32dxt
.length
= block_bits
/ 32;
246 type32dxt
= lp_build_vec_type(gallivm
, lp_type32dxt
);
248 assert(block_bits
== 64 || block_bits
== 128);
249 assert(length
== 1 || length
== 4 || length
== 8);
251 for (i
= 0; i
< length
; ++i
) {
252 elems
[i
] = lp_build_gather_elem(gallivm
, length
,
253 block_bits
, block_bits
, TRUE
,
254 base_ptr
, offsets
, i
, FALSE
);
255 elems
[i
] = LLVMBuildBitCast(builder
, elems
[i
], type32dxt
, "");
258 LLVMValueRef elem
= elems
[0];
259 if (block_bits
== 128) {
260 *alpha_lo
= LLVMBuildExtractElement(builder
, elem
,
261 lp_build_const_int32(gallivm
, 0), "");
262 *alpha_hi
= LLVMBuildExtractElement(builder
, elem
,
263 lp_build_const_int32(gallivm
, 1), "");
264 *colors
= LLVMBuildExtractElement(builder
, elem
,
265 lp_build_const_int32(gallivm
, 2), "");
266 *codewords
= LLVMBuildExtractElement(builder
, elem
,
267 lp_build_const_int32(gallivm
, 3), "");
270 *alpha_lo
= LLVMGetUndef(type32
);
271 *alpha_hi
= LLVMGetUndef(type32
);
272 *colors
= LLVMBuildExtractElement(builder
, elem
,
273 lp_build_const_int32(gallivm
, 0), "");
274 *codewords
= LLVMBuildExtractElement(builder
, elem
,
275 lp_build_const_int32(gallivm
, 1), "");
279 LLVMValueRef tmp
[4], cc01
, cc23
;
280 struct lp_type lp_type32
, lp_type64
, lp_type32dxt
;
281 memset(&lp_type32
, 0, sizeof lp_type32
);
282 lp_type32
.width
= 32;
283 lp_type32
.length
= length
;
284 memset(&lp_type64
, 0, sizeof lp_type64
);
285 lp_type64
.width
= 64;
286 lp_type64
.length
= length
/2;
288 if (block_bits
== 128) {
290 for (i
= 0; i
< 4; ++i
) {
293 elems
[i
] = lp_build_concat(gallivm
, tmp
, lp_type32dxt
, 2);
296 lp_build_transpose_aos(gallivm
, lp_type32
, elems
, tmp
);
302 LLVMTypeRef type64_vec
= LLVMVectorType(type64
, length
/2);
303 LLVMTypeRef type32_vec
= LLVMVectorType(type32
, length
);
305 for (i
= 0; i
< length
; ++i
) {
307 elems
[i
] = LLVMBuildShuffleVector(builder
, elems
[i
],
308 LLVMGetUndef(type32dxt
),
309 lp_build_const_extend_shuffle(gallivm
, 2, 4), "");
312 for (i
= 0; i
< 4; ++i
) {
315 elems
[i
] = lp_build_concat(gallivm
, tmp
, lp_type32
, 2);
318 cc01
= lp_build_interleave2_half(gallivm
, lp_type32
, elems
[0], elems
[1], 0);
319 cc23
= lp_build_interleave2_half(gallivm
, lp_type32
, elems
[2], elems
[3], 0);
320 cc01
= LLVMBuildBitCast(builder
, cc01
, type64_vec
, "");
321 cc23
= LLVMBuildBitCast(builder
, cc23
, type64_vec
, "");
322 *colors
= lp_build_interleave2_half(gallivm
, lp_type64
, cc01
, cc23
, 0);
323 *codewords
= lp_build_interleave2_half(gallivm
, lp_type64
, cc01
, cc23
, 1);
324 *colors
= LLVMBuildBitCast(builder
, *colors
, type32_vec
, "");
325 *codewords
= LLVMBuildBitCast(builder
, *codewords
, type32_vec
, "");
330 /** Convert from <n x i32> containing 2 x n rgb565 colors
331 * to 2 <n x i32> rgba8888 colors
332 * This is the most optimized version I can think of
333 * should be nearly as fast as decoding only one color
334 * NOTE: alpha channel will be set to 0
335 * @param colors is a <n x i32> vector containing the rgb565 colors
338 color_expand2_565_to_8888(struct gallivm_state
*gallivm
,
341 LLVMValueRef
*color0
,
342 LLVMValueRef
*color1
)
344 LLVMBuilderRef builder
= gallivm
->builder
;
345 LLVMValueRef r
, g
, b
, rblo
, glo
;
346 LLVMValueRef rgblomask
, rb
, rgb0
, rgb1
;
347 struct lp_type type
, type16
, type8
;
351 memset(&type
, 0, sizeof type
);
355 memset(&type16
, 0, sizeof type16
);
357 type16
.length
= 2 * n
;
359 memset(&type8
, 0, sizeof type8
);
361 type8
.length
= 4 * n
;
363 rgblomask
= lp_build_const_int_vec(gallivm
, type16
, 0x0707);
364 colors
= LLVMBuildBitCast(builder
, colors
,
365 lp_build_vec_type(gallivm
, type16
), "");
366 /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
367 * make sure low bits of r are zero - could use AND but requires constant */
368 r
= LLVMBuildLShr(builder
, colors
, lp_build_const_int_vec(gallivm
, type16
, 11), "");
369 r
= LLVMBuildShl(builder
, r
, lp_build_const_int_vec(gallivm
, type16
, 3), "");
370 b
= LLVMBuildShl(builder
, colors
, lp_build_const_int_vec(gallivm
, type16
, 11), "");
371 rb
= LLVMBuildOr(builder
, r
, b
, "");
372 rblo
= LLVMBuildLShr(builder
, rb
, lp_build_const_int_vec(gallivm
, type16
, 5), "");
373 /* don't have byte shift hence need mask */
374 rblo
= LLVMBuildAnd(builder
, rblo
, rgblomask
, "");
375 rb
= LLVMBuildOr(builder
, rb
, rblo
, "");
377 /* make sure low bits of g are zero */
378 g
= LLVMBuildAnd(builder
, colors
, lp_build_const_int_vec(gallivm
, type16
, 0x07e0), "");
379 g
= LLVMBuildLShr(builder
, g
, lp_build_const_int_vec(gallivm
, type16
, 3), "");
380 glo
= LLVMBuildLShr(builder
, g
, lp_build_const_int_vec(gallivm
, type16
, 6), "");
381 g
= LLVMBuildOr(builder
, g
, glo
, "");
383 rb
= LLVMBuildBitCast(builder
, rb
, lp_build_vec_type(gallivm
, type8
), "");
384 g
= LLVMBuildBitCast(builder
, g
, lp_build_vec_type(gallivm
, type8
), "");
385 rgb0
= lp_build_interleave2_half(gallivm
, type8
, rb
, g
, 0);
386 rgb1
= lp_build_interleave2_half(gallivm
, type8
, rb
, g
, 1);
388 rgb0
= LLVMBuildBitCast(builder
, rgb0
, lp_build_vec_type(gallivm
, type
), "");
389 rgb1
= LLVMBuildBitCast(builder
, rgb1
, lp_build_vec_type(gallivm
, type
), "");
391 /* rgb0 is rgb00, rgb01, rgb10, rgb11
392 * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
393 * on x86 this _should_ just generate one shufps...
395 *color0
= lp_build_uninterleave2_half(gallivm
, type
, rgb0
, rgb1
, 0);
396 *color1
= lp_build_uninterleave2_half(gallivm
, type
, rgb0
, rgb1
, 1);
400 /** Convert from <n x i32> containing rgb565 colors
401 * (in first 16 bits) to <n x i32> rgba8888 colors
403 * NOTE: alpha channel will be set to 0
404 * @param colors is a <n x i32> vector containing the rgb565 colors
407 color_expand_565_to_8888(struct gallivm_state
*gallivm
,
411 LLVMBuilderRef builder
= gallivm
->builder
;
412 LLVMValueRef rgba
, r
, g
, b
, rgblo
, glo
;
413 LLVMValueRef rbhimask
, g6mask
, rgblomask
;
415 memset(&type
, 0, sizeof type
);
420 * first extract and shift colors into their final locations
421 * (high bits - low bits zero at this point)
422 * then replicate highest bits to the lowest bits
423 * note rb replication can be done in parallel but not g
425 * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
426 * rhigh = 8, ghigh = 5, bhigh = 19
427 * rblow = 5, glow = 6
428 * rgblowmask = 0x00070307
429 * r = colors >> rhigh
430 * b = colors << bhigh
431 * g = (colors & g6mask) << ghigh
432 * rb = (r | b) rbhimask
433 * rbtmp = rb >> rblow
435 * rbtmp = rbtmp | gtmp
436 * rbtmp = rbtmp & rgblowmask
437 * rgb = rb | g | rbtmp
439 g6mask
= lp_build_const_int_vec(gallivm
, type
, 0x07e0);
440 rbhimask
= lp_build_const_int_vec(gallivm
, type
, 0x00f800f8);
441 rgblomask
= lp_build_const_int_vec(gallivm
, type
, 0x00070307);
443 r
= LLVMBuildLShr(builder
, colors
, lp_build_const_int_vec(gallivm
, type
, 8), "");
444 b
= LLVMBuildShl(builder
, colors
, lp_build_const_int_vec(gallivm
, type
, 19), "");
445 g
= LLVMBuildAnd(builder
, colors
, g6mask
, "");
446 g
= LLVMBuildShl(builder
, g
, lp_build_const_int_vec(gallivm
, type
, 5), "");
447 rgba
= LLVMBuildOr(builder
, r
, b
, "");
448 rgba
= LLVMBuildAnd(builder
, rgba
, rbhimask
, "");
449 rgblo
= LLVMBuildLShr(builder
, rgba
, lp_build_const_int_vec(gallivm
, type
, 5), "");
450 glo
= LLVMBuildLShr(builder
, g
, lp_build_const_int_vec(gallivm
, type
, 6), "");
451 rgblo
= LLVMBuildOr(builder
, rgblo
, glo
, "");
452 rgblo
= LLVMBuildAnd(builder
, rgblo
, rgblomask
, "");
453 rgba
= LLVMBuildOr(builder
, rgba
, g
, "");
454 rgba
= LLVMBuildOr(builder
, rgba
, rgblo
, "");
461 * Average two byte vectors. (Will always round up.)
464 lp_build_pavgb(struct lp_build_context
*bld8
,
468 struct gallivm_state
*gallivm
= bld8
->gallivm
;
469 LLVMBuilderRef builder
= gallivm
->builder
;
470 assert(bld8
->type
.width
== 8);
471 assert(bld8
->type
.length
== 16 || bld8
->type
.length
== 32);
472 if (HAVE_LLVM
< 0x0600) {
473 LLVMValueRef intrargs
[2];
474 char *intr_name
= bld8
->type
.length
== 32 ? "llvm.x86.avx2.pavg.b" :
475 "llvm.x86.sse2.pavg.b";
478 return lp_build_intrinsic(builder
, intr_name
,
479 bld8
->vec_type
, intrargs
, 2, 0);
482 * Must match llvm's autoupgrade of pavg.b intrinsic to be useful.
483 * You better hope the backend code manages to detect the pattern, and
484 * the pattern doesn't change there...
486 struct lp_type type_ext
= bld8
->type
;
487 LLVMTypeRef vec_type_ext
;
489 LLVMValueRef ext_one
;
491 vec_type_ext
= lp_build_vec_type(gallivm
, type_ext
);
492 ext_one
= lp_build_const_vec(gallivm
, type_ext
, 1);
494 v0
= LLVMBuildZExt(builder
, v0
, vec_type_ext
, "");
495 v1
= LLVMBuildZExt(builder
, v1
, vec_type_ext
, "");
496 res
= LLVMBuildAdd(builder
, v0
, v1
, "");
497 res
= LLVMBuildAdd(builder
, res
, ext_one
, "");
498 res
= LLVMBuildLShr(builder
, res
, ext_one
, "");
499 res
= LLVMBuildTrunc(builder
, res
, bld8
->vec_type
, "");
505 * Calculate 1/3(v1-v0) + v0
506 * and 2*1/3(v1-v0) + v0
509 lp_build_lerp23(struct lp_build_context
*bld
,
515 struct gallivm_state
*gallivm
= bld
->gallivm
;
516 LLVMValueRef x
, x_lo
, x_hi
, delta_lo
, delta_hi
;
517 LLVMValueRef mul_lo
, mul_hi
, v0_lo
, v0_hi
, v1_lo
, v1_hi
, tmp
;
518 const struct lp_type type
= bld
->type
;
519 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
520 struct lp_type i16_type
= lp_wider_type(type
);
521 struct lp_build_context bld2
;
523 assert(lp_check_value(type
, v0
));
524 assert(lp_check_value(type
, v1
));
525 assert(!type
.floating
&& !type
.fixed
&& !type
.norm
&& type
.width
== 8);
527 lp_build_context_init(&bld2
, gallivm
, i16_type
);
528 bld2
.type
.sign
= TRUE
;
529 x
= lp_build_const_int_vec(gallivm
, bld
->type
, 255*1/3);
531 /* FIXME: use native avx256 unpack/pack */
532 lp_build_unpack2(gallivm
, type
, i16_type
, x
, &x_lo
, &x_hi
);
533 lp_build_unpack2(gallivm
, type
, i16_type
, v0
, &v0_lo
, &v0_hi
);
534 lp_build_unpack2(gallivm
, type
, i16_type
, v1
, &v1_lo
, &v1_hi
);
535 delta_lo
= lp_build_sub(&bld2
, v1_lo
, v0_lo
);
536 delta_hi
= lp_build_sub(&bld2
, v1_hi
, v0_hi
);
538 mul_lo
= LLVMBuildMul(builder
, x_lo
, delta_lo
, "");
539 mul_hi
= LLVMBuildMul(builder
, x_hi
, delta_hi
, "");
541 x_lo
= LLVMBuildLShr(builder
, mul_lo
, lp_build_const_int_vec(gallivm
, i16_type
, 8), "");
542 x_hi
= LLVMBuildLShr(builder
, mul_hi
, lp_build_const_int_vec(gallivm
, i16_type
, 8), "");
543 /* lerp optimization: pack now, do add afterwards */
544 tmp
= lp_build_pack2(gallivm
, i16_type
, type
, x_lo
, x_hi
);
545 *res0
= lp_build_add(bld
, tmp
, v0
);
547 x_lo
= LLVMBuildLShr(builder
, mul_lo
, lp_build_const_int_vec(gallivm
, i16_type
, 7), "");
548 x_hi
= LLVMBuildLShr(builder
, mul_hi
, lp_build_const_int_vec(gallivm
, i16_type
, 7), "");
549 /* unlike above still need mask (but add still afterwards). */
550 x_lo
= LLVMBuildAnd(builder
, x_lo
, lp_build_const_int_vec(gallivm
, i16_type
, 0xff), "");
551 x_hi
= LLVMBuildAnd(builder
, x_hi
, lp_build_const_int_vec(gallivm
, i16_type
, 0xff), "");
552 tmp
= lp_build_pack2(gallivm
, i16_type
, type
, x_lo
, x_hi
);
553 *res1
= lp_build_add(bld
, tmp
, v0
);
557 * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
558 * @param colors is a <n x i32> vector with n x 2x16bit colors
559 * @param codewords is a <n x i32> vector containing the codewords
560 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
561 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
564 s3tc_dxt1_full_to_rgba_aos(struct gallivm_state
*gallivm
,
566 enum pipe_format format
,
568 LLVMValueRef codewords
,
572 LLVMBuilderRef builder
= gallivm
->builder
;
573 LLVMValueRef color0
, color1
, color2
, color3
, color2_2
, color3_2
;
574 LLVMValueRef rgba
, a
, colors0
, colors1
, col0
, col1
, const2
;
575 LLVMValueRef bit_pos
, sel_mask
, sel_lo
, sel_hi
, indices
;
576 struct lp_type type
, type8
;
577 struct lp_build_context bld8
, bld32
;
578 boolean is_dxt1_variant
= format_dxt1_variant(format
);
580 memset(&type
, 0, sizeof type
);
584 memset(&type8
, 0, sizeof type8
);
588 assert(lp_check_value(type
, i
));
589 assert(lp_check_value(type
, j
));
591 a
= lp_build_const_int_vec(gallivm
, type
, 0xff000000);
593 lp_build_context_init(&bld32
, gallivm
, type
);
594 lp_build_context_init(&bld8
, gallivm
, type8
);
598 * - expand color0/color1 to rgba8888
599 * - calculate color2/3 (interpolation) according to color0 < color1 rules
600 * - calculate color2/3 according to color0 >= color1 rules
601 * - do selection of color2/3 according to comparison of color0/1
602 * - extract indices (vector shift).
603 * - use compare/select to select the correct color. Since we have 2bit
604 * indices (and 4 colors), needs at least three compare/selects.
607 * expand the two colors
609 col0
= LLVMBuildAnd(builder
, colors
, lp_build_const_int_vec(gallivm
, type
, 0x0000ffff), "");
610 col1
= LLVMBuildLShr(builder
, colors
, lp_build_const_int_vec(gallivm
, type
, 16), "");
612 color_expand2_565_to_8888(gallivm
, n
, colors
, &color0
, &color1
);
615 color0
= color_expand_565_to_8888(gallivm
, n
, col0
);
616 color1
= color_expand_565_to_8888(gallivm
, n
, col1
);
621 * color2_1 is 2/3 color0 + 1/3 color1
622 * color3_1 is 1/3 color0 + 2/3 color1
623 * color2_2 is 1/2 color0 + 1/2 color1
627 colors0
= LLVMBuildBitCast(builder
, color0
, bld8
.vec_type
, "");
628 colors1
= LLVMBuildBitCast(builder
, color1
, bld8
.vec_type
, "");
629 /* can combine 2 lerps into one mostly - still looks expensive enough. */
630 lp_build_lerp23(&bld8
, colors0
, colors1
, &color2
, &color3
);
631 color2
= LLVMBuildBitCast(builder
, color2
, bld32
.vec_type
, "");
632 color3
= LLVMBuildBitCast(builder
, color3
, bld32
.vec_type
, "");
634 /* dxt3/5 always use 4-color encoding */
635 if (is_dxt1_variant
) {
637 if (format
== PIPE_FORMAT_DXT1_RGBA
||
638 format
== PIPE_FORMAT_DXT1_SRGBA
) {
639 color0
= LLVMBuildOr(builder
, color0
, a
, "");
640 color1
= LLVMBuildOr(builder
, color1
, a
, "");
641 color3
= LLVMBuildOr(builder
, color3
, a
, "");
644 * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
645 * Much cheaper (but we don't care that much if n == 1).
647 if ((util_cpu_caps
.has_sse2
&& n
== 4) ||
648 (util_cpu_caps
.has_avx2
&& n
== 8)) {
649 color2_2
= lp_build_pavgb(&bld8
, colors0
, colors1
);
650 color2_2
= LLVMBuildBitCast(builder
, color2_2
, bld32
.vec_type
, "");
653 struct lp_type i16_type
= lp_wider_type(type8
);
654 struct lp_build_context bld2
;
655 LLVMValueRef v0_lo
, v0_hi
, v1_lo
, v1_hi
, addlo
, addhi
;
657 lp_build_context_init(&bld2
, gallivm
, i16_type
);
658 bld2
.type
.sign
= TRUE
;
661 * This isn't as expensive as it looks (the unpack is the same as
662 * for lerp23), with correct rounding.
663 * (Note that while rounding is correct, this will always round down,
664 * whereas pavgb will always round up.)
666 /* FIXME: use native avx256 unpack/pack */
667 lp_build_unpack2(gallivm
, type8
, i16_type
, colors0
, &v0_lo
, &v0_hi
);
668 lp_build_unpack2(gallivm
, type8
, i16_type
, colors1
, &v1_lo
, &v1_hi
);
670 addlo
= lp_build_add(&bld2
, v0_lo
, v1_lo
);
671 addhi
= lp_build_add(&bld2
, v0_hi
, v1_hi
);
672 addlo
= LLVMBuildLShr(builder
, addlo
,
673 lp_build_const_int_vec(gallivm
, i16_type
, 1), "");
674 addhi
= LLVMBuildLShr(builder
, addhi
,
675 lp_build_const_int_vec(gallivm
, i16_type
, 1), "");
676 color2_2
= lp_build_pack2(gallivm
, i16_type
, type8
, addlo
, addhi
);
677 color2_2
= LLVMBuildBitCast(builder
, color2_2
, bld32
.vec_type
, "");
679 color3_2
= lp_build_const_int_vec(gallivm
, type
, 0);
681 /* select between colors2/3 */
682 /* signed compare is faster saves some xors */
684 sel_mask
= lp_build_compare(gallivm
, type
, PIPE_FUNC_GREATER
, col0
, col1
);
685 color2
= lp_build_select(&bld32
, sel_mask
, color2
, color2_2
);
686 color3
= lp_build_select(&bld32
, sel_mask
, color3
, color3_2
);
689 if (format
== PIPE_FORMAT_DXT1_RGBA
||
690 format
== PIPE_FORMAT_DXT1_SRGBA
) {
691 color2
= LLVMBuildOr(builder
, color2
, a
, "");
695 const2
= lp_build_const_int_vec(gallivm
, type
, 2);
696 /* extract 2-bit index values */
697 bit_pos
= LLVMBuildShl(builder
, j
, const2
, "");
698 bit_pos
= LLVMBuildAdd(builder
, bit_pos
, i
, "");
699 bit_pos
= LLVMBuildAdd(builder
, bit_pos
, bit_pos
, "");
701 * NOTE: This innocent looking shift is very expensive with x86/ssex.
702 * Shifts with per-elemnent shift count get roughly translated to
703 * extract (count), extract (value), shift, move (back to xmm), unpack
705 * So about 20 instructions here for 4xi32.
706 * Newer llvm versions (3.7+) will not do extract/insert but use a
707 * a couple constant count vector shifts plus shuffles. About same
708 * amount of instructions unfortunately...
709 * Would get much worse with 8xi16 even...
710 * We could actually do better here:
711 * - subtract bit_pos from 128+30, shl 23, convert float to int...
712 * - now do mul with codewords followed by shr 30...
713 * But requires 32bit->32bit mul, sse41 only (well that's emulatable
714 * with 2 32bit->64bit muls...) and not exactly cheap
715 * AVX2, of course, fixes this nonsense.
717 indices
= LLVMBuildLShr(builder
, codewords
, bit_pos
, "");
719 /* finally select the colors */
720 sel_lo
= LLVMBuildAnd(builder
, indices
, bld32
.one
, "");
721 sel_lo
= lp_build_compare(gallivm
, type
, PIPE_FUNC_EQUAL
, sel_lo
, bld32
.one
);
722 color0
= lp_build_select(&bld32
, sel_lo
, color1
, color0
);
723 color2
= lp_build_select(&bld32
, sel_lo
, color3
, color2
);
724 sel_hi
= LLVMBuildAnd(builder
, indices
, const2
, "");
725 sel_hi
= lp_build_compare(gallivm
, type
, PIPE_FUNC_EQUAL
, sel_hi
, const2
);
726 rgba
= lp_build_select(&bld32
, sel_hi
, color2
, color0
);
729 if (format
== PIPE_FORMAT_DXT1_RGB
||
730 format
== PIPE_FORMAT_DXT1_SRGB
) {
731 rgba
= LLVMBuildOr(builder
, rgba
, a
, "");
733 return LLVMBuildBitCast(builder
, rgba
, bld8
.vec_type
, "");
738 s3tc_dxt1_to_rgba_aos(struct gallivm_state
*gallivm
,
740 enum pipe_format format
,
742 LLVMValueRef codewords
,
746 return s3tc_dxt1_full_to_rgba_aos(gallivm
, n
, format
,
747 colors
, codewords
, i
, j
);
752 * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
753 * @param colors is a <n x i32> vector with n x 2x16bit colors
754 * @param codewords is a <n x i32> vector containing the codewords
755 * @param alphas is a <n x i64> vector containing the alpha values
756 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
757 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
760 s3tc_dxt3_to_rgba_aos(struct gallivm_state
*gallivm
,
762 enum pipe_format format
,
764 LLVMValueRef codewords
,
765 LLVMValueRef alpha_low
,
766 LLVMValueRef alpha_hi
,
770 LLVMBuilderRef builder
= gallivm
->builder
;
771 LLVMValueRef rgba
, tmp
, tmp2
;
772 LLVMValueRef bit_pos
, sel_mask
;
773 struct lp_type type
, type8
;
774 struct lp_build_context bld
;
776 memset(&type
, 0, sizeof type
);
780 memset(&type8
, 0, sizeof type8
);
784 assert(lp_check_value(type
, i
));
785 assert(lp_check_value(type
, j
));
787 lp_build_context_init(&bld
, gallivm
, type
);
789 rgba
= s3tc_dxt1_to_rgba_aos(gallivm
, n
, format
,
790 colors
, codewords
, i
, j
);
792 rgba
= LLVMBuildBitCast(builder
, rgba
, bld
.vec_type
, "");
795 * Extract alpha values. Since we now need to select from
796 * which 32bit vector values are fetched, construct selection
797 * mask from highest bit of bit_pos, and use select, then shift
798 * according to the bit_pos (without the highest bit).
799 * Note this is pointless for n == 1 case. Could just
800 * directly use 64bit arithmetic if we'd extract 64bit
801 * alpha value instead of 2x32...
804 bit_pos
= LLVMBuildShl(builder
, j
, lp_build_const_int_vec(gallivm
, type
, 2), "");
805 bit_pos
= LLVMBuildAdd(builder
, bit_pos
, i
, "");
806 bit_pos
= LLVMBuildShl(builder
, bit_pos
,
807 lp_build_const_int_vec(gallivm
, type
, 2), "");
808 sel_mask
= LLVMBuildLShr(builder
, bit_pos
,
809 lp_build_const_int_vec(gallivm
, type
, 5), "");
810 sel_mask
= LLVMBuildSub(builder
, sel_mask
, bld
.one
, "");
811 tmp
= lp_build_select(&bld
, sel_mask
, alpha_low
, alpha_hi
);
812 bit_pos
= LLVMBuildAnd(builder
, bit_pos
,
813 lp_build_const_int_vec(gallivm
, type
, 0xffffffdf), "");
814 /* Warning: slow shift with per element count */
816 * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
817 * to select the right byte with pshufb. Then for the remaining one bit
818 * just do shift/select.
820 tmp
= LLVMBuildLShr(builder
, tmp
, bit_pos
, "");
822 /* combined expand from a4 to a8 and shift into position */
823 tmp
= LLVMBuildShl(builder
, tmp
, lp_build_const_int_vec(gallivm
, type
, 28), "");
824 tmp2
= LLVMBuildLShr(builder
, tmp
, lp_build_const_int_vec(gallivm
, type
, 4), "");
825 tmp
= LLVMBuildOr(builder
, tmp
, tmp2
, "");
827 rgba
= LLVMBuildOr(builder
, tmp
, rgba
, "");
829 return LLVMBuildBitCast(builder
, rgba
, lp_build_vec_type(gallivm
, type8
), "");
833 lp_build_lerpdxta(struct gallivm_state
*gallivm
,
837 LLVMValueRef sel_mask
,
841 * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
842 * (plus pmullw is actually faster...)
843 * we just pretend our 32bit values (which are really only 8bit) are 16bits.
844 * Note that this is obviously a disaster for the scalar case.
846 LLVMBuilderRef builder
= gallivm
->builder
;
847 LLVMValueRef delta
, ainterp
;
848 LLVMValueRef weight5
, weight7
, weight
;
849 struct lp_type type32
, type16
, type8
;
850 struct lp_build_context bld16
;
852 memset(&type32
, 0, sizeof type32
);
855 memset(&type16
, 0, sizeof type16
);
859 memset(&type8
, 0, sizeof type8
);
863 lp_build_context_init(&bld16
, gallivm
, type16
);
864 /* 255/7 is a bit off - increase accuracy at the expense of shift later */
865 sel_mask
= LLVMBuildBitCast(builder
, sel_mask
, bld16
.vec_type
, "");
866 weight5
= lp_build_const_int_vec(gallivm
, type16
, 255*64/5);
867 weight7
= lp_build_const_int_vec(gallivm
, type16
, 255*64/7);
868 weight
= lp_build_select(&bld16
, sel_mask
, weight7
, weight5
);
870 alpha0
= LLVMBuildBitCast(builder
, alpha0
, bld16
.vec_type
, "");
871 alpha1
= LLVMBuildBitCast(builder
, alpha1
, bld16
.vec_type
, "");
872 code
= LLVMBuildBitCast(builder
, code
, bld16
.vec_type
, "");
873 /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
875 code
= LLVMBuildSub(builder
, code
, bld16
.one
, "");
877 weight
= LLVMBuildMul(builder
, weight
, code
, "");
878 weight
= LLVMBuildLShr(builder
, weight
,
879 lp_build_const_int_vec(gallivm
, type16
, 6), "");
881 delta
= LLVMBuildSub(builder
, alpha1
, alpha0
, "");
883 ainterp
= LLVMBuildMul(builder
, delta
, weight
, "");
884 ainterp
= LLVMBuildLShr(builder
, ainterp
,
885 lp_build_const_int_vec(gallivm
, type16
, 8), "");
887 ainterp
= LLVMBuildBitCast(builder
, ainterp
, lp_build_vec_type(gallivm
, type8
), "");
888 alpha0
= LLVMBuildBitCast(builder
, alpha0
, lp_build_vec_type(gallivm
, type8
), "");
889 ainterp
= LLVMBuildAdd(builder
, alpha0
, ainterp
, "");
890 ainterp
= LLVMBuildBitCast(builder
, ainterp
, lp_build_vec_type(gallivm
, type32
), "");
896 * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
897 * @param colors is a <n x i32> vector with n x 2x16bit colors
898 * @param codewords is a <n x i32> vector containing the codewords
899 * @param alphas is a <n x i64> vector containing the alpha values
900 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
901 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
904 s3tc_dxt5_full_to_rgba_aos(struct gallivm_state
*gallivm
,
906 enum pipe_format format
,
908 LLVMValueRef codewords
,
909 LLVMValueRef alpha_lo
,
910 LLVMValueRef alpha_hi
,
914 LLVMBuilderRef builder
= gallivm
->builder
;
915 LLVMValueRef rgba
, tmp
, alpha0
, alpha1
, alphac
, alphac0
, bit_pos
, shift
;
916 LLVMValueRef sel_mask
, tmp_mask
, alpha
, alpha64
, code_s
;
917 LLVMValueRef mask6
, mask7
, ainterp
;
918 LLVMTypeRef i64t
= LLVMInt64TypeInContext(gallivm
->context
);
919 LLVMTypeRef i32t
= LLVMInt32TypeInContext(gallivm
->context
);
920 struct lp_type type
, type8
;
921 struct lp_build_context bld32
;
923 memset(&type
, 0, sizeof type
);
927 memset(&type8
, 0, sizeof type8
);
931 assert(lp_check_value(type
, i
));
932 assert(lp_check_value(type
, j
));
934 lp_build_context_init(&bld32
, gallivm
, type
);
936 assert(lp_check_value(type
, i
));
937 assert(lp_check_value(type
, j
));
939 rgba
= s3tc_dxt1_to_rgba_aos(gallivm
, n
, format
,
940 colors
, codewords
, i
, j
);
942 rgba
= LLVMBuildBitCast(builder
, rgba
, bld32
.vec_type
, "");
944 /* this looks pretty complex for vectorization:
945 * extract a0/a1 values
947 * select weights for interpolation depending on a0 > a1
948 * mul weights by code - 1
950 * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
953 alpha0
= LLVMBuildAnd(builder
, alpha_lo
,
954 lp_build_const_int_vec(gallivm
, type
, 0xff), "");
955 alpha1
= LLVMBuildLShr(builder
, alpha_lo
,
956 lp_build_const_int_vec(gallivm
, type
, 8), "");
957 alpha1
= LLVMBuildAnd(builder
, alpha1
,
958 lp_build_const_int_vec(gallivm
, type
, 0xff), "");
961 bit_pos
= LLVMBuildShl(builder
, j
, lp_build_const_int_vec(gallivm
, type
, 2), "");
962 bit_pos
= LLVMBuildAdd(builder
, bit_pos
, i
, "");
963 tmp
= LLVMBuildAdd(builder
, bit_pos
, bit_pos
, "");
964 bit_pos
= LLVMBuildAdd(builder
, bit_pos
, tmp
, "");
965 /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
966 bit_pos
= LLVMBuildAdd(builder
, bit_pos
,
967 lp_build_const_int_vec(gallivm
, type
, 16), "");
970 struct lp_type type64
;
971 memset(&type64
, 0, sizeof type64
);
974 /* This is pretty pointless could avoid by just directly extracting
975 64bit in the first place but makes it more complicated elsewhere */
976 alpha_lo
= LLVMBuildZExt(builder
, alpha_lo
, i64t
, "");
977 alpha_hi
= LLVMBuildZExt(builder
, alpha_hi
, i64t
, "");
978 alphac0
= LLVMBuildShl(builder
, alpha_hi
,
979 lp_build_const_int_vec(gallivm
, type64
, 32), "");
980 alphac0
= LLVMBuildOr(builder
, alpha_lo
, alphac0
, "");
982 shift
= LLVMBuildZExt(builder
, bit_pos
, i64t
, "");
983 alphac0
= LLVMBuildLShr(builder
, alphac0
, shift
, "");
984 alphac0
= LLVMBuildTrunc(builder
, alphac0
, i32t
, "");
985 alphac
= LLVMBuildAnd(builder
, alphac0
,
986 lp_build_const_int_vec(gallivm
, type
, 0x7), "");
990 * Using non-native vector length here (actually, with avx2 and
991 * n == 4 llvm will indeed expand to ymm regs...)
992 * At least newer llvm versions handle that ok.
993 * llvm 3.7+ will even handle the emulated 64bit shift with variable
994 * shift count without extraction (and it's actually easier to
995 * emulate than the 32bit one).
997 alpha64
= LLVMBuildShuffleVector(builder
, alpha_lo
, alpha_hi
,
998 lp_build_const_unpackx2_shuffle(gallivm
, n
), "");
1000 alpha64
= LLVMBuildBitCast(builder
, alpha64
, LLVMVectorType(i64t
, n
), "");
1001 shift
= LLVMBuildZExt(builder
, bit_pos
, LLVMVectorType(i64t
, n
), "");
1002 alphac
= LLVMBuildLShr(builder
, alpha64
, shift
, "");
1003 alphac
= LLVMBuildTrunc(builder
, alphac
, bld32
.vec_type
, "");
1005 alphac
= LLVMBuildAnd(builder
, alphac
,
1006 lp_build_const_int_vec(gallivm
, type
, 0x7), "");
1009 /* signed compare is faster saves some xors */
1011 /* alpha0 > alpha1 selection */
1012 sel_mask
= lp_build_compare(gallivm
, type
, PIPE_FUNC_GREATER
,
1014 ainterp
= lp_build_lerpdxta(gallivm
, alpha0
, alpha1
, alphac
, sel_mask
, n
);
1017 * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
1018 * else we select a0 for case 0, a1 for case 1,
1019 * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
1020 * a = (c == 0) ? a0 : a1
1021 * a = (c > 1) ? ainterp : a
1022 * Finally handle case 6/7 for !(a0 > a1)
1023 * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
1024 * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
1026 tmp_mask
= lp_build_compare(gallivm
, type
, PIPE_FUNC_EQUAL
,
1027 alphac
, bld32
.zero
);
1028 alpha
= lp_build_select(&bld32
, tmp_mask
, alpha0
, alpha1
);
1029 tmp_mask
= lp_build_compare(gallivm
, type
, PIPE_FUNC_GREATER
,
1031 alpha
= lp_build_select(&bld32
, tmp_mask
, ainterp
, alpha
);
1033 code_s
= LLVMBuildAnd(builder
, alphac
,
1034 LLVMBuildNot(builder
, sel_mask
, ""), "");
1035 mask6
= lp_build_compare(gallivm
, type
, PIPE_FUNC_EQUAL
,
1036 code_s
, lp_build_const_int_vec(gallivm
, type
, 6));
1037 mask7
= lp_build_compare(gallivm
, type
, PIPE_FUNC_EQUAL
,
1038 code_s
, lp_build_const_int_vec(gallivm
, type
, 7));
1039 alpha
= LLVMBuildAnd(builder
, alpha
, LLVMBuildNot(builder
, mask6
, ""), "");
1040 alpha
= LLVMBuildOr(builder
, alpha
, mask7
, "");
1042 alpha
= LLVMBuildShl(builder
, alpha
, lp_build_const_int_vec(gallivm
, type
, 24), "");
1043 rgba
= LLVMBuildOr(builder
, alpha
, rgba
, "");
1045 return LLVMBuildBitCast(builder
, rgba
, lp_build_vec_type(gallivm
, type8
), "");
1050 lp_build_gather_s3tc_simple_scalar(struct gallivm_state
*gallivm
,
1051 const struct util_format_description
*format_desc
,
1052 LLVMValueRef
*dxt_block
,
1055 LLVMBuilderRef builder
= gallivm
->builder
;
1056 unsigned block_bits
= format_desc
->block
.bits
;
1057 LLVMValueRef elem
, shuf
;
1058 LLVMTypeRef type32
= LLVMIntTypeInContext(gallivm
->context
, 32);
1059 LLVMTypeRef src_type
= LLVMIntTypeInContext(gallivm
->context
, block_bits
);
1060 LLVMTypeRef src_ptr_type
= LLVMPointerType(src_type
, 0);
1061 LLVMTypeRef type32_4
= LLVMVectorType(type32
, 4);
1063 assert(block_bits
== 64 || block_bits
== 128);
1065 ptr
= LLVMBuildBitCast(builder
, ptr
, src_ptr_type
, "");
1066 elem
= LLVMBuildLoad(builder
, ptr
, "");
1068 if (block_bits
== 128) {
1069 /* just return block as is */
1070 *dxt_block
= LLVMBuildBitCast(builder
, elem
, type32_4
, "");
1073 LLVMTypeRef type32_2
= LLVMVectorType(type32
, 2);
1074 shuf
= lp_build_const_extend_shuffle(gallivm
, 2, 4);
1075 elem
= LLVMBuildBitCast(builder
, elem
, type32_2
, "");
1076 *dxt_block
= LLVMBuildShuffleVector(builder
, elem
,
1077 LLVMGetUndef(type32_2
), shuf
, "");
1083 s3tc_store_cached_block(struct gallivm_state
*gallivm
,
1085 LLVMValueRef tag_value
,
1086 LLVMValueRef hash_index
,
1089 LLVMBuilderRef builder
= gallivm
->builder
;
1090 LLVMValueRef ptr
, indices
[3];
1091 LLVMTypeRef type_ptr4x32
;
1094 type_ptr4x32
= LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm
->context
), 4), 0);
1095 indices
[0] = lp_build_const_int32(gallivm
, 0);
1096 indices
[1] = lp_build_const_int32(gallivm
, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS
);
1097 indices
[2] = hash_index
;
1098 ptr
= LLVMBuildGEP(builder
, cache
, indices
, ARRAY_SIZE(indices
), "");
1099 LLVMBuildStore(builder
, tag_value
, ptr
);
1101 indices
[1] = lp_build_const_int32(gallivm
, LP_BUILD_FORMAT_CACHE_MEMBER_DATA
);
1102 hash_index
= LLVMBuildMul(builder
, hash_index
,
1103 lp_build_const_int32(gallivm
, 16), "");
1104 for (count
= 0; count
< 4; count
++) {
1105 indices
[2] = hash_index
;
1106 ptr
= LLVMBuildGEP(builder
, cache
, indices
, ARRAY_SIZE(indices
), "");
1107 ptr
= LLVMBuildBitCast(builder
, ptr
, type_ptr4x32
, "");
1108 LLVMBuildStore(builder
, col
[count
], ptr
);
1109 hash_index
= LLVMBuildAdd(builder
, hash_index
,
1110 lp_build_const_int32(gallivm
, 4), "");
1115 s3tc_lookup_cached_pixel(struct gallivm_state
*gallivm
,
1119 LLVMBuilderRef builder
= gallivm
->builder
;
1120 LLVMValueRef member_ptr
, indices
[3];
1122 indices
[0] = lp_build_const_int32(gallivm
, 0);
1123 indices
[1] = lp_build_const_int32(gallivm
, LP_BUILD_FORMAT_CACHE_MEMBER_DATA
);
1125 member_ptr
= LLVMBuildGEP(builder
, ptr
, indices
, ARRAY_SIZE(indices
), "");
1126 return LLVMBuildLoad(builder
, member_ptr
, "cache_data");
1130 s3tc_lookup_tag_data(struct gallivm_state
*gallivm
,
1134 LLVMBuilderRef builder
= gallivm
->builder
;
1135 LLVMValueRef member_ptr
, indices
[3];
1137 indices
[0] = lp_build_const_int32(gallivm
, 0);
1138 indices
[1] = lp_build_const_int32(gallivm
, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS
);
1140 member_ptr
= LLVMBuildGEP(builder
, ptr
, indices
, ARRAY_SIZE(indices
), "");
1141 return LLVMBuildLoad(builder
, member_ptr
, "tag_data");
1144 #if LP_BUILD_FORMAT_CACHE_DEBUG
1146 s3tc_update_cache_access(struct gallivm_state
*gallivm
,
1151 LLVMBuilderRef builder
= gallivm
->builder
;
1152 LLVMValueRef member_ptr
, cache_access
;
1154 assert(index
== LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL
||
1155 index
== LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS
);
1157 member_ptr
= lp_build_struct_get_ptr(gallivm
, ptr
, index
, "");
1158 cache_access
= LLVMBuildLoad(builder
, member_ptr
, "cache_access");
1159 cache_access
= LLVMBuildAdd(builder
, cache_access
,
1160 LLVMConstInt(LLVMInt64TypeInContext(gallivm
->context
),
1162 LLVMBuildStore(builder
, cache_access
, member_ptr
);
1167 * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
1168 * The lerp is performed between the first 2 32bit colors
1169 * in the source vector, both results are returned packed in result vector.
1172 lp_build_lerp23_single(struct lp_build_context
*bld
,
1175 struct gallivm_state
*gallivm
= bld
->gallivm
;
1176 LLVMValueRef x
, mul
, delta
, res
, v0
, v1
, elems
[8];
1177 const struct lp_type type
= bld
->type
;
1178 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1179 struct lp_type i16_type
= lp_wider_type(type
);
1180 struct lp_type i32_type
= lp_wider_type(i16_type
);
1181 struct lp_build_context bld2
;
1183 assert(!type
.floating
&& !type
.fixed
&& !type
.norm
&& type
.width
== 8);
1185 lp_build_context_init(&bld2
, gallivm
, i16_type
);
1186 bld2
.type
.sign
= TRUE
;
1188 /* weights 256/3, 256*2/3, with correct rounding */
1189 elems
[0] = elems
[1] = elems
[2] = elems
[3] =
1190 lp_build_const_elem(gallivm
, i16_type
, 255*1/3);
1191 elems
[4] = elems
[5] = elems
[6] = elems
[7] =
1192 lp_build_const_elem(gallivm
, i16_type
, 171);
1193 x
= LLVMConstVector(elems
, 8);
1196 * v01 has col0 in 32bit elem 0, col1 in elem 1.
1197 * Interleave/unpack will give us separate v0/v1 vectors.
1199 v01
= lp_build_interleave2(gallivm
, i32_type
, v01
, v01
, 0);
1200 v01
= LLVMBuildBitCast(builder
, v01
, bld
->vec_type
, "");
1202 lp_build_unpack2(gallivm
, type
, i16_type
, v01
, &v0
, &v1
);
1203 delta
= lp_build_sub(&bld2
, v1
, v0
);
1205 mul
= LLVMBuildMul(builder
, x
, delta
, "");
1207 mul
= LLVMBuildLShr(builder
, mul
, lp_build_const_int_vec(gallivm
, i16_type
, 8), "");
1208 /* lerp optimization: pack now, do add afterwards */
1209 res
= lp_build_pack2(gallivm
, i16_type
, type
, mul
, bld2
.undef
);
1210 /* only lower 2 elems are valid - for these v0 is really v0 */
1211 return lp_build_add(bld
, res
, v01
);
1215 * decode one dxt1 block.
1218 s3tc_decode_block_dxt1(struct gallivm_state
*gallivm
,
1219 enum pipe_format format
,
1220 LLVMValueRef dxt_block
,
1223 LLVMBuilderRef builder
= gallivm
->builder
;
1224 LLVMValueRef color01
, color23
, color01_16
, color0123
;
1225 LLVMValueRef rgba
, tmp
, a
, sel_mask
, indices
, code
, const2
;
1226 struct lp_type type8
, type32
, type16
, type64
;
1227 struct lp_build_context bld8
, bld32
, bld16
, bld64
;
1229 boolean is_dxt1_variant
= format_dxt1_variant(format
);
1231 memset(&type32
, 0, sizeof type32
);
1236 memset(&type8
, 0, sizeof type8
);
1240 memset(&type16
, 0, sizeof type16
);
1244 memset(&type64
, 0, sizeof type64
);
1248 a
= lp_build_const_int_vec(gallivm
, type32
, 0xff000000);
1249 const2
= lp_build_const_int_vec(gallivm
, type32
, 2);
1251 lp_build_context_init(&bld32
, gallivm
, type32
);
1252 lp_build_context_init(&bld16
, gallivm
, type16
);
1253 lp_build_context_init(&bld8
, gallivm
, type8
);
1254 lp_build_context_init(&bld64
, gallivm
, type64
);
1256 if (is_dxt1_variant
) {
1257 color01
= lp_build_shuffle1undef(gallivm
, dxt_block
, 0, 4);
1258 code
= lp_build_shuffle1undef(gallivm
, dxt_block
, 1, 4);
1260 color01
= lp_build_shuffle1undef(gallivm
, dxt_block
, 2, 4);
1261 code
= lp_build_shuffle1undef(gallivm
, dxt_block
, 3, 4);
1263 code
= LLVMBuildBitCast(builder
, code
, bld8
.vec_type
, "");
1264 /* expand bytes to dwords */
1265 code
= lp_build_interleave2(gallivm
, type8
, code
, code
, 0);
1266 code
= lp_build_interleave2(gallivm
, type8
, code
, code
, 0);
1271 * - expand color0/color1 to rgba8888
1272 * - calculate color2/3 (interpolation) according to color0 < color1 rules
1273 * - calculate color2/3 according to color0 >= color1 rules
1274 * - do selection of color2/3 according to comparison of color0/1
1275 * - extract indices.
1276 * - use compare/select to select the correct color. Since we have 2bit
1277 * indices (and 4 colors), needs at least three compare/selects.
1281 * expand the two colors
1283 color01
= LLVMBuildBitCast(builder
, color01
, bld16
.vec_type
, "");
1284 color01
= lp_build_interleave2(gallivm
, type16
, color01
,
1286 color01_16
= LLVMBuildBitCast(builder
, color01
, bld32
.vec_type
, "");
1287 color01
= color_expand_565_to_8888(gallivm
, 4, color01_16
);
1290 * interpolate colors
1291 * color2_1 is 2/3 color0 + 1/3 color1
1292 * color3_1 is 1/3 color0 + 2/3 color1
1293 * color2_2 is 1/2 color0 + 1/2 color1
1297 /* TODO: since this is now always scalar, should
1298 * probably just use control flow here instead of calculating
1299 * both cases and then selection
1301 if (format
== PIPE_FORMAT_DXT1_RGBA
||
1302 format
== PIPE_FORMAT_DXT1_SRGBA
) {
1303 color01
= LLVMBuildOr(builder
, color01
, a
, "");
1305 /* can combine 2 lerps into one mostly */
1306 color23
= lp_build_lerp23_single(&bld8
, color01
);
1307 color23
= LLVMBuildBitCast(builder
, color23
, bld32
.vec_type
, "");
1309 /* dxt3/5 always use 4-color encoding */
1310 if (is_dxt1_variant
) {
1311 LLVMValueRef color23_2
, color2_2
;
1313 if (util_cpu_caps
.has_sse2
) {
1314 LLVMValueRef intrargs
[2];
1315 intrargs
[0] = LLVMBuildBitCast(builder
, color01
, bld8
.vec_type
, "");
1316 /* same interleave as for lerp23 - correct result in 2nd element */
1317 intrargs
[1] = lp_build_interleave2(gallivm
, type32
, color01
, color01
, 0);
1318 intrargs
[1] = LLVMBuildBitCast(builder
, intrargs
[1], bld8
.vec_type
, "");
1319 color2_2
= lp_build_pavgb(&bld8
, intrargs
[0], intrargs
[1]);
1322 LLVMValueRef v01
, v0
, v1
, vhalf
;
1324 * This isn't as expensive as it looks (the unpack is the same as
1325 * for lerp23, which is the reason why we do the pointless
1326 * interleave2 too), with correct rounding (the two lower elements
1327 * will be the same).
1329 v01
= lp_build_interleave2(gallivm
, type32
, color01
, color01
, 0);
1330 v01
= LLVMBuildBitCast(builder
, v01
, bld8
.vec_type
, "");
1331 lp_build_unpack2(gallivm
, type8
, type16
, v01
, &v0
, &v1
);
1332 vhalf
= lp_build_add(&bld16
, v0
, v1
);
1333 vhalf
= LLVMBuildLShr(builder
, vhalf
, bld16
.one
, "");
1334 color2_2
= lp_build_pack2(gallivm
, type16
, type8
, vhalf
, bld16
.undef
);
1336 /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
1337 color23_2
= LLVMBuildBitCast(builder
, color2_2
, bld64
.vec_type
, "");
1338 color23_2
= LLVMBuildLShr(builder
, color23_2
,
1339 lp_build_const_int_vec(gallivm
, type64
, 32), "");
1340 color23_2
= LLVMBuildBitCast(builder
, color23_2
, bld32
.vec_type
, "");
1342 tmp
= LLVMBuildBitCast(builder
, color01_16
, bld64
.vec_type
, "");
1343 tmp
= LLVMBuildLShr(builder
, tmp
,
1344 lp_build_const_int_vec(gallivm
, type64
, 32), "");
1345 tmp
= LLVMBuildBitCast(builder
, tmp
, bld32
.vec_type
, "");
1346 sel_mask
= lp_build_compare(gallivm
, type32
, PIPE_FUNC_GREATER
,
1348 sel_mask
= lp_build_interleave2(gallivm
, type32
, sel_mask
, sel_mask
, 0);
1349 color23
= lp_build_select(&bld32
, sel_mask
, color23
, color23_2
);
1352 if (util_cpu_caps
.has_ssse3
) {
1354 * Use pshufb as mini-lut. (Only doable with intrinsics as the
1355 * final shuffles are non-constant. pshufb is awesome!)
1357 LLVMValueRef shuf
[16], low2mask
;
1358 LLVMValueRef intrargs
[2], lut_ind
, lut_adj
;
1360 color01
= LLVMBuildBitCast(builder
, color01
, bld64
.vec_type
, "");
1361 color23
= LLVMBuildBitCast(builder
, color23
, bld64
.vec_type
, "");
1362 color0123
= lp_build_interleave2(gallivm
, type64
, color01
, color23
, 0);
1363 color0123
= LLVMBuildBitCast(builder
, color0123
, bld32
.vec_type
, "");
1365 if (format
== PIPE_FORMAT_DXT1_RGB
||
1366 format
== PIPE_FORMAT_DXT1_SRGB
) {
1367 color0123
= LLVMBuildOr(builder
, color0123
, a
, "");
1370 /* shuffle as r0r1r2r3g0g1... */
1371 for (i
= 0; i
< 4; i
++) {
1372 shuf
[4*i
] = lp_build_const_int32(gallivm
, 0 + i
);
1373 shuf
[4*i
+1] = lp_build_const_int32(gallivm
, 4 + i
);
1374 shuf
[4*i
+2] = lp_build_const_int32(gallivm
, 8 + i
);
1375 shuf
[4*i
+3] = lp_build_const_int32(gallivm
, 12 + i
);
1377 color0123
= LLVMBuildBitCast(builder
, color0123
, bld8
.vec_type
, "");
1378 color0123
= LLVMBuildShuffleVector(builder
, color0123
, bld8
.undef
,
1379 LLVMConstVector(shuf
, 16), "");
1381 /* lowest 2 bits of each 8 bit value contain index into "LUT" */
1382 low2mask
= lp_build_const_int_vec(gallivm
, type8
, 3);
1383 /* add 0/4/8/12 for r/g/b/a */
1384 lut_adj
= lp_build_const_int_vec(gallivm
, type32
, 0x0c080400);
1385 lut_adj
= LLVMBuildBitCast(builder
, lut_adj
, bld8
.vec_type
, "");
1386 intrargs
[0] = color0123
;
1387 for (i
= 0; i
< 4; i
++) {
1388 lut_ind
= LLVMBuildAnd(builder
, code
, low2mask
, "");
1389 lut_ind
= LLVMBuildOr(builder
, lut_ind
, lut_adj
, "");
1390 intrargs
[1] = lut_ind
;
1391 col
[i
] = lp_build_intrinsic(builder
, "llvm.x86.ssse3.pshuf.b.128",
1392 bld8
.vec_type
, intrargs
, 2, 0);
1393 col
[i
] = LLVMBuildBitCast(builder
, col
[i
], bld32
.vec_type
, "");
1394 code
= LLVMBuildBitCast(builder
, code
, bld32
.vec_type
, "");
1395 code
= LLVMBuildLShr(builder
, code
, const2
, "");
1396 code
= LLVMBuildBitCast(builder
, code
, bld8
.vec_type
, "");
1400 /* Thanks to vectorization can do 4 texels in parallel */
1401 LLVMValueRef color0
, color1
, color2
, color3
;
1402 if (format
== PIPE_FORMAT_DXT1_RGB
||
1403 format
== PIPE_FORMAT_DXT1_SRGB
) {
1404 color01
= LLVMBuildOr(builder
, color01
, a
, "");
1405 color23
= LLVMBuildOr(builder
, color23
, a
, "");
1407 color0
= LLVMBuildShuffleVector(builder
, color01
, bld32
.undef
,
1408 lp_build_const_shuffle1(gallivm
, 0, 4), "");
1409 color1
= LLVMBuildShuffleVector(builder
, color01
, bld32
.undef
,
1410 lp_build_const_shuffle1(gallivm
, 1, 4), "");
1411 color2
= LLVMBuildShuffleVector(builder
, color23
, bld32
.undef
,
1412 lp_build_const_shuffle1(gallivm
, 0, 4), "");
1413 color3
= LLVMBuildShuffleVector(builder
, color23
, bld32
.undef
,
1414 lp_build_const_shuffle1(gallivm
, 1, 4), "");
1415 code
= LLVMBuildBitCast(builder
, code
, bld32
.vec_type
, "");
1417 for (i
= 0; i
< 4; i
++) {
1418 /* select the colors */
1419 LLVMValueRef selmasklo
, rgba01
, rgba23
, bitlo
;
1421 indices
= LLVMBuildAnd(builder
, code
, bitlo
, "");
1422 selmasklo
= lp_build_compare(gallivm
, type32
, PIPE_FUNC_EQUAL
,
1424 rgba01
= lp_build_select(&bld32
, selmasklo
, color1
, color0
);
1426 LLVMValueRef selmaskhi
;
1427 indices
= LLVMBuildAnd(builder
, code
, const2
, "");
1428 selmaskhi
= lp_build_compare(gallivm
, type32
, PIPE_FUNC_EQUAL
,
1430 rgba23
= lp_build_select(&bld32
, selmasklo
, color3
, color2
);
1431 rgba
= lp_build_select(&bld32
, selmaskhi
, rgba23
, rgba01
);
1434 * Note that this will give "wrong" order.
1435 * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
1436 * This would be easily fixable by using different shuffle, bitlo/hi
1437 * vectors above (and different shift), but seems slightly easier to
1438 * deal with for dxt3/dxt5 alpha too. So instead change lookup.
1441 code
= LLVMBuildLShr(builder
, code
, const2
, "");
1447 * decode one dxt3 block.
1450 s3tc_decode_block_dxt3(struct gallivm_state
*gallivm
,
1451 enum pipe_format format
,
1452 LLVMValueRef dxt_block
,
1455 LLVMBuilderRef builder
= gallivm
->builder
;
1456 LLVMValueRef alpha
, alphas0
, alphas1
, shift4_16
, a
[4], mask8hi
;
1457 struct lp_type type32
, type8
, type16
;
1460 memset(&type32
, 0, sizeof type32
);
1464 memset(&type8
, 0, sizeof type8
);
1468 memset(&type16
, 0, sizeof type16
);
1472 s3tc_decode_block_dxt1(gallivm
, format
, dxt_block
, col
);
1474 shift4_16
= lp_build_const_int_vec(gallivm
, type16
, 4);
1475 mask8hi
= lp_build_const_int_vec(gallivm
, type32
, 0xff000000);
1477 alpha
= LLVMBuildBitCast(builder
, dxt_block
,
1478 lp_build_vec_type(gallivm
, type8
), "");
1479 alpha
= lp_build_interleave2(gallivm
, type8
, alpha
, alpha
, 0);
1480 alpha
= LLVMBuildBitCast(builder
, alpha
,
1481 lp_build_vec_type(gallivm
, type16
), "");
1482 alpha
= LLVMBuildAnd(builder
, alpha
,
1483 lp_build_const_int_vec(gallivm
, type16
, 0xf00f), "");
1484 alphas0
= LLVMBuildLShr(builder
, alpha
, shift4_16
, "");
1485 alphas1
= LLVMBuildShl(builder
, alpha
, shift4_16
, "");
1486 alpha
= LLVMBuildOr(builder
, alphas0
, alpha
, "");
1487 alpha
= LLVMBuildOr(builder
, alphas1
, alpha
, "");
1488 alpha
= LLVMBuildBitCast(builder
, alpha
,
1489 lp_build_vec_type(gallivm
, type32
), "");
1491 * alpha now contains elems 0,1,2,3,... (ubytes)
1492 * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
1493 * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
1495 a
[0] = LLVMBuildShl(builder
, alpha
,
1496 lp_build_const_int_vec(gallivm
, type32
, 24), "");
1497 a
[1] = LLVMBuildShl(builder
, alpha
,
1498 lp_build_const_int_vec(gallivm
, type32
, 16), "");
1499 a
[1] = LLVMBuildAnd(builder
, a
[1], mask8hi
, "");
1500 a
[2] = LLVMBuildShl(builder
, alpha
,
1501 lp_build_const_int_vec(gallivm
, type32
, 8), "");
1502 a
[2] = LLVMBuildAnd(builder
, a
[2], mask8hi
, "");
1503 a
[3] = LLVMBuildAnd(builder
, alpha
, mask8hi
, "");
1505 for (i
= 0; i
< 4; i
++) {
1506 col
[i
] = LLVMBuildOr(builder
, col
[i
], a
[i
], "");
1512 lp_build_lerpdxta_block(struct gallivm_state
*gallivm
,
1513 LLVMValueRef alpha0
,
1514 LLVMValueRef alpha1
,
1516 LLVMValueRef sel_mask
)
1518 LLVMBuilderRef builder
= gallivm
->builder
;
1519 LLVMValueRef delta
, ainterp
;
1520 LLVMValueRef weight5
, weight7
, weight
;
1521 struct lp_type type16
;
1522 struct lp_build_context bld
;
1524 memset(&type16
, 0, sizeof type16
);
1529 lp_build_context_init(&bld
, gallivm
, type16
);
1531 * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
1532 * actually be desirable to do this here with even higher accuracy than
1533 * even 8 bit (more or less required for rgtc, albeit that's not handled
1534 * here right now), shift the weights after multiplication by code.
1536 weight5
= lp_build_const_int_vec(gallivm
, type16
, 256*64/5);
1537 weight7
= lp_build_const_int_vec(gallivm
, type16
, 256*64/7);
1538 weight
= lp_build_select(&bld
, sel_mask
, weight7
, weight5
);
1541 * we'll get garbage in the elements which had code 0 (or larger than
1542 * 5 or 7) but we don't care (or rather, need to fix up anyway).
1544 code
= LLVMBuildSub(builder
, code
, bld
.one
, "");
1546 weight
= LLVMBuildMul(builder
, weight
, code
, "");
1547 weight
= LLVMBuildLShr(builder
, weight
,
1548 lp_build_const_int_vec(gallivm
, type16
, 6), "");
1550 delta
= LLVMBuildSub(builder
, alpha1
, alpha0
, "");
1552 ainterp
= LLVMBuildMul(builder
, delta
, weight
, "");
1553 ainterp
= LLVMBuildLShr(builder
, ainterp
,
1554 lp_build_const_int_vec(gallivm
, type16
, 8), "");
1556 /* lerp is done later (with packed values) */
1563 * decode one dxt5 block.
1566 s3tc_decode_block_dxt5(struct gallivm_state
*gallivm
,
1567 enum pipe_format format
,
1568 LLVMValueRef dxt_block
,
1571 LLVMBuilderRef builder
= gallivm
->builder
;
1572 LLVMValueRef alpha
, alpha0
, alpha1
, ares
;
1573 LLVMValueRef ainterp
, ainterp0
, ainterp1
, shuffle1
, sel_mask
, sel_mask2
;
1574 LLVMValueRef a
[4], acode
, tmp0
, tmp1
;
1575 LLVMTypeRef i64t
, i32t
;
1576 struct lp_type type32
, type64
, type8
, type16
;
1577 struct lp_build_context bld16
, bld8
;
1580 memset(&type32
, 0, sizeof type32
);
1584 memset(&type64
, 0, sizeof type64
);
1588 memset(&type8
, 0, sizeof type8
);
1592 memset(&type16
, 0, sizeof type16
);
1596 lp_build_context_init(&bld16
, gallivm
, type16
);
1597 lp_build_context_init(&bld8
, gallivm
, type8
);
1599 i64t
= lp_build_vec_type(gallivm
, type64
);
1600 i32t
= lp_build_vec_type(gallivm
, type32
);
1602 s3tc_decode_block_dxt1(gallivm
, format
, dxt_block
, col
);
1605 * three possible strategies for vectorizing alpha:
1606 * 1) compute all 8 values then use scalar extraction
1607 * (i.e. have all 8 alpha values packed in one 64bit scalar
1608 * and do something like ax = vals >> (codex * 8) followed
1609 * by inserting these values back into color)
1610 * 2) same as 8 but just use pshufb as a mini-LUT for selection.
1611 * (without pshufb would need boatloads of cmp/selects trying to
1612 * keep things vectorized for essentially scalar selection).
1613 * 3) do something similar to the uncached case
1614 * needs more calculations (need to calc 16 values instead of 8 though
1615 * that's only an issue for the lerp which we need to do twice otherwise
1616 * everything still fits into 128bit) but keeps things vectorized mostly.
1617 * Trying 3) here though not sure it's really faster...
1618 * With pshufb, we try 2) (cheaper and more accurate)
1622 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1623 * help since code crosses 8bit boundaries). But variable shifts are
1624 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1625 * shifts!). Instead, emulate by 16bit muls.
1626 * Also, the required byte shuffles are essentially non-emulatable, so
1627 * require ssse3 (albeit other archs might do them fine).
1628 * This is not directly tied to ssse3 - just need sane byte shuffles.
1629 * But ordering is going to be different below so use same condition.
1633 /* vectorize alpha */
1634 alpha
= LLVMBuildBitCast(builder
, dxt_block
, i64t
, "");
1635 alpha0
= LLVMBuildAnd(builder
, alpha
,
1636 lp_build_const_int_vec(gallivm
, type64
, 0xff), "");
1637 alpha0
= LLVMBuildBitCast(builder
, alpha0
, bld16
.vec_type
, "");
1638 alpha
= LLVMBuildBitCast(builder
, alpha
, bld16
.vec_type
, "");
1639 alpha1
= LLVMBuildLShr(builder
, alpha
,
1640 lp_build_const_int_vec(gallivm
, type16
, 8), "");
1641 alpha
= LLVMBuildBitCast(builder
, alpha
, i64t
, "");
1642 shuffle1
= lp_build_const_shuffle1(gallivm
, 0, 8);
1643 /* XXX this shuffle broken with LLVM 2.8 */
1644 alpha0
= LLVMBuildShuffleVector(builder
, alpha0
, alpha0
, shuffle1
, "");
1645 alpha1
= LLVMBuildShuffleVector(builder
, alpha1
, alpha1
, shuffle1
, "");
1648 sel_mask
= lp_build_compare(gallivm
, type16
, PIPE_FUNC_GREATER
,
1650 type16
.sign
= FALSE
;
1651 sel_mask
= LLVMBuildBitCast(builder
, sel_mask
, bld8
.vec_type
, "");
1653 if (!util_cpu_caps
.has_ssse3
) {
1654 LLVMValueRef acodeg
, mask1
, acode0
, acode1
;
1656 /* extraction of the 3 bit values into something more useful is HARD */
1657 /* first steps are actually scalar */
1658 acode
= LLVMBuildLShr(builder
, alpha
,
1659 lp_build_const_int_vec(gallivm
, type64
, 16), "");
1660 tmp0
= LLVMBuildAnd(builder
, acode
,
1661 lp_build_const_int_vec(gallivm
, type64
, 0xffffff), "");
1662 tmp1
= LLVMBuildLShr(builder
, acode
,
1663 lp_build_const_int_vec(gallivm
, type64
, 24), "");
1664 tmp0
= LLVMBuildBitCast(builder
, tmp0
, i32t
, "");
1665 tmp1
= LLVMBuildBitCast(builder
, tmp1
, i32t
, "");
1666 acode
= lp_build_interleave2(gallivm
, type32
, tmp0
, tmp1
, 0);
1667 /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
1668 tmp0
= LLVMBuildAnd(builder
, acode
,
1669 lp_build_const_int_vec(gallivm
, type32
, 0xfff), "");
1670 tmp1
= LLVMBuildLShr(builder
, acode
,
1671 lp_build_const_int_vec(gallivm
, type32
, 12), "");
1672 acode
= lp_build_interleave2(gallivm
, type32
, tmp0
, tmp1
, 0);
1673 /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
1674 tmp0
= LLVMBuildAnd(builder
, acode
,
1675 lp_build_const_int_vec(gallivm
, type32
, 0x3f), "");
1676 tmp1
= LLVMBuildLShr(builder
, acode
,
1677 lp_build_const_int_vec(gallivm
, type32
, 6), "");
1678 /* use signed pack doesn't matter and otherwise need sse41 */
1679 type32
.sign
= type16
.sign
= TRUE
;
1680 acode
= lp_build_pack2(gallivm
, type32
, type16
, tmp0
, tmp1
);
1681 type32
.sign
= type16
.sign
= FALSE
;
1682 /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
1683 acode0
= LLVMBuildAnd(builder
, acode
,
1684 lp_build_const_int_vec(gallivm
, type16
, 0x7), "");
1685 acode1
= LLVMBuildLShr(builder
, acode
,
1686 lp_build_const_int_vec(gallivm
, type16
, 3), "");
1687 acode
= lp_build_pack2(gallivm
, type16
, type8
, acode0
, acode1
);
1688 /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
1690 acodeg
= LLVMBuildAnd(builder
, acode
,
1691 LLVMBuildNot(builder
, sel_mask
, ""), "");
1692 mask1
= lp_build_compare(gallivm
, type8
, PIPE_FUNC_EQUAL
,
1695 sel_mask
= LLVMBuildBitCast(builder
, sel_mask
, bld16
.vec_type
, "");
1696 ainterp0
= lp_build_lerpdxta_block(gallivm
, alpha0
, alpha1
, acode0
, sel_mask
);
1697 ainterp1
= lp_build_lerpdxta_block(gallivm
, alpha0
, alpha1
, acode1
, sel_mask
);
1698 sel_mask
= LLVMBuildBitCast(builder
, sel_mask
, bld8
.vec_type
, "");
1699 ainterp
= lp_build_pack2(gallivm
, type16
, type8
, ainterp0
, ainterp1
);
1700 alpha0
= lp_build_pack2(gallivm
, type16
, type8
, alpha0
, alpha0
);
1701 alpha1
= lp_build_pack2(gallivm
, type16
, type8
, alpha1
, alpha1
);
1702 ainterp
= LLVMBuildAdd(builder
, ainterp
, alpha0
, "");
1704 sel_mask2
= lp_build_compare(gallivm
, type8
, PIPE_FUNC_EQUAL
,
1706 ainterp
= lp_build_select(&bld8
, sel_mask2
, alpha0
, ainterp
);
1707 ainterp
= lp_build_select(&bld8
, mask1
, alpha1
, ainterp
);
1709 /* fix up val67 if a0 <= a1 */
1710 sel_mask2
= lp_build_compare(gallivm
, type8
, PIPE_FUNC_EQUAL
,
1711 acodeg
, lp_build_const_int_vec(gallivm
, type8
, 6));
1712 ares
= LLVMBuildAnd(builder
, ainterp
, LLVMBuildNot(builder
, sel_mask2
, ""), "");
1713 sel_mask2
= lp_build_compare(gallivm
, type8
, PIPE_FUNC_EQUAL
,
1714 acodeg
, lp_build_const_int_vec(gallivm
, type8
, 7));
1715 ares
= LLVMBuildOr(builder
, ares
, sel_mask2
, "");
1717 /* unpack in right order (0,4,8,12,1,5,..) */
1718 /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
1719 tmp0
= lp_build_interleave2(gallivm
, type8
, bld8
.zero
, ares
, 0);
1720 tmp1
= lp_build_interleave2(gallivm
, type8
, bld8
.zero
, ares
, 1);
1721 tmp0
= LLVMBuildBitCast(builder
, tmp0
, bld16
.vec_type
, "");
1722 tmp1
= LLVMBuildBitCast(builder
, tmp1
, bld16
.vec_type
, "");
1724 a
[0] = lp_build_interleave2(gallivm
, type16
, bld16
.zero
, tmp0
, 0);
1725 a
[1] = lp_build_interleave2(gallivm
, type16
, bld16
.zero
, tmp1
, 0);
1726 a
[2] = lp_build_interleave2(gallivm
, type16
, bld16
.zero
, tmp0
, 1);
1727 a
[3] = lp_build_interleave2(gallivm
, type16
, bld16
.zero
, tmp1
, 1);
1730 LLVMValueRef elems
[16], intrargs
[2], shufa
, mulclo
, mulchi
, mask8hi
;
1731 LLVMTypeRef type16s
= LLVMInt16TypeInContext(gallivm
->context
);
1732 LLVMTypeRef type8s
= LLVMInt8TypeInContext(gallivm
->context
);
1735 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1736 * help since code crosses 8bit boundaries). But variable shifts are
1737 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1738 * shifts!). Instead, emulate by 16bit muls.
1739 * Also, the required byte shuffles are essentially non-emulatable, so
1740 * require ssse3 (albeit other archs might do them fine, but the
1741 * complete path is ssse3 only for now).
1743 for (i
= 0, j
= 0; i
< 16; i
+= 8, j
+= 3) {
1744 elems
[i
+0] = elems
[i
+1] = elems
[i
+2] = lp_build_const_int32(gallivm
, j
+2);
1745 elems
[i
+3] = elems
[i
+4] = lp_build_const_int32(gallivm
, j
+3);
1746 elems
[i
+5] = elems
[i
+6] = elems
[i
+7] = lp_build_const_int32(gallivm
, j
+4);
1748 shufa
= LLVMConstVector(elems
, 16);
1749 alpha
= LLVMBuildBitCast(builder
, alpha
, bld8
.vec_type
, "");
1750 acode
= LLVMBuildShuffleVector(builder
, alpha
, bld8
.undef
, shufa
, "");
1751 acode
= LLVMBuildBitCast(builder
, acode
, bld16
.vec_type
, "");
1753 * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
1754 * Do the same for 1/3/5/7 (albeit still need mask there - ideally
1755 * we'd place them into bits 4-7 so could save shift but impossible.)
1757 for (i
= 0; i
< 8; i
+= 4) {
1758 elems
[i
+0] = LLVMConstInt(type16s
, 1 << (13-0), 0);
1759 elems
[i
+1] = LLVMConstInt(type16s
, 1 << (13-6), 0);
1760 elems
[i
+2] = LLVMConstInt(type16s
, 1 << (13-4), 0);
1761 elems
[i
+3] = LLVMConstInt(type16s
, 1 << (13-2), 0);
1763 mulclo
= LLVMConstVector(elems
, 8);
1764 for (i
= 0; i
< 8; i
+= 4) {
1765 elems
[i
+0] = LLVMConstInt(type16s
, 1 << (13-3), 0);
1766 elems
[i
+1] = LLVMConstInt(type16s
, 1 << (13-9), 0);
1767 elems
[i
+2] = LLVMConstInt(type16s
, 1 << (13-7), 0);
1768 elems
[i
+3] = LLVMConstInt(type16s
, 1 << (13-5), 0);
1770 mulchi
= LLVMConstVector(elems
, 8);
1772 tmp0
= LLVMBuildMul(builder
, acode
, mulclo
, "");
1773 tmp1
= LLVMBuildMul(builder
, acode
, mulchi
, "");
1774 tmp0
= LLVMBuildLShr(builder
, tmp0
,
1775 lp_build_const_int_vec(gallivm
, type16
, 13), "");
1776 tmp1
= LLVMBuildLShr(builder
, tmp1
,
1777 lp_build_const_int_vec(gallivm
, type16
, 5), "");
1778 tmp1
= LLVMBuildAnd(builder
, tmp1
,
1779 lp_build_const_int_vec(gallivm
, type16
, 0x700), "");
1780 acode
= LLVMBuildOr(builder
, tmp0
, tmp1
, "");
1781 acode
= LLVMBuildBitCast(builder
, acode
, bld8
.vec_type
, "");
1784 * Note that ordering is different here to non-ssse3 path:
1788 LLVMValueRef weight0
, weight1
, weight
, delta
;
1789 LLVMValueRef constff_elem7
, const0_elem6
;
1790 /* weights, correctly rounded (round(256*x/7)) */
1791 elems
[0] = LLVMConstInt(type16s
, 256, 0);
1792 elems
[1] = LLVMConstInt(type16s
, 0, 0);
1793 elems
[2] = LLVMConstInt(type16s
, 219, 0);
1794 elems
[3] = LLVMConstInt(type16s
, 183, 0);
1795 elems
[4] = LLVMConstInt(type16s
, 146, 0);
1796 elems
[5] = LLVMConstInt(type16s
, 110, 0);
1797 elems
[6] = LLVMConstInt(type16s
, 73, 0);
1798 elems
[7] = LLVMConstInt(type16s
, 37, 0);
1799 weight0
= LLVMConstVector(elems
, 8);
1801 elems
[0] = LLVMConstInt(type16s
, 256, 0);
1802 elems
[1] = LLVMConstInt(type16s
, 0, 0);
1803 elems
[2] = LLVMConstInt(type16s
, 205, 0);
1804 elems
[3] = LLVMConstInt(type16s
, 154, 0);
1805 elems
[4] = LLVMConstInt(type16s
, 102, 0);
1806 elems
[5] = LLVMConstInt(type16s
, 51, 0);
1807 elems
[6] = LLVMConstInt(type16s
, 0, 0);
1808 elems
[7] = LLVMConstInt(type16s
, 0, 0);
1809 weight1
= LLVMConstVector(elems
, 8);
1811 weight0
= LLVMBuildBitCast(builder
, weight0
, bld8
.vec_type
, "");
1812 weight1
= LLVMBuildBitCast(builder
, weight1
, bld8
.vec_type
, "");
1813 weight
= lp_build_select(&bld8
, sel_mask
, weight0
, weight1
);
1814 weight
= LLVMBuildBitCast(builder
, weight
, bld16
.vec_type
, "");
1816 for (i
= 0; i
< 16; i
++) {
1817 elems
[i
] = LLVMConstNull(type8s
);
1819 elems
[7] = LLVMConstInt(type8s
, 255, 0);
1820 constff_elem7
= LLVMConstVector(elems
, 16);
1822 for (i
= 0; i
< 16; i
++) {
1823 elems
[i
] = LLVMConstInt(type8s
, 255, 0);
1825 elems
[6] = LLVMConstInt(type8s
, 0, 0);
1826 const0_elem6
= LLVMConstVector(elems
, 16);
1828 /* standard simple lerp - but the version we need isn't available */
1829 delta
= LLVMBuildSub(builder
, alpha0
, alpha1
, "");
1830 ainterp
= LLVMBuildMul(builder
, delta
, weight
, "");
1831 ainterp
= LLVMBuildLShr(builder
, ainterp
,
1832 lp_build_const_int_vec(gallivm
, type16
, 8), "");
1833 ainterp
= LLVMBuildBitCast(builder
, ainterp
, bld8
.vec_type
, "");
1834 alpha1
= LLVMBuildBitCast(builder
, alpha1
, bld8
.vec_type
, "");
1835 ainterp
= LLVMBuildAdd(builder
, ainterp
, alpha1
, "");
1836 ainterp
= LLVMBuildBitCast(builder
, ainterp
, bld16
.vec_type
, "");
1837 ainterp
= lp_build_pack2(gallivm
, type16
, type8
, ainterp
, bld16
.undef
);
1839 /* fixing 0/0xff case is slightly more complex */
1840 constff_elem7
= LLVMBuildAnd(builder
, constff_elem7
,
1841 LLVMBuildNot(builder
, sel_mask
, ""), "");
1842 const0_elem6
= LLVMBuildOr(builder
, const0_elem6
, sel_mask
, "");
1843 ainterp
= LLVMBuildOr(builder
, ainterp
, constff_elem7
, "");
1844 ainterp
= LLVMBuildAnd(builder
, ainterp
, const0_elem6
, "");
1846 /* now pick all 16 elements at once! */
1847 intrargs
[0] = ainterp
;
1848 intrargs
[1] = acode
;
1849 ares
= lp_build_intrinsic(builder
, "llvm.x86.ssse3.pshuf.b.128",
1850 bld8
.vec_type
, intrargs
, 2, 0);
1852 ares
= LLVMBuildBitCast(builder
, ares
, i32t
, "");
1853 mask8hi
= lp_build_const_int_vec(gallivm
, type32
, 0xff000000);
1854 a
[0] = LLVMBuildShl(builder
, ares
,
1855 lp_build_const_int_vec(gallivm
, type32
, 24), "");
1856 a
[1] = LLVMBuildShl(builder
, ares
,
1857 lp_build_const_int_vec(gallivm
, type32
, 16), "");
1858 a
[1] = LLVMBuildAnd(builder
, a
[1], mask8hi
, "");
1859 a
[2] = LLVMBuildShl(builder
, ares
,
1860 lp_build_const_int_vec(gallivm
, type32
, 8), "");
1861 a
[2] = LLVMBuildAnd(builder
, a
[2], mask8hi
, "");
1862 a
[3] = LLVMBuildAnd(builder
, ares
, mask8hi
, "");
1865 for (i
= 0; i
< 4; i
++) {
1866 a
[i
] = LLVMBuildBitCast(builder
, a
[i
], i32t
, "");
1867 col
[i
] = LLVMBuildOr(builder
, col
[i
], a
[i
], "");
1873 generate_update_cache_one_block(struct gallivm_state
*gallivm
,
1874 LLVMValueRef function
,
1875 const struct util_format_description
*format_desc
)
1877 LLVMBasicBlockRef block
;
1878 LLVMBuilderRef old_builder
;
1879 LLVMValueRef ptr_addr
;
1880 LLVMValueRef hash_index
;
1882 LLVMValueRef dxt_block
, tag_value
;
1883 LLVMValueRef col
[LP_MAX_VECTOR_LENGTH
];
1885 ptr_addr
= LLVMGetParam(function
, 0);
1886 hash_index
= LLVMGetParam(function
, 1);
1887 cache
= LLVMGetParam(function
, 2);
1889 lp_build_name(ptr_addr
, "ptr_addr" );
1890 lp_build_name(hash_index
, "hash_index");
1891 lp_build_name(cache
, "cache_addr");
1897 old_builder
= gallivm
->builder
;
1898 block
= LLVMAppendBasicBlockInContext(gallivm
->context
, function
, "entry");
1899 gallivm
->builder
= LLVMCreateBuilderInContext(gallivm
->context
);
1900 LLVMPositionBuilderAtEnd(gallivm
->builder
, block
);
1902 lp_build_gather_s3tc_simple_scalar(gallivm
, format_desc
, &dxt_block
,
1905 switch (format_desc
->format
) {
1906 case PIPE_FORMAT_DXT1_RGB
:
1907 case PIPE_FORMAT_DXT1_RGBA
:
1908 case PIPE_FORMAT_DXT1_SRGB
:
1909 case PIPE_FORMAT_DXT1_SRGBA
:
1910 s3tc_decode_block_dxt1(gallivm
, format_desc
->format
, dxt_block
, col
);
1912 case PIPE_FORMAT_DXT3_RGBA
:
1913 case PIPE_FORMAT_DXT3_SRGBA
:
1914 s3tc_decode_block_dxt3(gallivm
, format_desc
->format
, dxt_block
, col
);
1916 case PIPE_FORMAT_DXT5_RGBA
:
1917 case PIPE_FORMAT_DXT5_SRGBA
:
1918 s3tc_decode_block_dxt5(gallivm
, format_desc
->format
, dxt_block
, col
);
1922 s3tc_decode_block_dxt1(gallivm
, format_desc
->format
, dxt_block
, col
);
1926 tag_value
= LLVMBuildPtrToInt(gallivm
->builder
, ptr_addr
,
1927 LLVMInt64TypeInContext(gallivm
->context
), "");
1928 s3tc_store_cached_block(gallivm
, col
, tag_value
, hash_index
, cache
);
1930 LLVMBuildRetVoid(gallivm
->builder
);
1932 LLVMDisposeBuilder(gallivm
->builder
);
1933 gallivm
->builder
= old_builder
;
1935 gallivm_verify_function(gallivm
, function
);
1940 update_cached_block(struct gallivm_state
*gallivm
,
1941 const struct util_format_description
*format_desc
,
1942 LLVMValueRef ptr_addr
,
1943 LLVMValueRef hash_index
,
1947 LLVMBuilderRef builder
= gallivm
->builder
;
1948 LLVMModuleRef module
= gallivm
->module
;
1950 LLVMTypeRef i8t
= LLVMInt8TypeInContext(gallivm
->context
);
1951 LLVMTypeRef pi8t
= LLVMPointerType(i8t
, 0);
1952 LLVMValueRef function
, inst
;
1953 LLVMBasicBlockRef bb
;
1954 LLVMValueRef args
[3];
1956 util_snprintf(name
, sizeof name
, "%s_update_cache_one_block",
1957 format_desc
->short_name
);
1958 function
= LLVMGetNamedFunction(module
, name
);
1961 LLVMTypeRef ret_type
;
1962 LLVMTypeRef arg_types
[3];
1963 LLVMTypeRef function_type
;
1967 * Generate the function prototype.
1970 ret_type
= LLVMVoidTypeInContext(gallivm
->context
);
1971 arg_types
[0] = pi8t
;
1972 arg_types
[1] = LLVMInt32TypeInContext(gallivm
->context
);
1973 arg_types
[2] = LLVMTypeOf(cache
); // XXX: put right type here
1974 function_type
= LLVMFunctionType(ret_type
, arg_types
, ARRAY_SIZE(arg_types
), 0);
1975 function
= LLVMAddFunction(module
, name
, function_type
);
1977 for (arg
= 0; arg
< ARRAY_SIZE(arg_types
); ++arg
)
1978 if (LLVMGetTypeKind(arg_types
[arg
]) == LLVMPointerTypeKind
)
1979 lp_add_function_attr(function
, arg
+ 1, LP_FUNC_ATTR_NOALIAS
);
1981 LLVMSetFunctionCallConv(function
, LLVMFastCallConv
);
1982 LLVMSetVisibility(function
, LLVMHiddenVisibility
);
1983 generate_update_cache_one_block(gallivm
, function
, format_desc
);
1987 args
[1] = hash_index
;
1990 LLVMBuildCall(builder
, function
, args
, ARRAY_SIZE(args
), "");
1991 bb
= LLVMGetInsertBlock(builder
);
1992 inst
= LLVMGetLastInstruction(bb
);
1993 LLVMSetInstructionCallConv(inst
, LLVMFastCallConv
);
2000 compressed_fetch_cached(struct gallivm_state
*gallivm
,
2001 const struct util_format_description
*format_desc
,
2003 LLVMValueRef base_ptr
,
2004 LLVMValueRef offset
,
2010 LLVMBuilderRef builder
= gallivm
->builder
;
2011 unsigned count
, low_bit
, log2size
;
2012 LLVMValueRef color
, offset_stored
, addr
, ptr_addrtrunc
, tmp
;
2013 LLVMValueRef ij_index
, hash_index
, hash_mask
, block_index
;
2014 LLVMTypeRef i8t
= LLVMInt8TypeInContext(gallivm
->context
);
2015 LLVMTypeRef i32t
= LLVMInt32TypeInContext(gallivm
->context
);
2016 LLVMTypeRef i64t
= LLVMInt64TypeInContext(gallivm
->context
);
2017 struct lp_type type
;
2018 struct lp_build_context bld32
;
2019 memset(&type
, 0, sizeof type
);
2023 lp_build_context_init(&bld32
, gallivm
, type
);
2026 * compute hash - we use direct mapped cache, the hash function could
2027 * be better but it needs to be simple
2029 * compare offset with offset stored at tag (hash)
2030 * if not equal extract block, store block, update tag
2031 * extract color from cache
2035 low_bit
= util_logbase2(format_desc
->block
.bits
/ 8);
2036 log2size
= util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE
);
2037 addr
= LLVMBuildPtrToInt(builder
, base_ptr
, i64t
, "");
2038 ptr_addrtrunc
= LLVMBuildPtrToInt(builder
, base_ptr
, i32t
, "");
2039 ptr_addrtrunc
= lp_build_broadcast_scalar(&bld32
, ptr_addrtrunc
);
2040 /* For the hash function, first mask off the unused lowest bits. Then just
2041 do some xor with address bits - only use lower 32bits */
2042 ptr_addrtrunc
= LLVMBuildAdd(builder
, offset
, ptr_addrtrunc
, "");
2043 ptr_addrtrunc
= LLVMBuildLShr(builder
, ptr_addrtrunc
,
2044 lp_build_const_int_vec(gallivm
, type
, low_bit
), "");
2045 /* This only really makes sense for size 64,128,256 */
2046 hash_index
= ptr_addrtrunc
;
2047 ptr_addrtrunc
= LLVMBuildLShr(builder
, ptr_addrtrunc
,
2048 lp_build_const_int_vec(gallivm
, type
, 2*log2size
), "");
2049 hash_index
= LLVMBuildXor(builder
, ptr_addrtrunc
, hash_index
, "");
2050 tmp
= LLVMBuildLShr(builder
, hash_index
,
2051 lp_build_const_int_vec(gallivm
, type
, log2size
), "");
2052 hash_index
= LLVMBuildXor(builder
, hash_index
, tmp
, "");
2054 hash_mask
= lp_build_const_int_vec(gallivm
, type
, LP_BUILD_FORMAT_CACHE_SIZE
- 1);
2055 hash_index
= LLVMBuildAnd(builder
, hash_index
, hash_mask
, "");
2056 ij_index
= LLVMBuildShl(builder
, i
, lp_build_const_int_vec(gallivm
, type
, 2), "");
2057 ij_index
= LLVMBuildAdd(builder
, ij_index
, j
, "");
2058 block_index
= LLVMBuildShl(builder
, hash_index
,
2059 lp_build_const_int_vec(gallivm
, type
, 4), "");
2060 block_index
= LLVMBuildAdd(builder
, ij_index
, block_index
, "");
2063 color
= bld32
.undef
;
2064 for (count
= 0; count
< n
; count
++) {
2065 LLVMValueRef index
, cond
, colorx
;
2066 LLVMValueRef block_indexx
, hash_indexx
, addrx
, offsetx
, ptr_addrx
;
2067 struct lp_build_if_state if_ctx
;
2069 index
= lp_build_const_int32(gallivm
, count
);
2070 offsetx
= LLVMBuildExtractElement(builder
, offset
, index
, "");
2071 addrx
= LLVMBuildZExt(builder
, offsetx
, i64t
, "");
2072 addrx
= LLVMBuildAdd(builder
, addrx
, addr
, "");
2073 block_indexx
= LLVMBuildExtractElement(builder
, block_index
, index
, "");
2074 hash_indexx
= LLVMBuildLShr(builder
, block_indexx
,
2075 lp_build_const_int32(gallivm
, 4), "");
2076 offset_stored
= s3tc_lookup_tag_data(gallivm
, cache
, hash_indexx
);
2077 cond
= LLVMBuildICmp(builder
, LLVMIntNE
, offset_stored
, addrx
, "");
2079 lp_build_if(&if_ctx
, gallivm
, cond
);
2081 ptr_addrx
= LLVMBuildIntToPtr(builder
, addrx
,
2082 LLVMPointerType(i8t
, 0), "");
2083 update_cached_block(gallivm
, format_desc
, ptr_addrx
, hash_indexx
, cache
);
2084 #if LP_BUILD_FORMAT_CACHE_DEBUG
2085 s3tc_update_cache_access(gallivm
, cache
, 1,
2086 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS
);
2089 lp_build_endif(&if_ctx
);
2091 colorx
= s3tc_lookup_cached_pixel(gallivm
, cache
, block_indexx
);
2093 color
= LLVMBuildInsertElement(builder
, color
, colorx
,
2094 lp_build_const_int32(gallivm
, count
), "");
2099 struct lp_build_if_state if_ctx
;
2101 tmp
= LLVMBuildZExt(builder
, offset
, i64t
, "");
2102 addr
= LLVMBuildAdd(builder
, tmp
, addr
, "");
2103 offset_stored
= s3tc_lookup_tag_data(gallivm
, cache
, hash_index
);
2104 cond
= LLVMBuildICmp(builder
, LLVMIntNE
, offset_stored
, addr
, "");
2106 lp_build_if(&if_ctx
, gallivm
, cond
);
2108 tmp
= LLVMBuildIntToPtr(builder
, addr
, LLVMPointerType(i8t
, 0), "");
2109 update_cached_block(gallivm
, format_desc
, tmp
, hash_index
, cache
);
2110 #if LP_BUILD_FORMAT_CACHE_DEBUG
2111 s3tc_update_cache_access(gallivm
, cache
, 1,
2112 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS
);
2115 lp_build_endif(&if_ctx
);
2117 color
= s3tc_lookup_cached_pixel(gallivm
, cache
, block_index
);
2119 #if LP_BUILD_FORMAT_CACHE_DEBUG
2120 s3tc_update_cache_access(gallivm
, cache
, n
,
2121 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL
);
2123 return LLVMBuildBitCast(builder
, color
, LLVMVectorType(i8t
, n
* 4), "");
2128 s3tc_dxt5_to_rgba_aos(struct gallivm_state
*gallivm
,
2130 enum pipe_format format
,
2131 LLVMValueRef colors
,
2132 LLVMValueRef codewords
,
2133 LLVMValueRef alpha_lo
,
2134 LLVMValueRef alpha_hi
,
2138 return s3tc_dxt5_full_to_rgba_aos(gallivm
, n
, format
, colors
,
2139 codewords
, alpha_lo
, alpha_hi
, i
, j
);
2144 * @param n number of pixels processed (usually n=4, but it should also work with n=1
2145 * and multiples of 4)
2146 * @param base_ptr base pointer (32bit or 64bit pointer depending on the architecture)
2147 * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2148 * @param i is a <n x i32> vector with the x subpixel coordinate (0..3)
2149 * @param j is a <n x i32> vector with the y subpixel coordinate (0..3)
2150 * @return a <4*n x i8> vector with the pixel RGBA values in AoS
2153 lp_build_fetch_s3tc_rgba_aos(struct gallivm_state
*gallivm
,
2154 const struct util_format_description
*format_desc
,
2156 LLVMValueRef base_ptr
,
2157 LLVMValueRef offset
,
2163 LLVMTypeRef i8t
= LLVMInt8TypeInContext(gallivm
->context
);
2164 LLVMBuilderRef builder
= gallivm
->builder
;
2166 assert(format_desc
->layout
== UTIL_FORMAT_LAYOUT_S3TC
);
2167 assert(format_desc
->block
.width
== 4);
2168 assert(format_desc
->block
.height
== 4);
2170 assert((n
== 1) || (n
% 4 == 0));
2172 /* debug_printf("format = %d\n", format_desc->format);*/
2174 rgba
= compressed_fetch_cached(gallivm
, format_desc
, n
,
2175 base_ptr
, offset
, i
, j
, cache
);
2181 LLVMTypeRef i8_vectype
= LLVMVectorType(i8t
, 4 * n
);
2182 LLVMTypeRef i128_type
= LLVMIntTypeInContext(gallivm
->context
, 128);
2183 LLVMTypeRef i128_vectype
= LLVMVectorType(i128_type
, n
/ 4);
2184 LLVMTypeRef i324_vectype
= LLVMVectorType(LLVMInt32TypeInContext(
2185 gallivm
->context
), 4);
2186 LLVMValueRef offset4
, i4
, j4
, rgba4
[LP_MAX_VECTOR_LENGTH
/16];
2187 struct lp_type lp_324_vectype
= lp_type_uint_vec(32, 128);
2189 assert(n
/ 4 <= ARRAY_SIZE(rgba4
));
2191 rgba
= LLVMGetUndef(i128_vectype
);
2193 for (count
= 0; count
< n
/ 4; count
++) {
2194 LLVMValueRef colors
, codewords
, alpha_lo
= NULL
, alpha_hi
= NULL
;
2196 i4
= lp_build_extract_range(gallivm
, i
, count
* 4, 4);
2197 j4
= lp_build_extract_range(gallivm
, j
, count
* 4, 4);
2198 offset4
= lp_build_extract_range(gallivm
, offset
, count
* 4, 4);
2200 lp_build_gather_s3tc(gallivm
, 4, format_desc
, &colors
, &codewords
,
2201 &alpha_lo
, &alpha_hi
, base_ptr
, offset4
);
2203 switch (format_desc
->format
) {
2204 case PIPE_FORMAT_DXT1_RGB
:
2205 case PIPE_FORMAT_DXT1_RGBA
:
2206 case PIPE_FORMAT_DXT1_SRGB
:
2207 case PIPE_FORMAT_DXT1_SRGBA
:
2208 rgba4
[count
] = s3tc_dxt1_to_rgba_aos(gallivm
, 4, format_desc
->format
,
2209 colors
, codewords
, i4
, j4
);
2211 case PIPE_FORMAT_DXT3_RGBA
:
2212 case PIPE_FORMAT_DXT3_SRGBA
:
2213 rgba4
[count
] = s3tc_dxt3_to_rgba_aos(gallivm
, 4, format_desc
->format
, colors
,
2214 codewords
, alpha_lo
, alpha_hi
, i4
, j4
);
2216 case PIPE_FORMAT_DXT5_RGBA
:
2217 case PIPE_FORMAT_DXT5_SRGBA
:
2218 rgba4
[count
] = s3tc_dxt5_to_rgba_aos(gallivm
, 4, format_desc
->format
, colors
,
2219 codewords
, alpha_lo
, alpha_hi
, i4
, j4
);
2223 rgba4
[count
] = LLVMGetUndef(LLVMVectorType(i8t
, 4));
2226 /* shuffles typically give best results with dword elements...*/
2227 rgba4
[count
] = LLVMBuildBitCast(builder
, rgba4
[count
], i324_vectype
, "");
2229 rgba
= lp_build_concat(gallivm
, rgba4
, lp_324_vectype
, n
/ 4);
2230 rgba
= LLVMBuildBitCast(builder
, rgba
, i8_vectype
, "");
2233 LLVMValueRef colors
, codewords
, alpha_lo
= NULL
, alpha_hi
= NULL
;
2235 lp_build_gather_s3tc(gallivm
, n
, format_desc
, &colors
, &codewords
,
2236 &alpha_lo
, &alpha_hi
, base_ptr
, offset
);
2238 switch (format_desc
->format
) {
2239 case PIPE_FORMAT_DXT1_RGB
:
2240 case PIPE_FORMAT_DXT1_RGBA
:
2241 case PIPE_FORMAT_DXT1_SRGB
:
2242 case PIPE_FORMAT_DXT1_SRGBA
:
2243 rgba
= s3tc_dxt1_to_rgba_aos(gallivm
, n
, format_desc
->format
,
2244 colors
, codewords
, i
, j
);
2246 case PIPE_FORMAT_DXT3_RGBA
:
2247 case PIPE_FORMAT_DXT3_SRGBA
:
2248 rgba
= s3tc_dxt3_to_rgba_aos(gallivm
, n
, format_desc
->format
, colors
,
2249 codewords
, alpha_lo
, alpha_hi
, i
, j
);
2251 case PIPE_FORMAT_DXT5_RGBA
:
2252 case PIPE_FORMAT_DXT5_SRGBA
:
2253 rgba
= s3tc_dxt5_to_rgba_aos(gallivm
, n
, format_desc
->format
, colors
,
2254 codewords
, alpha_lo
, alpha_hi
, i
, j
);
2258 rgba
= LLVMGetUndef(LLVMVectorType(i8t
, 4*n
));
2263 /* always return just decompressed values - srgb conversion is done later */