1 /**************************************************************************
3 * Copyright 2010-2018 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
26 **************************************************************************/
31 * s3tc pixel format manipulation.
33 * @author Roland Scheidegger <sroland@vmware.com>
37 #include <llvm/Config/llvm-config.h>
39 #include "util/format/u_format.h"
40 #include "util/u_math.h"
41 #include "util/u_string.h"
42 #include "util/u_cpu_detect.h"
43 #include "util/u_debug.h"
45 #include "lp_bld_arit.h"
46 #include "lp_bld_type.h"
47 #include "lp_bld_const.h"
48 #include "lp_bld_conv.h"
49 #include "lp_bld_gather.h"
50 #include "lp_bld_format.h"
51 #include "lp_bld_logic.h"
52 #include "lp_bld_pack.h"
53 #include "lp_bld_flow.h"
54 #include "lp_bld_printf.h"
55 #include "lp_bld_struct.h"
56 #include "lp_bld_swizzle.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_debug.h"
59 #include "lp_bld_intr.h"
63 * Reverse an interleave2_half
64 * (ie. pick every second element, independent lower/upper halfs)
65 * sse2 can only do that with 32bit (shufps) or larger elements
66 * natively. (Otherwise, and/pack (even) or shift/pack (odd)
67 * could be used, ideally llvm would do that for us.)
68 * XXX: Unfortunately, this does NOT translate to a shufps if those
69 * are int vectors (and casting will not help, llvm needs to recognize it
70 * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
71 * sequence which I'm pretty sure is a lot worse despite domain transition
72 * penalties with shufps (except maybe on Nehalem).
75 lp_build_uninterleave2_half(struct gallivm_state
*gallivm
,
81 LLVMValueRef shuffle
, elems
[LP_MAX_VECTOR_LENGTH
];
84 assert(type
.length
<= LP_MAX_VECTOR_LENGTH
);
87 if (type
.length
* type
.width
== 256) {
88 assert(type
.length
== 8);
89 assert(type
.width
== 32);
90 static const unsigned shufvals
[8] = {0, 2, 8, 10, 4, 6, 12, 14};
91 for (i
= 0; i
< type
.length
; ++i
) {
92 elems
[i
] = lp_build_const_int32(gallivm
, shufvals
[i
] + lo_hi
);
95 for (i
= 0; i
< type
.length
; ++i
) {
96 elems
[i
] = lp_build_const_int32(gallivm
, 2*i
+ lo_hi
);
100 shuffle
= LLVMConstVector(elems
, type
.length
);
102 return LLVMBuildShuffleVector(gallivm
->builder
, a
, b
, shuffle
, "");
108 * Build shuffle for extending vectors.
111 lp_build_const_extend_shuffle(struct gallivm_state
*gallivm
,
112 unsigned n
, unsigned length
)
114 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
118 assert(length
<= LP_MAX_VECTOR_LENGTH
);
120 /* TODO: cache results in a static table */
122 for(i
= 0; i
< n
; i
++) {
123 elems
[i
] = lp_build_const_int32(gallivm
, i
);
125 for (i
= n
; i
< length
; i
++) {
126 elems
[i
] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm
->context
));
129 return LLVMConstVector(elems
, length
);
133 lp_build_const_unpackx2_shuffle(struct gallivm_state
*gallivm
, unsigned n
)
135 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
138 assert(n
<= LP_MAX_VECTOR_LENGTH
);
140 /* TODO: cache results in a static table */
142 for(i
= 0, j
= 0; i
< n
; i
+= 2, ++j
) {
143 elems
[i
+ 0] = lp_build_const_int32(gallivm
, 0 + j
);
144 elems
[i
+ 1] = lp_build_const_int32(gallivm
, n
+ j
);
145 elems
[n
+ i
+ 0] = lp_build_const_int32(gallivm
, 0 + n
/2 + j
);
146 elems
[n
+ i
+ 1] = lp_build_const_int32(gallivm
, n
+ n
/2 + j
);
149 return LLVMConstVector(elems
, n
* 2);
153 * broadcast 1 element to all elements
156 lp_build_const_shuffle1(struct gallivm_state
*gallivm
,
157 unsigned index
, unsigned n
)
159 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
];
162 assert(n
<= LP_MAX_VECTOR_LENGTH
);
164 /* TODO: cache results in a static table */
166 for (i
= 0; i
< n
; i
++) {
167 elems
[i
] = lp_build_const_int32(gallivm
, index
);
170 return LLVMConstVector(elems
, n
);
174 * move 1 element to pos 0, rest undef
177 lp_build_shuffle1undef(struct gallivm_state
*gallivm
,
178 LLVMValueRef a
, unsigned index
, unsigned n
)
180 LLVMValueRef elems
[LP_MAX_VECTOR_LENGTH
], shuf
;
183 assert(n
<= LP_MAX_VECTOR_LENGTH
);
185 elems
[0] = lp_build_const_int32(gallivm
, index
);
187 for (i
= 1; i
< n
; i
++) {
188 elems
[i
] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm
->context
));
190 shuf
= LLVMConstVector(elems
, n
);
192 return LLVMBuildShuffleVector(gallivm
->builder
, a
, a
, shuf
, "");
196 format_dxt1_variant(enum pipe_format format
)
198 return format
== PIPE_FORMAT_DXT1_RGB
||
199 format
== PIPE_FORMAT_DXT1_RGBA
||
200 format
== PIPE_FORMAT_DXT1_SRGB
||
201 format
== PIPE_FORMAT_DXT1_SRGBA
;
206 * Gather elements from scatter positions in memory into vectors.
207 * This is customised for fetching texels from s3tc textures.
208 * For SSE, typical value is length=4.
210 * @param length length of the offsets
211 * @param colors the stored colors of the blocks will be extracted into this.
212 * @param codewords the codewords of the blocks will be extracted into this.
213 * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
214 * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
215 * @param base_ptr base pointer, should be a i8 pointer type.
216 * @param offsets vector with offsets
219 lp_build_gather_s3tc(struct gallivm_state
*gallivm
,
221 const struct util_format_description
*format_desc
,
222 LLVMValueRef
*colors
,
223 LLVMValueRef
*codewords
,
224 LLVMValueRef
*alpha_lo
,
225 LLVMValueRef
*alpha_hi
,
226 LLVMValueRef base_ptr
,
227 LLVMValueRef offsets
)
229 LLVMBuilderRef builder
= gallivm
->builder
;
230 unsigned block_bits
= format_desc
->block
.bits
;
232 LLVMValueRef elems
[8];
233 LLVMTypeRef type32
= LLVMInt32TypeInContext(gallivm
->context
);
234 LLVMTypeRef type64
= LLVMInt64TypeInContext(gallivm
->context
);
235 LLVMTypeRef type32dxt
;
236 struct lp_type lp_type32dxt
;
238 memset(&lp_type32dxt
, 0, sizeof lp_type32dxt
);
239 lp_type32dxt
.width
= 32;
240 lp_type32dxt
.length
= block_bits
/ 32;
241 type32dxt
= lp_build_vec_type(gallivm
, lp_type32dxt
);
243 assert(block_bits
== 64 || block_bits
== 128);
244 assert(length
== 1 || length
== 4 || length
== 8);
246 for (i
= 0; i
< length
; ++i
) {
247 elems
[i
] = lp_build_gather_elem(gallivm
, length
,
248 block_bits
, block_bits
, TRUE
,
249 base_ptr
, offsets
, i
, FALSE
);
250 elems
[i
] = LLVMBuildBitCast(builder
, elems
[i
], type32dxt
, "");
253 LLVMValueRef elem
= elems
[0];
254 if (block_bits
== 128) {
255 *alpha_lo
= LLVMBuildExtractElement(builder
, elem
,
256 lp_build_const_int32(gallivm
, 0), "");
257 *alpha_hi
= LLVMBuildExtractElement(builder
, elem
,
258 lp_build_const_int32(gallivm
, 1), "");
259 *colors
= LLVMBuildExtractElement(builder
, elem
,
260 lp_build_const_int32(gallivm
, 2), "");
261 *codewords
= LLVMBuildExtractElement(builder
, elem
,
262 lp_build_const_int32(gallivm
, 3), "");
265 *alpha_lo
= LLVMGetUndef(type32
);
266 *alpha_hi
= LLVMGetUndef(type32
);
267 *colors
= LLVMBuildExtractElement(builder
, elem
,
268 lp_build_const_int32(gallivm
, 0), "");
269 *codewords
= LLVMBuildExtractElement(builder
, elem
,
270 lp_build_const_int32(gallivm
, 1), "");
274 LLVMValueRef tmp
[4], cc01
, cc23
;
275 struct lp_type lp_type32
, lp_type64
;
276 memset(&lp_type32
, 0, sizeof lp_type32
);
277 lp_type32
.width
= 32;
278 lp_type32
.length
= length
;
279 memset(&lp_type64
, 0, sizeof lp_type64
);
280 lp_type64
.width
= 64;
281 lp_type64
.length
= length
/2;
283 if (block_bits
== 128) {
285 for (i
= 0; i
< 4; ++i
) {
288 elems
[i
] = lp_build_concat(gallivm
, tmp
, lp_type32dxt
, 2);
291 lp_build_transpose_aos(gallivm
, lp_type32
, elems
, tmp
);
297 LLVMTypeRef type64_vec
= LLVMVectorType(type64
, length
/2);
298 LLVMTypeRef type32_vec
= LLVMVectorType(type32
, length
);
300 for (i
= 0; i
< length
; ++i
) {
302 elems
[i
] = LLVMBuildShuffleVector(builder
, elems
[i
],
303 LLVMGetUndef(type32dxt
),
304 lp_build_const_extend_shuffle(gallivm
, 2, 4), "");
307 struct lp_type lp_type32_4
= {0};
308 lp_type32_4
.width
= 32;
309 lp_type32_4
.length
= 4;
310 for (i
= 0; i
< 4; ++i
) {
313 elems
[i
] = lp_build_concat(gallivm
, tmp
, lp_type32_4
, 2);
316 cc01
= lp_build_interleave2_half(gallivm
, lp_type32
, elems
[0], elems
[1], 0);
317 cc23
= lp_build_interleave2_half(gallivm
, lp_type32
, elems
[2], elems
[3], 0);
318 cc01
= LLVMBuildBitCast(builder
, cc01
, type64_vec
, "");
319 cc23
= LLVMBuildBitCast(builder
, cc23
, type64_vec
, "");
320 *colors
= lp_build_interleave2_half(gallivm
, lp_type64
, cc01
, cc23
, 0);
321 *codewords
= lp_build_interleave2_half(gallivm
, lp_type64
, cc01
, cc23
, 1);
322 *colors
= LLVMBuildBitCast(builder
, *colors
, type32_vec
, "");
323 *codewords
= LLVMBuildBitCast(builder
, *codewords
, type32_vec
, "");
328 /** Convert from <n x i32> containing 2 x n rgb565 colors
329 * to 2 <n x i32> rgba8888 colors
330 * This is the most optimized version I can think of
331 * should be nearly as fast as decoding only one color
332 * NOTE: alpha channel will be set to 0
333 * @param colors is a <n x i32> vector containing the rgb565 colors
336 color_expand2_565_to_8888(struct gallivm_state
*gallivm
,
339 LLVMValueRef
*color0
,
340 LLVMValueRef
*color1
)
342 LLVMBuilderRef builder
= gallivm
->builder
;
343 LLVMValueRef r
, g
, b
, rblo
, glo
;
344 LLVMValueRef rgblomask
, rb
, rgb0
, rgb1
;
345 struct lp_type type
, type16
, type8
;
349 memset(&type
, 0, sizeof type
);
353 memset(&type16
, 0, sizeof type16
);
355 type16
.length
= 2 * n
;
357 memset(&type8
, 0, sizeof type8
);
359 type8
.length
= 4 * n
;
361 rgblomask
= lp_build_const_int_vec(gallivm
, type16
, 0x0707);
362 colors
= LLVMBuildBitCast(builder
, colors
,
363 lp_build_vec_type(gallivm
, type16
), "");
364 /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
365 * make sure low bits of r are zero - could use AND but requires constant */
366 r
= LLVMBuildLShr(builder
, colors
, lp_build_const_int_vec(gallivm
, type16
, 11), "");
367 r
= LLVMBuildShl(builder
, r
, lp_build_const_int_vec(gallivm
, type16
, 3), "");
368 b
= LLVMBuildShl(builder
, colors
, lp_build_const_int_vec(gallivm
, type16
, 11), "");
369 rb
= LLVMBuildOr(builder
, r
, b
, "");
370 rblo
= LLVMBuildLShr(builder
, rb
, lp_build_const_int_vec(gallivm
, type16
, 5), "");
371 /* don't have byte shift hence need mask */
372 rblo
= LLVMBuildAnd(builder
, rblo
, rgblomask
, "");
373 rb
= LLVMBuildOr(builder
, rb
, rblo
, "");
375 /* make sure low bits of g are zero */
376 g
= LLVMBuildAnd(builder
, colors
, lp_build_const_int_vec(gallivm
, type16
, 0x07e0), "");
377 g
= LLVMBuildLShr(builder
, g
, lp_build_const_int_vec(gallivm
, type16
, 3), "");
378 glo
= LLVMBuildLShr(builder
, g
, lp_build_const_int_vec(gallivm
, type16
, 6), "");
379 g
= LLVMBuildOr(builder
, g
, glo
, "");
381 rb
= LLVMBuildBitCast(builder
, rb
, lp_build_vec_type(gallivm
, type8
), "");
382 g
= LLVMBuildBitCast(builder
, g
, lp_build_vec_type(gallivm
, type8
), "");
383 rgb0
= lp_build_interleave2_half(gallivm
, type8
, rb
, g
, 0);
384 rgb1
= lp_build_interleave2_half(gallivm
, type8
, rb
, g
, 1);
386 rgb0
= LLVMBuildBitCast(builder
, rgb0
, lp_build_vec_type(gallivm
, type
), "");
387 rgb1
= LLVMBuildBitCast(builder
, rgb1
, lp_build_vec_type(gallivm
, type
), "");
389 /* rgb0 is rgb00, rgb01, rgb10, rgb11
390 * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
391 * on x86 this _should_ just generate one shufps...
393 *color0
= lp_build_uninterleave2_half(gallivm
, type
, rgb0
, rgb1
, 0);
394 *color1
= lp_build_uninterleave2_half(gallivm
, type
, rgb0
, rgb1
, 1);
398 /** Convert from <n x i32> containing rgb565 colors
399 * (in first 16 bits) to <n x i32> rgba8888 colors
401 * NOTE: alpha channel will be set to 0
402 * @param colors is a <n x i32> vector containing the rgb565 colors
405 color_expand_565_to_8888(struct gallivm_state
*gallivm
,
409 LLVMBuilderRef builder
= gallivm
->builder
;
410 LLVMValueRef rgba
, r
, g
, b
, rgblo
, glo
;
411 LLVMValueRef rbhimask
, g6mask
, rgblomask
;
413 memset(&type
, 0, sizeof type
);
418 * first extract and shift colors into their final locations
419 * (high bits - low bits zero at this point)
420 * then replicate highest bits to the lowest bits
421 * note rb replication can be done in parallel but not g
423 * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
424 * rhigh = 8, ghigh = 5, bhigh = 19
425 * rblow = 5, glow = 6
426 * rgblowmask = 0x00070307
427 * r = colors >> rhigh
428 * b = colors << bhigh
429 * g = (colors & g6mask) << ghigh
430 * rb = (r | b) rbhimask
431 * rbtmp = rb >> rblow
433 * rbtmp = rbtmp | gtmp
434 * rbtmp = rbtmp & rgblowmask
435 * rgb = rb | g | rbtmp
437 g6mask
= lp_build_const_int_vec(gallivm
, type
, 0x07e0);
438 rbhimask
= lp_build_const_int_vec(gallivm
, type
, 0x00f800f8);
439 rgblomask
= lp_build_const_int_vec(gallivm
, type
, 0x00070307);
441 r
= LLVMBuildLShr(builder
, colors
, lp_build_const_int_vec(gallivm
, type
, 8), "");
442 b
= LLVMBuildShl(builder
, colors
, lp_build_const_int_vec(gallivm
, type
, 19), "");
443 g
= LLVMBuildAnd(builder
, colors
, g6mask
, "");
444 g
= LLVMBuildShl(builder
, g
, lp_build_const_int_vec(gallivm
, type
, 5), "");
445 rgba
= LLVMBuildOr(builder
, r
, b
, "");
446 rgba
= LLVMBuildAnd(builder
, rgba
, rbhimask
, "");
447 rgblo
= LLVMBuildLShr(builder
, rgba
, lp_build_const_int_vec(gallivm
, type
, 5), "");
448 glo
= LLVMBuildLShr(builder
, g
, lp_build_const_int_vec(gallivm
, type
, 6), "");
449 rgblo
= LLVMBuildOr(builder
, rgblo
, glo
, "");
450 rgblo
= LLVMBuildAnd(builder
, rgblo
, rgblomask
, "");
451 rgba
= LLVMBuildOr(builder
, rgba
, g
, "");
452 rgba
= LLVMBuildOr(builder
, rgba
, rgblo
, "");
459 * Average two byte vectors. (Will always round up.)
462 lp_build_pavgb(struct lp_build_context
*bld8
,
466 struct gallivm_state
*gallivm
= bld8
->gallivm
;
467 LLVMBuilderRef builder
= gallivm
->builder
;
468 assert(bld8
->type
.width
== 8);
469 assert(bld8
->type
.length
== 16 || bld8
->type
.length
== 32);
470 if (LLVM_VERSION_MAJOR
< 6) {
471 LLVMValueRef intrargs
[2];
472 char *intr_name
= bld8
->type
.length
== 32 ? "llvm.x86.avx2.pavg.b" :
473 "llvm.x86.sse2.pavg.b";
476 return lp_build_intrinsic(builder
, intr_name
,
477 bld8
->vec_type
, intrargs
, 2, 0);
480 * Must match llvm's autoupgrade of pavg.b intrinsic to be useful.
481 * You better hope the backend code manages to detect the pattern, and
482 * the pattern doesn't change there...
484 struct lp_type type_ext
= bld8
->type
;
485 LLVMTypeRef vec_type_ext
;
487 LLVMValueRef ext_one
;
489 vec_type_ext
= lp_build_vec_type(gallivm
, type_ext
);
490 ext_one
= lp_build_const_vec(gallivm
, type_ext
, 1);
492 v0
= LLVMBuildZExt(builder
, v0
, vec_type_ext
, "");
493 v1
= LLVMBuildZExt(builder
, v1
, vec_type_ext
, "");
494 res
= LLVMBuildAdd(builder
, v0
, v1
, "");
495 res
= LLVMBuildAdd(builder
, res
, ext_one
, "");
496 res
= LLVMBuildLShr(builder
, res
, ext_one
, "");
497 res
= LLVMBuildTrunc(builder
, res
, bld8
->vec_type
, "");
503 * Calculate 1/3(v1-v0) + v0
504 * and 2*1/3(v1-v0) + v0
507 lp_build_lerp23(struct lp_build_context
*bld
,
513 struct gallivm_state
*gallivm
= bld
->gallivm
;
514 LLVMValueRef x
, x_lo
, x_hi
, delta_lo
, delta_hi
;
515 LLVMValueRef mul_lo
, mul_hi
, v0_lo
, v0_hi
, v1_lo
, v1_hi
, tmp
;
516 const struct lp_type type
= bld
->type
;
517 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
518 struct lp_type i16_type
= lp_wider_type(type
);
519 struct lp_build_context bld2
;
521 assert(lp_check_value(type
, v0
));
522 assert(lp_check_value(type
, v1
));
523 assert(!type
.floating
&& !type
.fixed
&& !type
.norm
&& type
.width
== 8);
525 lp_build_context_init(&bld2
, gallivm
, i16_type
);
526 bld2
.type
.sign
= TRUE
;
527 x
= lp_build_const_int_vec(gallivm
, bld
->type
, 255*1/3);
529 /* FIXME: use native avx256 unpack/pack */
530 lp_build_unpack2(gallivm
, type
, i16_type
, x
, &x_lo
, &x_hi
);
531 lp_build_unpack2(gallivm
, type
, i16_type
, v0
, &v0_lo
, &v0_hi
);
532 lp_build_unpack2(gallivm
, type
, i16_type
, v1
, &v1_lo
, &v1_hi
);
533 delta_lo
= lp_build_sub(&bld2
, v1_lo
, v0_lo
);
534 delta_hi
= lp_build_sub(&bld2
, v1_hi
, v0_hi
);
536 mul_lo
= LLVMBuildMul(builder
, x_lo
, delta_lo
, "");
537 mul_hi
= LLVMBuildMul(builder
, x_hi
, delta_hi
, "");
539 x_lo
= LLVMBuildLShr(builder
, mul_lo
, lp_build_const_int_vec(gallivm
, i16_type
, 8), "");
540 x_hi
= LLVMBuildLShr(builder
, mul_hi
, lp_build_const_int_vec(gallivm
, i16_type
, 8), "");
541 /* lerp optimization: pack now, do add afterwards */
542 tmp
= lp_build_pack2(gallivm
, i16_type
, type
, x_lo
, x_hi
);
543 *res0
= lp_build_add(bld
, tmp
, v0
);
545 x_lo
= LLVMBuildLShr(builder
, mul_lo
, lp_build_const_int_vec(gallivm
, i16_type
, 7), "");
546 x_hi
= LLVMBuildLShr(builder
, mul_hi
, lp_build_const_int_vec(gallivm
, i16_type
, 7), "");
547 /* unlike above still need mask (but add still afterwards). */
548 x_lo
= LLVMBuildAnd(builder
, x_lo
, lp_build_const_int_vec(gallivm
, i16_type
, 0xff), "");
549 x_hi
= LLVMBuildAnd(builder
, x_hi
, lp_build_const_int_vec(gallivm
, i16_type
, 0xff), "");
550 tmp
= lp_build_pack2(gallivm
, i16_type
, type
, x_lo
, x_hi
);
551 *res1
= lp_build_add(bld
, tmp
, v0
);
555 * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
556 * @param colors is a <n x i32> vector with n x 2x16bit colors
557 * @param codewords is a <n x i32> vector containing the codewords
558 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
559 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
562 s3tc_dxt1_full_to_rgba_aos(struct gallivm_state
*gallivm
,
564 enum pipe_format format
,
566 LLVMValueRef codewords
,
570 LLVMBuilderRef builder
= gallivm
->builder
;
571 LLVMValueRef color0
, color1
, color2
, color3
, color2_2
, color3_2
;
572 LLVMValueRef rgba
, a
, colors0
, colors1
, col0
, col1
, const2
;
573 LLVMValueRef bit_pos
, sel_mask
, sel_lo
, sel_hi
, indices
;
574 struct lp_type type
, type8
;
575 struct lp_build_context bld8
, bld32
;
576 boolean is_dxt1_variant
= format_dxt1_variant(format
);
578 memset(&type
, 0, sizeof type
);
582 memset(&type8
, 0, sizeof type8
);
586 assert(lp_check_value(type
, i
));
587 assert(lp_check_value(type
, j
));
589 a
= lp_build_const_int_vec(gallivm
, type
, 0xff000000);
591 lp_build_context_init(&bld32
, gallivm
, type
);
592 lp_build_context_init(&bld8
, gallivm
, type8
);
596 * - expand color0/color1 to rgba8888
597 * - calculate color2/3 (interpolation) according to color0 < color1 rules
598 * - calculate color2/3 according to color0 >= color1 rules
599 * - do selection of color2/3 according to comparison of color0/1
600 * - extract indices (vector shift).
601 * - use compare/select to select the correct color. Since we have 2bit
602 * indices (and 4 colors), needs at least three compare/selects.
605 * expand the two colors
607 col0
= LLVMBuildAnd(builder
, colors
, lp_build_const_int_vec(gallivm
, type
, 0x0000ffff), "");
608 col1
= LLVMBuildLShr(builder
, colors
, lp_build_const_int_vec(gallivm
, type
, 16), "");
610 color_expand2_565_to_8888(gallivm
, n
, colors
, &color0
, &color1
);
613 color0
= color_expand_565_to_8888(gallivm
, n
, col0
);
614 color1
= color_expand_565_to_8888(gallivm
, n
, col1
);
619 * color2_1 is 2/3 color0 + 1/3 color1
620 * color3_1 is 1/3 color0 + 2/3 color1
621 * color2_2 is 1/2 color0 + 1/2 color1
625 colors0
= LLVMBuildBitCast(builder
, color0
, bld8
.vec_type
, "");
626 colors1
= LLVMBuildBitCast(builder
, color1
, bld8
.vec_type
, "");
627 /* can combine 2 lerps into one mostly - still looks expensive enough. */
628 lp_build_lerp23(&bld8
, colors0
, colors1
, &color2
, &color3
);
629 color2
= LLVMBuildBitCast(builder
, color2
, bld32
.vec_type
, "");
630 color3
= LLVMBuildBitCast(builder
, color3
, bld32
.vec_type
, "");
632 /* dxt3/5 always use 4-color encoding */
633 if (is_dxt1_variant
) {
635 if (format
== PIPE_FORMAT_DXT1_RGBA
||
636 format
== PIPE_FORMAT_DXT1_SRGBA
) {
637 color0
= LLVMBuildOr(builder
, color0
, a
, "");
638 color1
= LLVMBuildOr(builder
, color1
, a
, "");
639 color3
= LLVMBuildOr(builder
, color3
, a
, "");
642 * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
643 * Much cheaper (but we don't care that much if n == 1).
645 if ((util_cpu_caps
.has_sse2
&& n
== 4) ||
646 (util_cpu_caps
.has_avx2
&& n
== 8)) {
647 color2_2
= lp_build_pavgb(&bld8
, colors0
, colors1
);
648 color2_2
= LLVMBuildBitCast(builder
, color2_2
, bld32
.vec_type
, "");
651 struct lp_type i16_type
= lp_wider_type(type8
);
652 struct lp_build_context bld2
;
653 LLVMValueRef v0_lo
, v0_hi
, v1_lo
, v1_hi
, addlo
, addhi
;
655 lp_build_context_init(&bld2
, gallivm
, i16_type
);
656 bld2
.type
.sign
= TRUE
;
659 * This isn't as expensive as it looks (the unpack is the same as
660 * for lerp23), with correct rounding.
661 * (Note that while rounding is correct, this will always round down,
662 * whereas pavgb will always round up.)
664 /* FIXME: use native avx256 unpack/pack */
665 lp_build_unpack2(gallivm
, type8
, i16_type
, colors0
, &v0_lo
, &v0_hi
);
666 lp_build_unpack2(gallivm
, type8
, i16_type
, colors1
, &v1_lo
, &v1_hi
);
668 addlo
= lp_build_add(&bld2
, v0_lo
, v1_lo
);
669 addhi
= lp_build_add(&bld2
, v0_hi
, v1_hi
);
670 addlo
= LLVMBuildLShr(builder
, addlo
,
671 lp_build_const_int_vec(gallivm
, i16_type
, 1), "");
672 addhi
= LLVMBuildLShr(builder
, addhi
,
673 lp_build_const_int_vec(gallivm
, i16_type
, 1), "");
674 color2_2
= lp_build_pack2(gallivm
, i16_type
, type8
, addlo
, addhi
);
675 color2_2
= LLVMBuildBitCast(builder
, color2_2
, bld32
.vec_type
, "");
677 color3_2
= lp_build_const_int_vec(gallivm
, type
, 0);
679 /* select between colors2/3 */
680 /* signed compare is faster saves some xors */
682 sel_mask
= lp_build_compare(gallivm
, type
, PIPE_FUNC_GREATER
, col0
, col1
);
683 color2
= lp_build_select(&bld32
, sel_mask
, color2
, color2_2
);
684 color3
= lp_build_select(&bld32
, sel_mask
, color3
, color3_2
);
687 if (format
== PIPE_FORMAT_DXT1_RGBA
||
688 format
== PIPE_FORMAT_DXT1_SRGBA
) {
689 color2
= LLVMBuildOr(builder
, color2
, a
, "");
693 const2
= lp_build_const_int_vec(gallivm
, type
, 2);
694 /* extract 2-bit index values */
695 bit_pos
= LLVMBuildShl(builder
, j
, const2
, "");
696 bit_pos
= LLVMBuildAdd(builder
, bit_pos
, i
, "");
697 bit_pos
= LLVMBuildAdd(builder
, bit_pos
, bit_pos
, "");
699 * NOTE: This innocent looking shift is very expensive with x86/ssex.
700 * Shifts with per-elemnent shift count get roughly translated to
701 * extract (count), extract (value), shift, move (back to xmm), unpack
703 * So about 20 instructions here for 4xi32.
704 * Newer llvm versions (3.7+) will not do extract/insert but use a
705 * a couple constant count vector shifts plus shuffles. About same
706 * amount of instructions unfortunately...
707 * Would get much worse with 8xi16 even...
708 * We could actually do better here:
709 * - subtract bit_pos from 128+30, shl 23, convert float to int...
710 * - now do mul with codewords followed by shr 30...
711 * But requires 32bit->32bit mul, sse41 only (well that's emulatable
712 * with 2 32bit->64bit muls...) and not exactly cheap
713 * AVX2, of course, fixes this nonsense.
715 indices
= LLVMBuildLShr(builder
, codewords
, bit_pos
, "");
717 /* finally select the colors */
718 sel_lo
= LLVMBuildAnd(builder
, indices
, bld32
.one
, "");
719 sel_lo
= lp_build_compare(gallivm
, type
, PIPE_FUNC_EQUAL
, sel_lo
, bld32
.one
);
720 color0
= lp_build_select(&bld32
, sel_lo
, color1
, color0
);
721 color2
= lp_build_select(&bld32
, sel_lo
, color3
, color2
);
722 sel_hi
= LLVMBuildAnd(builder
, indices
, const2
, "");
723 sel_hi
= lp_build_compare(gallivm
, type
, PIPE_FUNC_EQUAL
, sel_hi
, const2
);
724 rgba
= lp_build_select(&bld32
, sel_hi
, color2
, color0
);
727 if (format
== PIPE_FORMAT_DXT1_RGB
||
728 format
== PIPE_FORMAT_DXT1_SRGB
) {
729 rgba
= LLVMBuildOr(builder
, rgba
, a
, "");
731 return LLVMBuildBitCast(builder
, rgba
, bld8
.vec_type
, "");
736 s3tc_dxt1_to_rgba_aos(struct gallivm_state
*gallivm
,
738 enum pipe_format format
,
740 LLVMValueRef codewords
,
744 return s3tc_dxt1_full_to_rgba_aos(gallivm
, n
, format
,
745 colors
, codewords
, i
, j
);
750 * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
751 * @param colors is a <n x i32> vector with n x 2x16bit colors
752 * @param codewords is a <n x i32> vector containing the codewords
753 * @param alphas is a <n x i64> vector containing the alpha values
754 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
755 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
758 s3tc_dxt3_to_rgba_aos(struct gallivm_state
*gallivm
,
760 enum pipe_format format
,
762 LLVMValueRef codewords
,
763 LLVMValueRef alpha_low
,
764 LLVMValueRef alpha_hi
,
768 LLVMBuilderRef builder
= gallivm
->builder
;
769 LLVMValueRef rgba
, tmp
, tmp2
;
770 LLVMValueRef bit_pos
, sel_mask
;
771 struct lp_type type
, type8
;
772 struct lp_build_context bld
;
774 memset(&type
, 0, sizeof type
);
778 memset(&type8
, 0, sizeof type8
);
782 assert(lp_check_value(type
, i
));
783 assert(lp_check_value(type
, j
));
785 lp_build_context_init(&bld
, gallivm
, type
);
787 rgba
= s3tc_dxt1_to_rgba_aos(gallivm
, n
, format
,
788 colors
, codewords
, i
, j
);
790 rgba
= LLVMBuildBitCast(builder
, rgba
, bld
.vec_type
, "");
793 * Extract alpha values. Since we now need to select from
794 * which 32bit vector values are fetched, construct selection
795 * mask from highest bit of bit_pos, and use select, then shift
796 * according to the bit_pos (without the highest bit).
797 * Note this is pointless for n == 1 case. Could just
798 * directly use 64bit arithmetic if we'd extract 64bit
799 * alpha value instead of 2x32...
802 bit_pos
= LLVMBuildShl(builder
, j
, lp_build_const_int_vec(gallivm
, type
, 2), "");
803 bit_pos
= LLVMBuildAdd(builder
, bit_pos
, i
, "");
804 bit_pos
= LLVMBuildShl(builder
, bit_pos
,
805 lp_build_const_int_vec(gallivm
, type
, 2), "");
806 sel_mask
= LLVMBuildLShr(builder
, bit_pos
,
807 lp_build_const_int_vec(gallivm
, type
, 5), "");
808 sel_mask
= LLVMBuildSub(builder
, sel_mask
, bld
.one
, "");
809 tmp
= lp_build_select(&bld
, sel_mask
, alpha_low
, alpha_hi
);
810 bit_pos
= LLVMBuildAnd(builder
, bit_pos
,
811 lp_build_const_int_vec(gallivm
, type
, 0xffffffdf), "");
812 /* Warning: slow shift with per element count (without avx2) */
814 * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
815 * to select the right byte with pshufb. Then for the remaining one bit
816 * just do shift/select.
818 tmp
= LLVMBuildLShr(builder
, tmp
, bit_pos
, "");
820 /* combined expand from a4 to a8 and shift into position */
821 tmp
= LLVMBuildShl(builder
, tmp
, lp_build_const_int_vec(gallivm
, type
, 28), "");
822 tmp2
= LLVMBuildLShr(builder
, tmp
, lp_build_const_int_vec(gallivm
, type
, 4), "");
823 tmp
= LLVMBuildOr(builder
, tmp
, tmp2
, "");
825 rgba
= LLVMBuildOr(builder
, tmp
, rgba
, "");
827 return LLVMBuildBitCast(builder
, rgba
, lp_build_vec_type(gallivm
, type8
), "");
831 lp_build_lerpdxta(struct gallivm_state
*gallivm
,
835 LLVMValueRef sel_mask
,
839 * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
840 * (plus pmullw is actually faster...)
841 * we just pretend our 32bit values (which are really only 8bit) are 16bits.
842 * Note that this is obviously a disaster for the scalar case.
844 LLVMBuilderRef builder
= gallivm
->builder
;
845 LLVMValueRef delta
, ainterp
;
846 LLVMValueRef weight5
, weight7
, weight
;
847 struct lp_type type32
, type16
, type8
;
848 struct lp_build_context bld16
;
850 memset(&type32
, 0, sizeof type32
);
853 memset(&type16
, 0, sizeof type16
);
857 memset(&type8
, 0, sizeof type8
);
861 lp_build_context_init(&bld16
, gallivm
, type16
);
862 /* 255/7 is a bit off - increase accuracy at the expense of shift later */
863 sel_mask
= LLVMBuildBitCast(builder
, sel_mask
, bld16
.vec_type
, "");
864 weight5
= lp_build_const_int_vec(gallivm
, type16
, 255*64/5);
865 weight7
= lp_build_const_int_vec(gallivm
, type16
, 255*64/7);
866 weight
= lp_build_select(&bld16
, sel_mask
, weight7
, weight5
);
868 alpha0
= LLVMBuildBitCast(builder
, alpha0
, bld16
.vec_type
, "");
869 alpha1
= LLVMBuildBitCast(builder
, alpha1
, bld16
.vec_type
, "");
870 code
= LLVMBuildBitCast(builder
, code
, bld16
.vec_type
, "");
871 /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
873 code
= LLVMBuildSub(builder
, code
, bld16
.one
, "");
875 weight
= LLVMBuildMul(builder
, weight
, code
, "");
876 weight
= LLVMBuildLShr(builder
, weight
,
877 lp_build_const_int_vec(gallivm
, type16
, 6), "");
879 delta
= LLVMBuildSub(builder
, alpha1
, alpha0
, "");
881 ainterp
= LLVMBuildMul(builder
, delta
, weight
, "");
882 ainterp
= LLVMBuildLShr(builder
, ainterp
,
883 lp_build_const_int_vec(gallivm
, type16
, 8), "");
885 ainterp
= LLVMBuildBitCast(builder
, ainterp
, lp_build_vec_type(gallivm
, type8
), "");
886 alpha0
= LLVMBuildBitCast(builder
, alpha0
, lp_build_vec_type(gallivm
, type8
), "");
887 ainterp
= LLVMBuildAdd(builder
, alpha0
, ainterp
, "");
888 ainterp
= LLVMBuildBitCast(builder
, ainterp
, lp_build_vec_type(gallivm
, type32
), "");
894 s3tc_dxt5_alpha_channel(struct gallivm_state
*gallivm
,
897 LLVMValueRef alpha_hi
, LLVMValueRef alpha_lo
,
898 LLVMValueRef i
, LLVMValueRef j
)
900 LLVMBuilderRef builder
= gallivm
->builder
;
901 struct lp_type type
, type8
;
902 LLVMValueRef tmp
, alpha0
, alpha1
, alphac
, alphac0
, bit_pos
, shift
;
903 LLVMValueRef sel_mask
, tmp_mask
, alpha
, alpha64
, code_s
;
904 LLVMValueRef mask6
, mask7
, ainterp
;
905 LLVMTypeRef i64t
= LLVMInt64TypeInContext(gallivm
->context
);
906 LLVMTypeRef i32t
= LLVMInt32TypeInContext(gallivm
->context
);
907 struct lp_build_context bld32
;
909 memset(&type
, 0, sizeof type
);
913 memset(&type8
, 0, sizeof type8
);
916 type8
.sign
= is_signed
;
918 lp_build_context_init(&bld32
, gallivm
, type
);
919 /* this looks pretty complex for vectorization:
920 * extract a0/a1 values
922 * select weights for interpolation depending on a0 > a1
923 * mul weights by code - 1
925 * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
928 alpha0
= LLVMBuildAnd(builder
, alpha_lo
,
929 lp_build_const_int_vec(gallivm
, type
, 0xff), "");
931 alpha0
= LLVMBuildTrunc(builder
, alpha0
, lp_build_vec_type(gallivm
, type8
), "");
932 alpha0
= LLVMBuildSExt(builder
, alpha0
, lp_build_vec_type(gallivm
, type
), "");
935 alpha1
= LLVMBuildLShr(builder
, alpha_lo
,
936 lp_build_const_int_vec(gallivm
, type
, 8), "");
937 alpha1
= LLVMBuildAnd(builder
, alpha1
,
938 lp_build_const_int_vec(gallivm
, type
, 0xff), "");
940 alpha1
= LLVMBuildTrunc(builder
, alpha1
, lp_build_vec_type(gallivm
, type8
), "");
941 alpha1
= LLVMBuildSExt(builder
, alpha1
, lp_build_vec_type(gallivm
, type
), "");
945 bit_pos
= LLVMBuildShl(builder
, j
, lp_build_const_int_vec(gallivm
, type
, 2), "");
946 bit_pos
= LLVMBuildAdd(builder
, bit_pos
, i
, "");
947 tmp
= LLVMBuildAdd(builder
, bit_pos
, bit_pos
, "");
948 bit_pos
= LLVMBuildAdd(builder
, bit_pos
, tmp
, "");
949 /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
950 bit_pos
= LLVMBuildAdd(builder
, bit_pos
,
951 lp_build_const_int_vec(gallivm
, type
, 16), "");
954 struct lp_type type64
;
955 memset(&type64
, 0, sizeof type64
);
958 /* This is pretty pointless could avoid by just directly extracting
959 64bit in the first place but makes it more complicated elsewhere */
960 alpha_lo
= LLVMBuildZExt(builder
, alpha_lo
, i64t
, "");
961 alpha_hi
= LLVMBuildZExt(builder
, alpha_hi
, i64t
, "");
962 alphac0
= LLVMBuildShl(builder
, alpha_hi
,
963 lp_build_const_int_vec(gallivm
, type64
, 32), "");
964 alphac0
= LLVMBuildOr(builder
, alpha_lo
, alphac0
, "");
966 shift
= LLVMBuildZExt(builder
, bit_pos
, i64t
, "");
967 alphac0
= LLVMBuildLShr(builder
, alphac0
, shift
, "");
968 alphac0
= LLVMBuildTrunc(builder
, alphac0
, i32t
, "");
969 alphac
= LLVMBuildAnd(builder
, alphac0
,
970 lp_build_const_int_vec(gallivm
, type
, 0x7), "");
974 * Using non-native vector length here (actually, with avx2 and
975 * n == 4 llvm will indeed expand to ymm regs...)
976 * At least newer llvm versions handle that ok.
977 * llvm 3.7+ will even handle the emulated 64bit shift with variable
978 * shift count without extraction (and it's actually easier to
979 * emulate than the 32bit one).
981 alpha64
= LLVMBuildShuffleVector(builder
, alpha_lo
, alpha_hi
,
982 lp_build_const_unpackx2_shuffle(gallivm
, n
), "");
984 alpha64
= LLVMBuildBitCast(builder
, alpha64
, LLVMVectorType(i64t
, n
), "");
985 shift
= LLVMBuildZExt(builder
, bit_pos
, LLVMVectorType(i64t
, n
), "");
986 alphac
= LLVMBuildLShr(builder
, alpha64
, shift
, "");
987 alphac
= LLVMBuildTrunc(builder
, alphac
, bld32
.vec_type
, "");
989 alphac
= LLVMBuildAnd(builder
, alphac
,
990 lp_build_const_int_vec(gallivm
, type
, 0x7), "");
993 /* signed compare is faster saves some xors */
995 /* alpha0 > alpha1 selection */
996 sel_mask
= lp_build_compare(gallivm
, type
, PIPE_FUNC_GREATER
,
998 ainterp
= lp_build_lerpdxta(gallivm
, alpha0
, alpha1
, alphac
, sel_mask
, n
);
1001 * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
1002 * else we select a0 for case 0, a1 for case 1,
1003 * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
1004 * a = (c == 0) ? a0 : a1
1005 * a = (c > 1) ? ainterp : a
1006 * Finally handle case 6/7 for !(a0 > a1)
1007 * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
1008 * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
1010 tmp_mask
= lp_build_compare(gallivm
, type
, PIPE_FUNC_EQUAL
,
1011 alphac
, bld32
.zero
);
1012 alpha
= lp_build_select(&bld32
, tmp_mask
, alpha0
, alpha1
);
1013 tmp_mask
= lp_build_compare(gallivm
, type
, PIPE_FUNC_GREATER
,
1015 alpha
= lp_build_select(&bld32
, tmp_mask
, ainterp
, alpha
);
1017 code_s
= LLVMBuildAnd(builder
, alphac
,
1018 LLVMBuildNot(builder
, sel_mask
, ""), "");
1019 mask6
= lp_build_compare(gallivm
, type
, PIPE_FUNC_EQUAL
,
1020 code_s
, lp_build_const_int_vec(gallivm
, type
, 6));
1021 mask7
= lp_build_compare(gallivm
, type
, PIPE_FUNC_EQUAL
,
1022 code_s
, lp_build_const_int_vec(gallivm
, type
, 7));
1024 alpha
= lp_build_select(&bld32
, mask6
, lp_build_const_int_vec(gallivm
, type
, -127), alpha
);
1025 alpha
= lp_build_select(&bld32
, mask7
, lp_build_const_int_vec(gallivm
, type
, 127), alpha
);
1027 alpha
= LLVMBuildAnd(builder
, alpha
, LLVMBuildNot(builder
, mask6
, ""), "");
1028 alpha
= LLVMBuildOr(builder
, alpha
, mask7
, "");
1030 /* There can be garbage in upper bits, mask them off for rgtc formats */
1031 alpha
= LLVMBuildAnd(builder
, alpha
, lp_build_const_int_vec(gallivm
, type
, 0xff), "");
1037 * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
1038 * @param colors is a <n x i32> vector with n x 2x16bit colors
1039 * @param codewords is a <n x i32> vector containing the codewords
1040 * @param alphas is a <n x i64> vector containing the alpha values
1041 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
1042 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
1045 s3tc_dxt5_full_to_rgba_aos(struct gallivm_state
*gallivm
,
1047 enum pipe_format format
,
1048 LLVMValueRef colors
,
1049 LLVMValueRef codewords
,
1050 LLVMValueRef alpha_lo
,
1051 LLVMValueRef alpha_hi
,
1055 LLVMBuilderRef builder
= gallivm
->builder
;
1056 LLVMValueRef rgba
, alpha
;
1057 struct lp_type type
, type8
;
1058 struct lp_build_context bld32
;
1060 memset(&type
, 0, sizeof type
);
1064 memset(&type8
, 0, sizeof type8
);
1068 assert(lp_check_value(type
, i
));
1069 assert(lp_check_value(type
, j
));
1071 lp_build_context_init(&bld32
, gallivm
, type
);
1073 assert(lp_check_value(type
, i
));
1074 assert(lp_check_value(type
, j
));
1076 rgba
= s3tc_dxt1_to_rgba_aos(gallivm
, n
, format
,
1077 colors
, codewords
, i
, j
);
1079 rgba
= LLVMBuildBitCast(builder
, rgba
, bld32
.vec_type
, "");
1081 alpha
= s3tc_dxt5_alpha_channel(gallivm
, false, n
, alpha_hi
, alpha_lo
, i
, j
);
1082 alpha
= LLVMBuildShl(builder
, alpha
, lp_build_const_int_vec(gallivm
, type
, 24), "");
1083 rgba
= LLVMBuildOr(builder
, alpha
, rgba
, "");
1085 return LLVMBuildBitCast(builder
, rgba
, lp_build_vec_type(gallivm
, type8
), "");
1090 lp_build_gather_s3tc_simple_scalar(struct gallivm_state
*gallivm
,
1091 const struct util_format_description
*format_desc
,
1092 LLVMValueRef
*dxt_block
,
1095 LLVMBuilderRef builder
= gallivm
->builder
;
1096 unsigned block_bits
= format_desc
->block
.bits
;
1097 LLVMValueRef elem
, shuf
;
1098 LLVMTypeRef type32
= LLVMIntTypeInContext(gallivm
->context
, 32);
1099 LLVMTypeRef src_type
= LLVMIntTypeInContext(gallivm
->context
, block_bits
);
1100 LLVMTypeRef src_ptr_type
= LLVMPointerType(src_type
, 0);
1101 LLVMTypeRef type32_4
= LLVMVectorType(type32
, 4);
1103 assert(block_bits
== 64 || block_bits
== 128);
1105 ptr
= LLVMBuildBitCast(builder
, ptr
, src_ptr_type
, "");
1106 elem
= LLVMBuildLoad(builder
, ptr
, "");
1108 if (block_bits
== 128) {
1109 /* just return block as is */
1110 *dxt_block
= LLVMBuildBitCast(builder
, elem
, type32_4
, "");
1113 LLVMTypeRef type32_2
= LLVMVectorType(type32
, 2);
1114 shuf
= lp_build_const_extend_shuffle(gallivm
, 2, 4);
1115 elem
= LLVMBuildBitCast(builder
, elem
, type32_2
, "");
1116 *dxt_block
= LLVMBuildShuffleVector(builder
, elem
,
1117 LLVMGetUndef(type32_2
), shuf
, "");
1123 s3tc_store_cached_block(struct gallivm_state
*gallivm
,
1125 LLVMValueRef tag_value
,
1126 LLVMValueRef hash_index
,
1129 LLVMBuilderRef builder
= gallivm
->builder
;
1130 LLVMValueRef ptr
, indices
[3];
1131 LLVMTypeRef type_ptr4x32
;
1134 type_ptr4x32
= LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm
->context
), 4), 0);
1135 indices
[0] = lp_build_const_int32(gallivm
, 0);
1136 indices
[1] = lp_build_const_int32(gallivm
, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS
);
1137 indices
[2] = hash_index
;
1138 ptr
= LLVMBuildGEP(builder
, cache
, indices
, ARRAY_SIZE(indices
), "");
1139 LLVMBuildStore(builder
, tag_value
, ptr
);
1141 indices
[1] = lp_build_const_int32(gallivm
, LP_BUILD_FORMAT_CACHE_MEMBER_DATA
);
1142 hash_index
= LLVMBuildMul(builder
, hash_index
,
1143 lp_build_const_int32(gallivm
, 16), "");
1144 for (count
= 0; count
< 4; count
++) {
1145 indices
[2] = hash_index
;
1146 ptr
= LLVMBuildGEP(builder
, cache
, indices
, ARRAY_SIZE(indices
), "");
1147 ptr
= LLVMBuildBitCast(builder
, ptr
, type_ptr4x32
, "");
1148 LLVMBuildStore(builder
, col
[count
], ptr
);
1149 hash_index
= LLVMBuildAdd(builder
, hash_index
,
1150 lp_build_const_int32(gallivm
, 4), "");
1155 s3tc_lookup_cached_pixel(struct gallivm_state
*gallivm
,
1159 LLVMBuilderRef builder
= gallivm
->builder
;
1160 LLVMValueRef member_ptr
, indices
[3];
1162 indices
[0] = lp_build_const_int32(gallivm
, 0);
1163 indices
[1] = lp_build_const_int32(gallivm
, LP_BUILD_FORMAT_CACHE_MEMBER_DATA
);
1165 member_ptr
= LLVMBuildGEP(builder
, ptr
, indices
, ARRAY_SIZE(indices
), "");
1166 return LLVMBuildLoad(builder
, member_ptr
, "cache_data");
1170 s3tc_lookup_tag_data(struct gallivm_state
*gallivm
,
1174 LLVMBuilderRef builder
= gallivm
->builder
;
1175 LLVMValueRef member_ptr
, indices
[3];
1177 indices
[0] = lp_build_const_int32(gallivm
, 0);
1178 indices
[1] = lp_build_const_int32(gallivm
, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS
);
1180 member_ptr
= LLVMBuildGEP(builder
, ptr
, indices
, ARRAY_SIZE(indices
), "");
1181 return LLVMBuildLoad(builder
, member_ptr
, "tag_data");
1184 #if LP_BUILD_FORMAT_CACHE_DEBUG
1186 s3tc_update_cache_access(struct gallivm_state
*gallivm
,
1191 LLVMBuilderRef builder
= gallivm
->builder
;
1192 LLVMValueRef member_ptr
, cache_access
;
1194 assert(index
== LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL
||
1195 index
== LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS
);
1197 member_ptr
= lp_build_struct_get_ptr(gallivm
, ptr
, index
, "");
1198 cache_access
= LLVMBuildLoad(builder
, member_ptr
, "cache_access");
1199 cache_access
= LLVMBuildAdd(builder
, cache_access
,
1200 LLVMConstInt(LLVMInt64TypeInContext(gallivm
->context
),
1202 LLVMBuildStore(builder
, cache_access
, member_ptr
);
1207 * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
1208 * The lerp is performed between the first 2 32bit colors
1209 * in the source vector, both results are returned packed in result vector.
1212 lp_build_lerp23_single(struct lp_build_context
*bld
,
1215 struct gallivm_state
*gallivm
= bld
->gallivm
;
1216 LLVMValueRef x
, mul
, delta
, res
, v0
, v1
, elems
[8];
1217 const struct lp_type type
= bld
->type
;
1218 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1219 struct lp_type i16_type
= lp_wider_type(type
);
1220 struct lp_type i32_type
= lp_wider_type(i16_type
);
1221 struct lp_build_context bld2
;
1223 assert(!type
.floating
&& !type
.fixed
&& !type
.norm
&& type
.width
== 8);
1225 lp_build_context_init(&bld2
, gallivm
, i16_type
);
1226 bld2
.type
.sign
= TRUE
;
1228 /* weights 256/3, 256*2/3, with correct rounding */
1229 elems
[0] = elems
[1] = elems
[2] = elems
[3] =
1230 lp_build_const_elem(gallivm
, i16_type
, 255*1/3);
1231 elems
[4] = elems
[5] = elems
[6] = elems
[7] =
1232 lp_build_const_elem(gallivm
, i16_type
, 171);
1233 x
= LLVMConstVector(elems
, 8);
1236 * v01 has col0 in 32bit elem 0, col1 in elem 1.
1237 * Interleave/unpack will give us separate v0/v1 vectors.
1239 v01
= lp_build_interleave2(gallivm
, i32_type
, v01
, v01
, 0);
1240 v01
= LLVMBuildBitCast(builder
, v01
, bld
->vec_type
, "");
1242 lp_build_unpack2(gallivm
, type
, i16_type
, v01
, &v0
, &v1
);
1243 delta
= lp_build_sub(&bld2
, v1
, v0
);
1245 mul
= LLVMBuildMul(builder
, x
, delta
, "");
1247 mul
= LLVMBuildLShr(builder
, mul
, lp_build_const_int_vec(gallivm
, i16_type
, 8), "");
1248 /* lerp optimization: pack now, do add afterwards */
1249 res
= lp_build_pack2(gallivm
, i16_type
, type
, mul
, bld2
.undef
);
1250 /* only lower 2 elems are valid - for these v0 is really v0 */
1251 return lp_build_add(bld
, res
, v01
);
1255 * decode one dxt1 block.
1258 s3tc_decode_block_dxt1(struct gallivm_state
*gallivm
,
1259 enum pipe_format format
,
1260 LLVMValueRef dxt_block
,
1263 LLVMBuilderRef builder
= gallivm
->builder
;
1264 LLVMValueRef color01
, color23
, color01_16
, color0123
;
1265 LLVMValueRef rgba
, tmp
, a
, sel_mask
, indices
, code
, const2
;
1266 struct lp_type type8
, type32
, type16
, type64
;
1267 struct lp_build_context bld8
, bld32
, bld16
, bld64
;
1269 boolean is_dxt1_variant
= format_dxt1_variant(format
);
1271 memset(&type32
, 0, sizeof type32
);
1276 memset(&type8
, 0, sizeof type8
);
1280 memset(&type16
, 0, sizeof type16
);
1284 memset(&type64
, 0, sizeof type64
);
1288 a
= lp_build_const_int_vec(gallivm
, type32
, 0xff000000);
1289 const2
= lp_build_const_int_vec(gallivm
, type32
, 2);
1291 lp_build_context_init(&bld32
, gallivm
, type32
);
1292 lp_build_context_init(&bld16
, gallivm
, type16
);
1293 lp_build_context_init(&bld8
, gallivm
, type8
);
1294 lp_build_context_init(&bld64
, gallivm
, type64
);
1296 if (is_dxt1_variant
) {
1297 color01
= lp_build_shuffle1undef(gallivm
, dxt_block
, 0, 4);
1298 code
= lp_build_shuffle1undef(gallivm
, dxt_block
, 1, 4);
1300 color01
= lp_build_shuffle1undef(gallivm
, dxt_block
, 2, 4);
1301 code
= lp_build_shuffle1undef(gallivm
, dxt_block
, 3, 4);
1303 code
= LLVMBuildBitCast(builder
, code
, bld8
.vec_type
, "");
1304 /* expand bytes to dwords */
1305 code
= lp_build_interleave2(gallivm
, type8
, code
, code
, 0);
1306 code
= lp_build_interleave2(gallivm
, type8
, code
, code
, 0);
1311 * - expand color0/color1 to rgba8888
1312 * - calculate color2/3 (interpolation) according to color0 < color1 rules
1313 * - calculate color2/3 according to color0 >= color1 rules
1314 * - do selection of color2/3 according to comparison of color0/1
1315 * - extract indices.
1316 * - use compare/select to select the correct color. Since we have 2bit
1317 * indices (and 4 colors), needs at least three compare/selects.
1321 * expand the two colors
1323 color01
= LLVMBuildBitCast(builder
, color01
, bld16
.vec_type
, "");
1324 color01
= lp_build_interleave2(gallivm
, type16
, color01
,
1326 color01_16
= LLVMBuildBitCast(builder
, color01
, bld32
.vec_type
, "");
1327 color01
= color_expand_565_to_8888(gallivm
, 4, color01_16
);
1330 * interpolate colors
1331 * color2_1 is 2/3 color0 + 1/3 color1
1332 * color3_1 is 1/3 color0 + 2/3 color1
1333 * color2_2 is 1/2 color0 + 1/2 color1
1337 /* TODO: since this is now always scalar, should
1338 * probably just use control flow here instead of calculating
1339 * both cases and then selection
1341 if (format
== PIPE_FORMAT_DXT1_RGBA
||
1342 format
== PIPE_FORMAT_DXT1_SRGBA
) {
1343 color01
= LLVMBuildOr(builder
, color01
, a
, "");
1345 /* can combine 2 lerps into one mostly */
1346 color23
= lp_build_lerp23_single(&bld8
, color01
);
1347 color23
= LLVMBuildBitCast(builder
, color23
, bld32
.vec_type
, "");
1349 /* dxt3/5 always use 4-color encoding */
1350 if (is_dxt1_variant
) {
1351 LLVMValueRef color23_2
, color2_2
;
1353 if (util_cpu_caps
.has_sse2
) {
1354 LLVMValueRef intrargs
[2];
1355 intrargs
[0] = LLVMBuildBitCast(builder
, color01
, bld8
.vec_type
, "");
1356 /* same interleave as for lerp23 - correct result in 2nd element */
1357 intrargs
[1] = lp_build_interleave2(gallivm
, type32
, color01
, color01
, 0);
1358 intrargs
[1] = LLVMBuildBitCast(builder
, intrargs
[1], bld8
.vec_type
, "");
1359 color2_2
= lp_build_pavgb(&bld8
, intrargs
[0], intrargs
[1]);
1362 LLVMValueRef v01
, v0
, v1
, vhalf
;
1364 * This isn't as expensive as it looks (the unpack is the same as
1365 * for lerp23, which is the reason why we do the pointless
1366 * interleave2 too), with correct rounding (the two lower elements
1367 * will be the same).
1369 v01
= lp_build_interleave2(gallivm
, type32
, color01
, color01
, 0);
1370 v01
= LLVMBuildBitCast(builder
, v01
, bld8
.vec_type
, "");
1371 lp_build_unpack2(gallivm
, type8
, type16
, v01
, &v0
, &v1
);
1372 vhalf
= lp_build_add(&bld16
, v0
, v1
);
1373 vhalf
= LLVMBuildLShr(builder
, vhalf
, bld16
.one
, "");
1374 color2_2
= lp_build_pack2(gallivm
, type16
, type8
, vhalf
, bld16
.undef
);
1376 /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
1377 color23_2
= LLVMBuildBitCast(builder
, color2_2
, bld64
.vec_type
, "");
1378 color23_2
= LLVMBuildLShr(builder
, color23_2
,
1379 lp_build_const_int_vec(gallivm
, type64
, 32), "");
1380 color23_2
= LLVMBuildBitCast(builder
, color23_2
, bld32
.vec_type
, "");
1382 tmp
= LLVMBuildBitCast(builder
, color01_16
, bld64
.vec_type
, "");
1383 tmp
= LLVMBuildLShr(builder
, tmp
,
1384 lp_build_const_int_vec(gallivm
, type64
, 32), "");
1385 tmp
= LLVMBuildBitCast(builder
, tmp
, bld32
.vec_type
, "");
1386 sel_mask
= lp_build_compare(gallivm
, type32
, PIPE_FUNC_GREATER
,
1388 sel_mask
= lp_build_interleave2(gallivm
, type32
, sel_mask
, sel_mask
, 0);
1389 color23
= lp_build_select(&bld32
, sel_mask
, color23
, color23_2
);
1392 if (util_cpu_caps
.has_ssse3
) {
1394 * Use pshufb as mini-lut. (Only doable with intrinsics as the
1395 * final shuffles are non-constant. pshufb is awesome!)
1397 LLVMValueRef shuf
[16], low2mask
;
1398 LLVMValueRef intrargs
[2], lut_ind
, lut_adj
;
1400 color01
= LLVMBuildBitCast(builder
, color01
, bld64
.vec_type
, "");
1401 color23
= LLVMBuildBitCast(builder
, color23
, bld64
.vec_type
, "");
1402 color0123
= lp_build_interleave2(gallivm
, type64
, color01
, color23
, 0);
1403 color0123
= LLVMBuildBitCast(builder
, color0123
, bld32
.vec_type
, "");
1405 if (format
== PIPE_FORMAT_DXT1_RGB
||
1406 format
== PIPE_FORMAT_DXT1_SRGB
) {
1407 color0123
= LLVMBuildOr(builder
, color0123
, a
, "");
1410 /* shuffle as r0r1r2r3g0g1... */
1411 for (i
= 0; i
< 4; i
++) {
1412 shuf
[4*i
] = lp_build_const_int32(gallivm
, 0 + i
);
1413 shuf
[4*i
+1] = lp_build_const_int32(gallivm
, 4 + i
);
1414 shuf
[4*i
+2] = lp_build_const_int32(gallivm
, 8 + i
);
1415 shuf
[4*i
+3] = lp_build_const_int32(gallivm
, 12 + i
);
1417 color0123
= LLVMBuildBitCast(builder
, color0123
, bld8
.vec_type
, "");
1418 color0123
= LLVMBuildShuffleVector(builder
, color0123
, bld8
.undef
,
1419 LLVMConstVector(shuf
, 16), "");
1421 /* lowest 2 bits of each 8 bit value contain index into "LUT" */
1422 low2mask
= lp_build_const_int_vec(gallivm
, type8
, 3);
1423 /* add 0/4/8/12 for r/g/b/a */
1424 lut_adj
= lp_build_const_int_vec(gallivm
, type32
, 0x0c080400);
1425 lut_adj
= LLVMBuildBitCast(builder
, lut_adj
, bld8
.vec_type
, "");
1426 intrargs
[0] = color0123
;
1427 for (i
= 0; i
< 4; i
++) {
1428 lut_ind
= LLVMBuildAnd(builder
, code
, low2mask
, "");
1429 lut_ind
= LLVMBuildOr(builder
, lut_ind
, lut_adj
, "");
1430 intrargs
[1] = lut_ind
;
1431 col
[i
] = lp_build_intrinsic(builder
, "llvm.x86.ssse3.pshuf.b.128",
1432 bld8
.vec_type
, intrargs
, 2, 0);
1433 col
[i
] = LLVMBuildBitCast(builder
, col
[i
], bld32
.vec_type
, "");
1434 code
= LLVMBuildBitCast(builder
, code
, bld32
.vec_type
, "");
1435 code
= LLVMBuildLShr(builder
, code
, const2
, "");
1436 code
= LLVMBuildBitCast(builder
, code
, bld8
.vec_type
, "");
1440 /* Thanks to vectorization can do 4 texels in parallel */
1441 LLVMValueRef color0
, color1
, color2
, color3
;
1442 if (format
== PIPE_FORMAT_DXT1_RGB
||
1443 format
== PIPE_FORMAT_DXT1_SRGB
) {
1444 color01
= LLVMBuildOr(builder
, color01
, a
, "");
1445 color23
= LLVMBuildOr(builder
, color23
, a
, "");
1447 color0
= LLVMBuildShuffleVector(builder
, color01
, bld32
.undef
,
1448 lp_build_const_shuffle1(gallivm
, 0, 4), "");
1449 color1
= LLVMBuildShuffleVector(builder
, color01
, bld32
.undef
,
1450 lp_build_const_shuffle1(gallivm
, 1, 4), "");
1451 color2
= LLVMBuildShuffleVector(builder
, color23
, bld32
.undef
,
1452 lp_build_const_shuffle1(gallivm
, 0, 4), "");
1453 color3
= LLVMBuildShuffleVector(builder
, color23
, bld32
.undef
,
1454 lp_build_const_shuffle1(gallivm
, 1, 4), "");
1455 code
= LLVMBuildBitCast(builder
, code
, bld32
.vec_type
, "");
1457 for (i
= 0; i
< 4; i
++) {
1458 /* select the colors */
1459 LLVMValueRef selmasklo
, rgba01
, rgba23
, bitlo
;
1461 indices
= LLVMBuildAnd(builder
, code
, bitlo
, "");
1462 selmasklo
= lp_build_compare(gallivm
, type32
, PIPE_FUNC_EQUAL
,
1464 rgba01
= lp_build_select(&bld32
, selmasklo
, color1
, color0
);
1466 LLVMValueRef selmaskhi
;
1467 indices
= LLVMBuildAnd(builder
, code
, const2
, "");
1468 selmaskhi
= lp_build_compare(gallivm
, type32
, PIPE_FUNC_EQUAL
,
1470 rgba23
= lp_build_select(&bld32
, selmasklo
, color3
, color2
);
1471 rgba
= lp_build_select(&bld32
, selmaskhi
, rgba23
, rgba01
);
1474 * Note that this will give "wrong" order.
1475 * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
1476 * This would be easily fixable by using different shuffle, bitlo/hi
1477 * vectors above (and different shift), but seems slightly easier to
1478 * deal with for dxt3/dxt5 alpha too. So instead change lookup.
1481 code
= LLVMBuildLShr(builder
, code
, const2
, "");
1487 * decode one dxt3 block.
1490 s3tc_decode_block_dxt3(struct gallivm_state
*gallivm
,
1491 enum pipe_format format
,
1492 LLVMValueRef dxt_block
,
1495 LLVMBuilderRef builder
= gallivm
->builder
;
1496 LLVMValueRef alpha
, alphas0
, alphas1
, shift4_16
, a
[4], mask8hi
;
1497 struct lp_type type32
, type8
, type16
;
1500 memset(&type32
, 0, sizeof type32
);
1504 memset(&type8
, 0, sizeof type8
);
1508 memset(&type16
, 0, sizeof type16
);
1512 s3tc_decode_block_dxt1(gallivm
, format
, dxt_block
, col
);
1514 shift4_16
= lp_build_const_int_vec(gallivm
, type16
, 4);
1515 mask8hi
= lp_build_const_int_vec(gallivm
, type32
, 0xff000000);
1517 alpha
= LLVMBuildBitCast(builder
, dxt_block
,
1518 lp_build_vec_type(gallivm
, type8
), "");
1519 alpha
= lp_build_interleave2(gallivm
, type8
, alpha
, alpha
, 0);
1520 alpha
= LLVMBuildBitCast(builder
, alpha
,
1521 lp_build_vec_type(gallivm
, type16
), "");
1522 alpha
= LLVMBuildAnd(builder
, alpha
,
1523 lp_build_const_int_vec(gallivm
, type16
, 0xf00f), "");
1524 alphas0
= LLVMBuildLShr(builder
, alpha
, shift4_16
, "");
1525 alphas1
= LLVMBuildShl(builder
, alpha
, shift4_16
, "");
1526 alpha
= LLVMBuildOr(builder
, alphas0
, alpha
, "");
1527 alpha
= LLVMBuildOr(builder
, alphas1
, alpha
, "");
1528 alpha
= LLVMBuildBitCast(builder
, alpha
,
1529 lp_build_vec_type(gallivm
, type32
), "");
1531 * alpha now contains elems 0,1,2,3,... (ubytes)
1532 * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
1533 * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
1535 a
[0] = LLVMBuildShl(builder
, alpha
,
1536 lp_build_const_int_vec(gallivm
, type32
, 24), "");
1537 a
[1] = LLVMBuildShl(builder
, alpha
,
1538 lp_build_const_int_vec(gallivm
, type32
, 16), "");
1539 a
[1] = LLVMBuildAnd(builder
, a
[1], mask8hi
, "");
1540 a
[2] = LLVMBuildShl(builder
, alpha
,
1541 lp_build_const_int_vec(gallivm
, type32
, 8), "");
1542 a
[2] = LLVMBuildAnd(builder
, a
[2], mask8hi
, "");
1543 a
[3] = LLVMBuildAnd(builder
, alpha
, mask8hi
, "");
1545 for (i
= 0; i
< 4; i
++) {
1546 col
[i
] = LLVMBuildOr(builder
, col
[i
], a
[i
], "");
1552 lp_build_lerpdxta_block(struct gallivm_state
*gallivm
,
1553 LLVMValueRef alpha0
,
1554 LLVMValueRef alpha1
,
1556 LLVMValueRef sel_mask
)
1558 LLVMBuilderRef builder
= gallivm
->builder
;
1559 LLVMValueRef delta
, ainterp
;
1560 LLVMValueRef weight5
, weight7
, weight
;
1561 struct lp_type type16
;
1562 struct lp_build_context bld
;
1564 memset(&type16
, 0, sizeof type16
);
1569 lp_build_context_init(&bld
, gallivm
, type16
);
1571 * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
1572 * actually be desirable to do this here with even higher accuracy than
1573 * even 8 bit (more or less required for rgtc, albeit that's not handled
1574 * here right now), shift the weights after multiplication by code.
1576 weight5
= lp_build_const_int_vec(gallivm
, type16
, 256*64/5);
1577 weight7
= lp_build_const_int_vec(gallivm
, type16
, 256*64/7);
1578 weight
= lp_build_select(&bld
, sel_mask
, weight7
, weight5
);
1581 * we'll get garbage in the elements which had code 0 (or larger than
1582 * 5 or 7) but we don't care (or rather, need to fix up anyway).
1584 code
= LLVMBuildSub(builder
, code
, bld
.one
, "");
1586 weight
= LLVMBuildMul(builder
, weight
, code
, "");
1587 weight
= LLVMBuildLShr(builder
, weight
,
1588 lp_build_const_int_vec(gallivm
, type16
, 6), "");
1590 delta
= LLVMBuildSub(builder
, alpha1
, alpha0
, "");
1592 ainterp
= LLVMBuildMul(builder
, delta
, weight
, "");
1593 ainterp
= LLVMBuildLShr(builder
, ainterp
,
1594 lp_build_const_int_vec(gallivm
, type16
, 8), "");
1596 /* lerp is done later (with packed values) */
1603 * decode one dxt5 block.
1606 s3tc_decode_block_dxt5(struct gallivm_state
*gallivm
,
1607 enum pipe_format format
,
1608 LLVMValueRef dxt_block
,
1611 LLVMBuilderRef builder
= gallivm
->builder
;
1612 LLVMValueRef alpha
, alpha0
, alpha1
, ares
;
1613 LLVMValueRef ainterp
, ainterp0
, ainterp1
, shuffle1
, sel_mask
, sel_mask2
;
1614 LLVMValueRef a
[4], acode
, tmp0
, tmp1
;
1615 LLVMTypeRef i64t
, i32t
;
1616 struct lp_type type32
, type64
, type8
, type16
;
1617 struct lp_build_context bld16
, bld8
;
1620 memset(&type32
, 0, sizeof type32
);
1624 memset(&type64
, 0, sizeof type64
);
1628 memset(&type8
, 0, sizeof type8
);
1632 memset(&type16
, 0, sizeof type16
);
1636 lp_build_context_init(&bld16
, gallivm
, type16
);
1637 lp_build_context_init(&bld8
, gallivm
, type8
);
1639 i64t
= lp_build_vec_type(gallivm
, type64
);
1640 i32t
= lp_build_vec_type(gallivm
, type32
);
1642 s3tc_decode_block_dxt1(gallivm
, format
, dxt_block
, col
);
1645 * three possible strategies for vectorizing alpha:
1646 * 1) compute all 8 values then use scalar extraction
1647 * (i.e. have all 8 alpha values packed in one 64bit scalar
1648 * and do something like ax = vals >> (codex * 8) followed
1649 * by inserting these values back into color)
1650 * 2) same as 8 but just use pshufb as a mini-LUT for selection.
1651 * (without pshufb would need boatloads of cmp/selects trying to
1652 * keep things vectorized for essentially scalar selection).
1653 * 3) do something similar to the uncached case
1654 * needs more calculations (need to calc 16 values instead of 8 though
1655 * that's only an issue for the lerp which we need to do twice otherwise
1656 * everything still fits into 128bit) but keeps things vectorized mostly.
1657 * Trying 3) here though not sure it's really faster...
1658 * With pshufb, we try 2) (cheaper and more accurate)
1662 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1663 * help since code crosses 8bit boundaries). But variable shifts are
1664 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1665 * shifts!). Instead, emulate by 16bit muls.
1666 * Also, the required byte shuffles are essentially non-emulatable, so
1667 * require ssse3 (albeit other archs might do them fine).
1668 * This is not directly tied to ssse3 - just need sane byte shuffles.
1669 * But ordering is going to be different below so use same condition.
1673 /* vectorize alpha */
1674 alpha
= LLVMBuildBitCast(builder
, dxt_block
, i64t
, "");
1675 alpha0
= LLVMBuildAnd(builder
, alpha
,
1676 lp_build_const_int_vec(gallivm
, type64
, 0xff), "");
1677 alpha0
= LLVMBuildBitCast(builder
, alpha0
, bld16
.vec_type
, "");
1678 alpha
= LLVMBuildBitCast(builder
, alpha
, bld16
.vec_type
, "");
1679 alpha1
= LLVMBuildLShr(builder
, alpha
,
1680 lp_build_const_int_vec(gallivm
, type16
, 8), "");
1681 alpha
= LLVMBuildBitCast(builder
, alpha
, i64t
, "");
1682 shuffle1
= lp_build_const_shuffle1(gallivm
, 0, 8);
1683 alpha0
= LLVMBuildShuffleVector(builder
, alpha0
, alpha0
, shuffle1
, "");
1684 alpha1
= LLVMBuildShuffleVector(builder
, alpha1
, alpha1
, shuffle1
, "");
1687 sel_mask
= lp_build_compare(gallivm
, type16
, PIPE_FUNC_GREATER
,
1689 type16
.sign
= FALSE
;
1690 sel_mask
= LLVMBuildBitCast(builder
, sel_mask
, bld8
.vec_type
, "");
1692 if (!util_cpu_caps
.has_ssse3
) {
1693 LLVMValueRef acodeg
, mask1
, acode0
, acode1
;
1695 /* extraction of the 3 bit values into something more useful is HARD */
1696 /* first steps are actually scalar */
1697 acode
= LLVMBuildLShr(builder
, alpha
,
1698 lp_build_const_int_vec(gallivm
, type64
, 16), "");
1699 tmp0
= LLVMBuildAnd(builder
, acode
,
1700 lp_build_const_int_vec(gallivm
, type64
, 0xffffff), "");
1701 tmp1
= LLVMBuildLShr(builder
, acode
,
1702 lp_build_const_int_vec(gallivm
, type64
, 24), "");
1703 tmp0
= LLVMBuildBitCast(builder
, tmp0
, i32t
, "");
1704 tmp1
= LLVMBuildBitCast(builder
, tmp1
, i32t
, "");
1705 acode
= lp_build_interleave2(gallivm
, type32
, tmp0
, tmp1
, 0);
1706 /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
1707 tmp0
= LLVMBuildAnd(builder
, acode
,
1708 lp_build_const_int_vec(gallivm
, type32
, 0xfff), "");
1709 tmp1
= LLVMBuildLShr(builder
, acode
,
1710 lp_build_const_int_vec(gallivm
, type32
, 12), "");
1711 acode
= lp_build_interleave2(gallivm
, type32
, tmp0
, tmp1
, 0);
1712 /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
1713 tmp0
= LLVMBuildAnd(builder
, acode
,
1714 lp_build_const_int_vec(gallivm
, type32
, 0x3f), "");
1715 tmp1
= LLVMBuildLShr(builder
, acode
,
1716 lp_build_const_int_vec(gallivm
, type32
, 6), "");
1717 /* use signed pack doesn't matter and otherwise need sse41 */
1718 type32
.sign
= type16
.sign
= TRUE
;
1719 acode
= lp_build_pack2(gallivm
, type32
, type16
, tmp0
, tmp1
);
1720 type32
.sign
= type16
.sign
= FALSE
;
1721 /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
1722 acode0
= LLVMBuildAnd(builder
, acode
,
1723 lp_build_const_int_vec(gallivm
, type16
, 0x7), "");
1724 acode1
= LLVMBuildLShr(builder
, acode
,
1725 lp_build_const_int_vec(gallivm
, type16
, 3), "");
1726 acode
= lp_build_pack2(gallivm
, type16
, type8
, acode0
, acode1
);
1727 /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
1729 acodeg
= LLVMBuildAnd(builder
, acode
,
1730 LLVMBuildNot(builder
, sel_mask
, ""), "");
1731 mask1
= lp_build_compare(gallivm
, type8
, PIPE_FUNC_EQUAL
,
1734 sel_mask
= LLVMBuildBitCast(builder
, sel_mask
, bld16
.vec_type
, "");
1735 ainterp0
= lp_build_lerpdxta_block(gallivm
, alpha0
, alpha1
, acode0
, sel_mask
);
1736 ainterp1
= lp_build_lerpdxta_block(gallivm
, alpha0
, alpha1
, acode1
, sel_mask
);
1737 sel_mask
= LLVMBuildBitCast(builder
, sel_mask
, bld8
.vec_type
, "");
1738 ainterp
= lp_build_pack2(gallivm
, type16
, type8
, ainterp0
, ainterp1
);
1739 alpha0
= lp_build_pack2(gallivm
, type16
, type8
, alpha0
, alpha0
);
1740 alpha1
= lp_build_pack2(gallivm
, type16
, type8
, alpha1
, alpha1
);
1741 ainterp
= LLVMBuildAdd(builder
, ainterp
, alpha0
, "");
1743 sel_mask2
= lp_build_compare(gallivm
, type8
, PIPE_FUNC_EQUAL
,
1745 ainterp
= lp_build_select(&bld8
, sel_mask2
, alpha0
, ainterp
);
1746 ainterp
= lp_build_select(&bld8
, mask1
, alpha1
, ainterp
);
1748 /* fix up val67 if a0 <= a1 */
1749 sel_mask2
= lp_build_compare(gallivm
, type8
, PIPE_FUNC_EQUAL
,
1750 acodeg
, lp_build_const_int_vec(gallivm
, type8
, 6));
1751 ares
= LLVMBuildAnd(builder
, ainterp
, LLVMBuildNot(builder
, sel_mask2
, ""), "");
1752 sel_mask2
= lp_build_compare(gallivm
, type8
, PIPE_FUNC_EQUAL
,
1753 acodeg
, lp_build_const_int_vec(gallivm
, type8
, 7));
1754 ares
= LLVMBuildOr(builder
, ares
, sel_mask2
, "");
1756 /* unpack in right order (0,4,8,12,1,5,..) */
1757 /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
1758 tmp0
= lp_build_interleave2(gallivm
, type8
, bld8
.zero
, ares
, 0);
1759 tmp1
= lp_build_interleave2(gallivm
, type8
, bld8
.zero
, ares
, 1);
1760 tmp0
= LLVMBuildBitCast(builder
, tmp0
, bld16
.vec_type
, "");
1761 tmp1
= LLVMBuildBitCast(builder
, tmp1
, bld16
.vec_type
, "");
1763 a
[0] = lp_build_interleave2(gallivm
, type16
, bld16
.zero
, tmp0
, 0);
1764 a
[1] = lp_build_interleave2(gallivm
, type16
, bld16
.zero
, tmp1
, 0);
1765 a
[2] = lp_build_interleave2(gallivm
, type16
, bld16
.zero
, tmp0
, 1);
1766 a
[3] = lp_build_interleave2(gallivm
, type16
, bld16
.zero
, tmp1
, 1);
1769 LLVMValueRef elems
[16], intrargs
[2], shufa
, mulclo
, mulchi
, mask8hi
;
1770 LLVMTypeRef type16s
= LLVMInt16TypeInContext(gallivm
->context
);
1771 LLVMTypeRef type8s
= LLVMInt8TypeInContext(gallivm
->context
);
1774 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1775 * help since code crosses 8bit boundaries). But variable shifts are
1776 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1777 * shifts!). Instead, emulate by 16bit muls.
1778 * Also, the required byte shuffles are essentially non-emulatable, so
1779 * require ssse3 (albeit other archs might do them fine, but the
1780 * complete path is ssse3 only for now).
1782 for (i
= 0, j
= 0; i
< 16; i
+= 8, j
+= 3) {
1783 elems
[i
+0] = elems
[i
+1] = elems
[i
+2] = lp_build_const_int32(gallivm
, j
+2);
1784 elems
[i
+3] = elems
[i
+4] = lp_build_const_int32(gallivm
, j
+3);
1785 elems
[i
+5] = elems
[i
+6] = elems
[i
+7] = lp_build_const_int32(gallivm
, j
+4);
1787 shufa
= LLVMConstVector(elems
, 16);
1788 alpha
= LLVMBuildBitCast(builder
, alpha
, bld8
.vec_type
, "");
1789 acode
= LLVMBuildShuffleVector(builder
, alpha
, bld8
.undef
, shufa
, "");
1790 acode
= LLVMBuildBitCast(builder
, acode
, bld16
.vec_type
, "");
1792 * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
1793 * Do the same for 1/3/5/7 (albeit still need mask there - ideally
1794 * we'd place them into bits 4-7 so could save shift but impossible.)
1796 for (i
= 0; i
< 8; i
+= 4) {
1797 elems
[i
+0] = LLVMConstInt(type16s
, 1 << (13-0), 0);
1798 elems
[i
+1] = LLVMConstInt(type16s
, 1 << (13-6), 0);
1799 elems
[i
+2] = LLVMConstInt(type16s
, 1 << (13-4), 0);
1800 elems
[i
+3] = LLVMConstInt(type16s
, 1 << (13-2), 0);
1802 mulclo
= LLVMConstVector(elems
, 8);
1803 for (i
= 0; i
< 8; i
+= 4) {
1804 elems
[i
+0] = LLVMConstInt(type16s
, 1 << (13-3), 0);
1805 elems
[i
+1] = LLVMConstInt(type16s
, 1 << (13-9), 0);
1806 elems
[i
+2] = LLVMConstInt(type16s
, 1 << (13-7), 0);
1807 elems
[i
+3] = LLVMConstInt(type16s
, 1 << (13-5), 0);
1809 mulchi
= LLVMConstVector(elems
, 8);
1811 tmp0
= LLVMBuildMul(builder
, acode
, mulclo
, "");
1812 tmp1
= LLVMBuildMul(builder
, acode
, mulchi
, "");
1813 tmp0
= LLVMBuildLShr(builder
, tmp0
,
1814 lp_build_const_int_vec(gallivm
, type16
, 13), "");
1815 tmp1
= LLVMBuildLShr(builder
, tmp1
,
1816 lp_build_const_int_vec(gallivm
, type16
, 5), "");
1817 tmp1
= LLVMBuildAnd(builder
, tmp1
,
1818 lp_build_const_int_vec(gallivm
, type16
, 0x700), "");
1819 acode
= LLVMBuildOr(builder
, tmp0
, tmp1
, "");
1820 acode
= LLVMBuildBitCast(builder
, acode
, bld8
.vec_type
, "");
1823 * Note that ordering is different here to non-ssse3 path:
1827 LLVMValueRef weight0
, weight1
, weight
, delta
;
1828 LLVMValueRef constff_elem7
, const0_elem6
;
1829 /* weights, correctly rounded (round(256*x/7)) */
1830 elems
[0] = LLVMConstInt(type16s
, 256, 0);
1831 elems
[1] = LLVMConstInt(type16s
, 0, 0);
1832 elems
[2] = LLVMConstInt(type16s
, 219, 0);
1833 elems
[3] = LLVMConstInt(type16s
, 183, 0);
1834 elems
[4] = LLVMConstInt(type16s
, 146, 0);
1835 elems
[5] = LLVMConstInt(type16s
, 110, 0);
1836 elems
[6] = LLVMConstInt(type16s
, 73, 0);
1837 elems
[7] = LLVMConstInt(type16s
, 37, 0);
1838 weight0
= LLVMConstVector(elems
, 8);
1840 elems
[0] = LLVMConstInt(type16s
, 256, 0);
1841 elems
[1] = LLVMConstInt(type16s
, 0, 0);
1842 elems
[2] = LLVMConstInt(type16s
, 205, 0);
1843 elems
[3] = LLVMConstInt(type16s
, 154, 0);
1844 elems
[4] = LLVMConstInt(type16s
, 102, 0);
1845 elems
[5] = LLVMConstInt(type16s
, 51, 0);
1846 elems
[6] = LLVMConstInt(type16s
, 0, 0);
1847 elems
[7] = LLVMConstInt(type16s
, 0, 0);
1848 weight1
= LLVMConstVector(elems
, 8);
1850 weight0
= LLVMBuildBitCast(builder
, weight0
, bld8
.vec_type
, "");
1851 weight1
= LLVMBuildBitCast(builder
, weight1
, bld8
.vec_type
, "");
1852 weight
= lp_build_select(&bld8
, sel_mask
, weight0
, weight1
);
1853 weight
= LLVMBuildBitCast(builder
, weight
, bld16
.vec_type
, "");
1855 for (i
= 0; i
< 16; i
++) {
1856 elems
[i
] = LLVMConstNull(type8s
);
1858 elems
[7] = LLVMConstInt(type8s
, 255, 0);
1859 constff_elem7
= LLVMConstVector(elems
, 16);
1861 for (i
= 0; i
< 16; i
++) {
1862 elems
[i
] = LLVMConstInt(type8s
, 255, 0);
1864 elems
[6] = LLVMConstInt(type8s
, 0, 0);
1865 const0_elem6
= LLVMConstVector(elems
, 16);
1867 /* standard simple lerp - but the version we need isn't available */
1868 delta
= LLVMBuildSub(builder
, alpha0
, alpha1
, "");
1869 ainterp
= LLVMBuildMul(builder
, delta
, weight
, "");
1870 ainterp
= LLVMBuildLShr(builder
, ainterp
,
1871 lp_build_const_int_vec(gallivm
, type16
, 8), "");
1872 ainterp
= LLVMBuildBitCast(builder
, ainterp
, bld8
.vec_type
, "");
1873 alpha1
= LLVMBuildBitCast(builder
, alpha1
, bld8
.vec_type
, "");
1874 ainterp
= LLVMBuildAdd(builder
, ainterp
, alpha1
, "");
1875 ainterp
= LLVMBuildBitCast(builder
, ainterp
, bld16
.vec_type
, "");
1876 ainterp
= lp_build_pack2(gallivm
, type16
, type8
, ainterp
, bld16
.undef
);
1878 /* fixing 0/0xff case is slightly more complex */
1879 constff_elem7
= LLVMBuildAnd(builder
, constff_elem7
,
1880 LLVMBuildNot(builder
, sel_mask
, ""), "");
1881 const0_elem6
= LLVMBuildOr(builder
, const0_elem6
, sel_mask
, "");
1882 ainterp
= LLVMBuildOr(builder
, ainterp
, constff_elem7
, "");
1883 ainterp
= LLVMBuildAnd(builder
, ainterp
, const0_elem6
, "");
1885 /* now pick all 16 elements at once! */
1886 intrargs
[0] = ainterp
;
1887 intrargs
[1] = acode
;
1888 ares
= lp_build_intrinsic(builder
, "llvm.x86.ssse3.pshuf.b.128",
1889 bld8
.vec_type
, intrargs
, 2, 0);
1891 ares
= LLVMBuildBitCast(builder
, ares
, i32t
, "");
1892 mask8hi
= lp_build_const_int_vec(gallivm
, type32
, 0xff000000);
1893 a
[0] = LLVMBuildShl(builder
, ares
,
1894 lp_build_const_int_vec(gallivm
, type32
, 24), "");
1895 a
[1] = LLVMBuildShl(builder
, ares
,
1896 lp_build_const_int_vec(gallivm
, type32
, 16), "");
1897 a
[1] = LLVMBuildAnd(builder
, a
[1], mask8hi
, "");
1898 a
[2] = LLVMBuildShl(builder
, ares
,
1899 lp_build_const_int_vec(gallivm
, type32
, 8), "");
1900 a
[2] = LLVMBuildAnd(builder
, a
[2], mask8hi
, "");
1901 a
[3] = LLVMBuildAnd(builder
, ares
, mask8hi
, "");
1904 for (i
= 0; i
< 4; i
++) {
1905 a
[i
] = LLVMBuildBitCast(builder
, a
[i
], i32t
, "");
1906 col
[i
] = LLVMBuildOr(builder
, col
[i
], a
[i
], "");
1912 generate_update_cache_one_block(struct gallivm_state
*gallivm
,
1913 LLVMValueRef function
,
1914 const struct util_format_description
*format_desc
)
1916 LLVMBasicBlockRef block
;
1917 LLVMBuilderRef old_builder
;
1918 LLVMValueRef ptr_addr
;
1919 LLVMValueRef hash_index
;
1921 LLVMValueRef dxt_block
, tag_value
;
1922 LLVMValueRef col
[LP_MAX_VECTOR_LENGTH
];
1924 ptr_addr
= LLVMGetParam(function
, 0);
1925 hash_index
= LLVMGetParam(function
, 1);
1926 cache
= LLVMGetParam(function
, 2);
1928 lp_build_name(ptr_addr
, "ptr_addr" );
1929 lp_build_name(hash_index
, "hash_index");
1930 lp_build_name(cache
, "cache_addr");
1936 old_builder
= gallivm
->builder
;
1937 block
= LLVMAppendBasicBlockInContext(gallivm
->context
, function
, "entry");
1938 gallivm
->builder
= LLVMCreateBuilderInContext(gallivm
->context
);
1939 LLVMPositionBuilderAtEnd(gallivm
->builder
, block
);
1941 lp_build_gather_s3tc_simple_scalar(gallivm
, format_desc
, &dxt_block
,
1944 switch (format_desc
->format
) {
1945 case PIPE_FORMAT_DXT1_RGB
:
1946 case PIPE_FORMAT_DXT1_RGBA
:
1947 case PIPE_FORMAT_DXT1_SRGB
:
1948 case PIPE_FORMAT_DXT1_SRGBA
:
1949 s3tc_decode_block_dxt1(gallivm
, format_desc
->format
, dxt_block
, col
);
1951 case PIPE_FORMAT_DXT3_RGBA
:
1952 case PIPE_FORMAT_DXT3_SRGBA
:
1953 s3tc_decode_block_dxt3(gallivm
, format_desc
->format
, dxt_block
, col
);
1955 case PIPE_FORMAT_DXT5_RGBA
:
1956 case PIPE_FORMAT_DXT5_SRGBA
:
1957 s3tc_decode_block_dxt5(gallivm
, format_desc
->format
, dxt_block
, col
);
1961 s3tc_decode_block_dxt1(gallivm
, format_desc
->format
, dxt_block
, col
);
1965 tag_value
= LLVMBuildPtrToInt(gallivm
->builder
, ptr_addr
,
1966 LLVMInt64TypeInContext(gallivm
->context
), "");
1967 s3tc_store_cached_block(gallivm
, col
, tag_value
, hash_index
, cache
);
1969 LLVMBuildRetVoid(gallivm
->builder
);
1971 LLVMDisposeBuilder(gallivm
->builder
);
1972 gallivm
->builder
= old_builder
;
1974 gallivm_verify_function(gallivm
, function
);
1979 update_cached_block(struct gallivm_state
*gallivm
,
1980 const struct util_format_description
*format_desc
,
1981 LLVMValueRef ptr_addr
,
1982 LLVMValueRef hash_index
,
1986 LLVMBuilderRef builder
= gallivm
->builder
;
1987 LLVMModuleRef module
= gallivm
->module
;
1989 LLVMTypeRef i8t
= LLVMInt8TypeInContext(gallivm
->context
);
1990 LLVMTypeRef pi8t
= LLVMPointerType(i8t
, 0);
1991 LLVMValueRef function
, inst
;
1992 LLVMBasicBlockRef bb
;
1993 LLVMValueRef args
[3];
1995 snprintf(name
, sizeof name
, "%s_update_cache_one_block",
1996 format_desc
->short_name
);
1997 function
= LLVMGetNamedFunction(module
, name
);
2000 LLVMTypeRef ret_type
;
2001 LLVMTypeRef arg_types
[3];
2002 LLVMTypeRef function_type
;
2006 * Generate the function prototype.
2009 ret_type
= LLVMVoidTypeInContext(gallivm
->context
);
2010 arg_types
[0] = pi8t
;
2011 arg_types
[1] = LLVMInt32TypeInContext(gallivm
->context
);
2012 arg_types
[2] = LLVMTypeOf(cache
); // XXX: put right type here
2013 function_type
= LLVMFunctionType(ret_type
, arg_types
, ARRAY_SIZE(arg_types
), 0);
2014 function
= LLVMAddFunction(module
, name
, function_type
);
2016 for (arg
= 0; arg
< ARRAY_SIZE(arg_types
); ++arg
)
2017 if (LLVMGetTypeKind(arg_types
[arg
]) == LLVMPointerTypeKind
)
2018 lp_add_function_attr(function
, arg
+ 1, LP_FUNC_ATTR_NOALIAS
);
2020 LLVMSetFunctionCallConv(function
, LLVMFastCallConv
);
2021 LLVMSetVisibility(function
, LLVMHiddenVisibility
);
2022 generate_update_cache_one_block(gallivm
, function
, format_desc
);
2026 args
[1] = hash_index
;
2029 LLVMBuildCall(builder
, function
, args
, ARRAY_SIZE(args
), "");
2030 bb
= LLVMGetInsertBlock(builder
);
2031 inst
= LLVMGetLastInstruction(bb
);
2032 LLVMSetInstructionCallConv(inst
, LLVMFastCallConv
);
2039 compressed_fetch_cached(struct gallivm_state
*gallivm
,
2040 const struct util_format_description
*format_desc
,
2042 LLVMValueRef base_ptr
,
2043 LLVMValueRef offset
,
2049 LLVMBuilderRef builder
= gallivm
->builder
;
2050 unsigned count
, low_bit
, log2size
;
2051 LLVMValueRef color
, offset_stored
, addr
, ptr_addrtrunc
, tmp
;
2052 LLVMValueRef ij_index
, hash_index
, hash_mask
, block_index
;
2053 LLVMTypeRef i8t
= LLVMInt8TypeInContext(gallivm
->context
);
2054 LLVMTypeRef i32t
= LLVMInt32TypeInContext(gallivm
->context
);
2055 LLVMTypeRef i64t
= LLVMInt64TypeInContext(gallivm
->context
);
2056 struct lp_type type
;
2057 struct lp_build_context bld32
;
2058 memset(&type
, 0, sizeof type
);
2062 lp_build_context_init(&bld32
, gallivm
, type
);
2065 * compute hash - we use direct mapped cache, the hash function could
2066 * be better but it needs to be simple
2068 * compare offset with offset stored at tag (hash)
2069 * if not equal extract block, store block, update tag
2070 * extract color from cache
2074 low_bit
= util_logbase2(format_desc
->block
.bits
/ 8);
2075 log2size
= util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE
);
2076 addr
= LLVMBuildPtrToInt(builder
, base_ptr
, i64t
, "");
2077 ptr_addrtrunc
= LLVMBuildPtrToInt(builder
, base_ptr
, i32t
, "");
2078 ptr_addrtrunc
= lp_build_broadcast_scalar(&bld32
, ptr_addrtrunc
);
2079 /* For the hash function, first mask off the unused lowest bits. Then just
2080 do some xor with address bits - only use lower 32bits */
2081 ptr_addrtrunc
= LLVMBuildAdd(builder
, offset
, ptr_addrtrunc
, "");
2082 ptr_addrtrunc
= LLVMBuildLShr(builder
, ptr_addrtrunc
,
2083 lp_build_const_int_vec(gallivm
, type
, low_bit
), "");
2084 /* This only really makes sense for size 64,128,256 */
2085 hash_index
= ptr_addrtrunc
;
2086 ptr_addrtrunc
= LLVMBuildLShr(builder
, ptr_addrtrunc
,
2087 lp_build_const_int_vec(gallivm
, type
, 2*log2size
), "");
2088 hash_index
= LLVMBuildXor(builder
, ptr_addrtrunc
, hash_index
, "");
2089 tmp
= LLVMBuildLShr(builder
, hash_index
,
2090 lp_build_const_int_vec(gallivm
, type
, log2size
), "");
2091 hash_index
= LLVMBuildXor(builder
, hash_index
, tmp
, "");
2093 hash_mask
= lp_build_const_int_vec(gallivm
, type
, LP_BUILD_FORMAT_CACHE_SIZE
- 1);
2094 hash_index
= LLVMBuildAnd(builder
, hash_index
, hash_mask
, "");
2095 ij_index
= LLVMBuildShl(builder
, i
, lp_build_const_int_vec(gallivm
, type
, 2), "");
2096 ij_index
= LLVMBuildAdd(builder
, ij_index
, j
, "");
2097 block_index
= LLVMBuildShl(builder
, hash_index
,
2098 lp_build_const_int_vec(gallivm
, type
, 4), "");
2099 block_index
= LLVMBuildAdd(builder
, ij_index
, block_index
, "");
2102 color
= bld32
.undef
;
2103 for (count
= 0; count
< n
; count
++) {
2104 LLVMValueRef index
, cond
, colorx
;
2105 LLVMValueRef block_indexx
, hash_indexx
, addrx
, offsetx
, ptr_addrx
;
2106 struct lp_build_if_state if_ctx
;
2108 index
= lp_build_const_int32(gallivm
, count
);
2109 offsetx
= LLVMBuildExtractElement(builder
, offset
, index
, "");
2110 addrx
= LLVMBuildZExt(builder
, offsetx
, i64t
, "");
2111 addrx
= LLVMBuildAdd(builder
, addrx
, addr
, "");
2112 block_indexx
= LLVMBuildExtractElement(builder
, block_index
, index
, "");
2113 hash_indexx
= LLVMBuildLShr(builder
, block_indexx
,
2114 lp_build_const_int32(gallivm
, 4), "");
2115 offset_stored
= s3tc_lookup_tag_data(gallivm
, cache
, hash_indexx
);
2116 cond
= LLVMBuildICmp(builder
, LLVMIntNE
, offset_stored
, addrx
, "");
2118 lp_build_if(&if_ctx
, gallivm
, cond
);
2120 ptr_addrx
= LLVMBuildIntToPtr(builder
, addrx
,
2121 LLVMPointerType(i8t
, 0), "");
2122 update_cached_block(gallivm
, format_desc
, ptr_addrx
, hash_indexx
, cache
);
2123 #if LP_BUILD_FORMAT_CACHE_DEBUG
2124 s3tc_update_cache_access(gallivm
, cache
, 1,
2125 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS
);
2128 lp_build_endif(&if_ctx
);
2130 colorx
= s3tc_lookup_cached_pixel(gallivm
, cache
, block_indexx
);
2132 color
= LLVMBuildInsertElement(builder
, color
, colorx
,
2133 lp_build_const_int32(gallivm
, count
), "");
2138 struct lp_build_if_state if_ctx
;
2140 tmp
= LLVMBuildZExt(builder
, offset
, i64t
, "");
2141 addr
= LLVMBuildAdd(builder
, tmp
, addr
, "");
2142 offset_stored
= s3tc_lookup_tag_data(gallivm
, cache
, hash_index
);
2143 cond
= LLVMBuildICmp(builder
, LLVMIntNE
, offset_stored
, addr
, "");
2145 lp_build_if(&if_ctx
, gallivm
, cond
);
2147 tmp
= LLVMBuildIntToPtr(builder
, addr
, LLVMPointerType(i8t
, 0), "");
2148 update_cached_block(gallivm
, format_desc
, tmp
, hash_index
, cache
);
2149 #if LP_BUILD_FORMAT_CACHE_DEBUG
2150 s3tc_update_cache_access(gallivm
, cache
, 1,
2151 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS
);
2154 lp_build_endif(&if_ctx
);
2156 color
= s3tc_lookup_cached_pixel(gallivm
, cache
, block_index
);
2158 #if LP_BUILD_FORMAT_CACHE_DEBUG
2159 s3tc_update_cache_access(gallivm
, cache
, n
,
2160 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL
);
2162 return LLVMBuildBitCast(builder
, color
, LLVMVectorType(i8t
, n
* 4), "");
2167 s3tc_dxt5_to_rgba_aos(struct gallivm_state
*gallivm
,
2169 enum pipe_format format
,
2170 LLVMValueRef colors
,
2171 LLVMValueRef codewords
,
2172 LLVMValueRef alpha_lo
,
2173 LLVMValueRef alpha_hi
,
2177 return s3tc_dxt5_full_to_rgba_aos(gallivm
, n
, format
, colors
,
2178 codewords
, alpha_lo
, alpha_hi
, i
, j
);
2183 * @param n number of pixels processed (usually n=4, but it should also work with n=1
2184 * and multiples of 4)
2185 * @param base_ptr base pointer (32bit or 64bit pointer depending on the architecture)
2186 * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2187 * @param i is a <n x i32> vector with the x subpixel coordinate (0..3)
2188 * @param j is a <n x i32> vector with the y subpixel coordinate (0..3)
2189 * @return a <4*n x i8> vector with the pixel RGBA values in AoS
2192 lp_build_fetch_s3tc_rgba_aos(struct gallivm_state
*gallivm
,
2193 const struct util_format_description
*format_desc
,
2195 LLVMValueRef base_ptr
,
2196 LLVMValueRef offset
,
2202 LLVMTypeRef i8t
= LLVMInt8TypeInContext(gallivm
->context
);
2203 LLVMBuilderRef builder
= gallivm
->builder
;
2205 assert(format_desc
->layout
== UTIL_FORMAT_LAYOUT_S3TC
);
2206 assert(format_desc
->block
.width
== 4);
2207 assert(format_desc
->block
.height
== 4);
2209 assert((n
== 1) || (n
% 4 == 0));
2211 /* debug_printf("format = %d\n", format_desc->format);*/
2213 rgba
= compressed_fetch_cached(gallivm
, format_desc
, n
,
2214 base_ptr
, offset
, i
, j
, cache
);
2219 * Could use n > 8 here with avx2, but doesn't seem faster.
2223 LLVMTypeRef i8_vectype
= LLVMVectorType(i8t
, 4 * n
);
2224 LLVMTypeRef i128_type
= LLVMIntTypeInContext(gallivm
->context
, 128);
2225 LLVMTypeRef i128_vectype
= LLVMVectorType(i128_type
, n
/ 4);
2226 LLVMTypeRef i324_vectype
= LLVMVectorType(LLVMInt32TypeInContext(
2227 gallivm
->context
), 4);
2228 LLVMValueRef offset4
, i4
, j4
, rgba4
[LP_MAX_VECTOR_LENGTH
/16];
2229 struct lp_type lp_324_vectype
= lp_type_uint_vec(32, 128);
2231 assert(n
/ 4 <= ARRAY_SIZE(rgba4
));
2233 rgba
= LLVMGetUndef(i128_vectype
);
2235 for (count
= 0; count
< n
/ 4; count
++) {
2236 LLVMValueRef colors
, codewords
, alpha_lo
= NULL
, alpha_hi
= NULL
;
2238 i4
= lp_build_extract_range(gallivm
, i
, count
* 4, 4);
2239 j4
= lp_build_extract_range(gallivm
, j
, count
* 4, 4);
2240 offset4
= lp_build_extract_range(gallivm
, offset
, count
* 4, 4);
2242 lp_build_gather_s3tc(gallivm
, 4, format_desc
, &colors
, &codewords
,
2243 &alpha_lo
, &alpha_hi
, base_ptr
, offset4
);
2245 switch (format_desc
->format
) {
2246 case PIPE_FORMAT_DXT1_RGB
:
2247 case PIPE_FORMAT_DXT1_RGBA
:
2248 case PIPE_FORMAT_DXT1_SRGB
:
2249 case PIPE_FORMAT_DXT1_SRGBA
:
2250 rgba4
[count
] = s3tc_dxt1_to_rgba_aos(gallivm
, 4, format_desc
->format
,
2251 colors
, codewords
, i4
, j4
);
2253 case PIPE_FORMAT_DXT3_RGBA
:
2254 case PIPE_FORMAT_DXT3_SRGBA
:
2255 rgba4
[count
] = s3tc_dxt3_to_rgba_aos(gallivm
, 4, format_desc
->format
, colors
,
2256 codewords
, alpha_lo
, alpha_hi
, i4
, j4
);
2258 case PIPE_FORMAT_DXT5_RGBA
:
2259 case PIPE_FORMAT_DXT5_SRGBA
:
2260 rgba4
[count
] = s3tc_dxt5_to_rgba_aos(gallivm
, 4, format_desc
->format
, colors
,
2261 codewords
, alpha_lo
, alpha_hi
, i4
, j4
);
2265 rgba4
[count
] = LLVMGetUndef(LLVMVectorType(i8t
, 4));
2268 /* shuffles typically give best results with dword elements...*/
2269 rgba4
[count
] = LLVMBuildBitCast(builder
, rgba4
[count
], i324_vectype
, "");
2271 rgba
= lp_build_concat(gallivm
, rgba4
, lp_324_vectype
, n
/ 4);
2272 rgba
= LLVMBuildBitCast(builder
, rgba
, i8_vectype
, "");
2275 LLVMValueRef colors
, codewords
, alpha_lo
= NULL
, alpha_hi
= NULL
;
2277 lp_build_gather_s3tc(gallivm
, n
, format_desc
, &colors
, &codewords
,
2278 &alpha_lo
, &alpha_hi
, base_ptr
, offset
);
2280 switch (format_desc
->format
) {
2281 case PIPE_FORMAT_DXT1_RGB
:
2282 case PIPE_FORMAT_DXT1_RGBA
:
2283 case PIPE_FORMAT_DXT1_SRGB
:
2284 case PIPE_FORMAT_DXT1_SRGBA
:
2285 rgba
= s3tc_dxt1_to_rgba_aos(gallivm
, n
, format_desc
->format
,
2286 colors
, codewords
, i
, j
);
2288 case PIPE_FORMAT_DXT3_RGBA
:
2289 case PIPE_FORMAT_DXT3_SRGBA
:
2290 rgba
= s3tc_dxt3_to_rgba_aos(gallivm
, n
, format_desc
->format
, colors
,
2291 codewords
, alpha_lo
, alpha_hi
, i
, j
);
2293 case PIPE_FORMAT_DXT5_RGBA
:
2294 case PIPE_FORMAT_DXT5_SRGBA
:
2295 rgba
= s3tc_dxt5_to_rgba_aos(gallivm
, n
, format_desc
->format
, colors
,
2296 codewords
, alpha_lo
, alpha_hi
, i
, j
);
2300 rgba
= LLVMGetUndef(LLVMVectorType(i8t
, 4*n
));
2305 /* always return just decompressed values - srgb conversion is done later */
2311 * Gather elements from scatter positions in memory into vectors.
2312 * This is customised for fetching texels from s3tc textures.
2313 * For SSE, typical value is length=4.
2315 * @param length length of the offsets
2316 * @param colors the stored colors of the blocks will be extracted into this.
2317 * @param codewords the codewords of the blocks will be extracted into this.
2318 * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
2319 * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
2320 * @param base_ptr base pointer, should be a i8 pointer type.
2321 * @param offsets vector with offsets
2324 lp_build_gather_rgtc(struct gallivm_state
*gallivm
,
2326 const struct util_format_description
*format_desc
,
2327 LLVMValueRef
*red_lo
, LLVMValueRef
*red_hi
,
2328 LLVMValueRef
*green_lo
, LLVMValueRef
*green_hi
,
2329 LLVMValueRef base_ptr
,
2330 LLVMValueRef offsets
)
2332 LLVMBuilderRef builder
= gallivm
->builder
;
2333 unsigned block_bits
= format_desc
->block
.bits
;
2335 LLVMValueRef elems
[8];
2336 LLVMTypeRef type32
= LLVMInt32TypeInContext(gallivm
->context
);
2337 LLVMTypeRef type64
= LLVMInt64TypeInContext(gallivm
->context
);
2338 LLVMTypeRef type32dxt
;
2339 struct lp_type lp_type32dxt
;
2341 memset(&lp_type32dxt
, 0, sizeof lp_type32dxt
);
2342 lp_type32dxt
.width
= 32;
2343 lp_type32dxt
.length
= block_bits
/ 32;
2344 type32dxt
= lp_build_vec_type(gallivm
, lp_type32dxt
);
2346 assert(block_bits
== 64 || block_bits
== 128);
2347 assert(length
== 1 || length
== 4 || length
== 8);
2349 for (i
= 0; i
< length
; ++i
) {
2350 elems
[i
] = lp_build_gather_elem(gallivm
, length
,
2351 block_bits
, block_bits
, TRUE
,
2352 base_ptr
, offsets
, i
, FALSE
);
2353 elems
[i
] = LLVMBuildBitCast(builder
, elems
[i
], type32dxt
, "");
2356 LLVMValueRef elem
= elems
[0];
2358 *red_lo
= LLVMBuildExtractElement(builder
, elem
,
2359 lp_build_const_int32(gallivm
, 0), "");
2360 *red_hi
= LLVMBuildExtractElement(builder
, elem
,
2361 lp_build_const_int32(gallivm
, 1), "");
2363 if (block_bits
== 128) {
2364 *green_lo
= LLVMBuildExtractElement(builder
, elem
,
2365 lp_build_const_int32(gallivm
, 2), "");
2366 *green_hi
= LLVMBuildExtractElement(builder
, elem
,
2367 lp_build_const_int32(gallivm
, 3), "");
2370 LLVMValueRef tmp
[4];
2371 struct lp_type lp_type32
, lp_type64
;
2372 memset(&lp_type32
, 0, sizeof lp_type32
);
2373 lp_type32
.width
= 32;
2374 lp_type32
.length
= length
;
2375 lp_type32
.sign
= lp_type32dxt
.sign
;
2376 memset(&lp_type64
, 0, sizeof lp_type64
);
2377 lp_type64
.width
= 64;
2378 lp_type64
.length
= length
/2;
2379 if (block_bits
== 128) {
2381 for (i
= 0; i
< 4; ++i
) {
2383 tmp
[1] = elems
[i
+4];
2384 elems
[i
] = lp_build_concat(gallivm
, tmp
, lp_type32dxt
, 2);
2387 lp_build_transpose_aos(gallivm
, lp_type32
, elems
, tmp
);
2393 LLVMValueRef red01
, red23
;
2394 LLVMTypeRef type64_vec
= LLVMVectorType(type64
, length
/2);
2395 LLVMTypeRef type32_vec
= LLVMVectorType(type32
, length
);
2397 for (i
= 0; i
< length
; ++i
) {
2399 elems
[i
] = LLVMBuildShuffleVector(builder
, elems
[i
],
2400 LLVMGetUndef(type32dxt
),
2401 lp_build_const_extend_shuffle(gallivm
, 2, 4), "");
2404 struct lp_type lp_type32_4
= {0};
2405 lp_type32_4
.width
= 32;
2406 lp_type32_4
.length
= 4;
2407 for (i
= 0; i
< 4; ++i
) {
2409 tmp
[1] = elems
[i
+4];
2410 elems
[i
] = lp_build_concat(gallivm
, tmp
, lp_type32_4
, 2);
2413 red01
= lp_build_interleave2_half(gallivm
, lp_type32
, elems
[0], elems
[1], 0);
2414 red23
= lp_build_interleave2_half(gallivm
, lp_type32
, elems
[2], elems
[3], 0);
2415 red01
= LLVMBuildBitCast(builder
, red01
, type64_vec
, "");
2416 red23
= LLVMBuildBitCast(builder
, red23
, type64_vec
, "");
2417 *red_lo
= lp_build_interleave2_half(gallivm
, lp_type64
, red01
, red23
, 0);
2418 *red_hi
= lp_build_interleave2_half(gallivm
, lp_type64
, red01
, red23
, 1);
2419 *red_lo
= LLVMBuildBitCast(builder
, *red_lo
, type32_vec
, "");
2420 *red_hi
= LLVMBuildBitCast(builder
, *red_hi
, type32_vec
, "");
2428 rgtc1_to_rgba_aos(struct gallivm_state
*gallivm
,
2430 enum pipe_format format
,
2431 LLVMValueRef red_lo
,
2432 LLVMValueRef red_hi
,
2436 LLVMBuilderRef builder
= gallivm
->builder
;
2437 bool is_signed
= (format
== PIPE_FORMAT_RGTC1_SNORM
);
2438 LLVMValueRef red
= s3tc_dxt5_alpha_channel(gallivm
, is_signed
, n
, red_hi
, red_lo
, i
, j
);
2440 struct lp_type type
, type8
;
2441 memset(&type
, 0, sizeof type
);
2444 memset(&type8
, 0, sizeof type8
);
2447 rgba
= lp_build_const_int_vec(gallivm
, type
, is_signed
? (0x7f << 24) : (0xff << 24));
2448 rgba
= LLVMBuildOr(builder
, rgba
, red
, "");
2449 return LLVMBuildBitCast(builder
, rgba
, lp_build_vec_type(gallivm
, type8
), "");
2453 rgtc2_to_rgba_aos(struct gallivm_state
*gallivm
,
2455 enum pipe_format format
,
2456 LLVMValueRef red_lo
,
2457 LLVMValueRef red_hi
,
2458 LLVMValueRef green_lo
,
2459 LLVMValueRef green_hi
,
2463 LLVMBuilderRef builder
= gallivm
->builder
;
2464 bool is_signed
= (format
== PIPE_FORMAT_RGTC2_SNORM
);
2465 LLVMValueRef red
= s3tc_dxt5_alpha_channel(gallivm
, is_signed
, n
, red_hi
, red_lo
, i
, j
);
2466 LLVMValueRef green
= s3tc_dxt5_alpha_channel(gallivm
, is_signed
, n
, green_hi
, green_lo
, i
, j
);
2468 struct lp_type type
, type8
;
2469 memset(&type
, 0, sizeof type
);
2472 memset(&type8
, 0, sizeof type8
);
2475 rgba
= lp_build_const_int_vec(gallivm
, type
, is_signed
? (0x7f << 24) : (0xff << 24));
2476 rgba
= LLVMBuildOr(builder
, rgba
, red
, "");
2477 green
= LLVMBuildShl(builder
, green
, lp_build_const_int_vec(gallivm
, type
, 8), "");
2478 rgba
= LLVMBuildOr(builder
, rgba
, green
, "");
2479 return LLVMBuildBitCast(builder
, rgba
, lp_build_vec_type(gallivm
, type8
), "");
2483 latc1_to_rgba_aos(struct gallivm_state
*gallivm
,
2485 enum pipe_format format
,
2486 LLVMValueRef red_lo
,
2487 LLVMValueRef red_hi
,
2491 LLVMBuilderRef builder
= gallivm
->builder
;
2492 bool is_signed
= (format
== PIPE_FORMAT_LATC1_SNORM
);
2493 LLVMValueRef red
= s3tc_dxt5_alpha_channel(gallivm
, is_signed
, n
, red_hi
, red_lo
, i
, j
);
2494 LLVMValueRef rgba
, temp
;
2495 struct lp_type type
, type8
;
2496 memset(&type
, 0, sizeof type
);
2499 memset(&type8
, 0, sizeof type8
);
2502 rgba
= lp_build_const_int_vec(gallivm
, type
, is_signed
? (0x7f << 24) : (0xff << 24));
2503 rgba
= LLVMBuildOr(builder
, rgba
, red
, "");
2504 temp
= LLVMBuildShl(builder
, red
, lp_build_const_int_vec(gallivm
, type
, 8), "");
2505 rgba
= LLVMBuildOr(builder
, rgba
, temp
, "");
2506 temp
= LLVMBuildShl(builder
, red
, lp_build_const_int_vec(gallivm
, type
, 16), "");
2507 rgba
= LLVMBuildOr(builder
, rgba
, temp
, "");
2508 return LLVMBuildBitCast(builder
, rgba
, lp_build_vec_type(gallivm
, type8
), "");
2512 latc2_to_rgba_aos(struct gallivm_state
*gallivm
,
2514 enum pipe_format format
,
2515 LLVMValueRef red_lo
,
2516 LLVMValueRef red_hi
,
2517 LLVMValueRef green_lo
,
2518 LLVMValueRef green_hi
,
2522 LLVMBuilderRef builder
= gallivm
->builder
;
2523 bool is_signed
= (format
== PIPE_FORMAT_LATC2_SNORM
);
2524 LLVMValueRef red
= s3tc_dxt5_alpha_channel(gallivm
, is_signed
, n
, red_hi
, red_lo
, i
, j
);
2525 LLVMValueRef green
= s3tc_dxt5_alpha_channel(gallivm
, is_signed
, n
, green_hi
, green_lo
, i
, j
);
2526 LLVMValueRef rgba
, temp
;
2527 struct lp_type type
, type8
;
2528 memset(&type
, 0, sizeof type
);
2531 memset(&type8
, 0, sizeof type8
);
2535 temp
= LLVMBuildShl(builder
, red
, lp_build_const_int_vec(gallivm
, type
, 8), "");
2536 rgba
= LLVMBuildOr(builder
, red
, temp
, "");
2537 temp
= LLVMBuildShl(builder
, red
, lp_build_const_int_vec(gallivm
, type
, 16), "");
2538 rgba
= LLVMBuildOr(builder
, rgba
, temp
, "");
2539 temp
= LLVMBuildShl(builder
, green
, lp_build_const_int_vec(gallivm
, type
, 24), "");
2540 rgba
= LLVMBuildOr(builder
, rgba
, temp
, "");
2541 return LLVMBuildBitCast(builder
, rgba
, lp_build_vec_type(gallivm
, type8
), "");
2545 * @param n number of pixels processed (usually n=4, but it should also work with n=1
2546 * and multiples of 4)
2547 * @param base_ptr base pointer (32bit or 64bit pointer depending on the architecture)
2548 * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2549 * @param i is a <n x i32> vector with the x subpixel coordinate (0..3)
2550 * @param j is a <n x i32> vector with the y subpixel coordinate (0..3)
2551 * @return a <4*n x i8> vector with the pixel RGBA values in AoS
2554 lp_build_fetch_rgtc_rgba_aos(struct gallivm_state
*gallivm
,
2555 const struct util_format_description
*format_desc
,
2557 LLVMValueRef base_ptr
,
2558 LLVMValueRef offset
,
2564 LLVMTypeRef i8t
= LLVMInt8TypeInContext(gallivm
->context
);
2565 LLVMBuilderRef builder
= gallivm
->builder
;
2566 LLVMValueRef red_lo
, red_hi
, green_lo
, green_hi
;
2567 assert(format_desc
->layout
== UTIL_FORMAT_LAYOUT_RGTC
);
2568 assert(format_desc
->block
.width
== 4);
2569 assert(format_desc
->block
.height
== 4);
2571 assert((n
== 1) || (n
% 4 == 0));
2575 LLVMTypeRef i128_type
= LLVMIntTypeInContext(gallivm
->context
, 128);
2576 LLVMTypeRef i128_vectype
= LLVMVectorType(i128_type
, n
/ 4);
2577 LLVMTypeRef i8_vectype
= LLVMVectorType(i8t
, 4 * n
);
2578 LLVMTypeRef i324_vectype
= LLVMVectorType(LLVMInt32TypeInContext(
2579 gallivm
->context
), 4);
2580 LLVMValueRef offset4
, i4
, j4
, rgba4
[LP_MAX_VECTOR_LENGTH
/16];
2581 struct lp_type lp_324_vectype
= lp_type_uint_vec(32, 128);
2583 rgba
= LLVMGetUndef(i128_vectype
);
2585 for (count
= 0; count
< n
/ 4; count
++) {
2587 i4
= lp_build_extract_range(gallivm
, i
, count
* 4, 4);
2588 j4
= lp_build_extract_range(gallivm
, j
, count
* 4, 4);
2589 offset4
= lp_build_extract_range(gallivm
, offset
, count
* 4, 4);
2591 lp_build_gather_rgtc(gallivm
, 4, format_desc
, &red_lo
, &red_hi
,
2592 &green_lo
, &green_hi
, base_ptr
, offset4
);
2594 switch (format_desc
->format
) {
2595 case PIPE_FORMAT_RGTC1_UNORM
:
2596 case PIPE_FORMAT_RGTC1_SNORM
:
2597 rgba4
[count
] = rgtc1_to_rgba_aos(gallivm
, 4, format_desc
->format
,
2598 red_lo
, red_hi
, i4
, j4
);
2600 case PIPE_FORMAT_RGTC2_UNORM
:
2601 case PIPE_FORMAT_RGTC2_SNORM
:
2602 rgba4
[count
] = rgtc2_to_rgba_aos(gallivm
, 4, format_desc
->format
,
2603 red_lo
, red_hi
, green_lo
, green_hi
, i4
, j4
);
2605 case PIPE_FORMAT_LATC1_UNORM
:
2606 case PIPE_FORMAT_LATC1_SNORM
:
2607 rgba4
[count
] = latc1_to_rgba_aos(gallivm
, 4, format_desc
->format
,
2608 red_lo
, red_hi
, i4
, j4
);
2610 case PIPE_FORMAT_LATC2_UNORM
:
2611 case PIPE_FORMAT_LATC2_SNORM
:
2612 rgba4
[count
] = latc2_to_rgba_aos(gallivm
, 4, format_desc
->format
,
2613 red_lo
, red_hi
, green_lo
, green_hi
, i4
, j4
);
2617 rgba4
[count
] = LLVMGetUndef(LLVMVectorType(i8t
, 4));
2620 /* shuffles typically give best results with dword elements...*/
2621 rgba4
[count
] = LLVMBuildBitCast(builder
, rgba4
[count
], i324_vectype
, "");
2623 rgba
= lp_build_concat(gallivm
, rgba4
, lp_324_vectype
, n
/ 4);
2624 rgba
= LLVMBuildBitCast(builder
, rgba
, i8_vectype
, "");
2626 LLVMValueRef red_lo
, red_hi
, green_lo
, green_hi
;
2628 lp_build_gather_rgtc(gallivm
, n
, format_desc
, &red_lo
, &red_hi
,
2629 &green_lo
, &green_hi
, base_ptr
, offset
);
2631 switch (format_desc
->format
) {
2632 case PIPE_FORMAT_RGTC1_UNORM
:
2633 case PIPE_FORMAT_RGTC1_SNORM
:
2634 rgba
= rgtc1_to_rgba_aos(gallivm
, n
, format_desc
->format
,
2635 red_lo
, red_hi
, i
, j
);
2637 case PIPE_FORMAT_RGTC2_UNORM
:
2638 case PIPE_FORMAT_RGTC2_SNORM
:
2639 rgba
= rgtc2_to_rgba_aos(gallivm
, n
, format_desc
->format
,
2640 red_lo
, red_hi
, green_lo
, green_hi
, i
, j
);
2642 case PIPE_FORMAT_LATC1_UNORM
:
2643 case PIPE_FORMAT_LATC1_SNORM
:
2644 rgba
= latc1_to_rgba_aos(gallivm
, n
, format_desc
->format
,
2645 red_lo
, red_hi
, i
, j
);
2647 case PIPE_FORMAT_LATC2_UNORM
:
2648 case PIPE_FORMAT_LATC2_SNORM
:
2649 rgba
= latc2_to_rgba_aos(gallivm
, n
, format_desc
->format
,
2650 red_lo
, red_hi
, green_lo
, green_hi
, i
, j
);
2654 rgba
= LLVMGetUndef(LLVMVectorType(i8t
, 4*n
));