util: use standard name for snprintf()
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_s3tc.c
1 /**************************************************************************
2 *
3 * Copyright 2010-2018 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * s3tc pixel format manipulation.
32 *
33 * @author Roland Scheidegger <sroland@vmware.com>
34 */
35
36
37 #include "util/u_format.h"
38 #include "util/u_math.h"
39 #include "util/u_string.h"
40 #include "util/u_cpu_detect.h"
41 #include "util/u_debug.h"
42
43 #include "lp_bld_arit.h"
44 #include "lp_bld_type.h"
45 #include "lp_bld_const.h"
46 #include "lp_bld_conv.h"
47 #include "lp_bld_gather.h"
48 #include "lp_bld_format.h"
49 #include "lp_bld_logic.h"
50 #include "lp_bld_pack.h"
51 #include "lp_bld_flow.h"
52 #include "lp_bld_printf.h"
53 #include "lp_bld_struct.h"
54 #include "lp_bld_swizzle.h"
55 #include "lp_bld_init.h"
56 #include "lp_bld_debug.h"
57 #include "lp_bld_intr.h"
58
59
60 /**
61 * Reverse an interleave2_half
62 * (ie. pick every second element, independent lower/upper halfs)
63 * sse2 can only do that with 32bit (shufps) or larger elements
64 * natively. (Otherwise, and/pack (even) or shift/pack (odd)
65 * could be used, ideally llvm would do that for us.)
66 * XXX: Unfortunately, this does NOT translate to a shufps if those
67 * are int vectors (and casting will not help, llvm needs to recognize it
68 * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
69 * sequence which I'm pretty sure is a lot worse despite domain transition
70 * penalties with shufps (except maybe on Nehalem).
71 */
72 static LLVMValueRef
73 lp_build_uninterleave2_half(struct gallivm_state *gallivm,
74 struct lp_type type,
75 LLVMValueRef a,
76 LLVMValueRef b,
77 unsigned lo_hi)
78 {
79 LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
80 unsigned i;
81
82 assert(type.length <= LP_MAX_VECTOR_LENGTH);
83 assert(lo_hi < 2);
84
85 if (type.length * type.width == 256) {
86 assert(type.length == 8);
87 assert(type.width == 32);
88 static const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};
89 for (i = 0; i < type.length; ++i) {
90 elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);
91 }
92 } else {
93 for (i = 0; i < type.length; ++i) {
94 elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
95 }
96 }
97
98 shuffle = LLVMConstVector(elems, type.length);
99
100 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
101
102 }
103
104
105 /**
106 * Build shuffle for extending vectors.
107 */
108 static LLVMValueRef
109 lp_build_const_extend_shuffle(struct gallivm_state *gallivm,
110 unsigned n, unsigned length)
111 {
112 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
113 unsigned i;
114
115 assert(n <= length);
116 assert(length <= LP_MAX_VECTOR_LENGTH);
117
118 /* TODO: cache results in a static table */
119
120 for(i = 0; i < n; i++) {
121 elems[i] = lp_build_const_int32(gallivm, i);
122 }
123 for (i = n; i < length; i++) {
124 elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
125 }
126
127 return LLVMConstVector(elems, length);
128 }
129
130 static LLVMValueRef
131 lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n)
132 {
133 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
134 unsigned i, j;
135
136 assert(n <= LP_MAX_VECTOR_LENGTH);
137
138 /* TODO: cache results in a static table */
139
140 for(i = 0, j = 0; i < n; i += 2, ++j) {
141 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
142 elems[i + 1] = lp_build_const_int32(gallivm, n + j);
143 elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j);
144 elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j);
145 }
146
147 return LLVMConstVector(elems, n * 2);
148 }
149
150 /*
151 * broadcast 1 element to all elements
152 */
153 static LLVMValueRef
154 lp_build_const_shuffle1(struct gallivm_state *gallivm,
155 unsigned index, unsigned n)
156 {
157 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
158 unsigned i;
159
160 assert(n <= LP_MAX_VECTOR_LENGTH);
161
162 /* TODO: cache results in a static table */
163
164 for (i = 0; i < n; i++) {
165 elems[i] = lp_build_const_int32(gallivm, index);
166 }
167
168 return LLVMConstVector(elems, n);
169 }
170
171 /*
172 * move 1 element to pos 0, rest undef
173 */
174 static LLVMValueRef
175 lp_build_shuffle1undef(struct gallivm_state *gallivm,
176 LLVMValueRef a, unsigned index, unsigned n)
177 {
178 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf;
179 unsigned i;
180
181 assert(n <= LP_MAX_VECTOR_LENGTH);
182
183 elems[0] = lp_build_const_int32(gallivm, index);
184
185 for (i = 1; i < n; i++) {
186 elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
187 }
188 shuf = LLVMConstVector(elems, n);
189
190 return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, "");
191 }
192
193 static boolean
194 format_dxt1_variant(enum pipe_format format)
195 {
196 return format == PIPE_FORMAT_DXT1_RGB ||
197 format == PIPE_FORMAT_DXT1_RGBA ||
198 format == PIPE_FORMAT_DXT1_SRGB ||
199 format == PIPE_FORMAT_DXT1_SRGBA;
200
201 }
202
203 /**
204 * Gather elements from scatter positions in memory into vectors.
205 * This is customised for fetching texels from s3tc textures.
206 * For SSE, typical value is length=4.
207 *
208 * @param length length of the offsets
209 * @param colors the stored colors of the blocks will be extracted into this.
210 * @param codewords the codewords of the blocks will be extracted into this.
211 * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
212 * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
213 * @param base_ptr base pointer, should be a i8 pointer type.
214 * @param offsets vector with offsets
215 */
216 static void
217 lp_build_gather_s3tc(struct gallivm_state *gallivm,
218 unsigned length,
219 const struct util_format_description *format_desc,
220 LLVMValueRef *colors,
221 LLVMValueRef *codewords,
222 LLVMValueRef *alpha_lo,
223 LLVMValueRef *alpha_hi,
224 LLVMValueRef base_ptr,
225 LLVMValueRef offsets)
226 {
227 LLVMBuilderRef builder = gallivm->builder;
228 unsigned block_bits = format_desc->block.bits;
229 unsigned i;
230 LLVMValueRef elems[8];
231 LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
232 LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
233 LLVMTypeRef type32dxt;
234 struct lp_type lp_type32dxt;
235
236 memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
237 lp_type32dxt.width = 32;
238 lp_type32dxt.length = block_bits / 32;
239 type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
240
241 assert(block_bits == 64 || block_bits == 128);
242 assert(length == 1 || length == 4 || length == 8);
243
244 for (i = 0; i < length; ++i) {
245 elems[i] = lp_build_gather_elem(gallivm, length,
246 block_bits, block_bits, TRUE,
247 base_ptr, offsets, i, FALSE);
248 elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
249 }
250 if (length == 1) {
251 LLVMValueRef elem = elems[0];
252 if (block_bits == 128) {
253 *alpha_lo = LLVMBuildExtractElement(builder, elem,
254 lp_build_const_int32(gallivm, 0), "");
255 *alpha_hi = LLVMBuildExtractElement(builder, elem,
256 lp_build_const_int32(gallivm, 1), "");
257 *colors = LLVMBuildExtractElement(builder, elem,
258 lp_build_const_int32(gallivm, 2), "");
259 *codewords = LLVMBuildExtractElement(builder, elem,
260 lp_build_const_int32(gallivm, 3), "");
261 }
262 else {
263 *alpha_lo = LLVMGetUndef(type32);
264 *alpha_hi = LLVMGetUndef(type32);
265 *colors = LLVMBuildExtractElement(builder, elem,
266 lp_build_const_int32(gallivm, 0), "");
267 *codewords = LLVMBuildExtractElement(builder, elem,
268 lp_build_const_int32(gallivm, 1), "");
269 }
270 }
271 else {
272 LLVMValueRef tmp[4], cc01, cc23;
273 struct lp_type lp_type32, lp_type64;
274 memset(&lp_type32, 0, sizeof lp_type32);
275 lp_type32.width = 32;
276 lp_type32.length = length;
277 memset(&lp_type64, 0, sizeof lp_type64);
278 lp_type64.width = 64;
279 lp_type64.length = length/2;
280
281 if (block_bits == 128) {
282 if (length == 8) {
283 for (i = 0; i < 4; ++i) {
284 tmp[0] = elems[i];
285 tmp[1] = elems[i+4];
286 elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
287 }
288 }
289 lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
290 *colors = tmp[2];
291 *codewords = tmp[3];
292 *alpha_lo = tmp[0];
293 *alpha_hi = tmp[1];
294 } else {
295 LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
296 LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
297
298 for (i = 0; i < length; ++i) {
299 /* no-op shuffle */
300 elems[i] = LLVMBuildShuffleVector(builder, elems[i],
301 LLVMGetUndef(type32dxt),
302 lp_build_const_extend_shuffle(gallivm, 2, 4), "");
303 }
304 if (length == 8) {
305 struct lp_type lp_type32_4 = {0};
306 lp_type32_4.width = 32;
307 lp_type32_4.length = 4;
308 for (i = 0; i < 4; ++i) {
309 tmp[0] = elems[i];
310 tmp[1] = elems[i+4];
311 elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
312 }
313 }
314 cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
315 cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
316 cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, "");
317 cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, "");
318 *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0);
319 *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1);
320 *colors = LLVMBuildBitCast(builder, *colors, type32_vec, "");
321 *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, "");
322 }
323 }
324 }
325
326 /** Convert from <n x i32> containing 2 x n rgb565 colors
327 * to 2 <n x i32> rgba8888 colors
328 * This is the most optimized version I can think of
329 * should be nearly as fast as decoding only one color
330 * NOTE: alpha channel will be set to 0
331 * @param colors is a <n x i32> vector containing the rgb565 colors
332 */
333 static void
334 color_expand2_565_to_8888(struct gallivm_state *gallivm,
335 unsigned n,
336 LLVMValueRef colors,
337 LLVMValueRef *color0,
338 LLVMValueRef *color1)
339 {
340 LLVMBuilderRef builder = gallivm->builder;
341 LLVMValueRef r, g, b, rblo, glo;
342 LLVMValueRef rgblomask, rb, rgb0, rgb1;
343 struct lp_type type, type16, type8;
344
345 assert(n > 1);
346
347 memset(&type, 0, sizeof type);
348 type.width = 32;
349 type.length = n;
350
351 memset(&type16, 0, sizeof type16);
352 type16.width = 16;
353 type16.length = 2 * n;
354
355 memset(&type8, 0, sizeof type8);
356 type8.width = 8;
357 type8.length = 4 * n;
358
359 rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707);
360 colors = LLVMBuildBitCast(builder, colors,
361 lp_build_vec_type(gallivm, type16), "");
362 /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
363 * make sure low bits of r are zero - could use AND but requires constant */
364 r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
365 r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), "");
366 b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
367 rb = LLVMBuildOr(builder, r, b, "");
368 rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), "");
369 /* don't have byte shift hence need mask */
370 rblo = LLVMBuildAnd(builder, rblo, rgblomask, "");
371 rb = LLVMBuildOr(builder, rb, rblo, "");
372
373 /* make sure low bits of g are zero */
374 g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), "");
375 g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), "");
376 glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), "");
377 g = LLVMBuildOr(builder, g, glo, "");
378
379 rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), "");
380 g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), "");
381 rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0);
382 rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1);
383
384 rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), "");
385 rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), "");
386
387 /* rgb0 is rgb00, rgb01, rgb10, rgb11
388 * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
389 * on x86 this _should_ just generate one shufps...
390 */
391 *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0);
392 *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1);
393 }
394
395
396 /** Convert from <n x i32> containing rgb565 colors
397 * (in first 16 bits) to <n x i32> rgba8888 colors
398 * bits 16-31 MBZ
399 * NOTE: alpha channel will be set to 0
400 * @param colors is a <n x i32> vector containing the rgb565 colors
401 */
402 static LLVMValueRef
403 color_expand_565_to_8888(struct gallivm_state *gallivm,
404 unsigned n,
405 LLVMValueRef colors)
406 {
407 LLVMBuilderRef builder = gallivm->builder;
408 LLVMValueRef rgba, r, g, b, rgblo, glo;
409 LLVMValueRef rbhimask, g6mask, rgblomask;
410 struct lp_type type;
411 memset(&type, 0, sizeof type);
412 type.width = 32;
413 type.length = n;
414
415 /* color expansion:
416 * first extract and shift colors into their final locations
417 * (high bits - low bits zero at this point)
418 * then replicate highest bits to the lowest bits
419 * note rb replication can be done in parallel but not g
420 * (different shift)
421 * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
422 * rhigh = 8, ghigh = 5, bhigh = 19
423 * rblow = 5, glow = 6
424 * rgblowmask = 0x00070307
425 * r = colors >> rhigh
426 * b = colors << bhigh
427 * g = (colors & g6mask) << ghigh
428 * rb = (r | b) rbhimask
429 * rbtmp = rb >> rblow
430 * gtmp = rb >> glow
431 * rbtmp = rbtmp | gtmp
432 * rbtmp = rbtmp & rgblowmask
433 * rgb = rb | g | rbtmp
434 */
435 g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0);
436 rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8);
437 rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307);
438
439 r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), "");
440 b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), "");
441 g = LLVMBuildAnd(builder, colors, g6mask, "");
442 g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), "");
443 rgba = LLVMBuildOr(builder, r, b, "");
444 rgba = LLVMBuildAnd(builder, rgba, rbhimask, "");
445 rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), "");
446 glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), "");
447 rgblo = LLVMBuildOr(builder, rgblo, glo, "");
448 rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, "");
449 rgba = LLVMBuildOr(builder, rgba, g, "");
450 rgba = LLVMBuildOr(builder, rgba, rgblo, "");
451
452 return rgba;
453 }
454
455
456 /*
457 * Average two byte vectors. (Will always round up.)
458 */
459 static LLVMValueRef
460 lp_build_pavgb(struct lp_build_context *bld8,
461 LLVMValueRef v0,
462 LLVMValueRef v1)
463 {
464 struct gallivm_state *gallivm = bld8->gallivm;
465 LLVMBuilderRef builder = gallivm->builder;
466 assert(bld8->type.width == 8);
467 assert(bld8->type.length == 16 || bld8->type.length == 32);
468 if (HAVE_LLVM < 0x0600) {
469 LLVMValueRef intrargs[2];
470 char *intr_name = bld8->type.length == 32 ? "llvm.x86.avx2.pavg.b" :
471 "llvm.x86.sse2.pavg.b";
472 intrargs[0] = v0;
473 intrargs[1] = v1;
474 return lp_build_intrinsic(builder, intr_name,
475 bld8->vec_type, intrargs, 2, 0);
476 } else {
477 /*
478 * Must match llvm's autoupgrade of pavg.b intrinsic to be useful.
479 * You better hope the backend code manages to detect the pattern, and
480 * the pattern doesn't change there...
481 */
482 struct lp_type type_ext = bld8->type;
483 LLVMTypeRef vec_type_ext;
484 LLVMValueRef res;
485 LLVMValueRef ext_one;
486 type_ext.width = 16;
487 vec_type_ext = lp_build_vec_type(gallivm, type_ext);
488 ext_one = lp_build_const_vec(gallivm, type_ext, 1);
489
490 v0 = LLVMBuildZExt(builder, v0, vec_type_ext, "");
491 v1 = LLVMBuildZExt(builder, v1, vec_type_ext, "");
492 res = LLVMBuildAdd(builder, v0, v1, "");
493 res = LLVMBuildAdd(builder, res, ext_one, "");
494 res = LLVMBuildLShr(builder, res, ext_one, "");
495 res = LLVMBuildTrunc(builder, res, bld8->vec_type, "");
496 return res;
497 }
498 }
499
500 /**
501 * Calculate 1/3(v1-v0) + v0
502 * and 2*1/3(v1-v0) + v0
503 */
504 static void
505 lp_build_lerp23(struct lp_build_context *bld,
506 LLVMValueRef v0,
507 LLVMValueRef v1,
508 LLVMValueRef *res0,
509 LLVMValueRef *res1)
510 {
511 struct gallivm_state *gallivm = bld->gallivm;
512 LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi;
513 LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp;
514 const struct lp_type type = bld->type;
515 LLVMBuilderRef builder = bld->gallivm->builder;
516 struct lp_type i16_type = lp_wider_type(type);
517 struct lp_build_context bld2;
518
519 assert(lp_check_value(type, v0));
520 assert(lp_check_value(type, v1));
521 assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
522
523 lp_build_context_init(&bld2, gallivm, i16_type);
524 bld2.type.sign = TRUE;
525 x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3);
526
527 /* FIXME: use native avx256 unpack/pack */
528 lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi);
529 lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi);
530 lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi);
531 delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo);
532 delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi);
533
534 mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, "");
535 mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, "");
536
537 x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), "");
538 x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), "");
539 /* lerp optimization: pack now, do add afterwards */
540 tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
541 *res0 = lp_build_add(bld, tmp, v0);
542
543 x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), "");
544 x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), "");
545 /* unlike above still need mask (but add still afterwards). */
546 x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
547 x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
548 tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
549 *res1 = lp_build_add(bld, tmp, v0);
550 }
551
552 /**
553 * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
554 * @param colors is a <n x i32> vector with n x 2x16bit colors
555 * @param codewords is a <n x i32> vector containing the codewords
556 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
557 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
558 */
559 static LLVMValueRef
560 s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
561 unsigned n,
562 enum pipe_format format,
563 LLVMValueRef colors,
564 LLVMValueRef codewords,
565 LLVMValueRef i,
566 LLVMValueRef j)
567 {
568 LLVMBuilderRef builder = gallivm->builder;
569 LLVMValueRef color0, color1, color2, color3, color2_2, color3_2;
570 LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2;
571 LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices;
572 struct lp_type type, type8;
573 struct lp_build_context bld8, bld32;
574 boolean is_dxt1_variant = format_dxt1_variant(format);
575
576 memset(&type, 0, sizeof type);
577 type.width = 32;
578 type.length = n;
579
580 memset(&type8, 0, sizeof type8);
581 type8.width = 8;
582 type8.length = 4*n;
583
584 assert(lp_check_value(type, i));
585 assert(lp_check_value(type, j));
586
587 a = lp_build_const_int_vec(gallivm, type, 0xff000000);
588
589 lp_build_context_init(&bld32, gallivm, type);
590 lp_build_context_init(&bld8, gallivm, type8);
591
592 /*
593 * works as follows:
594 * - expand color0/color1 to rgba8888
595 * - calculate color2/3 (interpolation) according to color0 < color1 rules
596 * - calculate color2/3 according to color0 >= color1 rules
597 * - do selection of color2/3 according to comparison of color0/1
598 * - extract indices (vector shift).
599 * - use compare/select to select the correct color. Since we have 2bit
600 * indices (and 4 colors), needs at least three compare/selects.
601 */
602 /*
603 * expand the two colors
604 */
605 col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), "");
606 col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), "");
607 if (n > 1) {
608 color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1);
609 }
610 else {
611 color0 = color_expand_565_to_8888(gallivm, n, col0);
612 color1 = color_expand_565_to_8888(gallivm, n, col1);
613 }
614
615 /*
616 * interpolate colors
617 * color2_1 is 2/3 color0 + 1/3 color1
618 * color3_1 is 1/3 color0 + 2/3 color1
619 * color2_2 is 1/2 color0 + 1/2 color1
620 * color3_2 is 0
621 */
622
623 colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, "");
624 colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, "");
625 /* can combine 2 lerps into one mostly - still looks expensive enough. */
626 lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3);
627 color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, "");
628 color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, "");
629
630 /* dxt3/5 always use 4-color encoding */
631 if (is_dxt1_variant) {
632 /* fix up alpha */
633 if (format == PIPE_FORMAT_DXT1_RGBA ||
634 format == PIPE_FORMAT_DXT1_SRGBA) {
635 color0 = LLVMBuildOr(builder, color0, a, "");
636 color1 = LLVMBuildOr(builder, color1, a, "");
637 color3 = LLVMBuildOr(builder, color3, a, "");
638 }
639 /*
640 * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
641 * Much cheaper (but we don't care that much if n == 1).
642 */
643 if ((util_cpu_caps.has_sse2 && n == 4) ||
644 (util_cpu_caps.has_avx2 && n == 8)) {
645 color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
646 color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
647 }
648 else {
649 struct lp_type i16_type = lp_wider_type(type8);
650 struct lp_build_context bld2;
651 LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi;
652
653 lp_build_context_init(&bld2, gallivm, i16_type);
654 bld2.type.sign = TRUE;
655
656 /*
657 * This isn't as expensive as it looks (the unpack is the same as
658 * for lerp23), with correct rounding.
659 * (Note that while rounding is correct, this will always round down,
660 * whereas pavgb will always round up.)
661 */
662 /* FIXME: use native avx256 unpack/pack */
663 lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi);
664 lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi);
665
666 addlo = lp_build_add(&bld2, v0_lo, v1_lo);
667 addhi = lp_build_add(&bld2, v0_hi, v1_hi);
668 addlo = LLVMBuildLShr(builder, addlo,
669 lp_build_const_int_vec(gallivm, i16_type, 1), "");
670 addhi = LLVMBuildLShr(builder, addhi,
671 lp_build_const_int_vec(gallivm, i16_type, 1), "");
672 color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi);
673 color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
674 }
675 color3_2 = lp_build_const_int_vec(gallivm, type, 0);
676
677 /* select between colors2/3 */
678 /* signed compare is faster saves some xors */
679 type.sign = TRUE;
680 sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1);
681 color2 = lp_build_select(&bld32, sel_mask, color2, color2_2);
682 color3 = lp_build_select(&bld32, sel_mask, color3, color3_2);
683 type.sign = FALSE;
684
685 if (format == PIPE_FORMAT_DXT1_RGBA ||
686 format == PIPE_FORMAT_DXT1_SRGBA) {
687 color2 = LLVMBuildOr(builder, color2, a, "");
688 }
689 }
690
691 const2 = lp_build_const_int_vec(gallivm, type, 2);
692 /* extract 2-bit index values */
693 bit_pos = LLVMBuildShl(builder, j, const2, "");
694 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
695 bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
696 /*
697 * NOTE: This innocent looking shift is very expensive with x86/ssex.
698 * Shifts with per-elemnent shift count get roughly translated to
699 * extract (count), extract (value), shift, move (back to xmm), unpack
700 * per element!
701 * So about 20 instructions here for 4xi32.
702 * Newer llvm versions (3.7+) will not do extract/insert but use a
703 * a couple constant count vector shifts plus shuffles. About same
704 * amount of instructions unfortunately...
705 * Would get much worse with 8xi16 even...
706 * We could actually do better here:
707 * - subtract bit_pos from 128+30, shl 23, convert float to int...
708 * - now do mul with codewords followed by shr 30...
709 * But requires 32bit->32bit mul, sse41 only (well that's emulatable
710 * with 2 32bit->64bit muls...) and not exactly cheap
711 * AVX2, of course, fixes this nonsense.
712 */
713 indices = LLVMBuildLShr(builder, codewords, bit_pos, "");
714
715 /* finally select the colors */
716 sel_lo = LLVMBuildAnd(builder, indices, bld32.one, "");
717 sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one);
718 color0 = lp_build_select(&bld32, sel_lo, color1, color0);
719 color2 = lp_build_select(&bld32, sel_lo, color3, color2);
720 sel_hi = LLVMBuildAnd(builder, indices, const2, "");
721 sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2);
722 rgba = lp_build_select(&bld32, sel_hi, color2, color0);
723
724 /* fix up alpha */
725 if (format == PIPE_FORMAT_DXT1_RGB ||
726 format == PIPE_FORMAT_DXT1_SRGB) {
727 rgba = LLVMBuildOr(builder, rgba, a, "");
728 }
729 return LLVMBuildBitCast(builder, rgba, bld8.vec_type, "");
730 }
731
732
733 static LLVMValueRef
734 s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm,
735 unsigned n,
736 enum pipe_format format,
737 LLVMValueRef colors,
738 LLVMValueRef codewords,
739 LLVMValueRef i,
740 LLVMValueRef j)
741 {
742 return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format,
743 colors, codewords, i, j);
744 }
745
746
747 /**
748 * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
749 * @param colors is a <n x i32> vector with n x 2x16bit colors
750 * @param codewords is a <n x i32> vector containing the codewords
751 * @param alphas is a <n x i64> vector containing the alpha values
752 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
753 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
754 */
755 static LLVMValueRef
756 s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
757 unsigned n,
758 enum pipe_format format,
759 LLVMValueRef colors,
760 LLVMValueRef codewords,
761 LLVMValueRef alpha_low,
762 LLVMValueRef alpha_hi,
763 LLVMValueRef i,
764 LLVMValueRef j)
765 {
766 LLVMBuilderRef builder = gallivm->builder;
767 LLVMValueRef rgba, tmp, tmp2;
768 LLVMValueRef bit_pos, sel_mask;
769 struct lp_type type, type8;
770 struct lp_build_context bld;
771
772 memset(&type, 0, sizeof type);
773 type.width = 32;
774 type.length = n;
775
776 memset(&type8, 0, sizeof type8);
777 type8.width = 8;
778 type8.length = n*4;
779
780 assert(lp_check_value(type, i));
781 assert(lp_check_value(type, j));
782
783 lp_build_context_init(&bld, gallivm, type);
784
785 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
786 colors, codewords, i, j);
787
788 rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, "");
789
790 /*
791 * Extract alpha values. Since we now need to select from
792 * which 32bit vector values are fetched, construct selection
793 * mask from highest bit of bit_pos, and use select, then shift
794 * according to the bit_pos (without the highest bit).
795 * Note this is pointless for n == 1 case. Could just
796 * directly use 64bit arithmetic if we'd extract 64bit
797 * alpha value instead of 2x32...
798 */
799 /* pos = 4*(4j+i) */
800 bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
801 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
802 bit_pos = LLVMBuildShl(builder, bit_pos,
803 lp_build_const_int_vec(gallivm, type, 2), "");
804 sel_mask = LLVMBuildLShr(builder, bit_pos,
805 lp_build_const_int_vec(gallivm, type, 5), "");
806 sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, "");
807 tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
808 bit_pos = LLVMBuildAnd(builder, bit_pos,
809 lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
810 /* Warning: slow shift with per element count (without avx2) */
811 /*
812 * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
813 * to select the right byte with pshufb. Then for the remaining one bit
814 * just do shift/select.
815 */
816 tmp = LLVMBuildLShr(builder, tmp, bit_pos, "");
817
818 /* combined expand from a4 to a8 and shift into position */
819 tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), "");
820 tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), "");
821 tmp = LLVMBuildOr(builder, tmp, tmp2, "");
822
823 rgba = LLVMBuildOr(builder, tmp, rgba, "");
824
825 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
826 }
827
828 static LLVMValueRef
829 lp_build_lerpdxta(struct gallivm_state *gallivm,
830 LLVMValueRef alpha0,
831 LLVMValueRef alpha1,
832 LLVMValueRef code,
833 LLVMValueRef sel_mask,
834 unsigned n)
835 {
836 /*
837 * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
838 * (plus pmullw is actually faster...)
839 * we just pretend our 32bit values (which are really only 8bit) are 16bits.
840 * Note that this is obviously a disaster for the scalar case.
841 */
842 LLVMBuilderRef builder = gallivm->builder;
843 LLVMValueRef delta, ainterp;
844 LLVMValueRef weight5, weight7, weight;
845 struct lp_type type32, type16, type8;
846 struct lp_build_context bld16;
847
848 memset(&type32, 0, sizeof type32);
849 type32.width = 32;
850 type32.length = n;
851 memset(&type16, 0, sizeof type16);
852 type16.width = 16;
853 type16.length = 2*n;
854 type16.sign = TRUE;
855 memset(&type8, 0, sizeof type8);
856 type8.width = 8;
857 type8.length = 4*n;
858
859 lp_build_context_init(&bld16, gallivm, type16);
860 /* 255/7 is a bit off - increase accuracy at the expense of shift later */
861 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
862 weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5);
863 weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7);
864 weight = lp_build_select(&bld16, sel_mask, weight7, weight5);
865
866 alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
867 alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, "");
868 code = LLVMBuildBitCast(builder, code, bld16.vec_type, "");
869 /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
870 but we don't care */
871 code = LLVMBuildSub(builder, code, bld16.one, "");
872
873 weight = LLVMBuildMul(builder, weight, code, "");
874 weight = LLVMBuildLShr(builder, weight,
875 lp_build_const_int_vec(gallivm, type16, 6), "");
876
877 delta = LLVMBuildSub(builder, alpha1, alpha0, "");
878
879 ainterp = LLVMBuildMul(builder, delta, weight, "");
880 ainterp = LLVMBuildLShr(builder, ainterp,
881 lp_build_const_int_vec(gallivm, type16, 8), "");
882
883 ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), "");
884 alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
885 ainterp = LLVMBuildAdd(builder, alpha0, ainterp, "");
886 ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), "");
887
888 return ainterp;
889 }
890
891 /**
892 * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
893 * @param colors is a <n x i32> vector with n x 2x16bit colors
894 * @param codewords is a <n x i32> vector containing the codewords
895 * @param alphas is a <n x i64> vector containing the alpha values
896 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
897 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
898 */
899 static LLVMValueRef
900 s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm,
901 unsigned n,
902 enum pipe_format format,
903 LLVMValueRef colors,
904 LLVMValueRef codewords,
905 LLVMValueRef alpha_lo,
906 LLVMValueRef alpha_hi,
907 LLVMValueRef i,
908 LLVMValueRef j)
909 {
910 LLVMBuilderRef builder = gallivm->builder;
911 LLVMValueRef rgba, tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift;
912 LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s;
913 LLVMValueRef mask6, mask7, ainterp;
914 LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
915 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
916 struct lp_type type, type8;
917 struct lp_build_context bld32;
918
919 memset(&type, 0, sizeof type);
920 type.width = 32;
921 type.length = n;
922
923 memset(&type8, 0, sizeof type8);
924 type8.width = 8;
925 type8.length = n*4;
926
927 assert(lp_check_value(type, i));
928 assert(lp_check_value(type, j));
929
930 lp_build_context_init(&bld32, gallivm, type);
931
932 assert(lp_check_value(type, i));
933 assert(lp_check_value(type, j));
934
935 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
936 colors, codewords, i, j);
937
938 rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, "");
939
940 /* this looks pretty complex for vectorization:
941 * extract a0/a1 values
942 * extract code
943 * select weights for interpolation depending on a0 > a1
944 * mul weights by code - 1
945 * lerp a0/a1/weights
946 * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
947 */
948
949 alpha0 = LLVMBuildAnd(builder, alpha_lo,
950 lp_build_const_int_vec(gallivm, type, 0xff), "");
951 alpha1 = LLVMBuildLShr(builder, alpha_lo,
952 lp_build_const_int_vec(gallivm, type, 8), "");
953 alpha1 = LLVMBuildAnd(builder, alpha1,
954 lp_build_const_int_vec(gallivm, type, 0xff), "");
955
956 /* pos = 3*(4j+i) */
957 bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
958 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
959 tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
960 bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, "");
961 /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
962 bit_pos = LLVMBuildAdd(builder, bit_pos,
963 lp_build_const_int_vec(gallivm, type, 16), "");
964
965 if (n == 1) {
966 struct lp_type type64;
967 memset(&type64, 0, sizeof type64);
968 type64.width = 64;
969 type64.length = 1;
970 /* This is pretty pointless could avoid by just directly extracting
971 64bit in the first place but makes it more complicated elsewhere */
972 alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, "");
973 alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, "");
974 alphac0 = LLVMBuildShl(builder, alpha_hi,
975 lp_build_const_int_vec(gallivm, type64, 32), "");
976 alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, "");
977
978 shift = LLVMBuildZExt(builder, bit_pos, i64t, "");
979 alphac0 = LLVMBuildLShr(builder, alphac0, shift, "");
980 alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, "");
981 alphac = LLVMBuildAnd(builder, alphac0,
982 lp_build_const_int_vec(gallivm, type, 0x7), "");
983 }
984 else {
985 /*
986 * Using non-native vector length here (actually, with avx2 and
987 * n == 4 llvm will indeed expand to ymm regs...)
988 * At least newer llvm versions handle that ok.
989 * llvm 3.7+ will even handle the emulated 64bit shift with variable
990 * shift count without extraction (and it's actually easier to
991 * emulate than the 32bit one).
992 */
993 alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi,
994 lp_build_const_unpackx2_shuffle(gallivm, n), "");
995
996 alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), "");
997 shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), "");
998 alphac = LLVMBuildLShr(builder, alpha64, shift, "");
999 alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, "");
1000
1001 alphac = LLVMBuildAnd(builder, alphac,
1002 lp_build_const_int_vec(gallivm, type, 0x7), "");
1003 }
1004
1005 /* signed compare is faster saves some xors */
1006 type.sign = TRUE;
1007 /* alpha0 > alpha1 selection */
1008 sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1009 alpha0, alpha1);
1010 ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n);
1011
1012 /*
1013 * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
1014 * else we select a0 for case 0, a1 for case 1,
1015 * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
1016 * a = (c == 0) ? a0 : a1
1017 * a = (c > 1) ? ainterp : a
1018 * Finally handle case 6/7 for !(a0 > a1)
1019 * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
1020 * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
1021 */
1022 tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1023 alphac, bld32.zero);
1024 alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1);
1025 tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1026 alphac, bld32.one);
1027 alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha);
1028
1029 code_s = LLVMBuildAnd(builder, alphac,
1030 LLVMBuildNot(builder, sel_mask, ""), "");
1031 mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1032 code_s, lp_build_const_int_vec(gallivm, type, 6));
1033 mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1034 code_s, lp_build_const_int_vec(gallivm, type, 7));
1035 alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), "");
1036 alpha = LLVMBuildOr(builder, alpha, mask7, "");
1037
1038 alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), "");
1039 rgba = LLVMBuildOr(builder, alpha, rgba, "");
1040
1041 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
1042 }
1043
1044
1045 static void
1046 lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm,
1047 const struct util_format_description *format_desc,
1048 LLVMValueRef *dxt_block,
1049 LLVMValueRef ptr)
1050 {
1051 LLVMBuilderRef builder = gallivm->builder;
1052 unsigned block_bits = format_desc->block.bits;
1053 LLVMValueRef elem, shuf;
1054 LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32);
1055 LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits);
1056 LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
1057 LLVMTypeRef type32_4 = LLVMVectorType(type32, 4);
1058
1059 assert(block_bits == 64 || block_bits == 128);
1060
1061 ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
1062 elem = LLVMBuildLoad(builder, ptr, "");
1063
1064 if (block_bits == 128) {
1065 /* just return block as is */
1066 *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, "");
1067 }
1068 else {
1069 LLVMTypeRef type32_2 = LLVMVectorType(type32, 2);
1070 shuf = lp_build_const_extend_shuffle(gallivm, 2, 4);
1071 elem = LLVMBuildBitCast(builder, elem, type32_2, "");
1072 *dxt_block = LLVMBuildShuffleVector(builder, elem,
1073 LLVMGetUndef(type32_2), shuf, "");
1074 }
1075 }
1076
1077
1078 static void
1079 s3tc_store_cached_block(struct gallivm_state *gallivm,
1080 LLVMValueRef *col,
1081 LLVMValueRef tag_value,
1082 LLVMValueRef hash_index,
1083 LLVMValueRef cache)
1084 {
1085 LLVMBuilderRef builder = gallivm->builder;
1086 LLVMValueRef ptr, indices[3];
1087 LLVMTypeRef type_ptr4x32;
1088 unsigned count;
1089
1090 type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
1091 indices[0] = lp_build_const_int32(gallivm, 0);
1092 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1093 indices[2] = hash_index;
1094 ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1095 LLVMBuildStore(builder, tag_value, ptr);
1096
1097 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1098 hash_index = LLVMBuildMul(builder, hash_index,
1099 lp_build_const_int32(gallivm, 16), "");
1100 for (count = 0; count < 4; count++) {
1101 indices[2] = hash_index;
1102 ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1103 ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
1104 LLVMBuildStore(builder, col[count], ptr);
1105 hash_index = LLVMBuildAdd(builder, hash_index,
1106 lp_build_const_int32(gallivm, 4), "");
1107 }
1108 }
1109
1110 static LLVMValueRef
1111 s3tc_lookup_cached_pixel(struct gallivm_state *gallivm,
1112 LLVMValueRef ptr,
1113 LLVMValueRef index)
1114 {
1115 LLVMBuilderRef builder = gallivm->builder;
1116 LLVMValueRef member_ptr, indices[3];
1117
1118 indices[0] = lp_build_const_int32(gallivm, 0);
1119 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1120 indices[2] = index;
1121 member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1122 return LLVMBuildLoad(builder, member_ptr, "cache_data");
1123 }
1124
1125 static LLVMValueRef
1126 s3tc_lookup_tag_data(struct gallivm_state *gallivm,
1127 LLVMValueRef ptr,
1128 LLVMValueRef index)
1129 {
1130 LLVMBuilderRef builder = gallivm->builder;
1131 LLVMValueRef member_ptr, indices[3];
1132
1133 indices[0] = lp_build_const_int32(gallivm, 0);
1134 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1135 indices[2] = index;
1136 member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1137 return LLVMBuildLoad(builder, member_ptr, "tag_data");
1138 }
1139
1140 #if LP_BUILD_FORMAT_CACHE_DEBUG
1141 static void
1142 s3tc_update_cache_access(struct gallivm_state *gallivm,
1143 LLVMValueRef ptr,
1144 unsigned count,
1145 unsigned index)
1146 {
1147 LLVMBuilderRef builder = gallivm->builder;
1148 LLVMValueRef member_ptr, cache_access;
1149
1150 assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
1151 index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
1152
1153 member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
1154 cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
1155 cache_access = LLVMBuildAdd(builder, cache_access,
1156 LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
1157 count, 0), "");
1158 LLVMBuildStore(builder, cache_access, member_ptr);
1159 }
1160 #endif
1161
1162 /**
1163 * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
1164 * The lerp is performed between the first 2 32bit colors
1165 * in the source vector, both results are returned packed in result vector.
1166 */
1167 static LLVMValueRef
1168 lp_build_lerp23_single(struct lp_build_context *bld,
1169 LLVMValueRef v01)
1170 {
1171 struct gallivm_state *gallivm = bld->gallivm;
1172 LLVMValueRef x, mul, delta, res, v0, v1, elems[8];
1173 const struct lp_type type = bld->type;
1174 LLVMBuilderRef builder = bld->gallivm->builder;
1175 struct lp_type i16_type = lp_wider_type(type);
1176 struct lp_type i32_type = lp_wider_type(i16_type);
1177 struct lp_build_context bld2;
1178
1179 assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
1180
1181 lp_build_context_init(&bld2, gallivm, i16_type);
1182 bld2.type.sign = TRUE;
1183
1184 /* weights 256/3, 256*2/3, with correct rounding */
1185 elems[0] = elems[1] = elems[2] = elems[3] =
1186 lp_build_const_elem(gallivm, i16_type, 255*1/3);
1187 elems[4] = elems[5] = elems[6] = elems[7] =
1188 lp_build_const_elem(gallivm, i16_type, 171);
1189 x = LLVMConstVector(elems, 8);
1190
1191 /*
1192 * v01 has col0 in 32bit elem 0, col1 in elem 1.
1193 * Interleave/unpack will give us separate v0/v1 vectors.
1194 */
1195 v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0);
1196 v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, "");
1197
1198 lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1);
1199 delta = lp_build_sub(&bld2, v1, v0);
1200
1201 mul = LLVMBuildMul(builder, x, delta, "");
1202
1203 mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), "");
1204 /* lerp optimization: pack now, do add afterwards */
1205 res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef);
1206 /* only lower 2 elems are valid - for these v0 is really v0 */
1207 return lp_build_add(bld, res, v01);
1208 }
1209
1210 /*
1211 * decode one dxt1 block.
1212 */
1213 static void
1214 s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
1215 enum pipe_format format,
1216 LLVMValueRef dxt_block,
1217 LLVMValueRef *col)
1218 {
1219 LLVMBuilderRef builder = gallivm->builder;
1220 LLVMValueRef color01, color23, color01_16, color0123;
1221 LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2;
1222 struct lp_type type8, type32, type16, type64;
1223 struct lp_build_context bld8, bld32, bld16, bld64;
1224 unsigned i;
1225 boolean is_dxt1_variant = format_dxt1_variant(format);
1226
1227 memset(&type32, 0, sizeof type32);
1228 type32.width = 32;
1229 type32.length = 4;
1230 type32.sign = TRUE;
1231
1232 memset(&type8, 0, sizeof type8);
1233 type8.width = 8;
1234 type8.length = 16;
1235
1236 memset(&type16, 0, sizeof type16);
1237 type16.width = 16;
1238 type16.length = 8;
1239
1240 memset(&type64, 0, sizeof type64);
1241 type64.width = 64;
1242 type64.length = 2;
1243
1244 a = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1245 const2 = lp_build_const_int_vec(gallivm, type32, 2);
1246
1247 lp_build_context_init(&bld32, gallivm, type32);
1248 lp_build_context_init(&bld16, gallivm, type16);
1249 lp_build_context_init(&bld8, gallivm, type8);
1250 lp_build_context_init(&bld64, gallivm, type64);
1251
1252 if (is_dxt1_variant) {
1253 color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4);
1254 code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4);
1255 } else {
1256 color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4);
1257 code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4);
1258 }
1259 code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1260 /* expand bytes to dwords */
1261 code = lp_build_interleave2(gallivm, type8, code, code, 0);
1262 code = lp_build_interleave2(gallivm, type8, code, code, 0);
1263
1264
1265 /*
1266 * works as follows:
1267 * - expand color0/color1 to rgba8888
1268 * - calculate color2/3 (interpolation) according to color0 < color1 rules
1269 * - calculate color2/3 according to color0 >= color1 rules
1270 * - do selection of color2/3 according to comparison of color0/1
1271 * - extract indices.
1272 * - use compare/select to select the correct color. Since we have 2bit
1273 * indices (and 4 colors), needs at least three compare/selects.
1274 */
1275
1276 /*
1277 * expand the two colors
1278 */
1279 color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, "");
1280 color01 = lp_build_interleave2(gallivm, type16, color01,
1281 bld16.zero, 0);
1282 color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, "");
1283 color01 = color_expand_565_to_8888(gallivm, 4, color01_16);
1284
1285 /*
1286 * interpolate colors
1287 * color2_1 is 2/3 color0 + 1/3 color1
1288 * color3_1 is 1/3 color0 + 2/3 color1
1289 * color2_2 is 1/2 color0 + 1/2 color1
1290 * color3_2 is 0
1291 */
1292
1293 /* TODO: since this is now always scalar, should
1294 * probably just use control flow here instead of calculating
1295 * both cases and then selection
1296 */
1297 if (format == PIPE_FORMAT_DXT1_RGBA ||
1298 format == PIPE_FORMAT_DXT1_SRGBA) {
1299 color01 = LLVMBuildOr(builder, color01, a, "");
1300 }
1301 /* can combine 2 lerps into one mostly */
1302 color23 = lp_build_lerp23_single(&bld8, color01);
1303 color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, "");
1304
1305 /* dxt3/5 always use 4-color encoding */
1306 if (is_dxt1_variant) {
1307 LLVMValueRef color23_2, color2_2;
1308
1309 if (util_cpu_caps.has_sse2) {
1310 LLVMValueRef intrargs[2];
1311 intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
1312 /* same interleave as for lerp23 - correct result in 2nd element */
1313 intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1314 intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, "");
1315 color2_2 = lp_build_pavgb(&bld8, intrargs[0], intrargs[1]);
1316 }
1317 else {
1318 LLVMValueRef v01, v0, v1, vhalf;
1319 /*
1320 * This isn't as expensive as it looks (the unpack is the same as
1321 * for lerp23, which is the reason why we do the pointless
1322 * interleave2 too), with correct rounding (the two lower elements
1323 * will be the same).
1324 */
1325 v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1326 v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, "");
1327 lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1);
1328 vhalf = lp_build_add(&bld16, v0, v1);
1329 vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, "");
1330 color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef);
1331 }
1332 /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
1333 color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, "");
1334 color23_2 = LLVMBuildLShr(builder, color23_2,
1335 lp_build_const_int_vec(gallivm, type64, 32), "");
1336 color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, "");
1337
1338 tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, "");
1339 tmp = LLVMBuildLShr(builder, tmp,
1340 lp_build_const_int_vec(gallivm, type64, 32), "");
1341 tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, "");
1342 sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER,
1343 color01_16, tmp);
1344 sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0);
1345 color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
1346 }
1347
1348 if (util_cpu_caps.has_ssse3) {
1349 /*
1350 * Use pshufb as mini-lut. (Only doable with intrinsics as the
1351 * final shuffles are non-constant. pshufb is awesome!)
1352 */
1353 LLVMValueRef shuf[16], low2mask;
1354 LLVMValueRef intrargs[2], lut_ind, lut_adj;
1355
1356 color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, "");
1357 color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, "");
1358 color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0);
1359 color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, "");
1360
1361 if (format == PIPE_FORMAT_DXT1_RGB ||
1362 format == PIPE_FORMAT_DXT1_SRGB) {
1363 color0123 = LLVMBuildOr(builder, color0123, a, "");
1364 }
1365
1366 /* shuffle as r0r1r2r3g0g1... */
1367 for (i = 0; i < 4; i++) {
1368 shuf[4*i] = lp_build_const_int32(gallivm, 0 + i);
1369 shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i);
1370 shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i);
1371 shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i);
1372 }
1373 color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, "");
1374 color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef,
1375 LLVMConstVector(shuf, 16), "");
1376
1377 /* lowest 2 bits of each 8 bit value contain index into "LUT" */
1378 low2mask = lp_build_const_int_vec(gallivm, type8, 3);
1379 /* add 0/4/8/12 for r/g/b/a */
1380 lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400);
1381 lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, "");
1382 intrargs[0] = color0123;
1383 for (i = 0; i < 4; i++) {
1384 lut_ind = LLVMBuildAnd(builder, code, low2mask, "");
1385 lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, "");
1386 intrargs[1] = lut_ind;
1387 col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1388 bld8.vec_type, intrargs, 2, 0);
1389 col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, "");
1390 code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1391 code = LLVMBuildLShr(builder, code, const2, "");
1392 code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1393 }
1394 }
1395 else {
1396 /* Thanks to vectorization can do 4 texels in parallel */
1397 LLVMValueRef color0, color1, color2, color3;
1398 if (format == PIPE_FORMAT_DXT1_RGB ||
1399 format == PIPE_FORMAT_DXT1_SRGB) {
1400 color01 = LLVMBuildOr(builder, color01, a, "");
1401 color23 = LLVMBuildOr(builder, color23, a, "");
1402 }
1403 color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1404 lp_build_const_shuffle1(gallivm, 0, 4), "");
1405 color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1406 lp_build_const_shuffle1(gallivm, 1, 4), "");
1407 color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1408 lp_build_const_shuffle1(gallivm, 0, 4), "");
1409 color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1410 lp_build_const_shuffle1(gallivm, 1, 4), "");
1411 code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1412
1413 for (i = 0; i < 4; i++) {
1414 /* select the colors */
1415 LLVMValueRef selmasklo, rgba01, rgba23, bitlo;
1416 bitlo = bld32.one;
1417 indices = LLVMBuildAnd(builder, code, bitlo, "");
1418 selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1419 indices, bitlo);
1420 rgba01 = lp_build_select(&bld32, selmasklo, color1, color0);
1421
1422 LLVMValueRef selmaskhi;
1423 indices = LLVMBuildAnd(builder, code, const2, "");
1424 selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1425 indices, const2);
1426 rgba23 = lp_build_select(&bld32, selmasklo, color3, color2);
1427 rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01);
1428
1429 /*
1430 * Note that this will give "wrong" order.
1431 * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
1432 * This would be easily fixable by using different shuffle, bitlo/hi
1433 * vectors above (and different shift), but seems slightly easier to
1434 * deal with for dxt3/dxt5 alpha too. So instead change lookup.
1435 */
1436 col[i] = rgba;
1437 code = LLVMBuildLShr(builder, code, const2, "");
1438 }
1439 }
1440 }
1441
1442 /*
1443 * decode one dxt3 block.
1444 */
1445 static void
1446 s3tc_decode_block_dxt3(struct gallivm_state *gallivm,
1447 enum pipe_format format,
1448 LLVMValueRef dxt_block,
1449 LLVMValueRef *col)
1450 {
1451 LLVMBuilderRef builder = gallivm->builder;
1452 LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi;
1453 struct lp_type type32, type8, type16;
1454 unsigned i;
1455
1456 memset(&type32, 0, sizeof type32);
1457 type32.width = 32;
1458 type32.length = 4;
1459
1460 memset(&type8, 0, sizeof type8);
1461 type8.width = 8;
1462 type8.length = 16;
1463
1464 memset(&type16, 0, sizeof type16);
1465 type16.width = 16;
1466 type16.length = 8;
1467
1468 s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1469
1470 shift4_16 = lp_build_const_int_vec(gallivm, type16, 4);
1471 mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1472
1473 alpha = LLVMBuildBitCast(builder, dxt_block,
1474 lp_build_vec_type(gallivm, type8), "");
1475 alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0);
1476 alpha = LLVMBuildBitCast(builder, alpha,
1477 lp_build_vec_type(gallivm, type16), "");
1478 alpha = LLVMBuildAnd(builder, alpha,
1479 lp_build_const_int_vec(gallivm, type16, 0xf00f), "");
1480 alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, "");
1481 alphas1 = LLVMBuildShl(builder, alpha, shift4_16, "");
1482 alpha = LLVMBuildOr(builder, alphas0, alpha, "");
1483 alpha = LLVMBuildOr(builder, alphas1, alpha, "");
1484 alpha = LLVMBuildBitCast(builder, alpha,
1485 lp_build_vec_type(gallivm, type32), "");
1486 /*
1487 * alpha now contains elems 0,1,2,3,... (ubytes)
1488 * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
1489 * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
1490 */
1491 a[0] = LLVMBuildShl(builder, alpha,
1492 lp_build_const_int_vec(gallivm, type32, 24), "");
1493 a[1] = LLVMBuildShl(builder, alpha,
1494 lp_build_const_int_vec(gallivm, type32, 16), "");
1495 a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1496 a[2] = LLVMBuildShl(builder, alpha,
1497 lp_build_const_int_vec(gallivm, type32, 8), "");
1498 a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1499 a[3] = LLVMBuildAnd(builder, alpha, mask8hi, "");
1500
1501 for (i = 0; i < 4; i++) {
1502 col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1503 }
1504 }
1505
1506
1507 static LLVMValueRef
1508 lp_build_lerpdxta_block(struct gallivm_state *gallivm,
1509 LLVMValueRef alpha0,
1510 LLVMValueRef alpha1,
1511 LLVMValueRef code,
1512 LLVMValueRef sel_mask)
1513 {
1514 LLVMBuilderRef builder = gallivm->builder;
1515 LLVMValueRef delta, ainterp;
1516 LLVMValueRef weight5, weight7, weight;
1517 struct lp_type type16;
1518 struct lp_build_context bld;
1519
1520 memset(&type16, 0, sizeof type16);
1521 type16.width = 16;
1522 type16.length = 8;
1523 type16.sign = TRUE;
1524
1525 lp_build_context_init(&bld, gallivm, type16);
1526 /*
1527 * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
1528 * actually be desirable to do this here with even higher accuracy than
1529 * even 8 bit (more or less required for rgtc, albeit that's not handled
1530 * here right now), shift the weights after multiplication by code.
1531 */
1532 weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5);
1533 weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7);
1534 weight = lp_build_select(&bld, sel_mask, weight7, weight5);
1535
1536 /*
1537 * we'll get garbage in the elements which had code 0 (or larger than
1538 * 5 or 7) but we don't care (or rather, need to fix up anyway).
1539 */
1540 code = LLVMBuildSub(builder, code, bld.one, "");
1541
1542 weight = LLVMBuildMul(builder, weight, code, "");
1543 weight = LLVMBuildLShr(builder, weight,
1544 lp_build_const_int_vec(gallivm, type16, 6), "");
1545
1546 delta = LLVMBuildSub(builder, alpha1, alpha0, "");
1547
1548 ainterp = LLVMBuildMul(builder, delta, weight, "");
1549 ainterp = LLVMBuildLShr(builder, ainterp,
1550 lp_build_const_int_vec(gallivm, type16, 8), "");
1551
1552 /* lerp is done later (with packed values) */
1553
1554 return ainterp;
1555 }
1556
1557
1558 /*
1559 * decode one dxt5 block.
1560 */
1561 static void
1562 s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
1563 enum pipe_format format,
1564 LLVMValueRef dxt_block,
1565 LLVMValueRef *col)
1566 {
1567 LLVMBuilderRef builder = gallivm->builder;
1568 LLVMValueRef alpha, alpha0, alpha1, ares;
1569 LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2;
1570 LLVMValueRef a[4], acode, tmp0, tmp1;
1571 LLVMTypeRef i64t, i32t;
1572 struct lp_type type32, type64, type8, type16;
1573 struct lp_build_context bld16, bld8;
1574 unsigned i;
1575
1576 memset(&type32, 0, sizeof type32);
1577 type32.width = 32;
1578 type32.length = 4;
1579
1580 memset(&type64, 0, sizeof type64);
1581 type64.width = 64;
1582 type64.length = 2;
1583
1584 memset(&type8, 0, sizeof type8);
1585 type8.width = 8;
1586 type8.length = 16;
1587
1588 memset(&type16, 0, sizeof type16);
1589 type16.width = 16;
1590 type16.length = 8;
1591
1592 lp_build_context_init(&bld16, gallivm, type16);
1593 lp_build_context_init(&bld8, gallivm, type8);
1594
1595 i64t = lp_build_vec_type(gallivm, type64);
1596 i32t = lp_build_vec_type(gallivm, type32);
1597
1598 s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1599
1600 /*
1601 * three possible strategies for vectorizing alpha:
1602 * 1) compute all 8 values then use scalar extraction
1603 * (i.e. have all 8 alpha values packed in one 64bit scalar
1604 * and do something like ax = vals >> (codex * 8) followed
1605 * by inserting these values back into color)
1606 * 2) same as 8 but just use pshufb as a mini-LUT for selection.
1607 * (without pshufb would need boatloads of cmp/selects trying to
1608 * keep things vectorized for essentially scalar selection).
1609 * 3) do something similar to the uncached case
1610 * needs more calculations (need to calc 16 values instead of 8 though
1611 * that's only an issue for the lerp which we need to do twice otherwise
1612 * everything still fits into 128bit) but keeps things vectorized mostly.
1613 * Trying 3) here though not sure it's really faster...
1614 * With pshufb, we try 2) (cheaper and more accurate)
1615 */
1616
1617 /*
1618 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1619 * help since code crosses 8bit boundaries). But variable shifts are
1620 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1621 * shifts!). Instead, emulate by 16bit muls.
1622 * Also, the required byte shuffles are essentially non-emulatable, so
1623 * require ssse3 (albeit other archs might do them fine).
1624 * This is not directly tied to ssse3 - just need sane byte shuffles.
1625 * But ordering is going to be different below so use same condition.
1626 */
1627
1628
1629 /* vectorize alpha */
1630 alpha = LLVMBuildBitCast(builder, dxt_block, i64t, "");
1631 alpha0 = LLVMBuildAnd(builder, alpha,
1632 lp_build_const_int_vec(gallivm, type64, 0xff), "");
1633 alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
1634 alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, "");
1635 alpha1 = LLVMBuildLShr(builder, alpha,
1636 lp_build_const_int_vec(gallivm, type16, 8), "");
1637 alpha = LLVMBuildBitCast(builder, alpha, i64t, "");
1638 shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
1639 alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
1640 alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
1641
1642 type16.sign = TRUE;
1643 sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER,
1644 alpha0, alpha1);
1645 type16.sign = FALSE;
1646 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1647
1648 if (!util_cpu_caps.has_ssse3) {
1649 LLVMValueRef acodeg, mask1, acode0, acode1;
1650
1651 /* extraction of the 3 bit values into something more useful is HARD */
1652 /* first steps are actually scalar */
1653 acode = LLVMBuildLShr(builder, alpha,
1654 lp_build_const_int_vec(gallivm, type64, 16), "");
1655 tmp0 = LLVMBuildAnd(builder, acode,
1656 lp_build_const_int_vec(gallivm, type64, 0xffffff), "");
1657 tmp1 = LLVMBuildLShr(builder, acode,
1658 lp_build_const_int_vec(gallivm, type64, 24), "");
1659 tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, "");
1660 tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, "");
1661 acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1662 /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
1663 tmp0 = LLVMBuildAnd(builder, acode,
1664 lp_build_const_int_vec(gallivm, type32, 0xfff), "");
1665 tmp1 = LLVMBuildLShr(builder, acode,
1666 lp_build_const_int_vec(gallivm, type32, 12), "");
1667 acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1668 /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
1669 tmp0 = LLVMBuildAnd(builder, acode,
1670 lp_build_const_int_vec(gallivm, type32, 0x3f), "");
1671 tmp1 = LLVMBuildLShr(builder, acode,
1672 lp_build_const_int_vec(gallivm, type32, 6), "");
1673 /* use signed pack doesn't matter and otherwise need sse41 */
1674 type32.sign = type16.sign = TRUE;
1675 acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1);
1676 type32.sign = type16.sign = FALSE;
1677 /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
1678 acode0 = LLVMBuildAnd(builder, acode,
1679 lp_build_const_int_vec(gallivm, type16, 0x7), "");
1680 acode1 = LLVMBuildLShr(builder, acode,
1681 lp_build_const_int_vec(gallivm, type16, 3), "");
1682 acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1);
1683 /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
1684
1685 acodeg = LLVMBuildAnd(builder, acode,
1686 LLVMBuildNot(builder, sel_mask, ""), "");
1687 mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1688 acode, bld8.one);
1689
1690 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
1691 ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask);
1692 ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask);
1693 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1694 ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1);
1695 alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0);
1696 alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1);
1697 ainterp = LLVMBuildAdd(builder, ainterp, alpha0, "");
1698 /* Fix up val01 */
1699 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1700 acode, bld8.zero);
1701 ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp);
1702 ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp);
1703
1704 /* fix up val67 if a0 <= a1 */
1705 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1706 acodeg, lp_build_const_int_vec(gallivm, type8, 6));
1707 ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), "");
1708 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1709 acodeg, lp_build_const_int_vec(gallivm, type8, 7));
1710 ares = LLVMBuildOr(builder, ares, sel_mask2, "");
1711
1712 /* unpack in right order (0,4,8,12,1,5,..) */
1713 /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
1714 tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0);
1715 tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1);
1716 tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, "");
1717 tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, "");
1718
1719 a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0);
1720 a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0);
1721 a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1);
1722 a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1);
1723 }
1724 else {
1725 LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi;
1726 LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context);
1727 LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context);
1728 unsigned i, j;
1729 /*
1730 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1731 * help since code crosses 8bit boundaries). But variable shifts are
1732 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1733 * shifts!). Instead, emulate by 16bit muls.
1734 * Also, the required byte shuffles are essentially non-emulatable, so
1735 * require ssse3 (albeit other archs might do them fine, but the
1736 * complete path is ssse3 only for now).
1737 */
1738 for (i = 0, j = 0; i < 16; i += 8, j += 3) {
1739 elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2);
1740 elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3);
1741 elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4);
1742 }
1743 shufa = LLVMConstVector(elems, 16);
1744 alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, "");
1745 acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, "");
1746 acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, "");
1747 /*
1748 * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
1749 * Do the same for 1/3/5/7 (albeit still need mask there - ideally
1750 * we'd place them into bits 4-7 so could save shift but impossible.)
1751 */
1752 for (i = 0; i < 8; i += 4) {
1753 elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0);
1754 elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0);
1755 elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0);
1756 elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0);
1757 }
1758 mulclo = LLVMConstVector(elems, 8);
1759 for (i = 0; i < 8; i += 4) {
1760 elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0);
1761 elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0);
1762 elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0);
1763 elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0);
1764 }
1765 mulchi = LLVMConstVector(elems, 8);
1766
1767 tmp0 = LLVMBuildMul(builder, acode, mulclo, "");
1768 tmp1 = LLVMBuildMul(builder, acode, mulchi, "");
1769 tmp0 = LLVMBuildLShr(builder, tmp0,
1770 lp_build_const_int_vec(gallivm, type16, 13), "");
1771 tmp1 = LLVMBuildLShr(builder, tmp1,
1772 lp_build_const_int_vec(gallivm, type16, 5), "");
1773 tmp1 = LLVMBuildAnd(builder, tmp1,
1774 lp_build_const_int_vec(gallivm, type16, 0x700), "");
1775 acode = LLVMBuildOr(builder, tmp0, tmp1, "");
1776 acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, "");
1777
1778 /*
1779 * Note that ordering is different here to non-ssse3 path:
1780 * 0/1/2/3/4/5...
1781 */
1782
1783 LLVMValueRef weight0, weight1, weight, delta;
1784 LLVMValueRef constff_elem7, const0_elem6;
1785 /* weights, correctly rounded (round(256*x/7)) */
1786 elems[0] = LLVMConstInt(type16s, 256, 0);
1787 elems[1] = LLVMConstInt(type16s, 0, 0);
1788 elems[2] = LLVMConstInt(type16s, 219, 0);
1789 elems[3] = LLVMConstInt(type16s, 183, 0);
1790 elems[4] = LLVMConstInt(type16s, 146, 0);
1791 elems[5] = LLVMConstInt(type16s, 110, 0);
1792 elems[6] = LLVMConstInt(type16s, 73, 0);
1793 elems[7] = LLVMConstInt(type16s, 37, 0);
1794 weight0 = LLVMConstVector(elems, 8);
1795
1796 elems[0] = LLVMConstInt(type16s, 256, 0);
1797 elems[1] = LLVMConstInt(type16s, 0, 0);
1798 elems[2] = LLVMConstInt(type16s, 205, 0);
1799 elems[3] = LLVMConstInt(type16s, 154, 0);
1800 elems[4] = LLVMConstInt(type16s, 102, 0);
1801 elems[5] = LLVMConstInt(type16s, 51, 0);
1802 elems[6] = LLVMConstInt(type16s, 0, 0);
1803 elems[7] = LLVMConstInt(type16s, 0, 0);
1804 weight1 = LLVMConstVector(elems, 8);
1805
1806 weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, "");
1807 weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, "");
1808 weight = lp_build_select(&bld8, sel_mask, weight0, weight1);
1809 weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, "");
1810
1811 for (i = 0; i < 16; i++) {
1812 elems[i] = LLVMConstNull(type8s);
1813 }
1814 elems[7] = LLVMConstInt(type8s, 255, 0);
1815 constff_elem7 = LLVMConstVector(elems, 16);
1816
1817 for (i = 0; i < 16; i++) {
1818 elems[i] = LLVMConstInt(type8s, 255, 0);
1819 }
1820 elems[6] = LLVMConstInt(type8s, 0, 0);
1821 const0_elem6 = LLVMConstVector(elems, 16);
1822
1823 /* standard simple lerp - but the version we need isn't available */
1824 delta = LLVMBuildSub(builder, alpha0, alpha1, "");
1825 ainterp = LLVMBuildMul(builder, delta, weight, "");
1826 ainterp = LLVMBuildLShr(builder, ainterp,
1827 lp_build_const_int_vec(gallivm, type16, 8), "");
1828 ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, "");
1829 alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, "");
1830 ainterp = LLVMBuildAdd(builder, ainterp, alpha1, "");
1831 ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, "");
1832 ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef);
1833
1834 /* fixing 0/0xff case is slightly more complex */
1835 constff_elem7 = LLVMBuildAnd(builder, constff_elem7,
1836 LLVMBuildNot(builder, sel_mask, ""), "");
1837 const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, "");
1838 ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, "");
1839 ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, "");
1840
1841 /* now pick all 16 elements at once! */
1842 intrargs[0] = ainterp;
1843 intrargs[1] = acode;
1844 ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1845 bld8.vec_type, intrargs, 2, 0);
1846
1847 ares = LLVMBuildBitCast(builder, ares, i32t, "");
1848 mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1849 a[0] = LLVMBuildShl(builder, ares,
1850 lp_build_const_int_vec(gallivm, type32, 24), "");
1851 a[1] = LLVMBuildShl(builder, ares,
1852 lp_build_const_int_vec(gallivm, type32, 16), "");
1853 a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1854 a[2] = LLVMBuildShl(builder, ares,
1855 lp_build_const_int_vec(gallivm, type32, 8), "");
1856 a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1857 a[3] = LLVMBuildAnd(builder, ares, mask8hi, "");
1858 }
1859
1860 for (i = 0; i < 4; i++) {
1861 a[i] = LLVMBuildBitCast(builder, a[i], i32t, "");
1862 col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1863 }
1864 }
1865
1866
1867 static void
1868 generate_update_cache_one_block(struct gallivm_state *gallivm,
1869 LLVMValueRef function,
1870 const struct util_format_description *format_desc)
1871 {
1872 LLVMBasicBlockRef block;
1873 LLVMBuilderRef old_builder;
1874 LLVMValueRef ptr_addr;
1875 LLVMValueRef hash_index;
1876 LLVMValueRef cache;
1877 LLVMValueRef dxt_block, tag_value;
1878 LLVMValueRef col[LP_MAX_VECTOR_LENGTH];
1879
1880 ptr_addr = LLVMGetParam(function, 0);
1881 hash_index = LLVMGetParam(function, 1);
1882 cache = LLVMGetParam(function, 2);
1883
1884 lp_build_name(ptr_addr, "ptr_addr" );
1885 lp_build_name(hash_index, "hash_index");
1886 lp_build_name(cache, "cache_addr");
1887
1888 /*
1889 * Function body
1890 */
1891
1892 old_builder = gallivm->builder;
1893 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
1894 gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
1895 LLVMPositionBuilderAtEnd(gallivm->builder, block);
1896
1897 lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block,
1898 ptr_addr);
1899
1900 switch (format_desc->format) {
1901 case PIPE_FORMAT_DXT1_RGB:
1902 case PIPE_FORMAT_DXT1_RGBA:
1903 case PIPE_FORMAT_DXT1_SRGB:
1904 case PIPE_FORMAT_DXT1_SRGBA:
1905 s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1906 break;
1907 case PIPE_FORMAT_DXT3_RGBA:
1908 case PIPE_FORMAT_DXT3_SRGBA:
1909 s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col);
1910 break;
1911 case PIPE_FORMAT_DXT5_RGBA:
1912 case PIPE_FORMAT_DXT5_SRGBA:
1913 s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col);
1914 break;
1915 default:
1916 assert(0);
1917 s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1918 break;
1919 }
1920
1921 tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
1922 LLVMInt64TypeInContext(gallivm->context), "");
1923 s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache);
1924
1925 LLVMBuildRetVoid(gallivm->builder);
1926
1927 LLVMDisposeBuilder(gallivm->builder);
1928 gallivm->builder = old_builder;
1929
1930 gallivm_verify_function(gallivm, function);
1931 }
1932
1933
1934 static void
1935 update_cached_block(struct gallivm_state *gallivm,
1936 const struct util_format_description *format_desc,
1937 LLVMValueRef ptr_addr,
1938 LLVMValueRef hash_index,
1939 LLVMValueRef cache)
1940
1941 {
1942 LLVMBuilderRef builder = gallivm->builder;
1943 LLVMModuleRef module = gallivm->module;
1944 char name[256];
1945 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
1946 LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
1947 LLVMValueRef function, inst;
1948 LLVMBasicBlockRef bb;
1949 LLVMValueRef args[3];
1950
1951 snprintf(name, sizeof name, "%s_update_cache_one_block",
1952 format_desc->short_name);
1953 function = LLVMGetNamedFunction(module, name);
1954
1955 if (!function) {
1956 LLVMTypeRef ret_type;
1957 LLVMTypeRef arg_types[3];
1958 LLVMTypeRef function_type;
1959 unsigned arg;
1960
1961 /*
1962 * Generate the function prototype.
1963 */
1964
1965 ret_type = LLVMVoidTypeInContext(gallivm->context);
1966 arg_types[0] = pi8t;
1967 arg_types[1] = LLVMInt32TypeInContext(gallivm->context);
1968 arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here
1969 function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0);
1970 function = LLVMAddFunction(module, name, function_type);
1971
1972 for (arg = 0; arg < ARRAY_SIZE(arg_types); ++arg)
1973 if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind)
1974 lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS);
1975
1976 LLVMSetFunctionCallConv(function, LLVMFastCallConv);
1977 LLVMSetVisibility(function, LLVMHiddenVisibility);
1978 generate_update_cache_one_block(gallivm, function, format_desc);
1979 }
1980
1981 args[0] = ptr_addr;
1982 args[1] = hash_index;
1983 args[2] = cache;
1984
1985 LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
1986 bb = LLVMGetInsertBlock(builder);
1987 inst = LLVMGetLastInstruction(bb);
1988 LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
1989 }
1990
1991 /*
1992 * cached lookup
1993 */
1994 static LLVMValueRef
1995 compressed_fetch_cached(struct gallivm_state *gallivm,
1996 const struct util_format_description *format_desc,
1997 unsigned n,
1998 LLVMValueRef base_ptr,
1999 LLVMValueRef offset,
2000 LLVMValueRef i,
2001 LLVMValueRef j,
2002 LLVMValueRef cache)
2003
2004 {
2005 LLVMBuilderRef builder = gallivm->builder;
2006 unsigned count, low_bit, log2size;
2007 LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
2008 LLVMValueRef ij_index, hash_index, hash_mask, block_index;
2009 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2010 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2011 LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
2012 struct lp_type type;
2013 struct lp_build_context bld32;
2014 memset(&type, 0, sizeof type);
2015 type.width = 32;
2016 type.length = n;
2017
2018 lp_build_context_init(&bld32, gallivm, type);
2019
2020 /*
2021 * compute hash - we use direct mapped cache, the hash function could
2022 * be better but it needs to be simple
2023 * per-element:
2024 * compare offset with offset stored at tag (hash)
2025 * if not equal extract block, store block, update tag
2026 * extract color from cache
2027 * assemble colors
2028 */
2029
2030 low_bit = util_logbase2(format_desc->block.bits / 8);
2031 log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
2032 addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
2033 ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
2034 ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
2035 /* For the hash function, first mask off the unused lowest bits. Then just
2036 do some xor with address bits - only use lower 32bits */
2037 ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
2038 ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2039 lp_build_const_int_vec(gallivm, type, low_bit), "");
2040 /* This only really makes sense for size 64,128,256 */
2041 hash_index = ptr_addrtrunc;
2042 ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2043 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
2044 hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
2045 tmp = LLVMBuildLShr(builder, hash_index,
2046 lp_build_const_int_vec(gallivm, type, log2size), "");
2047 hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
2048
2049 hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
2050 hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
2051 ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
2052 ij_index = LLVMBuildAdd(builder, ij_index, j, "");
2053 block_index = LLVMBuildShl(builder, hash_index,
2054 lp_build_const_int_vec(gallivm, type, 4), "");
2055 block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
2056
2057 if (n > 1) {
2058 color = bld32.undef;
2059 for (count = 0; count < n; count++) {
2060 LLVMValueRef index, cond, colorx;
2061 LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
2062 struct lp_build_if_state if_ctx;
2063
2064 index = lp_build_const_int32(gallivm, count);
2065 offsetx = LLVMBuildExtractElement(builder, offset, index, "");
2066 addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
2067 addrx = LLVMBuildAdd(builder, addrx, addr, "");
2068 block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
2069 hash_indexx = LLVMBuildLShr(builder, block_indexx,
2070 lp_build_const_int32(gallivm, 4), "");
2071 offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx);
2072 cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
2073
2074 lp_build_if(&if_ctx, gallivm, cond);
2075 {
2076 ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
2077 LLVMPointerType(i8t, 0), "");
2078 update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
2079 #if LP_BUILD_FORMAT_CACHE_DEBUG
2080 s3tc_update_cache_access(gallivm, cache, 1,
2081 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2082 #endif
2083 }
2084 lp_build_endif(&if_ctx);
2085
2086 colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx);
2087
2088 color = LLVMBuildInsertElement(builder, color, colorx,
2089 lp_build_const_int32(gallivm, count), "");
2090 }
2091 }
2092 else {
2093 LLVMValueRef cond;
2094 struct lp_build_if_state if_ctx;
2095
2096 tmp = LLVMBuildZExt(builder, offset, i64t, "");
2097 addr = LLVMBuildAdd(builder, tmp, addr, "");
2098 offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index);
2099 cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
2100
2101 lp_build_if(&if_ctx, gallivm, cond);
2102 {
2103 tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
2104 update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
2105 #if LP_BUILD_FORMAT_CACHE_DEBUG
2106 s3tc_update_cache_access(gallivm, cache, 1,
2107 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2108 #endif
2109 }
2110 lp_build_endif(&if_ctx);
2111
2112 color = s3tc_lookup_cached_pixel(gallivm, cache, block_index);
2113 }
2114 #if LP_BUILD_FORMAT_CACHE_DEBUG
2115 s3tc_update_cache_access(gallivm, cache, n,
2116 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
2117 #endif
2118 return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
2119 }
2120
2121
2122 static LLVMValueRef
2123 s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm,
2124 unsigned n,
2125 enum pipe_format format,
2126 LLVMValueRef colors,
2127 LLVMValueRef codewords,
2128 LLVMValueRef alpha_lo,
2129 LLVMValueRef alpha_hi,
2130 LLVMValueRef i,
2131 LLVMValueRef j)
2132 {
2133 return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors,
2134 codewords, alpha_lo, alpha_hi, i, j);
2135 }
2136
2137
2138 /**
2139 * @param n number of pixels processed (usually n=4, but it should also work with n=1
2140 * and multiples of 4)
2141 * @param base_ptr base pointer (32bit or 64bit pointer depending on the architecture)
2142 * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2143 * @param i is a <n x i32> vector with the x subpixel coordinate (0..3)
2144 * @param j is a <n x i32> vector with the y subpixel coordinate (0..3)
2145 * @return a <4*n x i8> vector with the pixel RGBA values in AoS
2146 */
2147 LLVMValueRef
2148 lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
2149 const struct util_format_description *format_desc,
2150 unsigned n,
2151 LLVMValueRef base_ptr,
2152 LLVMValueRef offset,
2153 LLVMValueRef i,
2154 LLVMValueRef j,
2155 LLVMValueRef cache)
2156 {
2157 LLVMValueRef rgba;
2158 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2159 LLVMBuilderRef builder = gallivm->builder;
2160
2161 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
2162 assert(format_desc->block.width == 4);
2163 assert(format_desc->block.height == 4);
2164
2165 assert((n == 1) || (n % 4 == 0));
2166
2167 /* debug_printf("format = %d\n", format_desc->format);*/
2168 if (cache) {
2169 rgba = compressed_fetch_cached(gallivm, format_desc, n,
2170 base_ptr, offset, i, j, cache);
2171 return rgba;
2172 }
2173
2174 /*
2175 * Could use n > 8 here with avx2, but doesn't seem faster.
2176 */
2177 if (n > 4) {
2178 unsigned count;
2179 LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2180 LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2181 LLVMTypeRef i128_vectype = LLVMVectorType(i128_type, n / 4);
2182 LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2183 gallivm->context), 4);
2184 LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2185 struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2186
2187 assert(n / 4 <= ARRAY_SIZE(rgba4));
2188
2189 rgba = LLVMGetUndef(i128_vectype);
2190
2191 for (count = 0; count < n / 4; count++) {
2192 LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2193
2194 i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2195 j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2196 offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2197
2198 lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords,
2199 &alpha_lo, &alpha_hi, base_ptr, offset4);
2200
2201 switch (format_desc->format) {
2202 case PIPE_FORMAT_DXT1_RGB:
2203 case PIPE_FORMAT_DXT1_RGBA:
2204 case PIPE_FORMAT_DXT1_SRGB:
2205 case PIPE_FORMAT_DXT1_SRGBA:
2206 rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format,
2207 colors, codewords, i4, j4);
2208 break;
2209 case PIPE_FORMAT_DXT3_RGBA:
2210 case PIPE_FORMAT_DXT3_SRGBA:
2211 rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2212 codewords, alpha_lo, alpha_hi, i4, j4);
2213 break;
2214 case PIPE_FORMAT_DXT5_RGBA:
2215 case PIPE_FORMAT_DXT5_SRGBA:
2216 rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2217 codewords, alpha_lo, alpha_hi, i4, j4);
2218 break;
2219 default:
2220 assert(0);
2221 rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2222 break;
2223 }
2224 /* shuffles typically give best results with dword elements...*/
2225 rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2226 }
2227 rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2228 rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2229 }
2230 else {
2231 LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2232
2233 lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords,
2234 &alpha_lo, &alpha_hi, base_ptr, offset);
2235
2236 switch (format_desc->format) {
2237 case PIPE_FORMAT_DXT1_RGB:
2238 case PIPE_FORMAT_DXT1_RGBA:
2239 case PIPE_FORMAT_DXT1_SRGB:
2240 case PIPE_FORMAT_DXT1_SRGBA:
2241 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format,
2242 colors, codewords, i, j);
2243 break;
2244 case PIPE_FORMAT_DXT3_RGBA:
2245 case PIPE_FORMAT_DXT3_SRGBA:
2246 rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors,
2247 codewords, alpha_lo, alpha_hi, i, j);
2248 break;
2249 case PIPE_FORMAT_DXT5_RGBA:
2250 case PIPE_FORMAT_DXT5_SRGBA:
2251 rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors,
2252 codewords, alpha_lo, alpha_hi, i, j);
2253 break;
2254 default:
2255 assert(0);
2256 rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2257 break;
2258 }
2259 }
2260
2261 /* always return just decompressed values - srgb conversion is done later */
2262
2263 return rgba;
2264 }