llvmpipe: init some vars to NULL to silence MinGW compiler warnings
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_s3tc.c
1 /**************************************************************************
2 *
3 * Copyright 2010-2018 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * s3tc pixel format manipulation.
32 *
33 * @author Roland Scheidegger <sroland@vmware.com>
34 */
35
36
37 #include "util/u_format.h"
38 #include "util/u_math.h"
39 #include "util/u_string.h"
40 #include "util/u_cpu_detect.h"
41 #include "util/u_debug.h"
42
43 #include "lp_bld_arit.h"
44 #include "lp_bld_type.h"
45 #include "lp_bld_const.h"
46 #include "lp_bld_conv.h"
47 #include "lp_bld_gather.h"
48 #include "lp_bld_format.h"
49 #include "lp_bld_logic.h"
50 #include "lp_bld_pack.h"
51 #include "lp_bld_flow.h"
52 #include "lp_bld_printf.h"
53 #include "lp_bld_struct.h"
54 #include "lp_bld_swizzle.h"
55 #include "lp_bld_init.h"
56 #include "lp_bld_debug.h"
57 #include "lp_bld_intr.h"
58
59
60 /**
61 * Reverse an interleave2_half
62 * (ie. pick every second element, independent lower/upper halfs)
63 * sse2 can only do that with 32bit (shufps) or larger elements
64 * natively. (Otherwise, and/pack (even) or shift/pack (odd)
65 * could be used, ideally llvm would do that for us.)
66 * XXX: Unfortunately, this does NOT translate to a shufps if those
67 * are int vectors (and casting will not help, llvm needs to recognize it
68 * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
69 * sequence which I'm pretty sure is a lot worse despite domain transition
70 * penalties with shufps (except maybe on Nehalem).
71 */
72 static LLVMValueRef
73 lp_build_uninterleave2_half(struct gallivm_state *gallivm,
74 struct lp_type type,
75 LLVMValueRef a,
76 LLVMValueRef b,
77 unsigned lo_hi)
78 {
79 LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
80 unsigned i, j;
81
82 assert(type.length <= LP_MAX_VECTOR_LENGTH);
83 assert(lo_hi < 2);
84
85 if (type.length * type.width == 256) {
86 assert(type.length >= 4);
87 for (i = 0, j = 0; i < type.length; ++i) {
88 if (i == type.length / 4) {
89 j = type.length;
90 } else if (i == type.length / 2) {
91 j = type.length / 2;
92 } else if (i == 3 * type.length / 4) {
93 j = 3 * type.length / 4;
94 } else {
95 j += 2;
96 }
97 elems[i] = lp_build_const_int32(gallivm, j + lo_hi);
98 }
99 } else {
100 for (i = 0; i < type.length; ++i) {
101 elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
102 }
103 }
104
105 shuffle = LLVMConstVector(elems, type.length);
106
107 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
108
109 }
110
111
112 /**
113 * Build shuffle for extending vectors.
114 */
115 static LLVMValueRef
116 lp_build_const_extend_shuffle(struct gallivm_state *gallivm,
117 unsigned n, unsigned length)
118 {
119 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
120 unsigned i;
121
122 assert(n <= length);
123 assert(length <= LP_MAX_VECTOR_LENGTH);
124
125 /* TODO: cache results in a static table */
126
127 for(i = 0; i < n; i++) {
128 elems[i] = lp_build_const_int32(gallivm, i);
129 }
130 for (i = n; i < length; i++) {
131 elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
132 }
133
134 return LLVMConstVector(elems, length);
135 }
136
137 static LLVMValueRef
138 lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n)
139 {
140 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
141 unsigned i, j;
142
143 assert(n <= LP_MAX_VECTOR_LENGTH);
144
145 /* TODO: cache results in a static table */
146
147 for(i = 0, j = 0; i < n; i += 2, ++j) {
148 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
149 elems[i + 1] = lp_build_const_int32(gallivm, n + j);
150 elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j);
151 elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j);
152 }
153
154 return LLVMConstVector(elems, n * 2);
155 }
156
157 /*
158 * broadcast 1 element to all elements
159 */
160 static LLVMValueRef
161 lp_build_const_shuffle1(struct gallivm_state *gallivm,
162 unsigned index, unsigned n)
163 {
164 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
165 unsigned i;
166
167 assert(n <= LP_MAX_VECTOR_LENGTH);
168
169 /* TODO: cache results in a static table */
170
171 for (i = 0; i < n; i++) {
172 elems[i] = lp_build_const_int32(gallivm, index);
173 }
174
175 return LLVMConstVector(elems, n);
176 }
177
178 /*
179 * move 1 element to pos 0, rest undef
180 */
181 static LLVMValueRef
182 lp_build_shuffle1undef(struct gallivm_state *gallivm,
183 LLVMValueRef a, unsigned index, unsigned n)
184 {
185 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf;
186 unsigned i;
187
188 assert(n <= LP_MAX_VECTOR_LENGTH);
189
190 elems[0] = lp_build_const_int32(gallivm, index);
191
192 for (i = 1; i < n; i++) {
193 elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
194 }
195 shuf = LLVMConstVector(elems, n);
196
197 return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, "");
198 }
199
200 static boolean
201 format_dxt1_variant(enum pipe_format format)
202 {
203 return format == PIPE_FORMAT_DXT1_RGB ||
204 format == PIPE_FORMAT_DXT1_RGBA ||
205 format == PIPE_FORMAT_DXT1_SRGB ||
206 format == PIPE_FORMAT_DXT1_SRGBA;
207
208 }
209
210 /**
211 * Gather elements from scatter positions in memory into vectors.
212 * This is customised for fetching texels from s3tc textures.
213 * For SSE, typical value is length=4.
214 *
215 * @param length length of the offsets
216 * @param colors the stored colors of the blocks will be extracted into this.
217 * @param codewords the codewords of the blocks will be extracted into this.
218 * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
219 * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
220 * @param base_ptr base pointer, should be a i8 pointer type.
221 * @param offsets vector with offsets
222 */
223 static void
224 lp_build_gather_s3tc(struct gallivm_state *gallivm,
225 unsigned length,
226 const struct util_format_description *format_desc,
227 LLVMValueRef *colors,
228 LLVMValueRef *codewords,
229 LLVMValueRef *alpha_lo,
230 LLVMValueRef *alpha_hi,
231 LLVMValueRef base_ptr,
232 LLVMValueRef offsets)
233 {
234 LLVMBuilderRef builder = gallivm->builder;
235 unsigned block_bits = format_desc->block.bits;
236 unsigned i;
237 LLVMValueRef elems[8];
238 LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
239 LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
240 LLVMTypeRef type32dxt;
241 struct lp_type lp_type32dxt;
242
243 memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
244 lp_type32dxt.width = 32;
245 lp_type32dxt.length = block_bits / 32;
246 type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
247
248 assert(block_bits == 64 || block_bits == 128);
249 assert(length == 1 || length == 4 || length == 8);
250
251 for (i = 0; i < length; ++i) {
252 elems[i] = lp_build_gather_elem(gallivm, length,
253 block_bits, block_bits, TRUE,
254 base_ptr, offsets, i, FALSE);
255 elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
256 }
257 if (length == 1) {
258 LLVMValueRef elem = elems[0];
259 if (block_bits == 128) {
260 *alpha_lo = LLVMBuildExtractElement(builder, elem,
261 lp_build_const_int32(gallivm, 0), "");
262 *alpha_hi = LLVMBuildExtractElement(builder, elem,
263 lp_build_const_int32(gallivm, 1), "");
264 *colors = LLVMBuildExtractElement(builder, elem,
265 lp_build_const_int32(gallivm, 2), "");
266 *codewords = LLVMBuildExtractElement(builder, elem,
267 lp_build_const_int32(gallivm, 3), "");
268 }
269 else {
270 *alpha_lo = LLVMGetUndef(type32);
271 *alpha_hi = LLVMGetUndef(type32);
272 *colors = LLVMBuildExtractElement(builder, elem,
273 lp_build_const_int32(gallivm, 0), "");
274 *codewords = LLVMBuildExtractElement(builder, elem,
275 lp_build_const_int32(gallivm, 1), "");
276 }
277 }
278 else {
279 LLVMValueRef tmp[4], cc01, cc23;
280 struct lp_type lp_type32, lp_type64, lp_type32dxt;
281 memset(&lp_type32, 0, sizeof lp_type32);
282 lp_type32.width = 32;
283 lp_type32.length = length;
284 memset(&lp_type64, 0, sizeof lp_type64);
285 lp_type64.width = 64;
286 lp_type64.length = length/2;
287
288 if (block_bits == 128) {
289 if (length == 8) {
290 for (i = 0; i < 4; ++i) {
291 tmp[0] = elems[i];
292 tmp[1] = elems[i+4];
293 elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
294 }
295 }
296 lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
297 *colors = tmp[2];
298 *codewords = tmp[3];
299 *alpha_lo = tmp[0];
300 *alpha_hi = tmp[1];
301 } else {
302 LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
303 LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
304
305 for (i = 0; i < length; ++i) {
306 /* no-op shuffle */
307 elems[i] = LLVMBuildShuffleVector(builder, elems[i],
308 LLVMGetUndef(type32dxt),
309 lp_build_const_extend_shuffle(gallivm, 2, 4), "");
310 }
311 if (length == 8) {
312 for (i = 0; i < 4; ++i) {
313 tmp[0] = elems[i];
314 tmp[1] = elems[i+4];
315 elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2);
316 }
317 }
318 cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
319 cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
320 cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, "");
321 cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, "");
322 *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0);
323 *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1);
324 *colors = LLVMBuildBitCast(builder, *colors, type32_vec, "");
325 *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, "");
326 }
327 }
328 }
329
330 /** Convert from <n x i32> containing 2 x n rgb565 colors
331 * to 2 <n x i32> rgba8888 colors
332 * This is the most optimized version I can think of
333 * should be nearly as fast as decoding only one color
334 * NOTE: alpha channel will be set to 0
335 * @param colors is a <n x i32> vector containing the rgb565 colors
336 */
337 static void
338 color_expand2_565_to_8888(struct gallivm_state *gallivm,
339 unsigned n,
340 LLVMValueRef colors,
341 LLVMValueRef *color0,
342 LLVMValueRef *color1)
343 {
344 LLVMBuilderRef builder = gallivm->builder;
345 LLVMValueRef r, g, b, rblo, glo;
346 LLVMValueRef rgblomask, rb, rgb0, rgb1;
347 struct lp_type type, type16, type8;
348
349 assert(n > 1);
350
351 memset(&type, 0, sizeof type);
352 type.width = 32;
353 type.length = n;
354
355 memset(&type16, 0, sizeof type16);
356 type16.width = 16;
357 type16.length = 2 * n;
358
359 memset(&type8, 0, sizeof type8);
360 type8.width = 8;
361 type8.length = 4 * n;
362
363 rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707);
364 colors = LLVMBuildBitCast(builder, colors,
365 lp_build_vec_type(gallivm, type16), "");
366 /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
367 * make sure low bits of r are zero - could use AND but requires constant */
368 r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
369 r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), "");
370 b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
371 rb = LLVMBuildOr(builder, r, b, "");
372 rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), "");
373 /* don't have byte shift hence need mask */
374 rblo = LLVMBuildAnd(builder, rblo, rgblomask, "");
375 rb = LLVMBuildOr(builder, rb, rblo, "");
376
377 /* make sure low bits of g are zero */
378 g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), "");
379 g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), "");
380 glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), "");
381 g = LLVMBuildOr(builder, g, glo, "");
382
383 rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), "");
384 g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), "");
385 rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0);
386 rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1);
387
388 rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), "");
389 rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), "");
390
391 /* rgb0 is rgb00, rgb01, rgb10, rgb11
392 * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
393 * on x86 this _should_ just generate one shufps...
394 */
395 *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0);
396 *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1);
397 }
398
399
400 /** Convert from <n x i32> containing rgb565 colors
401 * (in first 16 bits) to <n x i32> rgba8888 colors
402 * bits 16-31 MBZ
403 * NOTE: alpha channel will be set to 0
404 * @param colors is a <n x i32> vector containing the rgb565 colors
405 */
406 static LLVMValueRef
407 color_expand_565_to_8888(struct gallivm_state *gallivm,
408 unsigned n,
409 LLVMValueRef colors)
410 {
411 LLVMBuilderRef builder = gallivm->builder;
412 LLVMValueRef rgba, r, g, b, rgblo, glo;
413 LLVMValueRef rbhimask, g6mask, rgblomask;
414 struct lp_type type;
415 memset(&type, 0, sizeof type);
416 type.width = 32;
417 type.length = n;
418
419 /* color expansion:
420 * first extract and shift colors into their final locations
421 * (high bits - low bits zero at this point)
422 * then replicate highest bits to the lowest bits
423 * note rb replication can be done in parallel but not g
424 * (different shift)
425 * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
426 * rhigh = 8, ghigh = 5, bhigh = 19
427 * rblow = 5, glow = 6
428 * rgblowmask = 0x00070307
429 * r = colors >> rhigh
430 * b = colors << bhigh
431 * g = (colors & g6mask) << ghigh
432 * rb = (r | b) rbhimask
433 * rbtmp = rb >> rblow
434 * gtmp = rb >> glow
435 * rbtmp = rbtmp | gtmp
436 * rbtmp = rbtmp & rgblowmask
437 * rgb = rb | g | rbtmp
438 */
439 g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0);
440 rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8);
441 rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307);
442
443 r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), "");
444 b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), "");
445 g = LLVMBuildAnd(builder, colors, g6mask, "");
446 g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), "");
447 rgba = LLVMBuildOr(builder, r, b, "");
448 rgba = LLVMBuildAnd(builder, rgba, rbhimask, "");
449 rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), "");
450 glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), "");
451 rgblo = LLVMBuildOr(builder, rgblo, glo, "");
452 rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, "");
453 rgba = LLVMBuildOr(builder, rgba, g, "");
454 rgba = LLVMBuildOr(builder, rgba, rgblo, "");
455
456 return rgba;
457 }
458
459
460 /*
461 * Average two byte vectors. (Will always round up.)
462 */
463 static LLVMValueRef
464 lp_build_pavgb(struct lp_build_context *bld8,
465 LLVMValueRef v0,
466 LLVMValueRef v1)
467 {
468 struct gallivm_state *gallivm = bld8->gallivm;
469 LLVMBuilderRef builder = gallivm->builder;
470 assert(bld8->type.width == 8);
471 assert(bld8->type.length == 16 || bld8->type.length == 32);
472 if (HAVE_LLVM < 0x0600) {
473 LLVMValueRef intrargs[2];
474 char *intr_name = bld8->type.length == 32 ? "llvm.x86.avx2.pavg.b" :
475 "llvm.x86.sse2.pavg.b";
476 intrargs[0] = v0;
477 intrargs[1] = v1;
478 return lp_build_intrinsic(builder, intr_name,
479 bld8->vec_type, intrargs, 2, 0);
480 } else {
481 /*
482 * Must match llvm's autoupgrade of pavg.b intrinsic to be useful.
483 * You better hope the backend code manages to detect the pattern, and
484 * the pattern doesn't change there...
485 */
486 struct lp_type type_ext = bld8->type;
487 LLVMTypeRef vec_type_ext;
488 LLVMValueRef res;
489 LLVMValueRef ext_one;
490 type_ext.width = 16;
491 vec_type_ext = lp_build_vec_type(gallivm, type_ext);
492 ext_one = lp_build_const_vec(gallivm, type_ext, 1);
493
494 v0 = LLVMBuildZExt(builder, v0, vec_type_ext, "");
495 v1 = LLVMBuildZExt(builder, v1, vec_type_ext, "");
496 res = LLVMBuildAdd(builder, v0, v1, "");
497 res = LLVMBuildAdd(builder, res, ext_one, "");
498 res = LLVMBuildLShr(builder, res, ext_one, "");
499 res = LLVMBuildTrunc(builder, res, bld8->vec_type, "");
500 return res;
501 }
502 }
503
504 /**
505 * Calculate 1/3(v1-v0) + v0
506 * and 2*1/3(v1-v0) + v0
507 */
508 static void
509 lp_build_lerp23(struct lp_build_context *bld,
510 LLVMValueRef v0,
511 LLVMValueRef v1,
512 LLVMValueRef *res0,
513 LLVMValueRef *res1)
514 {
515 struct gallivm_state *gallivm = bld->gallivm;
516 LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi;
517 LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp;
518 const struct lp_type type = bld->type;
519 LLVMBuilderRef builder = bld->gallivm->builder;
520 struct lp_type i16_type = lp_wider_type(type);
521 struct lp_build_context bld2;
522
523 assert(lp_check_value(type, v0));
524 assert(lp_check_value(type, v1));
525 assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
526
527 lp_build_context_init(&bld2, gallivm, i16_type);
528 bld2.type.sign = TRUE;
529 x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3);
530
531 /* FIXME: use native avx256 unpack/pack */
532 lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi);
533 lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi);
534 lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi);
535 delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo);
536 delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi);
537
538 mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, "");
539 mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, "");
540
541 x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), "");
542 x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), "");
543 /* lerp optimization: pack now, do add afterwards */
544 tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
545 *res0 = lp_build_add(bld, tmp, v0);
546
547 x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), "");
548 x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), "");
549 /* unlike above still need mask (but add still afterwards). */
550 x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
551 x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
552 tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
553 *res1 = lp_build_add(bld, tmp, v0);
554 }
555
556 /**
557 * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
558 * @param colors is a <n x i32> vector with n x 2x16bit colors
559 * @param codewords is a <n x i32> vector containing the codewords
560 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
561 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
562 */
563 static LLVMValueRef
564 s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
565 unsigned n,
566 enum pipe_format format,
567 LLVMValueRef colors,
568 LLVMValueRef codewords,
569 LLVMValueRef i,
570 LLVMValueRef j)
571 {
572 LLVMBuilderRef builder = gallivm->builder;
573 LLVMValueRef color0, color1, color2, color3, color2_2, color3_2;
574 LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2;
575 LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices;
576 struct lp_type type, type8;
577 struct lp_build_context bld8, bld32;
578 boolean is_dxt1_variant = format_dxt1_variant(format);
579
580 memset(&type, 0, sizeof type);
581 type.width = 32;
582 type.length = n;
583
584 memset(&type8, 0, sizeof type8);
585 type8.width = 8;
586 type8.length = 4*n;
587
588 assert(lp_check_value(type, i));
589 assert(lp_check_value(type, j));
590
591 a = lp_build_const_int_vec(gallivm, type, 0xff000000);
592
593 lp_build_context_init(&bld32, gallivm, type);
594 lp_build_context_init(&bld8, gallivm, type8);
595
596 /*
597 * works as follows:
598 * - expand color0/color1 to rgba8888
599 * - calculate color2/3 (interpolation) according to color0 < color1 rules
600 * - calculate color2/3 according to color0 >= color1 rules
601 * - do selection of color2/3 according to comparison of color0/1
602 * - extract indices (vector shift).
603 * - use compare/select to select the correct color. Since we have 2bit
604 * indices (and 4 colors), needs at least three compare/selects.
605 */
606 /*
607 * expand the two colors
608 */
609 col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), "");
610 col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), "");
611 if (n > 1) {
612 color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1);
613 }
614 else {
615 color0 = color_expand_565_to_8888(gallivm, n, col0);
616 color1 = color_expand_565_to_8888(gallivm, n, col1);
617 }
618
619 /*
620 * interpolate colors
621 * color2_1 is 2/3 color0 + 1/3 color1
622 * color3_1 is 1/3 color0 + 2/3 color1
623 * color2_2 is 1/2 color0 + 1/2 color1
624 * color3_2 is 0
625 */
626
627 colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, "");
628 colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, "");
629 /* can combine 2 lerps into one mostly - still looks expensive enough. */
630 lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3);
631 color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, "");
632 color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, "");
633
634 /* dxt3/5 always use 4-color encoding */
635 if (is_dxt1_variant) {
636 /* fix up alpha */
637 if (format == PIPE_FORMAT_DXT1_RGBA ||
638 format == PIPE_FORMAT_DXT1_SRGBA) {
639 color0 = LLVMBuildOr(builder, color0, a, "");
640 color1 = LLVMBuildOr(builder, color1, a, "");
641 color3 = LLVMBuildOr(builder, color3, a, "");
642 }
643 /*
644 * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
645 * Much cheaper (but we don't care that much if n == 1).
646 */
647 if ((util_cpu_caps.has_sse2 && n == 4) ||
648 (util_cpu_caps.has_avx2 && n == 8)) {
649 color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
650 color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
651 }
652 else {
653 struct lp_type i16_type = lp_wider_type(type8);
654 struct lp_build_context bld2;
655 LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi;
656
657 lp_build_context_init(&bld2, gallivm, i16_type);
658 bld2.type.sign = TRUE;
659
660 /*
661 * This isn't as expensive as it looks (the unpack is the same as
662 * for lerp23), with correct rounding.
663 * (Note that while rounding is correct, this will always round down,
664 * whereas pavgb will always round up.)
665 */
666 /* FIXME: use native avx256 unpack/pack */
667 lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi);
668 lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi);
669
670 addlo = lp_build_add(&bld2, v0_lo, v1_lo);
671 addhi = lp_build_add(&bld2, v0_hi, v1_hi);
672 addlo = LLVMBuildLShr(builder, addlo,
673 lp_build_const_int_vec(gallivm, i16_type, 1), "");
674 addhi = LLVMBuildLShr(builder, addhi,
675 lp_build_const_int_vec(gallivm, i16_type, 1), "");
676 color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi);
677 color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
678 }
679 color3_2 = lp_build_const_int_vec(gallivm, type, 0);
680
681 /* select between colors2/3 */
682 /* signed compare is faster saves some xors */
683 type.sign = TRUE;
684 sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1);
685 color2 = lp_build_select(&bld32, sel_mask, color2, color2_2);
686 color3 = lp_build_select(&bld32, sel_mask, color3, color3_2);
687 type.sign = FALSE;
688
689 if (format == PIPE_FORMAT_DXT1_RGBA ||
690 format == PIPE_FORMAT_DXT1_SRGBA) {
691 color2 = LLVMBuildOr(builder, color2, a, "");
692 }
693 }
694
695 const2 = lp_build_const_int_vec(gallivm, type, 2);
696 /* extract 2-bit index values */
697 bit_pos = LLVMBuildShl(builder, j, const2, "");
698 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
699 bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
700 /*
701 * NOTE: This innocent looking shift is very expensive with x86/ssex.
702 * Shifts with per-elemnent shift count get roughly translated to
703 * extract (count), extract (value), shift, move (back to xmm), unpack
704 * per element!
705 * So about 20 instructions here for 4xi32.
706 * Newer llvm versions (3.7+) will not do extract/insert but use a
707 * a couple constant count vector shifts plus shuffles. About same
708 * amount of instructions unfortunately...
709 * Would get much worse with 8xi16 even...
710 * We could actually do better here:
711 * - subtract bit_pos from 128+30, shl 23, convert float to int...
712 * - now do mul with codewords followed by shr 30...
713 * But requires 32bit->32bit mul, sse41 only (well that's emulatable
714 * with 2 32bit->64bit muls...) and not exactly cheap
715 * AVX2, of course, fixes this nonsense.
716 */
717 indices = LLVMBuildLShr(builder, codewords, bit_pos, "");
718
719 /* finally select the colors */
720 sel_lo = LLVMBuildAnd(builder, indices, bld32.one, "");
721 sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one);
722 color0 = lp_build_select(&bld32, sel_lo, color1, color0);
723 color2 = lp_build_select(&bld32, sel_lo, color3, color2);
724 sel_hi = LLVMBuildAnd(builder, indices, const2, "");
725 sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2);
726 rgba = lp_build_select(&bld32, sel_hi, color2, color0);
727
728 /* fix up alpha */
729 if (format == PIPE_FORMAT_DXT1_RGB ||
730 format == PIPE_FORMAT_DXT1_SRGB) {
731 rgba = LLVMBuildOr(builder, rgba, a, "");
732 }
733 return LLVMBuildBitCast(builder, rgba, bld8.vec_type, "");
734 }
735
736
737 static LLVMValueRef
738 s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm,
739 unsigned n,
740 enum pipe_format format,
741 LLVMValueRef colors,
742 LLVMValueRef codewords,
743 LLVMValueRef i,
744 LLVMValueRef j)
745 {
746 return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format,
747 colors, codewords, i, j);
748 }
749
750
751 /**
752 * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
753 * @param colors is a <n x i32> vector with n x 2x16bit colors
754 * @param codewords is a <n x i32> vector containing the codewords
755 * @param alphas is a <n x i64> vector containing the alpha values
756 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
757 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
758 */
759 static LLVMValueRef
760 s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
761 unsigned n,
762 enum pipe_format format,
763 LLVMValueRef colors,
764 LLVMValueRef codewords,
765 LLVMValueRef alpha_low,
766 LLVMValueRef alpha_hi,
767 LLVMValueRef i,
768 LLVMValueRef j)
769 {
770 LLVMBuilderRef builder = gallivm->builder;
771 LLVMValueRef rgba, tmp, tmp2;
772 LLVMValueRef bit_pos, sel_mask;
773 struct lp_type type, type8;
774 struct lp_build_context bld;
775
776 memset(&type, 0, sizeof type);
777 type.width = 32;
778 type.length = n;
779
780 memset(&type8, 0, sizeof type8);
781 type8.width = 8;
782 type8.length = n*4;
783
784 assert(lp_check_value(type, i));
785 assert(lp_check_value(type, j));
786
787 lp_build_context_init(&bld, gallivm, type);
788
789 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
790 colors, codewords, i, j);
791
792 rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, "");
793
794 /*
795 * Extract alpha values. Since we now need to select from
796 * which 32bit vector values are fetched, construct selection
797 * mask from highest bit of bit_pos, and use select, then shift
798 * according to the bit_pos (without the highest bit).
799 * Note this is pointless for n == 1 case. Could just
800 * directly use 64bit arithmetic if we'd extract 64bit
801 * alpha value instead of 2x32...
802 */
803 /* pos = 4*(4j+i) */
804 bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
805 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
806 bit_pos = LLVMBuildShl(builder, bit_pos,
807 lp_build_const_int_vec(gallivm, type, 2), "");
808 sel_mask = LLVMBuildLShr(builder, bit_pos,
809 lp_build_const_int_vec(gallivm, type, 5), "");
810 sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, "");
811 tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
812 bit_pos = LLVMBuildAnd(builder, bit_pos,
813 lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
814 /* Warning: slow shift with per element count */
815 /*
816 * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
817 * to select the right byte with pshufb. Then for the remaining one bit
818 * just do shift/select.
819 */
820 tmp = LLVMBuildLShr(builder, tmp, bit_pos, "");
821
822 /* combined expand from a4 to a8 and shift into position */
823 tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), "");
824 tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), "");
825 tmp = LLVMBuildOr(builder, tmp, tmp2, "");
826
827 rgba = LLVMBuildOr(builder, tmp, rgba, "");
828
829 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
830 }
831
832 static LLVMValueRef
833 lp_build_lerpdxta(struct gallivm_state *gallivm,
834 LLVMValueRef alpha0,
835 LLVMValueRef alpha1,
836 LLVMValueRef code,
837 LLVMValueRef sel_mask,
838 unsigned n)
839 {
840 /*
841 * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
842 * (plus pmullw is actually faster...)
843 * we just pretend our 32bit values (which are really only 8bit) are 16bits.
844 * Note that this is obviously a disaster for the scalar case.
845 */
846 LLVMBuilderRef builder = gallivm->builder;
847 LLVMValueRef delta, ainterp;
848 LLVMValueRef weight5, weight7, weight;
849 struct lp_type type32, type16, type8;
850 struct lp_build_context bld16;
851
852 memset(&type32, 0, sizeof type32);
853 type32.width = 32;
854 type32.length = n;
855 memset(&type16, 0, sizeof type16);
856 type16.width = 16;
857 type16.length = 2*n;
858 type16.sign = TRUE;
859 memset(&type8, 0, sizeof type8);
860 type8.width = 8;
861 type8.length = 4*n;
862
863 lp_build_context_init(&bld16, gallivm, type16);
864 /* 255/7 is a bit off - increase accuracy at the expense of shift later */
865 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
866 weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5);
867 weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7);
868 weight = lp_build_select(&bld16, sel_mask, weight7, weight5);
869
870 alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
871 alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, "");
872 code = LLVMBuildBitCast(builder, code, bld16.vec_type, "");
873 /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
874 but we don't care */
875 code = LLVMBuildSub(builder, code, bld16.one, "");
876
877 weight = LLVMBuildMul(builder, weight, code, "");
878 weight = LLVMBuildLShr(builder, weight,
879 lp_build_const_int_vec(gallivm, type16, 6), "");
880
881 delta = LLVMBuildSub(builder, alpha1, alpha0, "");
882
883 ainterp = LLVMBuildMul(builder, delta, weight, "");
884 ainterp = LLVMBuildLShr(builder, ainterp,
885 lp_build_const_int_vec(gallivm, type16, 8), "");
886
887 ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), "");
888 alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
889 ainterp = LLVMBuildAdd(builder, alpha0, ainterp, "");
890 ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), "");
891
892 return ainterp;
893 }
894
895 /**
896 * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
897 * @param colors is a <n x i32> vector with n x 2x16bit colors
898 * @param codewords is a <n x i32> vector containing the codewords
899 * @param alphas is a <n x i64> vector containing the alpha values
900 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
901 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
902 */
903 static LLVMValueRef
904 s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm,
905 unsigned n,
906 enum pipe_format format,
907 LLVMValueRef colors,
908 LLVMValueRef codewords,
909 LLVMValueRef alpha_lo,
910 LLVMValueRef alpha_hi,
911 LLVMValueRef i,
912 LLVMValueRef j)
913 {
914 LLVMBuilderRef builder = gallivm->builder;
915 LLVMValueRef rgba, tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift;
916 LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s;
917 LLVMValueRef mask6, mask7, ainterp;
918 LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
919 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
920 struct lp_type type, type8;
921 struct lp_build_context bld32;
922
923 memset(&type, 0, sizeof type);
924 type.width = 32;
925 type.length = n;
926
927 memset(&type8, 0, sizeof type8);
928 type8.width = 8;
929 type8.length = n*4;
930
931 assert(lp_check_value(type, i));
932 assert(lp_check_value(type, j));
933
934 lp_build_context_init(&bld32, gallivm, type);
935
936 assert(lp_check_value(type, i));
937 assert(lp_check_value(type, j));
938
939 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
940 colors, codewords, i, j);
941
942 rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, "");
943
944 /* this looks pretty complex for vectorization:
945 * extract a0/a1 values
946 * extract code
947 * select weights for interpolation depending on a0 > a1
948 * mul weights by code - 1
949 * lerp a0/a1/weights
950 * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
951 */
952
953 alpha0 = LLVMBuildAnd(builder, alpha_lo,
954 lp_build_const_int_vec(gallivm, type, 0xff), "");
955 alpha1 = LLVMBuildLShr(builder, alpha_lo,
956 lp_build_const_int_vec(gallivm, type, 8), "");
957 alpha1 = LLVMBuildAnd(builder, alpha1,
958 lp_build_const_int_vec(gallivm, type, 0xff), "");
959
960 /* pos = 3*(4j+i) */
961 bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
962 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
963 tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
964 bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, "");
965 /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
966 bit_pos = LLVMBuildAdd(builder, bit_pos,
967 lp_build_const_int_vec(gallivm, type, 16), "");
968
969 if (n == 1) {
970 struct lp_type type64;
971 memset(&type64, 0, sizeof type64);
972 type64.width = 64;
973 type64.length = 1;
974 /* This is pretty pointless could avoid by just directly extracting
975 64bit in the first place but makes it more complicated elsewhere */
976 alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, "");
977 alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, "");
978 alphac0 = LLVMBuildShl(builder, alpha_hi,
979 lp_build_const_int_vec(gallivm, type64, 32), "");
980 alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, "");
981
982 shift = LLVMBuildZExt(builder, bit_pos, i64t, "");
983 alphac0 = LLVMBuildLShr(builder, alphac0, shift, "");
984 alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, "");
985 alphac = LLVMBuildAnd(builder, alphac0,
986 lp_build_const_int_vec(gallivm, type, 0x7), "");
987 }
988 else {
989 /*
990 * Using non-native vector length here (actually, with avx2 and
991 * n == 4 llvm will indeed expand to ymm regs...)
992 * At least newer llvm versions handle that ok.
993 * llvm 3.7+ will even handle the emulated 64bit shift with variable
994 * shift count without extraction (and it's actually easier to
995 * emulate than the 32bit one).
996 */
997 alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi,
998 lp_build_const_unpackx2_shuffle(gallivm, n), "");
999
1000 alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), "");
1001 shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), "");
1002 alphac = LLVMBuildLShr(builder, alpha64, shift, "");
1003 alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, "");
1004
1005 alphac = LLVMBuildAnd(builder, alphac,
1006 lp_build_const_int_vec(gallivm, type, 0x7), "");
1007 }
1008
1009 /* signed compare is faster saves some xors */
1010 type.sign = TRUE;
1011 /* alpha0 > alpha1 selection */
1012 sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1013 alpha0, alpha1);
1014 ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n);
1015
1016 /*
1017 * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
1018 * else we select a0 for case 0, a1 for case 1,
1019 * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
1020 * a = (c == 0) ? a0 : a1
1021 * a = (c > 1) ? ainterp : a
1022 * Finally handle case 6/7 for !(a0 > a1)
1023 * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
1024 * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
1025 */
1026 tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1027 alphac, bld32.zero);
1028 alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1);
1029 tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1030 alphac, bld32.one);
1031 alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha);
1032
1033 code_s = LLVMBuildAnd(builder, alphac,
1034 LLVMBuildNot(builder, sel_mask, ""), "");
1035 mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1036 code_s, lp_build_const_int_vec(gallivm, type, 6));
1037 mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1038 code_s, lp_build_const_int_vec(gallivm, type, 7));
1039 alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), "");
1040 alpha = LLVMBuildOr(builder, alpha, mask7, "");
1041
1042 alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), "");
1043 rgba = LLVMBuildOr(builder, alpha, rgba, "");
1044
1045 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
1046 }
1047
1048
1049 static void
1050 lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm,
1051 const struct util_format_description *format_desc,
1052 LLVMValueRef *dxt_block,
1053 LLVMValueRef ptr)
1054 {
1055 LLVMBuilderRef builder = gallivm->builder;
1056 unsigned block_bits = format_desc->block.bits;
1057 LLVMValueRef elem, shuf;
1058 LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32);
1059 LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits);
1060 LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
1061 LLVMTypeRef type32_4 = LLVMVectorType(type32, 4);
1062
1063 assert(block_bits == 64 || block_bits == 128);
1064
1065 ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
1066 elem = LLVMBuildLoad(builder, ptr, "");
1067
1068 if (block_bits == 128) {
1069 /* just return block as is */
1070 *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, "");
1071 }
1072 else {
1073 LLVMTypeRef type32_2 = LLVMVectorType(type32, 2);
1074 shuf = lp_build_const_extend_shuffle(gallivm, 2, 4);
1075 elem = LLVMBuildBitCast(builder, elem, type32_2, "");
1076 *dxt_block = LLVMBuildShuffleVector(builder, elem,
1077 LLVMGetUndef(type32_2), shuf, "");
1078 }
1079 }
1080
1081
1082 static void
1083 s3tc_store_cached_block(struct gallivm_state *gallivm,
1084 LLVMValueRef *col,
1085 LLVMValueRef tag_value,
1086 LLVMValueRef hash_index,
1087 LLVMValueRef cache)
1088 {
1089 LLVMBuilderRef builder = gallivm->builder;
1090 LLVMValueRef ptr, indices[3];
1091 LLVMTypeRef type_ptr4x32;
1092 unsigned count;
1093
1094 type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
1095 indices[0] = lp_build_const_int32(gallivm, 0);
1096 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1097 indices[2] = hash_index;
1098 ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1099 LLVMBuildStore(builder, tag_value, ptr);
1100
1101 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1102 hash_index = LLVMBuildMul(builder, hash_index,
1103 lp_build_const_int32(gallivm, 16), "");
1104 for (count = 0; count < 4; count++) {
1105 indices[2] = hash_index;
1106 ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1107 ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
1108 LLVMBuildStore(builder, col[count], ptr);
1109 hash_index = LLVMBuildAdd(builder, hash_index,
1110 lp_build_const_int32(gallivm, 4), "");
1111 }
1112 }
1113
1114 static LLVMValueRef
1115 s3tc_lookup_cached_pixel(struct gallivm_state *gallivm,
1116 LLVMValueRef ptr,
1117 LLVMValueRef index)
1118 {
1119 LLVMBuilderRef builder = gallivm->builder;
1120 LLVMValueRef member_ptr, indices[3];
1121
1122 indices[0] = lp_build_const_int32(gallivm, 0);
1123 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1124 indices[2] = index;
1125 member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1126 return LLVMBuildLoad(builder, member_ptr, "cache_data");
1127 }
1128
1129 static LLVMValueRef
1130 s3tc_lookup_tag_data(struct gallivm_state *gallivm,
1131 LLVMValueRef ptr,
1132 LLVMValueRef index)
1133 {
1134 LLVMBuilderRef builder = gallivm->builder;
1135 LLVMValueRef member_ptr, indices[3];
1136
1137 indices[0] = lp_build_const_int32(gallivm, 0);
1138 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1139 indices[2] = index;
1140 member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1141 return LLVMBuildLoad(builder, member_ptr, "tag_data");
1142 }
1143
1144 #if LP_BUILD_FORMAT_CACHE_DEBUG
1145 static void
1146 s3tc_update_cache_access(struct gallivm_state *gallivm,
1147 LLVMValueRef ptr,
1148 unsigned count,
1149 unsigned index)
1150 {
1151 LLVMBuilderRef builder = gallivm->builder;
1152 LLVMValueRef member_ptr, cache_access;
1153
1154 assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
1155 index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
1156
1157 member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
1158 cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
1159 cache_access = LLVMBuildAdd(builder, cache_access,
1160 LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
1161 count, 0), "");
1162 LLVMBuildStore(builder, cache_access, member_ptr);
1163 }
1164 #endif
1165
1166 /**
1167 * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
1168 * The lerp is performed between the first 2 32bit colors
1169 * in the source vector, both results are returned packed in result vector.
1170 */
1171 static LLVMValueRef
1172 lp_build_lerp23_single(struct lp_build_context *bld,
1173 LLVMValueRef v01)
1174 {
1175 struct gallivm_state *gallivm = bld->gallivm;
1176 LLVMValueRef x, mul, delta, res, v0, v1, elems[8];
1177 const struct lp_type type = bld->type;
1178 LLVMBuilderRef builder = bld->gallivm->builder;
1179 struct lp_type i16_type = lp_wider_type(type);
1180 struct lp_type i32_type = lp_wider_type(i16_type);
1181 struct lp_build_context bld2;
1182
1183 assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
1184
1185 lp_build_context_init(&bld2, gallivm, i16_type);
1186 bld2.type.sign = TRUE;
1187
1188 /* weights 256/3, 256*2/3, with correct rounding */
1189 elems[0] = elems[1] = elems[2] = elems[3] =
1190 lp_build_const_elem(gallivm, i16_type, 255*1/3);
1191 elems[4] = elems[5] = elems[6] = elems[7] =
1192 lp_build_const_elem(gallivm, i16_type, 171);
1193 x = LLVMConstVector(elems, 8);
1194
1195 /*
1196 * v01 has col0 in 32bit elem 0, col1 in elem 1.
1197 * Interleave/unpack will give us separate v0/v1 vectors.
1198 */
1199 v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0);
1200 v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, "");
1201
1202 lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1);
1203 delta = lp_build_sub(&bld2, v1, v0);
1204
1205 mul = LLVMBuildMul(builder, x, delta, "");
1206
1207 mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), "");
1208 /* lerp optimization: pack now, do add afterwards */
1209 res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef);
1210 /* only lower 2 elems are valid - for these v0 is really v0 */
1211 return lp_build_add(bld, res, v01);
1212 }
1213
1214 /*
1215 * decode one dxt1 block.
1216 */
1217 static void
1218 s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
1219 enum pipe_format format,
1220 LLVMValueRef dxt_block,
1221 LLVMValueRef *col)
1222 {
1223 LLVMBuilderRef builder = gallivm->builder;
1224 LLVMValueRef color01, color23, color01_16, color0123;
1225 LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2;
1226 struct lp_type type8, type32, type16, type64;
1227 struct lp_build_context bld8, bld32, bld16, bld64;
1228 unsigned i;
1229 boolean is_dxt1_variant = format_dxt1_variant(format);
1230
1231 memset(&type32, 0, sizeof type32);
1232 type32.width = 32;
1233 type32.length = 4;
1234 type32.sign = TRUE;
1235
1236 memset(&type8, 0, sizeof type8);
1237 type8.width = 8;
1238 type8.length = 16;
1239
1240 memset(&type16, 0, sizeof type16);
1241 type16.width = 16;
1242 type16.length = 8;
1243
1244 memset(&type64, 0, sizeof type64);
1245 type64.width = 64;
1246 type64.length = 2;
1247
1248 a = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1249 const2 = lp_build_const_int_vec(gallivm, type32, 2);
1250
1251 lp_build_context_init(&bld32, gallivm, type32);
1252 lp_build_context_init(&bld16, gallivm, type16);
1253 lp_build_context_init(&bld8, gallivm, type8);
1254 lp_build_context_init(&bld64, gallivm, type64);
1255
1256 if (is_dxt1_variant) {
1257 color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4);
1258 code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4);
1259 } else {
1260 color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4);
1261 code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4);
1262 }
1263 code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1264 /* expand bytes to dwords */
1265 code = lp_build_interleave2(gallivm, type8, code, code, 0);
1266 code = lp_build_interleave2(gallivm, type8, code, code, 0);
1267
1268
1269 /*
1270 * works as follows:
1271 * - expand color0/color1 to rgba8888
1272 * - calculate color2/3 (interpolation) according to color0 < color1 rules
1273 * - calculate color2/3 according to color0 >= color1 rules
1274 * - do selection of color2/3 according to comparison of color0/1
1275 * - extract indices.
1276 * - use compare/select to select the correct color. Since we have 2bit
1277 * indices (and 4 colors), needs at least three compare/selects.
1278 */
1279
1280 /*
1281 * expand the two colors
1282 */
1283 color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, "");
1284 color01 = lp_build_interleave2(gallivm, type16, color01,
1285 bld16.zero, 0);
1286 color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, "");
1287 color01 = color_expand_565_to_8888(gallivm, 4, color01_16);
1288
1289 /*
1290 * interpolate colors
1291 * color2_1 is 2/3 color0 + 1/3 color1
1292 * color3_1 is 1/3 color0 + 2/3 color1
1293 * color2_2 is 1/2 color0 + 1/2 color1
1294 * color3_2 is 0
1295 */
1296
1297 /* TODO: since this is now always scalar, should
1298 * probably just use control flow here instead of calculating
1299 * both cases and then selection
1300 */
1301 if (format == PIPE_FORMAT_DXT1_RGBA ||
1302 format == PIPE_FORMAT_DXT1_SRGBA) {
1303 color01 = LLVMBuildOr(builder, color01, a, "");
1304 }
1305 /* can combine 2 lerps into one mostly */
1306 color23 = lp_build_lerp23_single(&bld8, color01);
1307 color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, "");
1308
1309 /* dxt3/5 always use 4-color encoding */
1310 if (is_dxt1_variant) {
1311 LLVMValueRef color23_2, color2_2;
1312
1313 if (util_cpu_caps.has_sse2) {
1314 LLVMValueRef intrargs[2];
1315 intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
1316 /* same interleave as for lerp23 - correct result in 2nd element */
1317 intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1318 intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, "");
1319 color2_2 = lp_build_pavgb(&bld8, intrargs[0], intrargs[1]);
1320 }
1321 else {
1322 LLVMValueRef v01, v0, v1, vhalf;
1323 /*
1324 * This isn't as expensive as it looks (the unpack is the same as
1325 * for lerp23, which is the reason why we do the pointless
1326 * interleave2 too), with correct rounding (the two lower elements
1327 * will be the same).
1328 */
1329 v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1330 v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, "");
1331 lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1);
1332 vhalf = lp_build_add(&bld16, v0, v1);
1333 vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, "");
1334 color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef);
1335 }
1336 /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
1337 color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, "");
1338 color23_2 = LLVMBuildLShr(builder, color23_2,
1339 lp_build_const_int_vec(gallivm, type64, 32), "");
1340 color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, "");
1341
1342 tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, "");
1343 tmp = LLVMBuildLShr(builder, tmp,
1344 lp_build_const_int_vec(gallivm, type64, 32), "");
1345 tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, "");
1346 sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER,
1347 color01_16, tmp);
1348 sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0);
1349 color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
1350 }
1351
1352 if (util_cpu_caps.has_ssse3) {
1353 /*
1354 * Use pshufb as mini-lut. (Only doable with intrinsics as the
1355 * final shuffles are non-constant. pshufb is awesome!)
1356 */
1357 LLVMValueRef shuf[16], low2mask;
1358 LLVMValueRef intrargs[2], lut_ind, lut_adj;
1359
1360 color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, "");
1361 color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, "");
1362 color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0);
1363 color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, "");
1364
1365 if (format == PIPE_FORMAT_DXT1_RGB ||
1366 format == PIPE_FORMAT_DXT1_SRGB) {
1367 color0123 = LLVMBuildOr(builder, color0123, a, "");
1368 }
1369
1370 /* shuffle as r0r1r2r3g0g1... */
1371 for (i = 0; i < 4; i++) {
1372 shuf[4*i] = lp_build_const_int32(gallivm, 0 + i);
1373 shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i);
1374 shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i);
1375 shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i);
1376 }
1377 color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, "");
1378 color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef,
1379 LLVMConstVector(shuf, 16), "");
1380
1381 /* lowest 2 bits of each 8 bit value contain index into "LUT" */
1382 low2mask = lp_build_const_int_vec(gallivm, type8, 3);
1383 /* add 0/4/8/12 for r/g/b/a */
1384 lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400);
1385 lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, "");
1386 intrargs[0] = color0123;
1387 for (i = 0; i < 4; i++) {
1388 lut_ind = LLVMBuildAnd(builder, code, low2mask, "");
1389 lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, "");
1390 intrargs[1] = lut_ind;
1391 col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1392 bld8.vec_type, intrargs, 2, 0);
1393 col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, "");
1394 code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1395 code = LLVMBuildLShr(builder, code, const2, "");
1396 code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1397 }
1398 }
1399 else {
1400 /* Thanks to vectorization can do 4 texels in parallel */
1401 LLVMValueRef color0, color1, color2, color3;
1402 if (format == PIPE_FORMAT_DXT1_RGB ||
1403 format == PIPE_FORMAT_DXT1_SRGB) {
1404 color01 = LLVMBuildOr(builder, color01, a, "");
1405 color23 = LLVMBuildOr(builder, color23, a, "");
1406 }
1407 color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1408 lp_build_const_shuffle1(gallivm, 0, 4), "");
1409 color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1410 lp_build_const_shuffle1(gallivm, 1, 4), "");
1411 color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1412 lp_build_const_shuffle1(gallivm, 0, 4), "");
1413 color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1414 lp_build_const_shuffle1(gallivm, 1, 4), "");
1415 code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1416
1417 for (i = 0; i < 4; i++) {
1418 /* select the colors */
1419 LLVMValueRef selmasklo, rgba01, rgba23, bitlo;
1420 bitlo = bld32.one;
1421 indices = LLVMBuildAnd(builder, code, bitlo, "");
1422 selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1423 indices, bitlo);
1424 rgba01 = lp_build_select(&bld32, selmasklo, color1, color0);
1425
1426 LLVMValueRef selmaskhi;
1427 indices = LLVMBuildAnd(builder, code, const2, "");
1428 selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1429 indices, const2);
1430 rgba23 = lp_build_select(&bld32, selmasklo, color3, color2);
1431 rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01);
1432
1433 /*
1434 * Note that this will give "wrong" order.
1435 * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
1436 * This would be easily fixable by using different shuffle, bitlo/hi
1437 * vectors above (and different shift), but seems slightly easier to
1438 * deal with for dxt3/dxt5 alpha too. So instead change lookup.
1439 */
1440 col[i] = rgba;
1441 code = LLVMBuildLShr(builder, code, const2, "");
1442 }
1443 }
1444 }
1445
1446 /*
1447 * decode one dxt3 block.
1448 */
1449 static void
1450 s3tc_decode_block_dxt3(struct gallivm_state *gallivm,
1451 enum pipe_format format,
1452 LLVMValueRef dxt_block,
1453 LLVMValueRef *col)
1454 {
1455 LLVMBuilderRef builder = gallivm->builder;
1456 LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi;
1457 struct lp_type type32, type8, type16;
1458 unsigned i;
1459
1460 memset(&type32, 0, sizeof type32);
1461 type32.width = 32;
1462 type32.length = 4;
1463
1464 memset(&type8, 0, sizeof type8);
1465 type8.width = 8;
1466 type8.length = 16;
1467
1468 memset(&type16, 0, sizeof type16);
1469 type16.width = 16;
1470 type16.length = 8;
1471
1472 s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1473
1474 shift4_16 = lp_build_const_int_vec(gallivm, type16, 4);
1475 mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1476
1477 alpha = LLVMBuildBitCast(builder, dxt_block,
1478 lp_build_vec_type(gallivm, type8), "");
1479 alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0);
1480 alpha = LLVMBuildBitCast(builder, alpha,
1481 lp_build_vec_type(gallivm, type16), "");
1482 alpha = LLVMBuildAnd(builder, alpha,
1483 lp_build_const_int_vec(gallivm, type16, 0xf00f), "");
1484 alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, "");
1485 alphas1 = LLVMBuildShl(builder, alpha, shift4_16, "");
1486 alpha = LLVMBuildOr(builder, alphas0, alpha, "");
1487 alpha = LLVMBuildOr(builder, alphas1, alpha, "");
1488 alpha = LLVMBuildBitCast(builder, alpha,
1489 lp_build_vec_type(gallivm, type32), "");
1490 /*
1491 * alpha now contains elems 0,1,2,3,... (ubytes)
1492 * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
1493 * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
1494 */
1495 a[0] = LLVMBuildShl(builder, alpha,
1496 lp_build_const_int_vec(gallivm, type32, 24), "");
1497 a[1] = LLVMBuildShl(builder, alpha,
1498 lp_build_const_int_vec(gallivm, type32, 16), "");
1499 a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1500 a[2] = LLVMBuildShl(builder, alpha,
1501 lp_build_const_int_vec(gallivm, type32, 8), "");
1502 a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1503 a[3] = LLVMBuildAnd(builder, alpha, mask8hi, "");
1504
1505 for (i = 0; i < 4; i++) {
1506 col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1507 }
1508 }
1509
1510
1511 static LLVMValueRef
1512 lp_build_lerpdxta_block(struct gallivm_state *gallivm,
1513 LLVMValueRef alpha0,
1514 LLVMValueRef alpha1,
1515 LLVMValueRef code,
1516 LLVMValueRef sel_mask)
1517 {
1518 LLVMBuilderRef builder = gallivm->builder;
1519 LLVMValueRef delta, ainterp;
1520 LLVMValueRef weight5, weight7, weight;
1521 struct lp_type type16;
1522 struct lp_build_context bld;
1523
1524 memset(&type16, 0, sizeof type16);
1525 type16.width = 16;
1526 type16.length = 8;
1527 type16.sign = TRUE;
1528
1529 lp_build_context_init(&bld, gallivm, type16);
1530 /*
1531 * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
1532 * actually be desirable to do this here with even higher accuracy than
1533 * even 8 bit (more or less required for rgtc, albeit that's not handled
1534 * here right now), shift the weights after multiplication by code.
1535 */
1536 weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5);
1537 weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7);
1538 weight = lp_build_select(&bld, sel_mask, weight7, weight5);
1539
1540 /*
1541 * we'll get garbage in the elements which had code 0 (or larger than
1542 * 5 or 7) but we don't care (or rather, need to fix up anyway).
1543 */
1544 code = LLVMBuildSub(builder, code, bld.one, "");
1545
1546 weight = LLVMBuildMul(builder, weight, code, "");
1547 weight = LLVMBuildLShr(builder, weight,
1548 lp_build_const_int_vec(gallivm, type16, 6), "");
1549
1550 delta = LLVMBuildSub(builder, alpha1, alpha0, "");
1551
1552 ainterp = LLVMBuildMul(builder, delta, weight, "");
1553 ainterp = LLVMBuildLShr(builder, ainterp,
1554 lp_build_const_int_vec(gallivm, type16, 8), "");
1555
1556 /* lerp is done later (with packed values) */
1557
1558 return ainterp;
1559 }
1560
1561
1562 /*
1563 * decode one dxt5 block.
1564 */
1565 static void
1566 s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
1567 enum pipe_format format,
1568 LLVMValueRef dxt_block,
1569 LLVMValueRef *col)
1570 {
1571 LLVMBuilderRef builder = gallivm->builder;
1572 LLVMValueRef alpha, alpha0, alpha1, ares;
1573 LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2;
1574 LLVMValueRef a[4], acode, tmp0, tmp1;
1575 LLVMTypeRef i64t, i32t;
1576 struct lp_type type32, type64, type8, type16;
1577 struct lp_build_context bld16, bld8;
1578 unsigned i;
1579
1580 memset(&type32, 0, sizeof type32);
1581 type32.width = 32;
1582 type32.length = 4;
1583
1584 memset(&type64, 0, sizeof type64);
1585 type64.width = 64;
1586 type64.length = 2;
1587
1588 memset(&type8, 0, sizeof type8);
1589 type8.width = 8;
1590 type8.length = 16;
1591
1592 memset(&type16, 0, sizeof type16);
1593 type16.width = 16;
1594 type16.length = 8;
1595
1596 lp_build_context_init(&bld16, gallivm, type16);
1597 lp_build_context_init(&bld8, gallivm, type8);
1598
1599 i64t = lp_build_vec_type(gallivm, type64);
1600 i32t = lp_build_vec_type(gallivm, type32);
1601
1602 s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1603
1604 /*
1605 * three possible strategies for vectorizing alpha:
1606 * 1) compute all 8 values then use scalar extraction
1607 * (i.e. have all 8 alpha values packed in one 64bit scalar
1608 * and do something like ax = vals >> (codex * 8) followed
1609 * by inserting these values back into color)
1610 * 2) same as 8 but just use pshufb as a mini-LUT for selection.
1611 * (without pshufb would need boatloads of cmp/selects trying to
1612 * keep things vectorized for essentially scalar selection).
1613 * 3) do something similar to the uncached case
1614 * needs more calculations (need to calc 16 values instead of 8 though
1615 * that's only an issue for the lerp which we need to do twice otherwise
1616 * everything still fits into 128bit) but keeps things vectorized mostly.
1617 * Trying 3) here though not sure it's really faster...
1618 * With pshufb, we try 2) (cheaper and more accurate)
1619 */
1620
1621 /*
1622 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1623 * help since code crosses 8bit boundaries). But variable shifts are
1624 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1625 * shifts!). Instead, emulate by 16bit muls.
1626 * Also, the required byte shuffles are essentially non-emulatable, so
1627 * require ssse3 (albeit other archs might do them fine).
1628 * This is not directly tied to ssse3 - just need sane byte shuffles.
1629 * But ordering is going to be different below so use same condition.
1630 */
1631
1632
1633 /* vectorize alpha */
1634 alpha = LLVMBuildBitCast(builder, dxt_block, i64t, "");
1635 alpha0 = LLVMBuildAnd(builder, alpha,
1636 lp_build_const_int_vec(gallivm, type64, 0xff), "");
1637 alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
1638 alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, "");
1639 alpha1 = LLVMBuildLShr(builder, alpha,
1640 lp_build_const_int_vec(gallivm, type16, 8), "");
1641 alpha = LLVMBuildBitCast(builder, alpha, i64t, "");
1642 shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
1643 /* XXX this shuffle broken with LLVM 2.8 */
1644 alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
1645 alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
1646
1647 type16.sign = TRUE;
1648 sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER,
1649 alpha0, alpha1);
1650 type16.sign = FALSE;
1651 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1652
1653 if (!util_cpu_caps.has_ssse3) {
1654 LLVMValueRef acodeg, mask1, acode0, acode1;
1655
1656 /* extraction of the 3 bit values into something more useful is HARD */
1657 /* first steps are actually scalar */
1658 acode = LLVMBuildLShr(builder, alpha,
1659 lp_build_const_int_vec(gallivm, type64, 16), "");
1660 tmp0 = LLVMBuildAnd(builder, acode,
1661 lp_build_const_int_vec(gallivm, type64, 0xffffff), "");
1662 tmp1 = LLVMBuildLShr(builder, acode,
1663 lp_build_const_int_vec(gallivm, type64, 24), "");
1664 tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, "");
1665 tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, "");
1666 acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1667 /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
1668 tmp0 = LLVMBuildAnd(builder, acode,
1669 lp_build_const_int_vec(gallivm, type32, 0xfff), "");
1670 tmp1 = LLVMBuildLShr(builder, acode,
1671 lp_build_const_int_vec(gallivm, type32, 12), "");
1672 acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1673 /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
1674 tmp0 = LLVMBuildAnd(builder, acode,
1675 lp_build_const_int_vec(gallivm, type32, 0x3f), "");
1676 tmp1 = LLVMBuildLShr(builder, acode,
1677 lp_build_const_int_vec(gallivm, type32, 6), "");
1678 /* use signed pack doesn't matter and otherwise need sse41 */
1679 type32.sign = type16.sign = TRUE;
1680 acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1);
1681 type32.sign = type16.sign = FALSE;
1682 /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
1683 acode0 = LLVMBuildAnd(builder, acode,
1684 lp_build_const_int_vec(gallivm, type16, 0x7), "");
1685 acode1 = LLVMBuildLShr(builder, acode,
1686 lp_build_const_int_vec(gallivm, type16, 3), "");
1687 acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1);
1688 /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
1689
1690 acodeg = LLVMBuildAnd(builder, acode,
1691 LLVMBuildNot(builder, sel_mask, ""), "");
1692 mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1693 acode, bld8.one);
1694
1695 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
1696 ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask);
1697 ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask);
1698 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1699 ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1);
1700 alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0);
1701 alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1);
1702 ainterp = LLVMBuildAdd(builder, ainterp, alpha0, "");
1703 /* Fix up val01 */
1704 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1705 acode, bld8.zero);
1706 ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp);
1707 ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp);
1708
1709 /* fix up val67 if a0 <= a1 */
1710 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1711 acodeg, lp_build_const_int_vec(gallivm, type8, 6));
1712 ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), "");
1713 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1714 acodeg, lp_build_const_int_vec(gallivm, type8, 7));
1715 ares = LLVMBuildOr(builder, ares, sel_mask2, "");
1716
1717 /* unpack in right order (0,4,8,12,1,5,..) */
1718 /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
1719 tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0);
1720 tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1);
1721 tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, "");
1722 tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, "");
1723
1724 a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0);
1725 a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0);
1726 a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1);
1727 a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1);
1728 }
1729 else {
1730 LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi;
1731 LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context);
1732 LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context);
1733 unsigned i, j;
1734 /*
1735 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1736 * help since code crosses 8bit boundaries). But variable shifts are
1737 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1738 * shifts!). Instead, emulate by 16bit muls.
1739 * Also, the required byte shuffles are essentially non-emulatable, so
1740 * require ssse3 (albeit other archs might do them fine, but the
1741 * complete path is ssse3 only for now).
1742 */
1743 for (i = 0, j = 0; i < 16; i += 8, j += 3) {
1744 elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2);
1745 elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3);
1746 elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4);
1747 }
1748 shufa = LLVMConstVector(elems, 16);
1749 alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, "");
1750 acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, "");
1751 acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, "");
1752 /*
1753 * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
1754 * Do the same for 1/3/5/7 (albeit still need mask there - ideally
1755 * we'd place them into bits 4-7 so could save shift but impossible.)
1756 */
1757 for (i = 0; i < 8; i += 4) {
1758 elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0);
1759 elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0);
1760 elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0);
1761 elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0);
1762 }
1763 mulclo = LLVMConstVector(elems, 8);
1764 for (i = 0; i < 8; i += 4) {
1765 elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0);
1766 elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0);
1767 elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0);
1768 elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0);
1769 }
1770 mulchi = LLVMConstVector(elems, 8);
1771
1772 tmp0 = LLVMBuildMul(builder, acode, mulclo, "");
1773 tmp1 = LLVMBuildMul(builder, acode, mulchi, "");
1774 tmp0 = LLVMBuildLShr(builder, tmp0,
1775 lp_build_const_int_vec(gallivm, type16, 13), "");
1776 tmp1 = LLVMBuildLShr(builder, tmp1,
1777 lp_build_const_int_vec(gallivm, type16, 5), "");
1778 tmp1 = LLVMBuildAnd(builder, tmp1,
1779 lp_build_const_int_vec(gallivm, type16, 0x700), "");
1780 acode = LLVMBuildOr(builder, tmp0, tmp1, "");
1781 acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, "");
1782
1783 /*
1784 * Note that ordering is different here to non-ssse3 path:
1785 * 0/1/2/3/4/5...
1786 */
1787
1788 LLVMValueRef weight0, weight1, weight, delta;
1789 LLVMValueRef constff_elem7, const0_elem6;
1790 /* weights, correctly rounded (round(256*x/7)) */
1791 elems[0] = LLVMConstInt(type16s, 256, 0);
1792 elems[1] = LLVMConstInt(type16s, 0, 0);
1793 elems[2] = LLVMConstInt(type16s, 219, 0);
1794 elems[3] = LLVMConstInt(type16s, 183, 0);
1795 elems[4] = LLVMConstInt(type16s, 146, 0);
1796 elems[5] = LLVMConstInt(type16s, 110, 0);
1797 elems[6] = LLVMConstInt(type16s, 73, 0);
1798 elems[7] = LLVMConstInt(type16s, 37, 0);
1799 weight0 = LLVMConstVector(elems, 8);
1800
1801 elems[0] = LLVMConstInt(type16s, 256, 0);
1802 elems[1] = LLVMConstInt(type16s, 0, 0);
1803 elems[2] = LLVMConstInt(type16s, 205, 0);
1804 elems[3] = LLVMConstInt(type16s, 154, 0);
1805 elems[4] = LLVMConstInt(type16s, 102, 0);
1806 elems[5] = LLVMConstInt(type16s, 51, 0);
1807 elems[6] = LLVMConstInt(type16s, 0, 0);
1808 elems[7] = LLVMConstInt(type16s, 0, 0);
1809 weight1 = LLVMConstVector(elems, 8);
1810
1811 weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, "");
1812 weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, "");
1813 weight = lp_build_select(&bld8, sel_mask, weight0, weight1);
1814 weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, "");
1815
1816 for (i = 0; i < 16; i++) {
1817 elems[i] = LLVMConstNull(type8s);
1818 }
1819 elems[7] = LLVMConstInt(type8s, 255, 0);
1820 constff_elem7 = LLVMConstVector(elems, 16);
1821
1822 for (i = 0; i < 16; i++) {
1823 elems[i] = LLVMConstInt(type8s, 255, 0);
1824 }
1825 elems[6] = LLVMConstInt(type8s, 0, 0);
1826 const0_elem6 = LLVMConstVector(elems, 16);
1827
1828 /* standard simple lerp - but the version we need isn't available */
1829 delta = LLVMBuildSub(builder, alpha0, alpha1, "");
1830 ainterp = LLVMBuildMul(builder, delta, weight, "");
1831 ainterp = LLVMBuildLShr(builder, ainterp,
1832 lp_build_const_int_vec(gallivm, type16, 8), "");
1833 ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, "");
1834 alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, "");
1835 ainterp = LLVMBuildAdd(builder, ainterp, alpha1, "");
1836 ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, "");
1837 ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef);
1838
1839 /* fixing 0/0xff case is slightly more complex */
1840 constff_elem7 = LLVMBuildAnd(builder, constff_elem7,
1841 LLVMBuildNot(builder, sel_mask, ""), "");
1842 const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, "");
1843 ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, "");
1844 ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, "");
1845
1846 /* now pick all 16 elements at once! */
1847 intrargs[0] = ainterp;
1848 intrargs[1] = acode;
1849 ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1850 bld8.vec_type, intrargs, 2, 0);
1851
1852 ares = LLVMBuildBitCast(builder, ares, i32t, "");
1853 mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1854 a[0] = LLVMBuildShl(builder, ares,
1855 lp_build_const_int_vec(gallivm, type32, 24), "");
1856 a[1] = LLVMBuildShl(builder, ares,
1857 lp_build_const_int_vec(gallivm, type32, 16), "");
1858 a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1859 a[2] = LLVMBuildShl(builder, ares,
1860 lp_build_const_int_vec(gallivm, type32, 8), "");
1861 a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1862 a[3] = LLVMBuildAnd(builder, ares, mask8hi, "");
1863 }
1864
1865 for (i = 0; i < 4; i++) {
1866 a[i] = LLVMBuildBitCast(builder, a[i], i32t, "");
1867 col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1868 }
1869 }
1870
1871
1872 static void
1873 generate_update_cache_one_block(struct gallivm_state *gallivm,
1874 LLVMValueRef function,
1875 const struct util_format_description *format_desc)
1876 {
1877 LLVMBasicBlockRef block;
1878 LLVMBuilderRef old_builder;
1879 LLVMValueRef ptr_addr;
1880 LLVMValueRef hash_index;
1881 LLVMValueRef cache;
1882 LLVMValueRef dxt_block, tag_value;
1883 LLVMValueRef col[LP_MAX_VECTOR_LENGTH];
1884
1885 ptr_addr = LLVMGetParam(function, 0);
1886 hash_index = LLVMGetParam(function, 1);
1887 cache = LLVMGetParam(function, 2);
1888
1889 lp_build_name(ptr_addr, "ptr_addr" );
1890 lp_build_name(hash_index, "hash_index");
1891 lp_build_name(cache, "cache_addr");
1892
1893 /*
1894 * Function body
1895 */
1896
1897 old_builder = gallivm->builder;
1898 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
1899 gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
1900 LLVMPositionBuilderAtEnd(gallivm->builder, block);
1901
1902 lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block,
1903 ptr_addr);
1904
1905 switch (format_desc->format) {
1906 case PIPE_FORMAT_DXT1_RGB:
1907 case PIPE_FORMAT_DXT1_RGBA:
1908 case PIPE_FORMAT_DXT1_SRGB:
1909 case PIPE_FORMAT_DXT1_SRGBA:
1910 s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1911 break;
1912 case PIPE_FORMAT_DXT3_RGBA:
1913 case PIPE_FORMAT_DXT3_SRGBA:
1914 s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col);
1915 break;
1916 case PIPE_FORMAT_DXT5_RGBA:
1917 case PIPE_FORMAT_DXT5_SRGBA:
1918 s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col);
1919 break;
1920 default:
1921 assert(0);
1922 s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1923 break;
1924 }
1925
1926 tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
1927 LLVMInt64TypeInContext(gallivm->context), "");
1928 s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache);
1929
1930 LLVMBuildRetVoid(gallivm->builder);
1931
1932 LLVMDisposeBuilder(gallivm->builder);
1933 gallivm->builder = old_builder;
1934
1935 gallivm_verify_function(gallivm, function);
1936 }
1937
1938
1939 static void
1940 update_cached_block(struct gallivm_state *gallivm,
1941 const struct util_format_description *format_desc,
1942 LLVMValueRef ptr_addr,
1943 LLVMValueRef hash_index,
1944 LLVMValueRef cache)
1945
1946 {
1947 LLVMBuilderRef builder = gallivm->builder;
1948 LLVMModuleRef module = gallivm->module;
1949 char name[256];
1950 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
1951 LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
1952 LLVMValueRef function, inst;
1953 LLVMBasicBlockRef bb;
1954 LLVMValueRef args[3];
1955
1956 util_snprintf(name, sizeof name, "%s_update_cache_one_block",
1957 format_desc->short_name);
1958 function = LLVMGetNamedFunction(module, name);
1959
1960 if (!function) {
1961 LLVMTypeRef ret_type;
1962 LLVMTypeRef arg_types[3];
1963 LLVMTypeRef function_type;
1964 unsigned arg;
1965
1966 /*
1967 * Generate the function prototype.
1968 */
1969
1970 ret_type = LLVMVoidTypeInContext(gallivm->context);
1971 arg_types[0] = pi8t;
1972 arg_types[1] = LLVMInt32TypeInContext(gallivm->context);
1973 arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here
1974 function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0);
1975 function = LLVMAddFunction(module, name, function_type);
1976
1977 for (arg = 0; arg < ARRAY_SIZE(arg_types); ++arg)
1978 if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind)
1979 lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS);
1980
1981 LLVMSetFunctionCallConv(function, LLVMFastCallConv);
1982 LLVMSetVisibility(function, LLVMHiddenVisibility);
1983 generate_update_cache_one_block(gallivm, function, format_desc);
1984 }
1985
1986 args[0] = ptr_addr;
1987 args[1] = hash_index;
1988 args[2] = cache;
1989
1990 LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
1991 bb = LLVMGetInsertBlock(builder);
1992 inst = LLVMGetLastInstruction(bb);
1993 LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
1994 }
1995
1996 /*
1997 * cached lookup
1998 */
1999 static LLVMValueRef
2000 compressed_fetch_cached(struct gallivm_state *gallivm,
2001 const struct util_format_description *format_desc,
2002 unsigned n,
2003 LLVMValueRef base_ptr,
2004 LLVMValueRef offset,
2005 LLVMValueRef i,
2006 LLVMValueRef j,
2007 LLVMValueRef cache)
2008
2009 {
2010 LLVMBuilderRef builder = gallivm->builder;
2011 unsigned count, low_bit, log2size;
2012 LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
2013 LLVMValueRef ij_index, hash_index, hash_mask, block_index;
2014 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2015 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2016 LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
2017 struct lp_type type;
2018 struct lp_build_context bld32;
2019 memset(&type, 0, sizeof type);
2020 type.width = 32;
2021 type.length = n;
2022
2023 lp_build_context_init(&bld32, gallivm, type);
2024
2025 /*
2026 * compute hash - we use direct mapped cache, the hash function could
2027 * be better but it needs to be simple
2028 * per-element:
2029 * compare offset with offset stored at tag (hash)
2030 * if not equal extract block, store block, update tag
2031 * extract color from cache
2032 * assemble colors
2033 */
2034
2035 low_bit = util_logbase2(format_desc->block.bits / 8);
2036 log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
2037 addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
2038 ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
2039 ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
2040 /* For the hash function, first mask off the unused lowest bits. Then just
2041 do some xor with address bits - only use lower 32bits */
2042 ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
2043 ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2044 lp_build_const_int_vec(gallivm, type, low_bit), "");
2045 /* This only really makes sense for size 64,128,256 */
2046 hash_index = ptr_addrtrunc;
2047 ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2048 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
2049 hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
2050 tmp = LLVMBuildLShr(builder, hash_index,
2051 lp_build_const_int_vec(gallivm, type, log2size), "");
2052 hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
2053
2054 hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
2055 hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
2056 ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
2057 ij_index = LLVMBuildAdd(builder, ij_index, j, "");
2058 block_index = LLVMBuildShl(builder, hash_index,
2059 lp_build_const_int_vec(gallivm, type, 4), "");
2060 block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
2061
2062 if (n > 1) {
2063 color = bld32.undef;
2064 for (count = 0; count < n; count++) {
2065 LLVMValueRef index, cond, colorx;
2066 LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
2067 struct lp_build_if_state if_ctx;
2068
2069 index = lp_build_const_int32(gallivm, count);
2070 offsetx = LLVMBuildExtractElement(builder, offset, index, "");
2071 addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
2072 addrx = LLVMBuildAdd(builder, addrx, addr, "");
2073 block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
2074 hash_indexx = LLVMBuildLShr(builder, block_indexx,
2075 lp_build_const_int32(gallivm, 4), "");
2076 offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx);
2077 cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
2078
2079 lp_build_if(&if_ctx, gallivm, cond);
2080 {
2081 ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
2082 LLVMPointerType(i8t, 0), "");
2083 update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
2084 #if LP_BUILD_FORMAT_CACHE_DEBUG
2085 s3tc_update_cache_access(gallivm, cache, 1,
2086 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2087 #endif
2088 }
2089 lp_build_endif(&if_ctx);
2090
2091 colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx);
2092
2093 color = LLVMBuildInsertElement(builder, color, colorx,
2094 lp_build_const_int32(gallivm, count), "");
2095 }
2096 }
2097 else {
2098 LLVMValueRef cond;
2099 struct lp_build_if_state if_ctx;
2100
2101 tmp = LLVMBuildZExt(builder, offset, i64t, "");
2102 addr = LLVMBuildAdd(builder, tmp, addr, "");
2103 offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index);
2104 cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
2105
2106 lp_build_if(&if_ctx, gallivm, cond);
2107 {
2108 tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
2109 update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
2110 #if LP_BUILD_FORMAT_CACHE_DEBUG
2111 s3tc_update_cache_access(gallivm, cache, 1,
2112 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2113 #endif
2114 }
2115 lp_build_endif(&if_ctx);
2116
2117 color = s3tc_lookup_cached_pixel(gallivm, cache, block_index);
2118 }
2119 #if LP_BUILD_FORMAT_CACHE_DEBUG
2120 s3tc_update_cache_access(gallivm, cache, n,
2121 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
2122 #endif
2123 return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
2124 }
2125
2126
2127 static LLVMValueRef
2128 s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm,
2129 unsigned n,
2130 enum pipe_format format,
2131 LLVMValueRef colors,
2132 LLVMValueRef codewords,
2133 LLVMValueRef alpha_lo,
2134 LLVMValueRef alpha_hi,
2135 LLVMValueRef i,
2136 LLVMValueRef j)
2137 {
2138 return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors,
2139 codewords, alpha_lo, alpha_hi, i, j);
2140 }
2141
2142
2143 /**
2144 * @param n number of pixels processed (usually n=4, but it should also work with n=1
2145 * and multiples of 4)
2146 * @param base_ptr base pointer (32bit or 64bit pointer depending on the architecture)
2147 * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2148 * @param i is a <n x i32> vector with the x subpixel coordinate (0..3)
2149 * @param j is a <n x i32> vector with the y subpixel coordinate (0..3)
2150 * @return a <4*n x i8> vector with the pixel RGBA values in AoS
2151 */
2152 LLVMValueRef
2153 lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
2154 const struct util_format_description *format_desc,
2155 unsigned n,
2156 LLVMValueRef base_ptr,
2157 LLVMValueRef offset,
2158 LLVMValueRef i,
2159 LLVMValueRef j,
2160 LLVMValueRef cache)
2161 {
2162 LLVMValueRef rgba;
2163 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2164 LLVMBuilderRef builder = gallivm->builder;
2165
2166 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
2167 assert(format_desc->block.width == 4);
2168 assert(format_desc->block.height == 4);
2169
2170 assert((n == 1) || (n % 4 == 0));
2171
2172 /* debug_printf("format = %d\n", format_desc->format);*/
2173 if (cache) {
2174 rgba = compressed_fetch_cached(gallivm, format_desc, n,
2175 base_ptr, offset, i, j, cache);
2176 return rgba;
2177 }
2178
2179 if (n > 4) {
2180 unsigned count;
2181 LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2182 LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2183 LLVMTypeRef i128_vectype = LLVMVectorType(i128_type, n / 4);
2184 LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2185 gallivm->context), 4);
2186 LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2187 struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2188
2189 assert(n / 4 <= ARRAY_SIZE(rgba4));
2190
2191 rgba = LLVMGetUndef(i128_vectype);
2192
2193 for (count = 0; count < n / 4; count++) {
2194 LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2195
2196 i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2197 j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2198 offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2199
2200 lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords,
2201 &alpha_lo, &alpha_hi, base_ptr, offset4);
2202
2203 switch (format_desc->format) {
2204 case PIPE_FORMAT_DXT1_RGB:
2205 case PIPE_FORMAT_DXT1_RGBA:
2206 case PIPE_FORMAT_DXT1_SRGB:
2207 case PIPE_FORMAT_DXT1_SRGBA:
2208 rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format,
2209 colors, codewords, i4, j4);
2210 break;
2211 case PIPE_FORMAT_DXT3_RGBA:
2212 case PIPE_FORMAT_DXT3_SRGBA:
2213 rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2214 codewords, alpha_lo, alpha_hi, i4, j4);
2215 break;
2216 case PIPE_FORMAT_DXT5_RGBA:
2217 case PIPE_FORMAT_DXT5_SRGBA:
2218 rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2219 codewords, alpha_lo, alpha_hi, i4, j4);
2220 break;
2221 default:
2222 assert(0);
2223 rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2224 break;
2225 }
2226 /* shuffles typically give best results with dword elements...*/
2227 rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2228 }
2229 rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2230 rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2231 }
2232 else {
2233 LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2234
2235 lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords,
2236 &alpha_lo, &alpha_hi, base_ptr, offset);
2237
2238 switch (format_desc->format) {
2239 case PIPE_FORMAT_DXT1_RGB:
2240 case PIPE_FORMAT_DXT1_RGBA:
2241 case PIPE_FORMAT_DXT1_SRGB:
2242 case PIPE_FORMAT_DXT1_SRGBA:
2243 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format,
2244 colors, codewords, i, j);
2245 break;
2246 case PIPE_FORMAT_DXT3_RGBA:
2247 case PIPE_FORMAT_DXT3_SRGBA:
2248 rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors,
2249 codewords, alpha_lo, alpha_hi, i, j);
2250 break;
2251 case PIPE_FORMAT_DXT5_RGBA:
2252 case PIPE_FORMAT_DXT5_SRGBA:
2253 rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors,
2254 codewords, alpha_lo, alpha_hi, i, j);
2255 break;
2256 default:
2257 assert(0);
2258 rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2259 break;
2260 }
2261 }
2262
2263 /* always return just decompressed values - srgb conversion is done later */
2264
2265 return rgba;
2266 }