2949494c1cca24ca10b8f0b13ebd84d42b63c96e
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_s3tc.c
1 /**************************************************************************
2 *
3 * Copyright 2010-2018 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * s3tc pixel format manipulation.
32 *
33 * @author Roland Scheidegger <sroland@vmware.com>
34 */
35
36
37 #include <llvm/Config/llvm-config.h>
38
39 #include "util/u_format.h"
40 #include "util/u_math.h"
41 #include "util/u_string.h"
42 #include "util/u_cpu_detect.h"
43 #include "util/u_debug.h"
44
45 #include "lp_bld_arit.h"
46 #include "lp_bld_type.h"
47 #include "lp_bld_const.h"
48 #include "lp_bld_conv.h"
49 #include "lp_bld_gather.h"
50 #include "lp_bld_format.h"
51 #include "lp_bld_logic.h"
52 #include "lp_bld_pack.h"
53 #include "lp_bld_flow.h"
54 #include "lp_bld_printf.h"
55 #include "lp_bld_struct.h"
56 #include "lp_bld_swizzle.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_debug.h"
59 #include "lp_bld_intr.h"
60
61
62 /**
63 * Reverse an interleave2_half
64 * (ie. pick every second element, independent lower/upper halfs)
65 * sse2 can only do that with 32bit (shufps) or larger elements
66 * natively. (Otherwise, and/pack (even) or shift/pack (odd)
67 * could be used, ideally llvm would do that for us.)
68 * XXX: Unfortunately, this does NOT translate to a shufps if those
69 * are int vectors (and casting will not help, llvm needs to recognize it
70 * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
71 * sequence which I'm pretty sure is a lot worse despite domain transition
72 * penalties with shufps (except maybe on Nehalem).
73 */
74 static LLVMValueRef
75 lp_build_uninterleave2_half(struct gallivm_state *gallivm,
76 struct lp_type type,
77 LLVMValueRef a,
78 LLVMValueRef b,
79 unsigned lo_hi)
80 {
81 LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
82 unsigned i;
83
84 assert(type.length <= LP_MAX_VECTOR_LENGTH);
85 assert(lo_hi < 2);
86
87 if (type.length * type.width == 256) {
88 assert(type.length == 8);
89 assert(type.width == 32);
90 static const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};
91 for (i = 0; i < type.length; ++i) {
92 elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);
93 }
94 } else {
95 for (i = 0; i < type.length; ++i) {
96 elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
97 }
98 }
99
100 shuffle = LLVMConstVector(elems, type.length);
101
102 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
103
104 }
105
106
107 /**
108 * Build shuffle for extending vectors.
109 */
110 static LLVMValueRef
111 lp_build_const_extend_shuffle(struct gallivm_state *gallivm,
112 unsigned n, unsigned length)
113 {
114 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
115 unsigned i;
116
117 assert(n <= length);
118 assert(length <= LP_MAX_VECTOR_LENGTH);
119
120 /* TODO: cache results in a static table */
121
122 for(i = 0; i < n; i++) {
123 elems[i] = lp_build_const_int32(gallivm, i);
124 }
125 for (i = n; i < length; i++) {
126 elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
127 }
128
129 return LLVMConstVector(elems, length);
130 }
131
132 static LLVMValueRef
133 lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n)
134 {
135 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
136 unsigned i, j;
137
138 assert(n <= LP_MAX_VECTOR_LENGTH);
139
140 /* TODO: cache results in a static table */
141
142 for(i = 0, j = 0; i < n; i += 2, ++j) {
143 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
144 elems[i + 1] = lp_build_const_int32(gallivm, n + j);
145 elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j);
146 elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j);
147 }
148
149 return LLVMConstVector(elems, n * 2);
150 }
151
152 /*
153 * broadcast 1 element to all elements
154 */
155 static LLVMValueRef
156 lp_build_const_shuffle1(struct gallivm_state *gallivm,
157 unsigned index, unsigned n)
158 {
159 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
160 unsigned i;
161
162 assert(n <= LP_MAX_VECTOR_LENGTH);
163
164 /* TODO: cache results in a static table */
165
166 for (i = 0; i < n; i++) {
167 elems[i] = lp_build_const_int32(gallivm, index);
168 }
169
170 return LLVMConstVector(elems, n);
171 }
172
173 /*
174 * move 1 element to pos 0, rest undef
175 */
176 static LLVMValueRef
177 lp_build_shuffle1undef(struct gallivm_state *gallivm,
178 LLVMValueRef a, unsigned index, unsigned n)
179 {
180 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf;
181 unsigned i;
182
183 assert(n <= LP_MAX_VECTOR_LENGTH);
184
185 elems[0] = lp_build_const_int32(gallivm, index);
186
187 for (i = 1; i < n; i++) {
188 elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
189 }
190 shuf = LLVMConstVector(elems, n);
191
192 return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, "");
193 }
194
195 static boolean
196 format_dxt1_variant(enum pipe_format format)
197 {
198 return format == PIPE_FORMAT_DXT1_RGB ||
199 format == PIPE_FORMAT_DXT1_RGBA ||
200 format == PIPE_FORMAT_DXT1_SRGB ||
201 format == PIPE_FORMAT_DXT1_SRGBA;
202
203 }
204
205 /**
206 * Gather elements from scatter positions in memory into vectors.
207 * This is customised for fetching texels from s3tc textures.
208 * For SSE, typical value is length=4.
209 *
210 * @param length length of the offsets
211 * @param colors the stored colors of the blocks will be extracted into this.
212 * @param codewords the codewords of the blocks will be extracted into this.
213 * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
214 * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
215 * @param base_ptr base pointer, should be a i8 pointer type.
216 * @param offsets vector with offsets
217 */
218 static void
219 lp_build_gather_s3tc(struct gallivm_state *gallivm,
220 unsigned length,
221 const struct util_format_description *format_desc,
222 LLVMValueRef *colors,
223 LLVMValueRef *codewords,
224 LLVMValueRef *alpha_lo,
225 LLVMValueRef *alpha_hi,
226 LLVMValueRef base_ptr,
227 LLVMValueRef offsets)
228 {
229 LLVMBuilderRef builder = gallivm->builder;
230 unsigned block_bits = format_desc->block.bits;
231 unsigned i;
232 LLVMValueRef elems[8];
233 LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
234 LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
235 LLVMTypeRef type32dxt;
236 struct lp_type lp_type32dxt;
237
238 memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
239 lp_type32dxt.width = 32;
240 lp_type32dxt.length = block_bits / 32;
241 type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
242
243 assert(block_bits == 64 || block_bits == 128);
244 assert(length == 1 || length == 4 || length == 8);
245
246 for (i = 0; i < length; ++i) {
247 elems[i] = lp_build_gather_elem(gallivm, length,
248 block_bits, block_bits, TRUE,
249 base_ptr, offsets, i, FALSE);
250 elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
251 }
252 if (length == 1) {
253 LLVMValueRef elem = elems[0];
254 if (block_bits == 128) {
255 *alpha_lo = LLVMBuildExtractElement(builder, elem,
256 lp_build_const_int32(gallivm, 0), "");
257 *alpha_hi = LLVMBuildExtractElement(builder, elem,
258 lp_build_const_int32(gallivm, 1), "");
259 *colors = LLVMBuildExtractElement(builder, elem,
260 lp_build_const_int32(gallivm, 2), "");
261 *codewords = LLVMBuildExtractElement(builder, elem,
262 lp_build_const_int32(gallivm, 3), "");
263 }
264 else {
265 *alpha_lo = LLVMGetUndef(type32);
266 *alpha_hi = LLVMGetUndef(type32);
267 *colors = LLVMBuildExtractElement(builder, elem,
268 lp_build_const_int32(gallivm, 0), "");
269 *codewords = LLVMBuildExtractElement(builder, elem,
270 lp_build_const_int32(gallivm, 1), "");
271 }
272 }
273 else {
274 LLVMValueRef tmp[4], cc01, cc23;
275 struct lp_type lp_type32, lp_type64;
276 memset(&lp_type32, 0, sizeof lp_type32);
277 lp_type32.width = 32;
278 lp_type32.length = length;
279 memset(&lp_type64, 0, sizeof lp_type64);
280 lp_type64.width = 64;
281 lp_type64.length = length/2;
282
283 if (block_bits == 128) {
284 if (length == 8) {
285 for (i = 0; i < 4; ++i) {
286 tmp[0] = elems[i];
287 tmp[1] = elems[i+4];
288 elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
289 }
290 }
291 lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
292 *colors = tmp[2];
293 *codewords = tmp[3];
294 *alpha_lo = tmp[0];
295 *alpha_hi = tmp[1];
296 } else {
297 LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
298 LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
299
300 for (i = 0; i < length; ++i) {
301 /* no-op shuffle */
302 elems[i] = LLVMBuildShuffleVector(builder, elems[i],
303 LLVMGetUndef(type32dxt),
304 lp_build_const_extend_shuffle(gallivm, 2, 4), "");
305 }
306 if (length == 8) {
307 struct lp_type lp_type32_4 = {0};
308 lp_type32_4.width = 32;
309 lp_type32_4.length = 4;
310 for (i = 0; i < 4; ++i) {
311 tmp[0] = elems[i];
312 tmp[1] = elems[i+4];
313 elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
314 }
315 }
316 cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
317 cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
318 cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, "");
319 cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, "");
320 *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0);
321 *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1);
322 *colors = LLVMBuildBitCast(builder, *colors, type32_vec, "");
323 *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, "");
324 }
325 }
326 }
327
328 /** Convert from <n x i32> containing 2 x n rgb565 colors
329 * to 2 <n x i32> rgba8888 colors
330 * This is the most optimized version I can think of
331 * should be nearly as fast as decoding only one color
332 * NOTE: alpha channel will be set to 0
333 * @param colors is a <n x i32> vector containing the rgb565 colors
334 */
335 static void
336 color_expand2_565_to_8888(struct gallivm_state *gallivm,
337 unsigned n,
338 LLVMValueRef colors,
339 LLVMValueRef *color0,
340 LLVMValueRef *color1)
341 {
342 LLVMBuilderRef builder = gallivm->builder;
343 LLVMValueRef r, g, b, rblo, glo;
344 LLVMValueRef rgblomask, rb, rgb0, rgb1;
345 struct lp_type type, type16, type8;
346
347 assert(n > 1);
348
349 memset(&type, 0, sizeof type);
350 type.width = 32;
351 type.length = n;
352
353 memset(&type16, 0, sizeof type16);
354 type16.width = 16;
355 type16.length = 2 * n;
356
357 memset(&type8, 0, sizeof type8);
358 type8.width = 8;
359 type8.length = 4 * n;
360
361 rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707);
362 colors = LLVMBuildBitCast(builder, colors,
363 lp_build_vec_type(gallivm, type16), "");
364 /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
365 * make sure low bits of r are zero - could use AND but requires constant */
366 r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
367 r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), "");
368 b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
369 rb = LLVMBuildOr(builder, r, b, "");
370 rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), "");
371 /* don't have byte shift hence need mask */
372 rblo = LLVMBuildAnd(builder, rblo, rgblomask, "");
373 rb = LLVMBuildOr(builder, rb, rblo, "");
374
375 /* make sure low bits of g are zero */
376 g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), "");
377 g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), "");
378 glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), "");
379 g = LLVMBuildOr(builder, g, glo, "");
380
381 rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), "");
382 g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), "");
383 rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0);
384 rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1);
385
386 rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), "");
387 rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), "");
388
389 /* rgb0 is rgb00, rgb01, rgb10, rgb11
390 * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
391 * on x86 this _should_ just generate one shufps...
392 */
393 *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0);
394 *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1);
395 }
396
397
398 /** Convert from <n x i32> containing rgb565 colors
399 * (in first 16 bits) to <n x i32> rgba8888 colors
400 * bits 16-31 MBZ
401 * NOTE: alpha channel will be set to 0
402 * @param colors is a <n x i32> vector containing the rgb565 colors
403 */
404 static LLVMValueRef
405 color_expand_565_to_8888(struct gallivm_state *gallivm,
406 unsigned n,
407 LLVMValueRef colors)
408 {
409 LLVMBuilderRef builder = gallivm->builder;
410 LLVMValueRef rgba, r, g, b, rgblo, glo;
411 LLVMValueRef rbhimask, g6mask, rgblomask;
412 struct lp_type type;
413 memset(&type, 0, sizeof type);
414 type.width = 32;
415 type.length = n;
416
417 /* color expansion:
418 * first extract and shift colors into their final locations
419 * (high bits - low bits zero at this point)
420 * then replicate highest bits to the lowest bits
421 * note rb replication can be done in parallel but not g
422 * (different shift)
423 * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
424 * rhigh = 8, ghigh = 5, bhigh = 19
425 * rblow = 5, glow = 6
426 * rgblowmask = 0x00070307
427 * r = colors >> rhigh
428 * b = colors << bhigh
429 * g = (colors & g6mask) << ghigh
430 * rb = (r | b) rbhimask
431 * rbtmp = rb >> rblow
432 * gtmp = rb >> glow
433 * rbtmp = rbtmp | gtmp
434 * rbtmp = rbtmp & rgblowmask
435 * rgb = rb | g | rbtmp
436 */
437 g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0);
438 rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8);
439 rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307);
440
441 r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), "");
442 b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), "");
443 g = LLVMBuildAnd(builder, colors, g6mask, "");
444 g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), "");
445 rgba = LLVMBuildOr(builder, r, b, "");
446 rgba = LLVMBuildAnd(builder, rgba, rbhimask, "");
447 rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), "");
448 glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), "");
449 rgblo = LLVMBuildOr(builder, rgblo, glo, "");
450 rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, "");
451 rgba = LLVMBuildOr(builder, rgba, g, "");
452 rgba = LLVMBuildOr(builder, rgba, rgblo, "");
453
454 return rgba;
455 }
456
457
458 /*
459 * Average two byte vectors. (Will always round up.)
460 */
461 static LLVMValueRef
462 lp_build_pavgb(struct lp_build_context *bld8,
463 LLVMValueRef v0,
464 LLVMValueRef v1)
465 {
466 struct gallivm_state *gallivm = bld8->gallivm;
467 LLVMBuilderRef builder = gallivm->builder;
468 assert(bld8->type.width == 8);
469 assert(bld8->type.length == 16 || bld8->type.length == 32);
470 if (LLVM_VERSION_MAJOR < 6) {
471 LLVMValueRef intrargs[2];
472 char *intr_name = bld8->type.length == 32 ? "llvm.x86.avx2.pavg.b" :
473 "llvm.x86.sse2.pavg.b";
474 intrargs[0] = v0;
475 intrargs[1] = v1;
476 return lp_build_intrinsic(builder, intr_name,
477 bld8->vec_type, intrargs, 2, 0);
478 } else {
479 /*
480 * Must match llvm's autoupgrade of pavg.b intrinsic to be useful.
481 * You better hope the backend code manages to detect the pattern, and
482 * the pattern doesn't change there...
483 */
484 struct lp_type type_ext = bld8->type;
485 LLVMTypeRef vec_type_ext;
486 LLVMValueRef res;
487 LLVMValueRef ext_one;
488 type_ext.width = 16;
489 vec_type_ext = lp_build_vec_type(gallivm, type_ext);
490 ext_one = lp_build_const_vec(gallivm, type_ext, 1);
491
492 v0 = LLVMBuildZExt(builder, v0, vec_type_ext, "");
493 v1 = LLVMBuildZExt(builder, v1, vec_type_ext, "");
494 res = LLVMBuildAdd(builder, v0, v1, "");
495 res = LLVMBuildAdd(builder, res, ext_one, "");
496 res = LLVMBuildLShr(builder, res, ext_one, "");
497 res = LLVMBuildTrunc(builder, res, bld8->vec_type, "");
498 return res;
499 }
500 }
501
502 /**
503 * Calculate 1/3(v1-v0) + v0
504 * and 2*1/3(v1-v0) + v0
505 */
506 static void
507 lp_build_lerp23(struct lp_build_context *bld,
508 LLVMValueRef v0,
509 LLVMValueRef v1,
510 LLVMValueRef *res0,
511 LLVMValueRef *res1)
512 {
513 struct gallivm_state *gallivm = bld->gallivm;
514 LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi;
515 LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp;
516 const struct lp_type type = bld->type;
517 LLVMBuilderRef builder = bld->gallivm->builder;
518 struct lp_type i16_type = lp_wider_type(type);
519 struct lp_build_context bld2;
520
521 assert(lp_check_value(type, v0));
522 assert(lp_check_value(type, v1));
523 assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
524
525 lp_build_context_init(&bld2, gallivm, i16_type);
526 bld2.type.sign = TRUE;
527 x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3);
528
529 /* FIXME: use native avx256 unpack/pack */
530 lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi);
531 lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi);
532 lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi);
533 delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo);
534 delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi);
535
536 mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, "");
537 mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, "");
538
539 x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), "");
540 x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), "");
541 /* lerp optimization: pack now, do add afterwards */
542 tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
543 *res0 = lp_build_add(bld, tmp, v0);
544
545 x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), "");
546 x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), "");
547 /* unlike above still need mask (but add still afterwards). */
548 x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
549 x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
550 tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
551 *res1 = lp_build_add(bld, tmp, v0);
552 }
553
554 /**
555 * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
556 * @param colors is a <n x i32> vector with n x 2x16bit colors
557 * @param codewords is a <n x i32> vector containing the codewords
558 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
559 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
560 */
561 static LLVMValueRef
562 s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
563 unsigned n,
564 enum pipe_format format,
565 LLVMValueRef colors,
566 LLVMValueRef codewords,
567 LLVMValueRef i,
568 LLVMValueRef j)
569 {
570 LLVMBuilderRef builder = gallivm->builder;
571 LLVMValueRef color0, color1, color2, color3, color2_2, color3_2;
572 LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2;
573 LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices;
574 struct lp_type type, type8;
575 struct lp_build_context bld8, bld32;
576 boolean is_dxt1_variant = format_dxt1_variant(format);
577
578 memset(&type, 0, sizeof type);
579 type.width = 32;
580 type.length = n;
581
582 memset(&type8, 0, sizeof type8);
583 type8.width = 8;
584 type8.length = 4*n;
585
586 assert(lp_check_value(type, i));
587 assert(lp_check_value(type, j));
588
589 a = lp_build_const_int_vec(gallivm, type, 0xff000000);
590
591 lp_build_context_init(&bld32, gallivm, type);
592 lp_build_context_init(&bld8, gallivm, type8);
593
594 /*
595 * works as follows:
596 * - expand color0/color1 to rgba8888
597 * - calculate color2/3 (interpolation) according to color0 < color1 rules
598 * - calculate color2/3 according to color0 >= color1 rules
599 * - do selection of color2/3 according to comparison of color0/1
600 * - extract indices (vector shift).
601 * - use compare/select to select the correct color. Since we have 2bit
602 * indices (and 4 colors), needs at least three compare/selects.
603 */
604 /*
605 * expand the two colors
606 */
607 col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), "");
608 col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), "");
609 if (n > 1) {
610 color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1);
611 }
612 else {
613 color0 = color_expand_565_to_8888(gallivm, n, col0);
614 color1 = color_expand_565_to_8888(gallivm, n, col1);
615 }
616
617 /*
618 * interpolate colors
619 * color2_1 is 2/3 color0 + 1/3 color1
620 * color3_1 is 1/3 color0 + 2/3 color1
621 * color2_2 is 1/2 color0 + 1/2 color1
622 * color3_2 is 0
623 */
624
625 colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, "");
626 colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, "");
627 /* can combine 2 lerps into one mostly - still looks expensive enough. */
628 lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3);
629 color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, "");
630 color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, "");
631
632 /* dxt3/5 always use 4-color encoding */
633 if (is_dxt1_variant) {
634 /* fix up alpha */
635 if (format == PIPE_FORMAT_DXT1_RGBA ||
636 format == PIPE_FORMAT_DXT1_SRGBA) {
637 color0 = LLVMBuildOr(builder, color0, a, "");
638 color1 = LLVMBuildOr(builder, color1, a, "");
639 color3 = LLVMBuildOr(builder, color3, a, "");
640 }
641 /*
642 * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
643 * Much cheaper (but we don't care that much if n == 1).
644 */
645 if ((util_cpu_caps.has_sse2 && n == 4) ||
646 (util_cpu_caps.has_avx2 && n == 8)) {
647 color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
648 color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
649 }
650 else {
651 struct lp_type i16_type = lp_wider_type(type8);
652 struct lp_build_context bld2;
653 LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi;
654
655 lp_build_context_init(&bld2, gallivm, i16_type);
656 bld2.type.sign = TRUE;
657
658 /*
659 * This isn't as expensive as it looks (the unpack is the same as
660 * for lerp23), with correct rounding.
661 * (Note that while rounding is correct, this will always round down,
662 * whereas pavgb will always round up.)
663 */
664 /* FIXME: use native avx256 unpack/pack */
665 lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi);
666 lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi);
667
668 addlo = lp_build_add(&bld2, v0_lo, v1_lo);
669 addhi = lp_build_add(&bld2, v0_hi, v1_hi);
670 addlo = LLVMBuildLShr(builder, addlo,
671 lp_build_const_int_vec(gallivm, i16_type, 1), "");
672 addhi = LLVMBuildLShr(builder, addhi,
673 lp_build_const_int_vec(gallivm, i16_type, 1), "");
674 color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi);
675 color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
676 }
677 color3_2 = lp_build_const_int_vec(gallivm, type, 0);
678
679 /* select between colors2/3 */
680 /* signed compare is faster saves some xors */
681 type.sign = TRUE;
682 sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1);
683 color2 = lp_build_select(&bld32, sel_mask, color2, color2_2);
684 color3 = lp_build_select(&bld32, sel_mask, color3, color3_2);
685 type.sign = FALSE;
686
687 if (format == PIPE_FORMAT_DXT1_RGBA ||
688 format == PIPE_FORMAT_DXT1_SRGBA) {
689 color2 = LLVMBuildOr(builder, color2, a, "");
690 }
691 }
692
693 const2 = lp_build_const_int_vec(gallivm, type, 2);
694 /* extract 2-bit index values */
695 bit_pos = LLVMBuildShl(builder, j, const2, "");
696 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
697 bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
698 /*
699 * NOTE: This innocent looking shift is very expensive with x86/ssex.
700 * Shifts with per-elemnent shift count get roughly translated to
701 * extract (count), extract (value), shift, move (back to xmm), unpack
702 * per element!
703 * So about 20 instructions here for 4xi32.
704 * Newer llvm versions (3.7+) will not do extract/insert but use a
705 * a couple constant count vector shifts plus shuffles. About same
706 * amount of instructions unfortunately...
707 * Would get much worse with 8xi16 even...
708 * We could actually do better here:
709 * - subtract bit_pos from 128+30, shl 23, convert float to int...
710 * - now do mul with codewords followed by shr 30...
711 * But requires 32bit->32bit mul, sse41 only (well that's emulatable
712 * with 2 32bit->64bit muls...) and not exactly cheap
713 * AVX2, of course, fixes this nonsense.
714 */
715 indices = LLVMBuildLShr(builder, codewords, bit_pos, "");
716
717 /* finally select the colors */
718 sel_lo = LLVMBuildAnd(builder, indices, bld32.one, "");
719 sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one);
720 color0 = lp_build_select(&bld32, sel_lo, color1, color0);
721 color2 = lp_build_select(&bld32, sel_lo, color3, color2);
722 sel_hi = LLVMBuildAnd(builder, indices, const2, "");
723 sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2);
724 rgba = lp_build_select(&bld32, sel_hi, color2, color0);
725
726 /* fix up alpha */
727 if (format == PIPE_FORMAT_DXT1_RGB ||
728 format == PIPE_FORMAT_DXT1_SRGB) {
729 rgba = LLVMBuildOr(builder, rgba, a, "");
730 }
731 return LLVMBuildBitCast(builder, rgba, bld8.vec_type, "");
732 }
733
734
735 static LLVMValueRef
736 s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm,
737 unsigned n,
738 enum pipe_format format,
739 LLVMValueRef colors,
740 LLVMValueRef codewords,
741 LLVMValueRef i,
742 LLVMValueRef j)
743 {
744 return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format,
745 colors, codewords, i, j);
746 }
747
748
749 /**
750 * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
751 * @param colors is a <n x i32> vector with n x 2x16bit colors
752 * @param codewords is a <n x i32> vector containing the codewords
753 * @param alphas is a <n x i64> vector containing the alpha values
754 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
755 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
756 */
757 static LLVMValueRef
758 s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
759 unsigned n,
760 enum pipe_format format,
761 LLVMValueRef colors,
762 LLVMValueRef codewords,
763 LLVMValueRef alpha_low,
764 LLVMValueRef alpha_hi,
765 LLVMValueRef i,
766 LLVMValueRef j)
767 {
768 LLVMBuilderRef builder = gallivm->builder;
769 LLVMValueRef rgba, tmp, tmp2;
770 LLVMValueRef bit_pos, sel_mask;
771 struct lp_type type, type8;
772 struct lp_build_context bld;
773
774 memset(&type, 0, sizeof type);
775 type.width = 32;
776 type.length = n;
777
778 memset(&type8, 0, sizeof type8);
779 type8.width = 8;
780 type8.length = n*4;
781
782 assert(lp_check_value(type, i));
783 assert(lp_check_value(type, j));
784
785 lp_build_context_init(&bld, gallivm, type);
786
787 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
788 colors, codewords, i, j);
789
790 rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, "");
791
792 /*
793 * Extract alpha values. Since we now need to select from
794 * which 32bit vector values are fetched, construct selection
795 * mask from highest bit of bit_pos, and use select, then shift
796 * according to the bit_pos (without the highest bit).
797 * Note this is pointless for n == 1 case. Could just
798 * directly use 64bit arithmetic if we'd extract 64bit
799 * alpha value instead of 2x32...
800 */
801 /* pos = 4*(4j+i) */
802 bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
803 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
804 bit_pos = LLVMBuildShl(builder, bit_pos,
805 lp_build_const_int_vec(gallivm, type, 2), "");
806 sel_mask = LLVMBuildLShr(builder, bit_pos,
807 lp_build_const_int_vec(gallivm, type, 5), "");
808 sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, "");
809 tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
810 bit_pos = LLVMBuildAnd(builder, bit_pos,
811 lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
812 /* Warning: slow shift with per element count (without avx2) */
813 /*
814 * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
815 * to select the right byte with pshufb. Then for the remaining one bit
816 * just do shift/select.
817 */
818 tmp = LLVMBuildLShr(builder, tmp, bit_pos, "");
819
820 /* combined expand from a4 to a8 and shift into position */
821 tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), "");
822 tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), "");
823 tmp = LLVMBuildOr(builder, tmp, tmp2, "");
824
825 rgba = LLVMBuildOr(builder, tmp, rgba, "");
826
827 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
828 }
829
830 static LLVMValueRef
831 lp_build_lerpdxta(struct gallivm_state *gallivm,
832 LLVMValueRef alpha0,
833 LLVMValueRef alpha1,
834 LLVMValueRef code,
835 LLVMValueRef sel_mask,
836 unsigned n)
837 {
838 /*
839 * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
840 * (plus pmullw is actually faster...)
841 * we just pretend our 32bit values (which are really only 8bit) are 16bits.
842 * Note that this is obviously a disaster for the scalar case.
843 */
844 LLVMBuilderRef builder = gallivm->builder;
845 LLVMValueRef delta, ainterp;
846 LLVMValueRef weight5, weight7, weight;
847 struct lp_type type32, type16, type8;
848 struct lp_build_context bld16;
849
850 memset(&type32, 0, sizeof type32);
851 type32.width = 32;
852 type32.length = n;
853 memset(&type16, 0, sizeof type16);
854 type16.width = 16;
855 type16.length = 2*n;
856 type16.sign = TRUE;
857 memset(&type8, 0, sizeof type8);
858 type8.width = 8;
859 type8.length = 4*n;
860
861 lp_build_context_init(&bld16, gallivm, type16);
862 /* 255/7 is a bit off - increase accuracy at the expense of shift later */
863 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
864 weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5);
865 weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7);
866 weight = lp_build_select(&bld16, sel_mask, weight7, weight5);
867
868 alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
869 alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, "");
870 code = LLVMBuildBitCast(builder, code, bld16.vec_type, "");
871 /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
872 but we don't care */
873 code = LLVMBuildSub(builder, code, bld16.one, "");
874
875 weight = LLVMBuildMul(builder, weight, code, "");
876 weight = LLVMBuildLShr(builder, weight,
877 lp_build_const_int_vec(gallivm, type16, 6), "");
878
879 delta = LLVMBuildSub(builder, alpha1, alpha0, "");
880
881 ainterp = LLVMBuildMul(builder, delta, weight, "");
882 ainterp = LLVMBuildLShr(builder, ainterp,
883 lp_build_const_int_vec(gallivm, type16, 8), "");
884
885 ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), "");
886 alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
887 ainterp = LLVMBuildAdd(builder, alpha0, ainterp, "");
888 ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), "");
889
890 return ainterp;
891 }
892
893 /**
894 * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
895 * @param colors is a <n x i32> vector with n x 2x16bit colors
896 * @param codewords is a <n x i32> vector containing the codewords
897 * @param alphas is a <n x i64> vector containing the alpha values
898 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
899 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
900 */
901 static LLVMValueRef
902 s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm,
903 unsigned n,
904 enum pipe_format format,
905 LLVMValueRef colors,
906 LLVMValueRef codewords,
907 LLVMValueRef alpha_lo,
908 LLVMValueRef alpha_hi,
909 LLVMValueRef i,
910 LLVMValueRef j)
911 {
912 LLVMBuilderRef builder = gallivm->builder;
913 LLVMValueRef rgba, tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift;
914 LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s;
915 LLVMValueRef mask6, mask7, ainterp;
916 LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
917 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
918 struct lp_type type, type8;
919 struct lp_build_context bld32;
920
921 memset(&type, 0, sizeof type);
922 type.width = 32;
923 type.length = n;
924
925 memset(&type8, 0, sizeof type8);
926 type8.width = 8;
927 type8.length = n*4;
928
929 assert(lp_check_value(type, i));
930 assert(lp_check_value(type, j));
931
932 lp_build_context_init(&bld32, gallivm, type);
933
934 assert(lp_check_value(type, i));
935 assert(lp_check_value(type, j));
936
937 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
938 colors, codewords, i, j);
939
940 rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, "");
941
942 /* this looks pretty complex for vectorization:
943 * extract a0/a1 values
944 * extract code
945 * select weights for interpolation depending on a0 > a1
946 * mul weights by code - 1
947 * lerp a0/a1/weights
948 * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
949 */
950
951 alpha0 = LLVMBuildAnd(builder, alpha_lo,
952 lp_build_const_int_vec(gallivm, type, 0xff), "");
953 alpha1 = LLVMBuildLShr(builder, alpha_lo,
954 lp_build_const_int_vec(gallivm, type, 8), "");
955 alpha1 = LLVMBuildAnd(builder, alpha1,
956 lp_build_const_int_vec(gallivm, type, 0xff), "");
957
958 /* pos = 3*(4j+i) */
959 bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
960 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
961 tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
962 bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, "");
963 /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
964 bit_pos = LLVMBuildAdd(builder, bit_pos,
965 lp_build_const_int_vec(gallivm, type, 16), "");
966
967 if (n == 1) {
968 struct lp_type type64;
969 memset(&type64, 0, sizeof type64);
970 type64.width = 64;
971 type64.length = 1;
972 /* This is pretty pointless could avoid by just directly extracting
973 64bit in the first place but makes it more complicated elsewhere */
974 alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, "");
975 alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, "");
976 alphac0 = LLVMBuildShl(builder, alpha_hi,
977 lp_build_const_int_vec(gallivm, type64, 32), "");
978 alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, "");
979
980 shift = LLVMBuildZExt(builder, bit_pos, i64t, "");
981 alphac0 = LLVMBuildLShr(builder, alphac0, shift, "");
982 alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, "");
983 alphac = LLVMBuildAnd(builder, alphac0,
984 lp_build_const_int_vec(gallivm, type, 0x7), "");
985 }
986 else {
987 /*
988 * Using non-native vector length here (actually, with avx2 and
989 * n == 4 llvm will indeed expand to ymm regs...)
990 * At least newer llvm versions handle that ok.
991 * llvm 3.7+ will even handle the emulated 64bit shift with variable
992 * shift count without extraction (and it's actually easier to
993 * emulate than the 32bit one).
994 */
995 alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi,
996 lp_build_const_unpackx2_shuffle(gallivm, n), "");
997
998 alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), "");
999 shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), "");
1000 alphac = LLVMBuildLShr(builder, alpha64, shift, "");
1001 alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, "");
1002
1003 alphac = LLVMBuildAnd(builder, alphac,
1004 lp_build_const_int_vec(gallivm, type, 0x7), "");
1005 }
1006
1007 /* signed compare is faster saves some xors */
1008 type.sign = TRUE;
1009 /* alpha0 > alpha1 selection */
1010 sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1011 alpha0, alpha1);
1012 ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n);
1013
1014 /*
1015 * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
1016 * else we select a0 for case 0, a1 for case 1,
1017 * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
1018 * a = (c == 0) ? a0 : a1
1019 * a = (c > 1) ? ainterp : a
1020 * Finally handle case 6/7 for !(a0 > a1)
1021 * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
1022 * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
1023 */
1024 tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1025 alphac, bld32.zero);
1026 alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1);
1027 tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1028 alphac, bld32.one);
1029 alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha);
1030
1031 code_s = LLVMBuildAnd(builder, alphac,
1032 LLVMBuildNot(builder, sel_mask, ""), "");
1033 mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1034 code_s, lp_build_const_int_vec(gallivm, type, 6));
1035 mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1036 code_s, lp_build_const_int_vec(gallivm, type, 7));
1037 alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), "");
1038 alpha = LLVMBuildOr(builder, alpha, mask7, "");
1039
1040 alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), "");
1041 rgba = LLVMBuildOr(builder, alpha, rgba, "");
1042
1043 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
1044 }
1045
1046
1047 static void
1048 lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm,
1049 const struct util_format_description *format_desc,
1050 LLVMValueRef *dxt_block,
1051 LLVMValueRef ptr)
1052 {
1053 LLVMBuilderRef builder = gallivm->builder;
1054 unsigned block_bits = format_desc->block.bits;
1055 LLVMValueRef elem, shuf;
1056 LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32);
1057 LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits);
1058 LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
1059 LLVMTypeRef type32_4 = LLVMVectorType(type32, 4);
1060
1061 assert(block_bits == 64 || block_bits == 128);
1062
1063 ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
1064 elem = LLVMBuildLoad(builder, ptr, "");
1065
1066 if (block_bits == 128) {
1067 /* just return block as is */
1068 *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, "");
1069 }
1070 else {
1071 LLVMTypeRef type32_2 = LLVMVectorType(type32, 2);
1072 shuf = lp_build_const_extend_shuffle(gallivm, 2, 4);
1073 elem = LLVMBuildBitCast(builder, elem, type32_2, "");
1074 *dxt_block = LLVMBuildShuffleVector(builder, elem,
1075 LLVMGetUndef(type32_2), shuf, "");
1076 }
1077 }
1078
1079
1080 static void
1081 s3tc_store_cached_block(struct gallivm_state *gallivm,
1082 LLVMValueRef *col,
1083 LLVMValueRef tag_value,
1084 LLVMValueRef hash_index,
1085 LLVMValueRef cache)
1086 {
1087 LLVMBuilderRef builder = gallivm->builder;
1088 LLVMValueRef ptr, indices[3];
1089 LLVMTypeRef type_ptr4x32;
1090 unsigned count;
1091
1092 type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
1093 indices[0] = lp_build_const_int32(gallivm, 0);
1094 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1095 indices[2] = hash_index;
1096 ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1097 LLVMBuildStore(builder, tag_value, ptr);
1098
1099 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1100 hash_index = LLVMBuildMul(builder, hash_index,
1101 lp_build_const_int32(gallivm, 16), "");
1102 for (count = 0; count < 4; count++) {
1103 indices[2] = hash_index;
1104 ptr = LLVMBuildGEP(builder, cache, indices, ARRAY_SIZE(indices), "");
1105 ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
1106 LLVMBuildStore(builder, col[count], ptr);
1107 hash_index = LLVMBuildAdd(builder, hash_index,
1108 lp_build_const_int32(gallivm, 4), "");
1109 }
1110 }
1111
1112 static LLVMValueRef
1113 s3tc_lookup_cached_pixel(struct gallivm_state *gallivm,
1114 LLVMValueRef ptr,
1115 LLVMValueRef index)
1116 {
1117 LLVMBuilderRef builder = gallivm->builder;
1118 LLVMValueRef member_ptr, indices[3];
1119
1120 indices[0] = lp_build_const_int32(gallivm, 0);
1121 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1122 indices[2] = index;
1123 member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1124 return LLVMBuildLoad(builder, member_ptr, "cache_data");
1125 }
1126
1127 static LLVMValueRef
1128 s3tc_lookup_tag_data(struct gallivm_state *gallivm,
1129 LLVMValueRef ptr,
1130 LLVMValueRef index)
1131 {
1132 LLVMBuilderRef builder = gallivm->builder;
1133 LLVMValueRef member_ptr, indices[3];
1134
1135 indices[0] = lp_build_const_int32(gallivm, 0);
1136 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1137 indices[2] = index;
1138 member_ptr = LLVMBuildGEP(builder, ptr, indices, ARRAY_SIZE(indices), "");
1139 return LLVMBuildLoad(builder, member_ptr, "tag_data");
1140 }
1141
1142 #if LP_BUILD_FORMAT_CACHE_DEBUG
1143 static void
1144 s3tc_update_cache_access(struct gallivm_state *gallivm,
1145 LLVMValueRef ptr,
1146 unsigned count,
1147 unsigned index)
1148 {
1149 LLVMBuilderRef builder = gallivm->builder;
1150 LLVMValueRef member_ptr, cache_access;
1151
1152 assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
1153 index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
1154
1155 member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
1156 cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
1157 cache_access = LLVMBuildAdd(builder, cache_access,
1158 LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
1159 count, 0), "");
1160 LLVMBuildStore(builder, cache_access, member_ptr);
1161 }
1162 #endif
1163
1164 /**
1165 * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
1166 * The lerp is performed between the first 2 32bit colors
1167 * in the source vector, both results are returned packed in result vector.
1168 */
1169 static LLVMValueRef
1170 lp_build_lerp23_single(struct lp_build_context *bld,
1171 LLVMValueRef v01)
1172 {
1173 struct gallivm_state *gallivm = bld->gallivm;
1174 LLVMValueRef x, mul, delta, res, v0, v1, elems[8];
1175 const struct lp_type type = bld->type;
1176 LLVMBuilderRef builder = bld->gallivm->builder;
1177 struct lp_type i16_type = lp_wider_type(type);
1178 struct lp_type i32_type = lp_wider_type(i16_type);
1179 struct lp_build_context bld2;
1180
1181 assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
1182
1183 lp_build_context_init(&bld2, gallivm, i16_type);
1184 bld2.type.sign = TRUE;
1185
1186 /* weights 256/3, 256*2/3, with correct rounding */
1187 elems[0] = elems[1] = elems[2] = elems[3] =
1188 lp_build_const_elem(gallivm, i16_type, 255*1/3);
1189 elems[4] = elems[5] = elems[6] = elems[7] =
1190 lp_build_const_elem(gallivm, i16_type, 171);
1191 x = LLVMConstVector(elems, 8);
1192
1193 /*
1194 * v01 has col0 in 32bit elem 0, col1 in elem 1.
1195 * Interleave/unpack will give us separate v0/v1 vectors.
1196 */
1197 v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0);
1198 v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, "");
1199
1200 lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1);
1201 delta = lp_build_sub(&bld2, v1, v0);
1202
1203 mul = LLVMBuildMul(builder, x, delta, "");
1204
1205 mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), "");
1206 /* lerp optimization: pack now, do add afterwards */
1207 res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef);
1208 /* only lower 2 elems are valid - for these v0 is really v0 */
1209 return lp_build_add(bld, res, v01);
1210 }
1211
1212 /*
1213 * decode one dxt1 block.
1214 */
1215 static void
1216 s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
1217 enum pipe_format format,
1218 LLVMValueRef dxt_block,
1219 LLVMValueRef *col)
1220 {
1221 LLVMBuilderRef builder = gallivm->builder;
1222 LLVMValueRef color01, color23, color01_16, color0123;
1223 LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2;
1224 struct lp_type type8, type32, type16, type64;
1225 struct lp_build_context bld8, bld32, bld16, bld64;
1226 unsigned i;
1227 boolean is_dxt1_variant = format_dxt1_variant(format);
1228
1229 memset(&type32, 0, sizeof type32);
1230 type32.width = 32;
1231 type32.length = 4;
1232 type32.sign = TRUE;
1233
1234 memset(&type8, 0, sizeof type8);
1235 type8.width = 8;
1236 type8.length = 16;
1237
1238 memset(&type16, 0, sizeof type16);
1239 type16.width = 16;
1240 type16.length = 8;
1241
1242 memset(&type64, 0, sizeof type64);
1243 type64.width = 64;
1244 type64.length = 2;
1245
1246 a = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1247 const2 = lp_build_const_int_vec(gallivm, type32, 2);
1248
1249 lp_build_context_init(&bld32, gallivm, type32);
1250 lp_build_context_init(&bld16, gallivm, type16);
1251 lp_build_context_init(&bld8, gallivm, type8);
1252 lp_build_context_init(&bld64, gallivm, type64);
1253
1254 if (is_dxt1_variant) {
1255 color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4);
1256 code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4);
1257 } else {
1258 color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4);
1259 code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4);
1260 }
1261 code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1262 /* expand bytes to dwords */
1263 code = lp_build_interleave2(gallivm, type8, code, code, 0);
1264 code = lp_build_interleave2(gallivm, type8, code, code, 0);
1265
1266
1267 /*
1268 * works as follows:
1269 * - expand color0/color1 to rgba8888
1270 * - calculate color2/3 (interpolation) according to color0 < color1 rules
1271 * - calculate color2/3 according to color0 >= color1 rules
1272 * - do selection of color2/3 according to comparison of color0/1
1273 * - extract indices.
1274 * - use compare/select to select the correct color. Since we have 2bit
1275 * indices (and 4 colors), needs at least three compare/selects.
1276 */
1277
1278 /*
1279 * expand the two colors
1280 */
1281 color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, "");
1282 color01 = lp_build_interleave2(gallivm, type16, color01,
1283 bld16.zero, 0);
1284 color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, "");
1285 color01 = color_expand_565_to_8888(gallivm, 4, color01_16);
1286
1287 /*
1288 * interpolate colors
1289 * color2_1 is 2/3 color0 + 1/3 color1
1290 * color3_1 is 1/3 color0 + 2/3 color1
1291 * color2_2 is 1/2 color0 + 1/2 color1
1292 * color3_2 is 0
1293 */
1294
1295 /* TODO: since this is now always scalar, should
1296 * probably just use control flow here instead of calculating
1297 * both cases and then selection
1298 */
1299 if (format == PIPE_FORMAT_DXT1_RGBA ||
1300 format == PIPE_FORMAT_DXT1_SRGBA) {
1301 color01 = LLVMBuildOr(builder, color01, a, "");
1302 }
1303 /* can combine 2 lerps into one mostly */
1304 color23 = lp_build_lerp23_single(&bld8, color01);
1305 color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, "");
1306
1307 /* dxt3/5 always use 4-color encoding */
1308 if (is_dxt1_variant) {
1309 LLVMValueRef color23_2, color2_2;
1310
1311 if (util_cpu_caps.has_sse2) {
1312 LLVMValueRef intrargs[2];
1313 intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
1314 /* same interleave as for lerp23 - correct result in 2nd element */
1315 intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1316 intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, "");
1317 color2_2 = lp_build_pavgb(&bld8, intrargs[0], intrargs[1]);
1318 }
1319 else {
1320 LLVMValueRef v01, v0, v1, vhalf;
1321 /*
1322 * This isn't as expensive as it looks (the unpack is the same as
1323 * for lerp23, which is the reason why we do the pointless
1324 * interleave2 too), with correct rounding (the two lower elements
1325 * will be the same).
1326 */
1327 v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1328 v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, "");
1329 lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1);
1330 vhalf = lp_build_add(&bld16, v0, v1);
1331 vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, "");
1332 color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef);
1333 }
1334 /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
1335 color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, "");
1336 color23_2 = LLVMBuildLShr(builder, color23_2,
1337 lp_build_const_int_vec(gallivm, type64, 32), "");
1338 color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, "");
1339
1340 tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, "");
1341 tmp = LLVMBuildLShr(builder, tmp,
1342 lp_build_const_int_vec(gallivm, type64, 32), "");
1343 tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, "");
1344 sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER,
1345 color01_16, tmp);
1346 sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0);
1347 color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
1348 }
1349
1350 if (util_cpu_caps.has_ssse3) {
1351 /*
1352 * Use pshufb as mini-lut. (Only doable with intrinsics as the
1353 * final shuffles are non-constant. pshufb is awesome!)
1354 */
1355 LLVMValueRef shuf[16], low2mask;
1356 LLVMValueRef intrargs[2], lut_ind, lut_adj;
1357
1358 color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, "");
1359 color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, "");
1360 color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0);
1361 color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, "");
1362
1363 if (format == PIPE_FORMAT_DXT1_RGB ||
1364 format == PIPE_FORMAT_DXT1_SRGB) {
1365 color0123 = LLVMBuildOr(builder, color0123, a, "");
1366 }
1367
1368 /* shuffle as r0r1r2r3g0g1... */
1369 for (i = 0; i < 4; i++) {
1370 shuf[4*i] = lp_build_const_int32(gallivm, 0 + i);
1371 shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i);
1372 shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i);
1373 shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i);
1374 }
1375 color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, "");
1376 color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef,
1377 LLVMConstVector(shuf, 16), "");
1378
1379 /* lowest 2 bits of each 8 bit value contain index into "LUT" */
1380 low2mask = lp_build_const_int_vec(gallivm, type8, 3);
1381 /* add 0/4/8/12 for r/g/b/a */
1382 lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400);
1383 lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, "");
1384 intrargs[0] = color0123;
1385 for (i = 0; i < 4; i++) {
1386 lut_ind = LLVMBuildAnd(builder, code, low2mask, "");
1387 lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, "");
1388 intrargs[1] = lut_ind;
1389 col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1390 bld8.vec_type, intrargs, 2, 0);
1391 col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, "");
1392 code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1393 code = LLVMBuildLShr(builder, code, const2, "");
1394 code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1395 }
1396 }
1397 else {
1398 /* Thanks to vectorization can do 4 texels in parallel */
1399 LLVMValueRef color0, color1, color2, color3;
1400 if (format == PIPE_FORMAT_DXT1_RGB ||
1401 format == PIPE_FORMAT_DXT1_SRGB) {
1402 color01 = LLVMBuildOr(builder, color01, a, "");
1403 color23 = LLVMBuildOr(builder, color23, a, "");
1404 }
1405 color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1406 lp_build_const_shuffle1(gallivm, 0, 4), "");
1407 color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1408 lp_build_const_shuffle1(gallivm, 1, 4), "");
1409 color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1410 lp_build_const_shuffle1(gallivm, 0, 4), "");
1411 color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1412 lp_build_const_shuffle1(gallivm, 1, 4), "");
1413 code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1414
1415 for (i = 0; i < 4; i++) {
1416 /* select the colors */
1417 LLVMValueRef selmasklo, rgba01, rgba23, bitlo;
1418 bitlo = bld32.one;
1419 indices = LLVMBuildAnd(builder, code, bitlo, "");
1420 selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1421 indices, bitlo);
1422 rgba01 = lp_build_select(&bld32, selmasklo, color1, color0);
1423
1424 LLVMValueRef selmaskhi;
1425 indices = LLVMBuildAnd(builder, code, const2, "");
1426 selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1427 indices, const2);
1428 rgba23 = lp_build_select(&bld32, selmasklo, color3, color2);
1429 rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01);
1430
1431 /*
1432 * Note that this will give "wrong" order.
1433 * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
1434 * This would be easily fixable by using different shuffle, bitlo/hi
1435 * vectors above (and different shift), but seems slightly easier to
1436 * deal with for dxt3/dxt5 alpha too. So instead change lookup.
1437 */
1438 col[i] = rgba;
1439 code = LLVMBuildLShr(builder, code, const2, "");
1440 }
1441 }
1442 }
1443
1444 /*
1445 * decode one dxt3 block.
1446 */
1447 static void
1448 s3tc_decode_block_dxt3(struct gallivm_state *gallivm,
1449 enum pipe_format format,
1450 LLVMValueRef dxt_block,
1451 LLVMValueRef *col)
1452 {
1453 LLVMBuilderRef builder = gallivm->builder;
1454 LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi;
1455 struct lp_type type32, type8, type16;
1456 unsigned i;
1457
1458 memset(&type32, 0, sizeof type32);
1459 type32.width = 32;
1460 type32.length = 4;
1461
1462 memset(&type8, 0, sizeof type8);
1463 type8.width = 8;
1464 type8.length = 16;
1465
1466 memset(&type16, 0, sizeof type16);
1467 type16.width = 16;
1468 type16.length = 8;
1469
1470 s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1471
1472 shift4_16 = lp_build_const_int_vec(gallivm, type16, 4);
1473 mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1474
1475 alpha = LLVMBuildBitCast(builder, dxt_block,
1476 lp_build_vec_type(gallivm, type8), "");
1477 alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0);
1478 alpha = LLVMBuildBitCast(builder, alpha,
1479 lp_build_vec_type(gallivm, type16), "");
1480 alpha = LLVMBuildAnd(builder, alpha,
1481 lp_build_const_int_vec(gallivm, type16, 0xf00f), "");
1482 alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, "");
1483 alphas1 = LLVMBuildShl(builder, alpha, shift4_16, "");
1484 alpha = LLVMBuildOr(builder, alphas0, alpha, "");
1485 alpha = LLVMBuildOr(builder, alphas1, alpha, "");
1486 alpha = LLVMBuildBitCast(builder, alpha,
1487 lp_build_vec_type(gallivm, type32), "");
1488 /*
1489 * alpha now contains elems 0,1,2,3,... (ubytes)
1490 * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
1491 * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
1492 */
1493 a[0] = LLVMBuildShl(builder, alpha,
1494 lp_build_const_int_vec(gallivm, type32, 24), "");
1495 a[1] = LLVMBuildShl(builder, alpha,
1496 lp_build_const_int_vec(gallivm, type32, 16), "");
1497 a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1498 a[2] = LLVMBuildShl(builder, alpha,
1499 lp_build_const_int_vec(gallivm, type32, 8), "");
1500 a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1501 a[3] = LLVMBuildAnd(builder, alpha, mask8hi, "");
1502
1503 for (i = 0; i < 4; i++) {
1504 col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1505 }
1506 }
1507
1508
1509 static LLVMValueRef
1510 lp_build_lerpdxta_block(struct gallivm_state *gallivm,
1511 LLVMValueRef alpha0,
1512 LLVMValueRef alpha1,
1513 LLVMValueRef code,
1514 LLVMValueRef sel_mask)
1515 {
1516 LLVMBuilderRef builder = gallivm->builder;
1517 LLVMValueRef delta, ainterp;
1518 LLVMValueRef weight5, weight7, weight;
1519 struct lp_type type16;
1520 struct lp_build_context bld;
1521
1522 memset(&type16, 0, sizeof type16);
1523 type16.width = 16;
1524 type16.length = 8;
1525 type16.sign = TRUE;
1526
1527 lp_build_context_init(&bld, gallivm, type16);
1528 /*
1529 * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
1530 * actually be desirable to do this here with even higher accuracy than
1531 * even 8 bit (more or less required for rgtc, albeit that's not handled
1532 * here right now), shift the weights after multiplication by code.
1533 */
1534 weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5);
1535 weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7);
1536 weight = lp_build_select(&bld, sel_mask, weight7, weight5);
1537
1538 /*
1539 * we'll get garbage in the elements which had code 0 (or larger than
1540 * 5 or 7) but we don't care (or rather, need to fix up anyway).
1541 */
1542 code = LLVMBuildSub(builder, code, bld.one, "");
1543
1544 weight = LLVMBuildMul(builder, weight, code, "");
1545 weight = LLVMBuildLShr(builder, weight,
1546 lp_build_const_int_vec(gallivm, type16, 6), "");
1547
1548 delta = LLVMBuildSub(builder, alpha1, alpha0, "");
1549
1550 ainterp = LLVMBuildMul(builder, delta, weight, "");
1551 ainterp = LLVMBuildLShr(builder, ainterp,
1552 lp_build_const_int_vec(gallivm, type16, 8), "");
1553
1554 /* lerp is done later (with packed values) */
1555
1556 return ainterp;
1557 }
1558
1559
1560 /*
1561 * decode one dxt5 block.
1562 */
1563 static void
1564 s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
1565 enum pipe_format format,
1566 LLVMValueRef dxt_block,
1567 LLVMValueRef *col)
1568 {
1569 LLVMBuilderRef builder = gallivm->builder;
1570 LLVMValueRef alpha, alpha0, alpha1, ares;
1571 LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2;
1572 LLVMValueRef a[4], acode, tmp0, tmp1;
1573 LLVMTypeRef i64t, i32t;
1574 struct lp_type type32, type64, type8, type16;
1575 struct lp_build_context bld16, bld8;
1576 unsigned i;
1577
1578 memset(&type32, 0, sizeof type32);
1579 type32.width = 32;
1580 type32.length = 4;
1581
1582 memset(&type64, 0, sizeof type64);
1583 type64.width = 64;
1584 type64.length = 2;
1585
1586 memset(&type8, 0, sizeof type8);
1587 type8.width = 8;
1588 type8.length = 16;
1589
1590 memset(&type16, 0, sizeof type16);
1591 type16.width = 16;
1592 type16.length = 8;
1593
1594 lp_build_context_init(&bld16, gallivm, type16);
1595 lp_build_context_init(&bld8, gallivm, type8);
1596
1597 i64t = lp_build_vec_type(gallivm, type64);
1598 i32t = lp_build_vec_type(gallivm, type32);
1599
1600 s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1601
1602 /*
1603 * three possible strategies for vectorizing alpha:
1604 * 1) compute all 8 values then use scalar extraction
1605 * (i.e. have all 8 alpha values packed in one 64bit scalar
1606 * and do something like ax = vals >> (codex * 8) followed
1607 * by inserting these values back into color)
1608 * 2) same as 8 but just use pshufb as a mini-LUT for selection.
1609 * (without pshufb would need boatloads of cmp/selects trying to
1610 * keep things vectorized for essentially scalar selection).
1611 * 3) do something similar to the uncached case
1612 * needs more calculations (need to calc 16 values instead of 8 though
1613 * that's only an issue for the lerp which we need to do twice otherwise
1614 * everything still fits into 128bit) but keeps things vectorized mostly.
1615 * Trying 3) here though not sure it's really faster...
1616 * With pshufb, we try 2) (cheaper and more accurate)
1617 */
1618
1619 /*
1620 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1621 * help since code crosses 8bit boundaries). But variable shifts are
1622 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1623 * shifts!). Instead, emulate by 16bit muls.
1624 * Also, the required byte shuffles are essentially non-emulatable, so
1625 * require ssse3 (albeit other archs might do them fine).
1626 * This is not directly tied to ssse3 - just need sane byte shuffles.
1627 * But ordering is going to be different below so use same condition.
1628 */
1629
1630
1631 /* vectorize alpha */
1632 alpha = LLVMBuildBitCast(builder, dxt_block, i64t, "");
1633 alpha0 = LLVMBuildAnd(builder, alpha,
1634 lp_build_const_int_vec(gallivm, type64, 0xff), "");
1635 alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
1636 alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, "");
1637 alpha1 = LLVMBuildLShr(builder, alpha,
1638 lp_build_const_int_vec(gallivm, type16, 8), "");
1639 alpha = LLVMBuildBitCast(builder, alpha, i64t, "");
1640 shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
1641 alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
1642 alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
1643
1644 type16.sign = TRUE;
1645 sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER,
1646 alpha0, alpha1);
1647 type16.sign = FALSE;
1648 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1649
1650 if (!util_cpu_caps.has_ssse3) {
1651 LLVMValueRef acodeg, mask1, acode0, acode1;
1652
1653 /* extraction of the 3 bit values into something more useful is HARD */
1654 /* first steps are actually scalar */
1655 acode = LLVMBuildLShr(builder, alpha,
1656 lp_build_const_int_vec(gallivm, type64, 16), "");
1657 tmp0 = LLVMBuildAnd(builder, acode,
1658 lp_build_const_int_vec(gallivm, type64, 0xffffff), "");
1659 tmp1 = LLVMBuildLShr(builder, acode,
1660 lp_build_const_int_vec(gallivm, type64, 24), "");
1661 tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, "");
1662 tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, "");
1663 acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1664 /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
1665 tmp0 = LLVMBuildAnd(builder, acode,
1666 lp_build_const_int_vec(gallivm, type32, 0xfff), "");
1667 tmp1 = LLVMBuildLShr(builder, acode,
1668 lp_build_const_int_vec(gallivm, type32, 12), "");
1669 acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1670 /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
1671 tmp0 = LLVMBuildAnd(builder, acode,
1672 lp_build_const_int_vec(gallivm, type32, 0x3f), "");
1673 tmp1 = LLVMBuildLShr(builder, acode,
1674 lp_build_const_int_vec(gallivm, type32, 6), "");
1675 /* use signed pack doesn't matter and otherwise need sse41 */
1676 type32.sign = type16.sign = TRUE;
1677 acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1);
1678 type32.sign = type16.sign = FALSE;
1679 /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
1680 acode0 = LLVMBuildAnd(builder, acode,
1681 lp_build_const_int_vec(gallivm, type16, 0x7), "");
1682 acode1 = LLVMBuildLShr(builder, acode,
1683 lp_build_const_int_vec(gallivm, type16, 3), "");
1684 acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1);
1685 /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
1686
1687 acodeg = LLVMBuildAnd(builder, acode,
1688 LLVMBuildNot(builder, sel_mask, ""), "");
1689 mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1690 acode, bld8.one);
1691
1692 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
1693 ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask);
1694 ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask);
1695 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1696 ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1);
1697 alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0);
1698 alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1);
1699 ainterp = LLVMBuildAdd(builder, ainterp, alpha0, "");
1700 /* Fix up val01 */
1701 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1702 acode, bld8.zero);
1703 ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp);
1704 ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp);
1705
1706 /* fix up val67 if a0 <= a1 */
1707 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1708 acodeg, lp_build_const_int_vec(gallivm, type8, 6));
1709 ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), "");
1710 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1711 acodeg, lp_build_const_int_vec(gallivm, type8, 7));
1712 ares = LLVMBuildOr(builder, ares, sel_mask2, "");
1713
1714 /* unpack in right order (0,4,8,12,1,5,..) */
1715 /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
1716 tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0);
1717 tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1);
1718 tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, "");
1719 tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, "");
1720
1721 a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0);
1722 a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0);
1723 a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1);
1724 a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1);
1725 }
1726 else {
1727 LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi;
1728 LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context);
1729 LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context);
1730 unsigned i, j;
1731 /*
1732 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1733 * help since code crosses 8bit boundaries). But variable shifts are
1734 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1735 * shifts!). Instead, emulate by 16bit muls.
1736 * Also, the required byte shuffles are essentially non-emulatable, so
1737 * require ssse3 (albeit other archs might do them fine, but the
1738 * complete path is ssse3 only for now).
1739 */
1740 for (i = 0, j = 0; i < 16; i += 8, j += 3) {
1741 elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2);
1742 elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3);
1743 elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4);
1744 }
1745 shufa = LLVMConstVector(elems, 16);
1746 alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, "");
1747 acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, "");
1748 acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, "");
1749 /*
1750 * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
1751 * Do the same for 1/3/5/7 (albeit still need mask there - ideally
1752 * we'd place them into bits 4-7 so could save shift but impossible.)
1753 */
1754 for (i = 0; i < 8; i += 4) {
1755 elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0);
1756 elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0);
1757 elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0);
1758 elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0);
1759 }
1760 mulclo = LLVMConstVector(elems, 8);
1761 for (i = 0; i < 8; i += 4) {
1762 elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0);
1763 elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0);
1764 elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0);
1765 elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0);
1766 }
1767 mulchi = LLVMConstVector(elems, 8);
1768
1769 tmp0 = LLVMBuildMul(builder, acode, mulclo, "");
1770 tmp1 = LLVMBuildMul(builder, acode, mulchi, "");
1771 tmp0 = LLVMBuildLShr(builder, tmp0,
1772 lp_build_const_int_vec(gallivm, type16, 13), "");
1773 tmp1 = LLVMBuildLShr(builder, tmp1,
1774 lp_build_const_int_vec(gallivm, type16, 5), "");
1775 tmp1 = LLVMBuildAnd(builder, tmp1,
1776 lp_build_const_int_vec(gallivm, type16, 0x700), "");
1777 acode = LLVMBuildOr(builder, tmp0, tmp1, "");
1778 acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, "");
1779
1780 /*
1781 * Note that ordering is different here to non-ssse3 path:
1782 * 0/1/2/3/4/5...
1783 */
1784
1785 LLVMValueRef weight0, weight1, weight, delta;
1786 LLVMValueRef constff_elem7, const0_elem6;
1787 /* weights, correctly rounded (round(256*x/7)) */
1788 elems[0] = LLVMConstInt(type16s, 256, 0);
1789 elems[1] = LLVMConstInt(type16s, 0, 0);
1790 elems[2] = LLVMConstInt(type16s, 219, 0);
1791 elems[3] = LLVMConstInt(type16s, 183, 0);
1792 elems[4] = LLVMConstInt(type16s, 146, 0);
1793 elems[5] = LLVMConstInt(type16s, 110, 0);
1794 elems[6] = LLVMConstInt(type16s, 73, 0);
1795 elems[7] = LLVMConstInt(type16s, 37, 0);
1796 weight0 = LLVMConstVector(elems, 8);
1797
1798 elems[0] = LLVMConstInt(type16s, 256, 0);
1799 elems[1] = LLVMConstInt(type16s, 0, 0);
1800 elems[2] = LLVMConstInt(type16s, 205, 0);
1801 elems[3] = LLVMConstInt(type16s, 154, 0);
1802 elems[4] = LLVMConstInt(type16s, 102, 0);
1803 elems[5] = LLVMConstInt(type16s, 51, 0);
1804 elems[6] = LLVMConstInt(type16s, 0, 0);
1805 elems[7] = LLVMConstInt(type16s, 0, 0);
1806 weight1 = LLVMConstVector(elems, 8);
1807
1808 weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, "");
1809 weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, "");
1810 weight = lp_build_select(&bld8, sel_mask, weight0, weight1);
1811 weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, "");
1812
1813 for (i = 0; i < 16; i++) {
1814 elems[i] = LLVMConstNull(type8s);
1815 }
1816 elems[7] = LLVMConstInt(type8s, 255, 0);
1817 constff_elem7 = LLVMConstVector(elems, 16);
1818
1819 for (i = 0; i < 16; i++) {
1820 elems[i] = LLVMConstInt(type8s, 255, 0);
1821 }
1822 elems[6] = LLVMConstInt(type8s, 0, 0);
1823 const0_elem6 = LLVMConstVector(elems, 16);
1824
1825 /* standard simple lerp - but the version we need isn't available */
1826 delta = LLVMBuildSub(builder, alpha0, alpha1, "");
1827 ainterp = LLVMBuildMul(builder, delta, weight, "");
1828 ainterp = LLVMBuildLShr(builder, ainterp,
1829 lp_build_const_int_vec(gallivm, type16, 8), "");
1830 ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, "");
1831 alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, "");
1832 ainterp = LLVMBuildAdd(builder, ainterp, alpha1, "");
1833 ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, "");
1834 ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef);
1835
1836 /* fixing 0/0xff case is slightly more complex */
1837 constff_elem7 = LLVMBuildAnd(builder, constff_elem7,
1838 LLVMBuildNot(builder, sel_mask, ""), "");
1839 const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, "");
1840 ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, "");
1841 ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, "");
1842
1843 /* now pick all 16 elements at once! */
1844 intrargs[0] = ainterp;
1845 intrargs[1] = acode;
1846 ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1847 bld8.vec_type, intrargs, 2, 0);
1848
1849 ares = LLVMBuildBitCast(builder, ares, i32t, "");
1850 mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1851 a[0] = LLVMBuildShl(builder, ares,
1852 lp_build_const_int_vec(gallivm, type32, 24), "");
1853 a[1] = LLVMBuildShl(builder, ares,
1854 lp_build_const_int_vec(gallivm, type32, 16), "");
1855 a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1856 a[2] = LLVMBuildShl(builder, ares,
1857 lp_build_const_int_vec(gallivm, type32, 8), "");
1858 a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1859 a[3] = LLVMBuildAnd(builder, ares, mask8hi, "");
1860 }
1861
1862 for (i = 0; i < 4; i++) {
1863 a[i] = LLVMBuildBitCast(builder, a[i], i32t, "");
1864 col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1865 }
1866 }
1867
1868
1869 static void
1870 generate_update_cache_one_block(struct gallivm_state *gallivm,
1871 LLVMValueRef function,
1872 const struct util_format_description *format_desc)
1873 {
1874 LLVMBasicBlockRef block;
1875 LLVMBuilderRef old_builder;
1876 LLVMValueRef ptr_addr;
1877 LLVMValueRef hash_index;
1878 LLVMValueRef cache;
1879 LLVMValueRef dxt_block, tag_value;
1880 LLVMValueRef col[LP_MAX_VECTOR_LENGTH];
1881
1882 ptr_addr = LLVMGetParam(function, 0);
1883 hash_index = LLVMGetParam(function, 1);
1884 cache = LLVMGetParam(function, 2);
1885
1886 lp_build_name(ptr_addr, "ptr_addr" );
1887 lp_build_name(hash_index, "hash_index");
1888 lp_build_name(cache, "cache_addr");
1889
1890 /*
1891 * Function body
1892 */
1893
1894 old_builder = gallivm->builder;
1895 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
1896 gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
1897 LLVMPositionBuilderAtEnd(gallivm->builder, block);
1898
1899 lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block,
1900 ptr_addr);
1901
1902 switch (format_desc->format) {
1903 case PIPE_FORMAT_DXT1_RGB:
1904 case PIPE_FORMAT_DXT1_RGBA:
1905 case PIPE_FORMAT_DXT1_SRGB:
1906 case PIPE_FORMAT_DXT1_SRGBA:
1907 s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1908 break;
1909 case PIPE_FORMAT_DXT3_RGBA:
1910 case PIPE_FORMAT_DXT3_SRGBA:
1911 s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col);
1912 break;
1913 case PIPE_FORMAT_DXT5_RGBA:
1914 case PIPE_FORMAT_DXT5_SRGBA:
1915 s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col);
1916 break;
1917 default:
1918 assert(0);
1919 s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1920 break;
1921 }
1922
1923 tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
1924 LLVMInt64TypeInContext(gallivm->context), "");
1925 s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache);
1926
1927 LLVMBuildRetVoid(gallivm->builder);
1928
1929 LLVMDisposeBuilder(gallivm->builder);
1930 gallivm->builder = old_builder;
1931
1932 gallivm_verify_function(gallivm, function);
1933 }
1934
1935
1936 static void
1937 update_cached_block(struct gallivm_state *gallivm,
1938 const struct util_format_description *format_desc,
1939 LLVMValueRef ptr_addr,
1940 LLVMValueRef hash_index,
1941 LLVMValueRef cache)
1942
1943 {
1944 LLVMBuilderRef builder = gallivm->builder;
1945 LLVMModuleRef module = gallivm->module;
1946 char name[256];
1947 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
1948 LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
1949 LLVMValueRef function, inst;
1950 LLVMBasicBlockRef bb;
1951 LLVMValueRef args[3];
1952
1953 snprintf(name, sizeof name, "%s_update_cache_one_block",
1954 format_desc->short_name);
1955 function = LLVMGetNamedFunction(module, name);
1956
1957 if (!function) {
1958 LLVMTypeRef ret_type;
1959 LLVMTypeRef arg_types[3];
1960 LLVMTypeRef function_type;
1961 unsigned arg;
1962
1963 /*
1964 * Generate the function prototype.
1965 */
1966
1967 ret_type = LLVMVoidTypeInContext(gallivm->context);
1968 arg_types[0] = pi8t;
1969 arg_types[1] = LLVMInt32TypeInContext(gallivm->context);
1970 arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here
1971 function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0);
1972 function = LLVMAddFunction(module, name, function_type);
1973
1974 for (arg = 0; arg < ARRAY_SIZE(arg_types); ++arg)
1975 if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind)
1976 lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS);
1977
1978 LLVMSetFunctionCallConv(function, LLVMFastCallConv);
1979 LLVMSetVisibility(function, LLVMHiddenVisibility);
1980 generate_update_cache_one_block(gallivm, function, format_desc);
1981 }
1982
1983 args[0] = ptr_addr;
1984 args[1] = hash_index;
1985 args[2] = cache;
1986
1987 LLVMBuildCall(builder, function, args, ARRAY_SIZE(args), "");
1988 bb = LLVMGetInsertBlock(builder);
1989 inst = LLVMGetLastInstruction(bb);
1990 LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
1991 }
1992
1993 /*
1994 * cached lookup
1995 */
1996 static LLVMValueRef
1997 compressed_fetch_cached(struct gallivm_state *gallivm,
1998 const struct util_format_description *format_desc,
1999 unsigned n,
2000 LLVMValueRef base_ptr,
2001 LLVMValueRef offset,
2002 LLVMValueRef i,
2003 LLVMValueRef j,
2004 LLVMValueRef cache)
2005
2006 {
2007 LLVMBuilderRef builder = gallivm->builder;
2008 unsigned count, low_bit, log2size;
2009 LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
2010 LLVMValueRef ij_index, hash_index, hash_mask, block_index;
2011 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2012 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2013 LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
2014 struct lp_type type;
2015 struct lp_build_context bld32;
2016 memset(&type, 0, sizeof type);
2017 type.width = 32;
2018 type.length = n;
2019
2020 lp_build_context_init(&bld32, gallivm, type);
2021
2022 /*
2023 * compute hash - we use direct mapped cache, the hash function could
2024 * be better but it needs to be simple
2025 * per-element:
2026 * compare offset with offset stored at tag (hash)
2027 * if not equal extract block, store block, update tag
2028 * extract color from cache
2029 * assemble colors
2030 */
2031
2032 low_bit = util_logbase2(format_desc->block.bits / 8);
2033 log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
2034 addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
2035 ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
2036 ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
2037 /* For the hash function, first mask off the unused lowest bits. Then just
2038 do some xor with address bits - only use lower 32bits */
2039 ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
2040 ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2041 lp_build_const_int_vec(gallivm, type, low_bit), "");
2042 /* This only really makes sense for size 64,128,256 */
2043 hash_index = ptr_addrtrunc;
2044 ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2045 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
2046 hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
2047 tmp = LLVMBuildLShr(builder, hash_index,
2048 lp_build_const_int_vec(gallivm, type, log2size), "");
2049 hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
2050
2051 hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
2052 hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
2053 ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
2054 ij_index = LLVMBuildAdd(builder, ij_index, j, "");
2055 block_index = LLVMBuildShl(builder, hash_index,
2056 lp_build_const_int_vec(gallivm, type, 4), "");
2057 block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
2058
2059 if (n > 1) {
2060 color = bld32.undef;
2061 for (count = 0; count < n; count++) {
2062 LLVMValueRef index, cond, colorx;
2063 LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
2064 struct lp_build_if_state if_ctx;
2065
2066 index = lp_build_const_int32(gallivm, count);
2067 offsetx = LLVMBuildExtractElement(builder, offset, index, "");
2068 addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
2069 addrx = LLVMBuildAdd(builder, addrx, addr, "");
2070 block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
2071 hash_indexx = LLVMBuildLShr(builder, block_indexx,
2072 lp_build_const_int32(gallivm, 4), "");
2073 offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx);
2074 cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
2075
2076 lp_build_if(&if_ctx, gallivm, cond);
2077 {
2078 ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
2079 LLVMPointerType(i8t, 0), "");
2080 update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
2081 #if LP_BUILD_FORMAT_CACHE_DEBUG
2082 s3tc_update_cache_access(gallivm, cache, 1,
2083 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2084 #endif
2085 }
2086 lp_build_endif(&if_ctx);
2087
2088 colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx);
2089
2090 color = LLVMBuildInsertElement(builder, color, colorx,
2091 lp_build_const_int32(gallivm, count), "");
2092 }
2093 }
2094 else {
2095 LLVMValueRef cond;
2096 struct lp_build_if_state if_ctx;
2097
2098 tmp = LLVMBuildZExt(builder, offset, i64t, "");
2099 addr = LLVMBuildAdd(builder, tmp, addr, "");
2100 offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index);
2101 cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
2102
2103 lp_build_if(&if_ctx, gallivm, cond);
2104 {
2105 tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
2106 update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
2107 #if LP_BUILD_FORMAT_CACHE_DEBUG
2108 s3tc_update_cache_access(gallivm, cache, 1,
2109 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2110 #endif
2111 }
2112 lp_build_endif(&if_ctx);
2113
2114 color = s3tc_lookup_cached_pixel(gallivm, cache, block_index);
2115 }
2116 #if LP_BUILD_FORMAT_CACHE_DEBUG
2117 s3tc_update_cache_access(gallivm, cache, n,
2118 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
2119 #endif
2120 return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
2121 }
2122
2123
2124 static LLVMValueRef
2125 s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm,
2126 unsigned n,
2127 enum pipe_format format,
2128 LLVMValueRef colors,
2129 LLVMValueRef codewords,
2130 LLVMValueRef alpha_lo,
2131 LLVMValueRef alpha_hi,
2132 LLVMValueRef i,
2133 LLVMValueRef j)
2134 {
2135 return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors,
2136 codewords, alpha_lo, alpha_hi, i, j);
2137 }
2138
2139
2140 /**
2141 * @param n number of pixels processed (usually n=4, but it should also work with n=1
2142 * and multiples of 4)
2143 * @param base_ptr base pointer (32bit or 64bit pointer depending on the architecture)
2144 * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2145 * @param i is a <n x i32> vector with the x subpixel coordinate (0..3)
2146 * @param j is a <n x i32> vector with the y subpixel coordinate (0..3)
2147 * @return a <4*n x i8> vector with the pixel RGBA values in AoS
2148 */
2149 LLVMValueRef
2150 lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
2151 const struct util_format_description *format_desc,
2152 unsigned n,
2153 LLVMValueRef base_ptr,
2154 LLVMValueRef offset,
2155 LLVMValueRef i,
2156 LLVMValueRef j,
2157 LLVMValueRef cache)
2158 {
2159 LLVMValueRef rgba;
2160 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2161 LLVMBuilderRef builder = gallivm->builder;
2162
2163 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
2164 assert(format_desc->block.width == 4);
2165 assert(format_desc->block.height == 4);
2166
2167 assert((n == 1) || (n % 4 == 0));
2168
2169 /* debug_printf("format = %d\n", format_desc->format);*/
2170 if (cache) {
2171 rgba = compressed_fetch_cached(gallivm, format_desc, n,
2172 base_ptr, offset, i, j, cache);
2173 return rgba;
2174 }
2175
2176 /*
2177 * Could use n > 8 here with avx2, but doesn't seem faster.
2178 */
2179 if (n > 4) {
2180 unsigned count;
2181 LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2182 LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2183 LLVMTypeRef i128_vectype = LLVMVectorType(i128_type, n / 4);
2184 LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2185 gallivm->context), 4);
2186 LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2187 struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2188
2189 assert(n / 4 <= ARRAY_SIZE(rgba4));
2190
2191 rgba = LLVMGetUndef(i128_vectype);
2192
2193 for (count = 0; count < n / 4; count++) {
2194 LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2195
2196 i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2197 j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2198 offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2199
2200 lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords,
2201 &alpha_lo, &alpha_hi, base_ptr, offset4);
2202
2203 switch (format_desc->format) {
2204 case PIPE_FORMAT_DXT1_RGB:
2205 case PIPE_FORMAT_DXT1_RGBA:
2206 case PIPE_FORMAT_DXT1_SRGB:
2207 case PIPE_FORMAT_DXT1_SRGBA:
2208 rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format,
2209 colors, codewords, i4, j4);
2210 break;
2211 case PIPE_FORMAT_DXT3_RGBA:
2212 case PIPE_FORMAT_DXT3_SRGBA:
2213 rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2214 codewords, alpha_lo, alpha_hi, i4, j4);
2215 break;
2216 case PIPE_FORMAT_DXT5_RGBA:
2217 case PIPE_FORMAT_DXT5_SRGBA:
2218 rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2219 codewords, alpha_lo, alpha_hi, i4, j4);
2220 break;
2221 default:
2222 assert(0);
2223 rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2224 break;
2225 }
2226 /* shuffles typically give best results with dword elements...*/
2227 rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2228 }
2229 rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2230 rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2231 }
2232 else {
2233 LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2234
2235 lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords,
2236 &alpha_lo, &alpha_hi, base_ptr, offset);
2237
2238 switch (format_desc->format) {
2239 case PIPE_FORMAT_DXT1_RGB:
2240 case PIPE_FORMAT_DXT1_RGBA:
2241 case PIPE_FORMAT_DXT1_SRGB:
2242 case PIPE_FORMAT_DXT1_SRGBA:
2243 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format,
2244 colors, codewords, i, j);
2245 break;
2246 case PIPE_FORMAT_DXT3_RGBA:
2247 case PIPE_FORMAT_DXT3_SRGBA:
2248 rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors,
2249 codewords, alpha_lo, alpha_hi, i, j);
2250 break;
2251 case PIPE_FORMAT_DXT5_RGBA:
2252 case PIPE_FORMAT_DXT5_SRGBA:
2253 rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors,
2254 codewords, alpha_lo, alpha_hi, i, j);
2255 break;
2256 default:
2257 assert(0);
2258 rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2259 break;
2260 }
2261 }
2262
2263 /* always return just decompressed values - srgb conversion is done later */
2264
2265 return rgba;
2266 }