gallivm: support array textures
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_aos.c
1 /**************************************************************************
2 *
3 * Copyright 2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- AoS.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "util/u_debug.h"
39 #include "util/u_dump.h"
40 #include "util/u_memory.h"
41 #include "util/u_math.h"
42 #include "util/u_format.h"
43 #include "util/u_cpu_detect.h"
44 #include "lp_bld_debug.h"
45 #include "lp_bld_type.h"
46 #include "lp_bld_const.h"
47 #include "lp_bld_conv.h"
48 #include "lp_bld_arit.h"
49 #include "lp_bld_bitarit.h"
50 #include "lp_bld_logic.h"
51 #include "lp_bld_swizzle.h"
52 #include "lp_bld_pack.h"
53 #include "lp_bld_flow.h"
54 #include "lp_bld_gather.h"
55 #include "lp_bld_format.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_sample.h"
58 #include "lp_bld_sample_aos.h"
59 #include "lp_bld_quad.h"
60
61
62 /**
63 * Build LLVM code for texture coord wrapping, for nearest filtering,
64 * for scaled integer texcoords.
65 * \param block_length is the length of the pixel block along the
66 * coordinate axis
67 * \param coord the incoming texcoord (s,t,r or q) scaled to the texture size
68 * \param length the texture size along one dimension
69 * \param stride pixel stride along the coordinate axis (in bytes)
70 * \param is_pot if TRUE, length is a power of two
71 * \param wrap_mode one of PIPE_TEX_WRAP_x
72 * \param out_offset byte offset for the wrapped coordinate
73 * \param out_i resulting sub-block pixel coordinate for coord0
74 */
75 static void
76 lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
77 unsigned block_length,
78 LLVMValueRef coord,
79 LLVMValueRef coord_f,
80 LLVMValueRef length,
81 LLVMValueRef stride,
82 boolean is_pot,
83 unsigned wrap_mode,
84 LLVMValueRef *out_offset,
85 LLVMValueRef *out_i)
86 {
87 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
88 LLVMBuilderRef builder = bld->gallivm->builder;
89 LLVMValueRef length_minus_one;
90
91 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
92
93 switch(wrap_mode) {
94 case PIPE_TEX_WRAP_REPEAT:
95 if(is_pot)
96 coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
97 else {
98 struct lp_build_context *coord_bld = &bld->coord_bld;
99 LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
100 coord = lp_build_fract_safe(coord_bld, coord_f);
101 coord = lp_build_mul(coord_bld, coord, length_f);
102 coord = lp_build_itrunc(coord_bld, coord);
103 }
104 break;
105
106 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
107 coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
108 coord = lp_build_min(int_coord_bld, coord, length_minus_one);
109 break;
110
111 case PIPE_TEX_WRAP_CLAMP:
112 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
113 case PIPE_TEX_WRAP_MIRROR_REPEAT:
114 case PIPE_TEX_WRAP_MIRROR_CLAMP:
115 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
116 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
117 default:
118 assert(0);
119 }
120
121 lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride,
122 out_offset, out_i);
123 }
124
125
126 /**
127 * Build LLVM code for texture coord wrapping, for nearest filtering,
128 * for float texcoords.
129 * \param coord the incoming texcoord (s,t,r or q)
130 * \param length the texture size along one dimension
131 * \param is_pot if TRUE, length is a power of two
132 * \param wrap_mode one of PIPE_TEX_WRAP_x
133 * \param icoord the texcoord after wrapping, as int
134 */
135 static void
136 lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
137 LLVMValueRef coord,
138 LLVMValueRef length,
139 boolean is_pot,
140 unsigned wrap_mode,
141 LLVMValueRef *icoord)
142 {
143 struct lp_build_context *coord_bld = &bld->coord_bld;
144 LLVMValueRef length_minus_one;
145
146 switch(wrap_mode) {
147 case PIPE_TEX_WRAP_REPEAT:
148 /* take fraction, unnormalize */
149 coord = lp_build_fract_safe(coord_bld, coord);
150 coord = lp_build_mul(coord_bld, coord, length);
151 *icoord = lp_build_itrunc(coord_bld, coord);
152 break;
153 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
154 length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
155 if (bld->static_state->normalized_coords) {
156 /* scale coord to length */
157 coord = lp_build_mul(coord_bld, coord, length);
158 }
159 coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
160 length_minus_one);
161 *icoord = lp_build_itrunc(coord_bld, coord);
162 break;
163
164 case PIPE_TEX_WRAP_CLAMP:
165 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
166 case PIPE_TEX_WRAP_MIRROR_REPEAT:
167 case PIPE_TEX_WRAP_MIRROR_CLAMP:
168 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
169 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
170 default:
171 assert(0);
172 }
173 }
174
175
176 /**
177 * Build LLVM code for texture coord wrapping, for linear filtering,
178 * for scaled integer texcoords.
179 * \param block_length is the length of the pixel block along the
180 * coordinate axis
181 * \param coord0 the incoming texcoord (s,t,r or q) scaled to the texture size
182 * \param length the texture size along one dimension
183 * \param stride pixel stride along the coordinate axis (in bytes)
184 * \param is_pot if TRUE, length is a power of two
185 * \param wrap_mode one of PIPE_TEX_WRAP_x
186 * \param offset0 resulting relative offset for coord0
187 * \param offset1 resulting relative offset for coord0 + 1
188 * \param i0 resulting sub-block pixel coordinate for coord0
189 * \param i1 resulting sub-block pixel coordinate for coord0 + 1
190 */
191 static void
192 lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
193 unsigned block_length,
194 LLVMValueRef coord0,
195 LLVMValueRef *weight_i,
196 LLVMValueRef coord_f,
197 LLVMValueRef length,
198 LLVMValueRef stride,
199 boolean is_pot,
200 unsigned wrap_mode,
201 LLVMValueRef *offset0,
202 LLVMValueRef *offset1,
203 LLVMValueRef *i0,
204 LLVMValueRef *i1)
205 {
206 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
207 LLVMBuilderRef builder = bld->gallivm->builder;
208 LLVMValueRef length_minus_one;
209 LLVMValueRef lmask, umask, mask;
210
211 /*
212 * If the pixel block covers more than one pixel then there is no easy
213 * way to calculate offset1 relative to offset0. Instead, compute them
214 * independently. Otherwise, try to compute offset0 and offset1 with
215 * a single stride multiplication.
216 */
217
218 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
219
220 if (block_length != 1) {
221 LLVMValueRef coord1;
222 switch(wrap_mode) {
223 case PIPE_TEX_WRAP_REPEAT:
224 if (is_pot) {
225 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
226 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
227 coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
228 }
229 else {
230 LLVMValueRef mask;
231 LLVMValueRef weight;
232 LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
233 lp_build_coord_repeat_npot_linear(bld, coord_f,
234 length, length_f,
235 &coord0, &weight);
236 mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
237 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
238 coord1 = LLVMBuildAnd(builder,
239 lp_build_add(int_coord_bld, coord0,
240 int_coord_bld->one),
241 mask, "");
242 weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
243 *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
244 }
245 break;
246
247 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
248 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
249 coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
250 length_minus_one);
251 coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
252 length_minus_one);
253 break;
254
255 case PIPE_TEX_WRAP_CLAMP:
256 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
257 case PIPE_TEX_WRAP_MIRROR_REPEAT:
258 case PIPE_TEX_WRAP_MIRROR_CLAMP:
259 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
260 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
261 default:
262 assert(0);
263 coord0 = int_coord_bld->zero;
264 coord1 = int_coord_bld->zero;
265 break;
266 }
267 lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
268 offset0, i0);
269 lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
270 offset1, i1);
271 return;
272 }
273
274 *i0 = int_coord_bld->zero;
275 *i1 = int_coord_bld->zero;
276
277 switch(wrap_mode) {
278 case PIPE_TEX_WRAP_REPEAT:
279 if (is_pot) {
280 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
281 }
282 else {
283 LLVMValueRef weight;
284 LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
285 lp_build_coord_repeat_npot_linear(bld, coord_f,
286 length, length_f,
287 &coord0, &weight);
288 weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
289 *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
290 }
291
292 mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
293 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
294
295 *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
296 *offset1 = LLVMBuildAnd(builder,
297 lp_build_add(int_coord_bld, *offset0, stride),
298 mask, "");
299 break;
300
301 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
302 /* XXX this might be slower than the separate path
303 * on some newer cpus. With sse41 this is 8 instructions vs. 7
304 * - at least on SNB this is almost certainly slower since
305 * min/max are cheaper than selects, and the muls aren't bad.
306 */
307 lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
308 PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
309 umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
310 PIPE_FUNC_LESS, coord0, length_minus_one);
311
312 coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
313 coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
314
315 mask = LLVMBuildAnd(builder, lmask, umask, "");
316
317 *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
318 *offset1 = lp_build_add(int_coord_bld,
319 *offset0,
320 LLVMBuildAnd(builder, stride, mask, ""));
321 break;
322
323 case PIPE_TEX_WRAP_CLAMP:
324 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
325 case PIPE_TEX_WRAP_MIRROR_REPEAT:
326 case PIPE_TEX_WRAP_MIRROR_CLAMP:
327 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
328 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
329 default:
330 assert(0);
331 *offset0 = int_coord_bld->zero;
332 *offset1 = int_coord_bld->zero;
333 break;
334 }
335 }
336
337
338 /**
339 * Build LLVM code for texture coord wrapping, for linear filtering,
340 * for float texcoords.
341 * \param block_length is the length of the pixel block along the
342 * coordinate axis
343 * \param coord the incoming texcoord (s,t,r or q)
344 * \param length the texture size along one dimension
345 * \param is_pot if TRUE, length is a power of two
346 * \param wrap_mode one of PIPE_TEX_WRAP_x
347 * \param coord0 the first texcoord after wrapping, as int
348 * \param coord1 the second texcoord after wrapping, as int
349 * \param weight the filter weight as int (0-255)
350 * \param force_nearest if this coord actually uses nearest filtering
351 */
352 static void
353 lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
354 unsigned block_length,
355 LLVMValueRef coord,
356 LLVMValueRef length,
357 boolean is_pot,
358 unsigned wrap_mode,
359 LLVMValueRef *coord0,
360 LLVMValueRef *coord1,
361 LLVMValueRef *weight,
362 unsigned force_nearest)
363 {
364 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
365 struct lp_build_context *coord_bld = &bld->coord_bld;
366 LLVMBuilderRef builder = bld->gallivm->builder;
367 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
368 LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
369
370 switch(wrap_mode) {
371 case PIPE_TEX_WRAP_REPEAT:
372 if (is_pot) {
373 /* mul by size and subtract 0.5 */
374 coord = lp_build_mul(coord_bld, coord, length);
375 if (!force_nearest)
376 coord = lp_build_sub(coord_bld, coord, half);
377 *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
378 /* convert to int, compute lerp weight */
379 lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
380 *coord1 = lp_build_ifloor(coord_bld, *coord1);
381 /* repeat wrap */
382 length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
383 *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
384 *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
385 }
386 else {
387 LLVMValueRef mask;
388 /* wrap with normalized floats is just fract */
389 coord = lp_build_fract(coord_bld, coord);
390 /* unnormalize */
391 coord = lp_build_mul(coord_bld, coord, length);
392 /*
393 * we avoided the 0.5/length division, have to fix up wrong
394 * edge cases with selects
395 */
396 *coord1 = lp_build_add(coord_bld, coord, half);
397 coord = lp_build_sub(coord_bld, coord, half);
398 *weight = lp_build_fract(coord_bld, coord);
399 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
400 PIPE_FUNC_LESS, coord, coord_bld->zero);
401 *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
402 *coord0 = lp_build_itrunc(coord_bld, *coord0);
403 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
404 PIPE_FUNC_LESS, *coord1, length);
405 *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
406 *coord1 = lp_build_itrunc(coord_bld, *coord1);
407 }
408 break;
409 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
410 if (bld->static_state->normalized_coords) {
411 /* mul by tex size */
412 coord = lp_build_mul(coord_bld, coord, length);
413 }
414 /* subtract 0.5 */
415 if (!force_nearest) {
416 coord = lp_build_sub(coord_bld, coord, half);
417 }
418 /* clamp to [0, length - 1] */
419 coord = lp_build_min(coord_bld, coord, length_minus_one);
420 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
421 *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
422 /* convert to int, compute lerp weight */
423 lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
424 /* coord1 = min(coord1, length-1) */
425 *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
426 *coord1 = lp_build_itrunc(coord_bld, *coord1);
427 break;
428 default:
429 assert(0);
430 *coord0 = int_coord_bld->zero;
431 *coord1 = int_coord_bld->zero;
432 *weight = coord_bld->zero;
433 break;
434 }
435 *weight = lp_build_mul_imm(coord_bld, *weight, 256);
436 *weight = lp_build_itrunc(coord_bld, *weight);
437 return;
438 }
439
440
441 /**
442 * Fetch texels for image with nearest sampling.
443 * Return filtered color as two vectors of 16-bit fixed point values.
444 */
445 static void
446 lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
447 LLVMValueRef data_ptr,
448 LLVMValueRef offset,
449 LLVMValueRef x_subcoord,
450 LLVMValueRef y_subcoord,
451 LLVMValueRef *colors_lo,
452 LLVMValueRef *colors_hi)
453 {
454 /*
455 * Fetch the pixels as 4 x 32bit (rgba order might differ):
456 *
457 * rgba0 rgba1 rgba2 rgba3
458 *
459 * bit cast them into 16 x u8
460 *
461 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
462 *
463 * unpack them into two 8 x i16:
464 *
465 * r0 g0 b0 a0 r1 g1 b1 a1
466 * r2 g2 b2 a2 r3 g3 b3 a3
467 *
468 * The higher 8 bits of the resulting elements will be zero.
469 */
470 LLVMBuilderRef builder = bld->gallivm->builder;
471 LLVMValueRef rgba8;
472 struct lp_build_context h16, u8n;
473 LLVMTypeRef u8n_vec_type;
474
475 lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
476 lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
477 u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
478
479 if (util_format_is_rgba8_variant(bld->format_desc)) {
480 /*
481 * Given the format is a rgba8, just read the pixels as is,
482 * without any swizzling. Swizzling will be done later.
483 */
484 rgba8 = lp_build_gather(bld->gallivm,
485 bld->texel_type.length,
486 bld->format_desc->block.bits,
487 bld->texel_type.width,
488 data_ptr, offset);
489
490 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
491 }
492 else {
493 rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
494 bld->format_desc,
495 u8n.type,
496 data_ptr, offset,
497 x_subcoord,
498 y_subcoord);
499 }
500
501 /* Expand one 4*rgba8 to two 2*rgba16 */
502 lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
503 rgba8,
504 colors_lo, colors_hi);
505 }
506
507
508 /**
509 * Sample a single texture image with nearest sampling.
510 * If sampling a cube texture, r = cube face in [0,5].
511 * Return filtered color as two vectors of 16-bit fixed point values.
512 */
513 static void
514 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
515 LLVMValueRef int_size,
516 LLVMValueRef row_stride_vec,
517 LLVMValueRef img_stride_vec,
518 LLVMValueRef data_ptr,
519 LLVMValueRef mipoffsets,
520 LLVMValueRef s,
521 LLVMValueRef t,
522 LLVMValueRef r,
523 LLVMValueRef *colors_lo,
524 LLVMValueRef *colors_hi)
525 {
526 const unsigned dims = bld->dims;
527 LLVMBuilderRef builder = bld->gallivm->builder;
528 struct lp_build_context i32;
529 LLVMTypeRef i32_vec_type;
530 LLVMValueRef i32_c8;
531 LLVMValueRef width_vec, height_vec, depth_vec;
532 LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
533 LLVMValueRef s_float, t_float = NULL, r_float = NULL;
534 LLVMValueRef x_stride;
535 LLVMValueRef x_offset, offset;
536 LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
537
538 lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
539
540 i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
541
542 lp_build_extract_image_sizes(bld,
543 &bld->int_size_bld,
544 bld->int_coord_type,
545 int_size,
546 &width_vec,
547 &height_vec,
548 &depth_vec);
549
550 s_float = s; t_float = t; r_float = r;
551
552 if (bld->static_state->normalized_coords) {
553 LLVMValueRef scaled_size;
554 LLVMValueRef flt_size;
555
556 /* scale size by 256 (8 fractional bits) */
557 scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
558
559 flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
560
561 lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
562 }
563 else {
564 /* scale coords by 256 (8 fractional bits) */
565 s = lp_build_mul_imm(&bld->coord_bld, s, 256);
566 if (dims >= 2)
567 t = lp_build_mul_imm(&bld->coord_bld, t, 256);
568 if (dims >= 3)
569 r = lp_build_mul_imm(&bld->coord_bld, r, 256);
570 }
571
572 /* convert float to int */
573 s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
574 if (dims >= 2)
575 t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
576 if (dims >= 3)
577 r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
578
579 /* compute floor (shift right 8) */
580 i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8);
581 s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
582 if (dims >= 2)
583 t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
584 if (dims >= 3)
585 r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
586
587 /* get pixel, row, image strides */
588 x_stride = lp_build_const_vec(bld->gallivm,
589 bld->int_coord_bld.type,
590 bld->format_desc->block.bits/8);
591
592 /* Do texcoord wrapping, compute texel offset */
593 lp_build_sample_wrap_nearest_int(bld,
594 bld->format_desc->block.width,
595 s_ipart, s_float,
596 width_vec, x_stride,
597 bld->static_state->pot_width,
598 bld->static_state->wrap_s,
599 &x_offset, &x_subcoord);
600 offset = x_offset;
601 if (dims >= 2) {
602 LLVMValueRef y_offset;
603 lp_build_sample_wrap_nearest_int(bld,
604 bld->format_desc->block.height,
605 t_ipart, t_float,
606 height_vec, row_stride_vec,
607 bld->static_state->pot_height,
608 bld->static_state->wrap_t,
609 &y_offset, &y_subcoord);
610 offset = lp_build_add(&bld->int_coord_bld, offset, y_offset);
611 if (dims >= 3) {
612 LLVMValueRef z_offset;
613 lp_build_sample_wrap_nearest_int(bld,
614 1, /* block length (depth) */
615 r_ipart, r_float,
616 depth_vec, img_stride_vec,
617 bld->static_state->pot_depth,
618 bld->static_state->wrap_r,
619 &z_offset, &z_subcoord);
620 offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
621 }
622 }
623 if (bld->static_state->target == PIPE_TEXTURE_CUBE ||
624 bld->static_state->target == PIPE_TEXTURE_1D_ARRAY ||
625 bld->static_state->target == PIPE_TEXTURE_2D_ARRAY) {
626 LLVMValueRef z_offset;
627 /* The r coord is the cube face in [0,5] or array layer */
628 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
629 offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
630 }
631 if (mipoffsets) {
632 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
633 }
634
635 lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
636 x_subcoord, y_subcoord,
637 colors_lo, colors_hi);
638 }
639
640
641 /**
642 * Sample a single texture image with nearest sampling.
643 * If sampling a cube texture, r = cube face in [0,5].
644 * Return filtered color as two vectors of 16-bit fixed point values.
645 * Does address calcs (except offsets) with floats.
646 * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
647 */
648 static void
649 lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
650 LLVMValueRef int_size,
651 LLVMValueRef row_stride_vec,
652 LLVMValueRef img_stride_vec,
653 LLVMValueRef data_ptr,
654 LLVMValueRef mipoffsets,
655 LLVMValueRef s,
656 LLVMValueRef t,
657 LLVMValueRef r,
658 LLVMValueRef *colors_lo,
659 LLVMValueRef *colors_hi)
660 {
661 const unsigned dims = bld->dims;
662 LLVMValueRef width_vec, height_vec, depth_vec;
663 LLVMValueRef offset;
664 LLVMValueRef x_subcoord, y_subcoord;
665 LLVMValueRef x_icoord = NULL, y_icoord = NULL, z_icoord = NULL;
666 LLVMValueRef flt_size;
667
668 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
669
670 lp_build_extract_image_sizes(bld,
671 &bld->float_size_bld,
672 bld->coord_type,
673 flt_size,
674 &width_vec,
675 &height_vec,
676 &depth_vec);
677
678 /* Do texcoord wrapping */
679 lp_build_sample_wrap_nearest_float(bld,
680 s, width_vec,
681 bld->static_state->pot_width,
682 bld->static_state->wrap_s,
683 &x_icoord);
684
685 if (dims >= 2) {
686 lp_build_sample_wrap_nearest_float(bld,
687 t, height_vec,
688 bld->static_state->pot_height,
689 bld->static_state->wrap_t,
690 &y_icoord);
691
692 if (dims >= 3) {
693 lp_build_sample_wrap_nearest_float(bld,
694 r, depth_vec,
695 bld->static_state->pot_depth,
696 bld->static_state->wrap_r,
697 &z_icoord);
698 }
699 }
700 if (bld->static_state->target == PIPE_TEXTURE_CUBE ||
701 bld->static_state->target == PIPE_TEXTURE_1D_ARRAY ||
702 bld->static_state->target == PIPE_TEXTURE_2D_ARRAY) {
703 z_icoord = r;
704 }
705
706 /*
707 * From here on we deal with ints, and we should split up the 256bit
708 * vectors manually for better generated code.
709 */
710
711 /*
712 * compute texel offsets -
713 * cannot do offset calc with floats, difficult for block-based formats,
714 * and not enough precision anyway.
715 */
716 lp_build_sample_offset(&bld->int_coord_bld,
717 bld->format_desc,
718 x_icoord, y_icoord,
719 z_icoord,
720 row_stride_vec, img_stride_vec,
721 &offset,
722 &x_subcoord, &y_subcoord);
723 if (mipoffsets) {
724 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
725 }
726
727 lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
728 x_subcoord, y_subcoord,
729 colors_lo, colors_hi);
730 }
731
732
733 /**
734 * Fetch texels for image with linear sampling.
735 * Return filtered color as two vectors of 16-bit fixed point values.
736 */
737 static void
738 lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
739 LLVMValueRef data_ptr,
740 LLVMValueRef offset[2][2][2],
741 LLVMValueRef x_subcoord[2],
742 LLVMValueRef y_subcoord[2],
743 LLVMValueRef s_fpart,
744 LLVMValueRef t_fpart,
745 LLVMValueRef r_fpart,
746 LLVMValueRef *colors_lo,
747 LLVMValueRef *colors_hi)
748 {
749 const unsigned dims = bld->dims;
750 LLVMBuilderRef builder = bld->gallivm->builder;
751 struct lp_build_context h16, u8n;
752 LLVMTypeRef h16_vec_type, u8n_vec_type;
753 LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
754 LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
755 LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
756 LLVMValueRef shuffle_lo, shuffle_hi;
757 LLVMValueRef s_fpart_lo, s_fpart_hi;
758 LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL;
759 LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL;
760 LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
761 LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
762 LLVMValueRef packed_lo, packed_hi;
763 unsigned i, j, k;
764 unsigned numj, numk;
765
766 lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
767 lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
768 h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
769 u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
770
771 /*
772 * Transform 4 x i32 in
773 *
774 * s_fpart = {s0, s1, s2, s3}
775 *
776 * into 8 x i16
777 *
778 * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
779 *
780 * into two 8 x i16
781 *
782 * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
783 * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
784 *
785 * and likewise for t_fpart. There is no risk of loosing precision here
786 * since the fractional parts only use the lower 8bits.
787 */
788 s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
789 if (dims >= 2)
790 t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
791 if (dims >= 3)
792 r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
793
794 for (j = 0; j < h16.type.length; j += 4) {
795 #ifdef PIPE_ARCH_LITTLE_ENDIAN
796 unsigned subindex = 0;
797 #else
798 unsigned subindex = 1;
799 #endif
800 LLVMValueRef index;
801
802 index = LLVMConstInt(elem_type, j/2 + subindex, 0);
803 for (i = 0; i < 4; ++i)
804 shuffles_lo[j + i] = index;
805
806 index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
807 for (i = 0; i < 4; ++i)
808 shuffles_hi[j + i] = index;
809 }
810
811 shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
812 shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
813
814 s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
815 shuffle_lo, "");
816 s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
817 shuffle_hi, "");
818 if (dims >= 2) {
819 t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
820 shuffle_lo, "");
821 t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
822 shuffle_hi, "");
823 }
824 if (dims >= 3) {
825 r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
826 shuffle_lo, "");
827 r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
828 shuffle_hi, "");
829 }
830
831 /*
832 * Fetch the pixels as 4 x 32bit (rgba order might differ):
833 *
834 * rgba0 rgba1 rgba2 rgba3
835 *
836 * bit cast them into 16 x u8
837 *
838 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
839 *
840 * unpack them into two 8 x i16:
841 *
842 * r0 g0 b0 a0 r1 g1 b1 a1
843 * r2 g2 b2 a2 r3 g3 b3 a3
844 *
845 * The higher 8 bits of the resulting elements will be zero.
846 */
847 numj = 1 + (dims >= 2);
848 numk = 1 + (dims >= 3);
849
850 for (k = 0; k < numk; k++) {
851 for (j = 0; j < numj; j++) {
852 for (i = 0; i < 2; i++) {
853 LLVMValueRef rgba8;
854
855 if (util_format_is_rgba8_variant(bld->format_desc)) {
856 /*
857 * Given the format is a rgba8, just read the pixels as is,
858 * without any swizzling. Swizzling will be done later.
859 */
860 rgba8 = lp_build_gather(bld->gallivm,
861 bld->texel_type.length,
862 bld->format_desc->block.bits,
863 bld->texel_type.width,
864 data_ptr, offset[k][j][i]);
865
866 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
867 }
868 else {
869 rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
870 bld->format_desc,
871 u8n.type,
872 data_ptr, offset[k][j][i],
873 x_subcoord[i],
874 y_subcoord[j]);
875 }
876
877 /* Expand one 4*rgba8 to two 2*rgba16 */
878 lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
879 rgba8,
880 &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
881 }
882 }
883 }
884
885 /*
886 * Linear interpolation with 8.8 fixed point.
887 */
888 if (bld->static_state->force_nearest_s) {
889 /* special case 1-D lerp */
890 packed_lo = lp_build_lerp(&h16,
891 t_fpart_lo,
892 neighbors_lo[0][0][0],
893 neighbors_lo[0][0][1]);
894
895 packed_hi = lp_build_lerp(&h16,
896 t_fpart_hi,
897 neighbors_hi[0][1][0],
898 neighbors_hi[0][1][0]);
899 }
900 else if (bld->static_state->force_nearest_t) {
901 /* special case 1-D lerp */
902 packed_lo = lp_build_lerp(&h16,
903 s_fpart_lo,
904 neighbors_lo[0][0][0],
905 neighbors_lo[0][0][1]);
906
907 packed_hi = lp_build_lerp(&h16,
908 s_fpart_hi,
909 neighbors_hi[0][0][0],
910 neighbors_hi[0][0][1]);
911 }
912 else {
913 /* general 1/2/3-D lerping */
914 if (dims == 1) {
915 packed_lo = lp_build_lerp(&h16,
916 s_fpart_lo,
917 neighbors_lo[0][0][0],
918 neighbors_lo[0][0][1]);
919
920 packed_hi = lp_build_lerp(&h16,
921 s_fpart_hi,
922 neighbors_hi[0][0][0],
923 neighbors_hi[0][0][1]);
924 }
925 else {
926 /* 2-D lerp */
927 packed_lo = lp_build_lerp_2d(&h16,
928 s_fpart_lo, t_fpart_lo,
929 neighbors_lo[0][0][0],
930 neighbors_lo[0][0][1],
931 neighbors_lo[0][1][0],
932 neighbors_lo[0][1][1]);
933
934 packed_hi = lp_build_lerp_2d(&h16,
935 s_fpart_hi, t_fpart_hi,
936 neighbors_hi[0][0][0],
937 neighbors_hi[0][0][1],
938 neighbors_hi[0][1][0],
939 neighbors_hi[0][1][1]);
940
941 if (dims >= 3) {
942 LLVMValueRef packed_lo2, packed_hi2;
943
944 /* lerp in the second z slice */
945 packed_lo2 = lp_build_lerp_2d(&h16,
946 s_fpart_lo, t_fpart_lo,
947 neighbors_lo[1][0][0],
948 neighbors_lo[1][0][1],
949 neighbors_lo[1][1][0],
950 neighbors_lo[1][1][1]);
951
952 packed_hi2 = lp_build_lerp_2d(&h16,
953 s_fpart_hi, t_fpart_hi,
954 neighbors_hi[1][0][0],
955 neighbors_hi[1][0][1],
956 neighbors_hi[1][1][0],
957 neighbors_hi[1][1][1]);
958 /* interp between two z slices */
959 packed_lo = lp_build_lerp(&h16, r_fpart_lo,
960 packed_lo, packed_lo2);
961 packed_hi = lp_build_lerp(&h16, r_fpart_hi,
962 packed_hi, packed_hi2);
963 }
964 }
965 }
966
967 *colors_lo = packed_lo;
968 *colors_hi = packed_hi;
969 }
970
971 /**
972 * Sample a single texture image with (bi-)(tri-)linear sampling.
973 * Return filtered color as two vectors of 16-bit fixed point values.
974 */
975 static void
976 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
977 LLVMValueRef int_size,
978 LLVMValueRef row_stride_vec,
979 LLVMValueRef img_stride_vec,
980 LLVMValueRef data_ptr,
981 LLVMValueRef mipoffsets,
982 LLVMValueRef s,
983 LLVMValueRef t,
984 LLVMValueRef r,
985 LLVMValueRef *colors_lo,
986 LLVMValueRef *colors_hi)
987 {
988 const unsigned dims = bld->dims;
989 LLVMBuilderRef builder = bld->gallivm->builder;
990 struct lp_build_context i32;
991 LLVMTypeRef i32_vec_type;
992 LLVMValueRef i32_c8, i32_c128, i32_c255;
993 LLVMValueRef width_vec, height_vec, depth_vec;
994 LLVMValueRef s_ipart, s_fpart, s_float;
995 LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
996 LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
997 LLVMValueRef x_stride, y_stride, z_stride;
998 LLVMValueRef x_offset0, x_offset1;
999 LLVMValueRef y_offset0, y_offset1;
1000 LLVMValueRef z_offset0, z_offset1;
1001 LLVMValueRef offset[2][2][2]; /* [z][y][x] */
1002 LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
1003 unsigned x, y, z;
1004
1005 lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
1006
1007 i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
1008
1009 lp_build_extract_image_sizes(bld,
1010 &bld->int_size_bld,
1011 bld->int_coord_type,
1012 int_size,
1013 &width_vec,
1014 &height_vec,
1015 &depth_vec);
1016
1017 s_float = s; t_float = t; r_float = r;
1018
1019 if (bld->static_state->normalized_coords) {
1020 LLVMValueRef scaled_size;
1021 LLVMValueRef flt_size;
1022
1023 /* scale size by 256 (8 fractional bits) */
1024 scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
1025
1026 flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
1027
1028 lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
1029 }
1030 else {
1031 /* scale coords by 256 (8 fractional bits) */
1032 s = lp_build_mul_imm(&bld->coord_bld, s, 256);
1033 if (dims >= 2)
1034 t = lp_build_mul_imm(&bld->coord_bld, t, 256);
1035 if (dims >= 3)
1036 r = lp_build_mul_imm(&bld->coord_bld, r, 256);
1037 }
1038
1039 /* convert float to int */
1040 s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
1041 if (dims >= 2)
1042 t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
1043 if (dims >= 3)
1044 r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
1045
1046 /* subtract 0.5 (add -128) */
1047 i32_c128 = lp_build_const_int_vec(bld->gallivm, i32.type, -128);
1048 if (!bld->static_state->force_nearest_s) {
1049 s = LLVMBuildAdd(builder, s, i32_c128, "");
1050 }
1051 if (dims >= 2 && !bld->static_state->force_nearest_t) {
1052 t = LLVMBuildAdd(builder, t, i32_c128, "");
1053 }
1054 if (dims >= 3) {
1055 r = LLVMBuildAdd(builder, r, i32_c128, "");
1056 }
1057
1058 /* compute floor (shift right 8) */
1059 i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8);
1060 s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
1061 if (dims >= 2)
1062 t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
1063 if (dims >= 3)
1064 r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
1065
1066 /* compute fractional part (AND with 0xff) */
1067 i32_c255 = lp_build_const_int_vec(bld->gallivm, i32.type, 255);
1068 s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
1069 if (dims >= 2)
1070 t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
1071 if (dims >= 3)
1072 r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
1073
1074 /* get pixel, row and image strides */
1075 x_stride = lp_build_const_vec(bld->gallivm, bld->int_coord_bld.type,
1076 bld->format_desc->block.bits/8);
1077 y_stride = row_stride_vec;
1078 z_stride = img_stride_vec;
1079
1080 /* do texcoord wrapping and compute texel offsets */
1081 lp_build_sample_wrap_linear_int(bld,
1082 bld->format_desc->block.width,
1083 s_ipart, &s_fpart, s_float,
1084 width_vec, x_stride,
1085 bld->static_state->pot_width,
1086 bld->static_state->wrap_s,
1087 &x_offset0, &x_offset1,
1088 &x_subcoord[0], &x_subcoord[1]);
1089
1090 /* add potential cube/array/mip offsets now as they are constant per pixel */
1091 if (bld->static_state->target == PIPE_TEXTURE_CUBE ||
1092 bld->static_state->target == PIPE_TEXTURE_1D_ARRAY ||
1093 bld->static_state->target == PIPE_TEXTURE_2D_ARRAY) {
1094 LLVMValueRef z_offset;
1095 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1096 /* The r coord is the cube face in [0,5] or array layer */
1097 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, z_offset);
1098 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, z_offset);
1099 }
1100 if (mipoffsets) {
1101 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, mipoffsets);
1102 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, mipoffsets);
1103 }
1104
1105 for (z = 0; z < 2; z++) {
1106 for (y = 0; y < 2; y++) {
1107 offset[z][y][0] = x_offset0;
1108 offset[z][y][1] = x_offset1;
1109 }
1110 }
1111
1112 if (dims >= 2) {
1113 lp_build_sample_wrap_linear_int(bld,
1114 bld->format_desc->block.height,
1115 t_ipart, &t_fpart, t_float,
1116 height_vec, y_stride,
1117 bld->static_state->pot_height,
1118 bld->static_state->wrap_t,
1119 &y_offset0, &y_offset1,
1120 &y_subcoord[0], &y_subcoord[1]);
1121
1122 for (z = 0; z < 2; z++) {
1123 for (x = 0; x < 2; x++) {
1124 offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1125 offset[z][0][x], y_offset0);
1126 offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1127 offset[z][1][x], y_offset1);
1128 }
1129 }
1130 }
1131
1132 if (dims >= 3) {
1133 lp_build_sample_wrap_linear_int(bld,
1134 bld->format_desc->block.height,
1135 r_ipart, &r_fpart, r_float,
1136 depth_vec, z_stride,
1137 bld->static_state->pot_depth,
1138 bld->static_state->wrap_r,
1139 &z_offset0, &z_offset1,
1140 &z_subcoord[0], &z_subcoord[1]);
1141 for (y = 0; y < 2; y++) {
1142 for (x = 0; x < 2; x++) {
1143 offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1144 offset[0][y][x], z_offset0);
1145 offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1146 offset[1][y][x], z_offset1);
1147 }
1148 }
1149 }
1150
1151 lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1152 x_subcoord, y_subcoord,
1153 s_fpart, t_fpart, r_fpart,
1154 colors_lo, colors_hi);
1155 }
1156
1157
1158 /**
1159 * Sample a single texture image with (bi-)(tri-)linear sampling.
1160 * Return filtered color as two vectors of 16-bit fixed point values.
1161 * Does address calcs (except offsets) with floats.
1162 * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
1163 */
1164 static void
1165 lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
1166 LLVMValueRef int_size,
1167 LLVMValueRef row_stride_vec,
1168 LLVMValueRef img_stride_vec,
1169 LLVMValueRef data_ptr,
1170 LLVMValueRef mipoffsets,
1171 LLVMValueRef s,
1172 LLVMValueRef t,
1173 LLVMValueRef r,
1174 LLVMValueRef *colors_lo,
1175 LLVMValueRef *colors_hi)
1176 {
1177 const unsigned dims = bld->dims;
1178 LLVMValueRef width_vec, height_vec, depth_vec;
1179 LLVMValueRef s_fpart;
1180 LLVMValueRef t_fpart = NULL;
1181 LLVMValueRef r_fpart = NULL;
1182 LLVMValueRef x_stride, y_stride, z_stride;
1183 LLVMValueRef x_offset0, x_offset1;
1184 LLVMValueRef y_offset0, y_offset1;
1185 LLVMValueRef z_offset0, z_offset1;
1186 LLVMValueRef offset[2][2][2]; /* [z][y][x] */
1187 LLVMValueRef x_subcoord[2], y_subcoord[2];
1188 LLVMValueRef flt_size;
1189 LLVMValueRef x_icoord0, x_icoord1;
1190 LLVMValueRef y_icoord0, y_icoord1;
1191 LLVMValueRef z_icoord0, z_icoord1;
1192 unsigned x, y, z;
1193
1194 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
1195
1196 lp_build_extract_image_sizes(bld,
1197 &bld->float_size_bld,
1198 bld->coord_type,
1199 flt_size,
1200 &width_vec,
1201 &height_vec,
1202 &depth_vec);
1203
1204 /* do texcoord wrapping and compute texel offsets */
1205 lp_build_sample_wrap_linear_float(bld,
1206 bld->format_desc->block.width,
1207 s, width_vec,
1208 bld->static_state->pot_width,
1209 bld->static_state->wrap_s,
1210 &x_icoord0, &x_icoord1,
1211 &s_fpart,
1212 bld->static_state->force_nearest_s);
1213
1214 if (dims >= 2) {
1215 lp_build_sample_wrap_linear_float(bld,
1216 bld->format_desc->block.height,
1217 t, height_vec,
1218 bld->static_state->pot_height,
1219 bld->static_state->wrap_t,
1220 &y_icoord0, &y_icoord1,
1221 &t_fpart,
1222 bld->static_state->force_nearest_t);
1223
1224 if (dims >= 3) {
1225 lp_build_sample_wrap_linear_float(bld,
1226 bld->format_desc->block.height,
1227 r, depth_vec,
1228 bld->static_state->pot_depth,
1229 bld->static_state->wrap_r,
1230 &z_icoord0, &z_icoord1,
1231 &r_fpart, 0);
1232 }
1233 }
1234
1235 /*
1236 * From here on we deal with ints, and we should split up the 256bit
1237 * vectors manually for better generated code.
1238 */
1239
1240 /* get pixel, row and image strides */
1241 x_stride = lp_build_const_vec(bld->gallivm,
1242 bld->int_coord_bld.type,
1243 bld->format_desc->block.bits/8);
1244 y_stride = row_stride_vec;
1245 z_stride = img_stride_vec;
1246
1247 /*
1248 * compute texel offset -
1249 * cannot do offset calc with floats, difficult for block-based formats,
1250 * and not enough precision anyway.
1251 */
1252 lp_build_sample_partial_offset(&bld->int_coord_bld,
1253 bld->format_desc->block.width,
1254 x_icoord0, x_stride,
1255 &x_offset0, &x_subcoord[0]);
1256 lp_build_sample_partial_offset(&bld->int_coord_bld,
1257 bld->format_desc->block.width,
1258 x_icoord1, x_stride,
1259 &x_offset1, &x_subcoord[1]);
1260
1261 /* add potential cube/array/mip offsets now as they are constant per pixel */
1262 if (bld->static_state->target == PIPE_TEXTURE_CUBE ||
1263 bld->static_state->target == PIPE_TEXTURE_1D_ARRAY ||
1264 bld->static_state->target == PIPE_TEXTURE_2D_ARRAY) {
1265 LLVMValueRef z_offset;
1266 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1267 /* The r coord is the cube face in [0,5] or array layer */
1268 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, z_offset);
1269 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, z_offset);
1270 }
1271 if (mipoffsets) {
1272 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, mipoffsets);
1273 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, mipoffsets);
1274 }
1275
1276 for (z = 0; z < 2; z++) {
1277 for (y = 0; y < 2; y++) {
1278 offset[z][y][0] = x_offset0;
1279 offset[z][y][1] = x_offset1;
1280 }
1281 }
1282
1283 if (dims >= 2) {
1284 lp_build_sample_partial_offset(&bld->int_coord_bld,
1285 bld->format_desc->block.height,
1286 y_icoord0, y_stride,
1287 &y_offset0, &y_subcoord[0]);
1288 lp_build_sample_partial_offset(&bld->int_coord_bld,
1289 bld->format_desc->block.height,
1290 y_icoord1, y_stride,
1291 &y_offset1, &y_subcoord[1]);
1292 for (z = 0; z < 2; z++) {
1293 for (x = 0; x < 2; x++) {
1294 offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1295 offset[z][0][x], y_offset0);
1296 offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1297 offset[z][1][x], y_offset1);
1298 }
1299 }
1300 }
1301
1302 if (dims >= 3) {
1303 LLVMValueRef z_subcoord[2];
1304 lp_build_sample_partial_offset(&bld->int_coord_bld,
1305 1,
1306 z_icoord0, z_stride,
1307 &z_offset0, &z_subcoord[0]);
1308 lp_build_sample_partial_offset(&bld->int_coord_bld,
1309 1,
1310 z_icoord1, z_stride,
1311 &z_offset1, &z_subcoord[1]);
1312 for (y = 0; y < 2; y++) {
1313 for (x = 0; x < 2; x++) {
1314 offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1315 offset[0][y][x], z_offset0);
1316 offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1317 offset[1][y][x], z_offset1);
1318 }
1319 }
1320 }
1321
1322 lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1323 x_subcoord, y_subcoord,
1324 s_fpart, t_fpart, r_fpart,
1325 colors_lo, colors_hi);
1326 }
1327
1328
1329 /**
1330 * Sample the texture/mipmap using given image filter and mip filter.
1331 * data0_ptr and data1_ptr point to the two mipmap levels to sample
1332 * from. width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
1333 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1334 */
1335 static void
1336 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1337 unsigned img_filter,
1338 unsigned mip_filter,
1339 LLVMValueRef s,
1340 LLVMValueRef t,
1341 LLVMValueRef r,
1342 LLVMValueRef ilevel0,
1343 LLVMValueRef ilevel1,
1344 LLVMValueRef lod_fpart,
1345 LLVMValueRef colors_lo_var,
1346 LLVMValueRef colors_hi_var)
1347 {
1348 LLVMBuilderRef builder = bld->gallivm->builder;
1349 LLVMValueRef size0;
1350 LLVMValueRef size1;
1351 LLVMValueRef row_stride0_vec = NULL;
1352 LLVMValueRef row_stride1_vec = NULL;
1353 LLVMValueRef img_stride0_vec = NULL;
1354 LLVMValueRef img_stride1_vec = NULL;
1355 LLVMValueRef data_ptr0;
1356 LLVMValueRef data_ptr1;
1357 LLVMValueRef mipoff0 = NULL;
1358 LLVMValueRef mipoff1 = NULL;
1359 LLVMValueRef colors0_lo, colors0_hi;
1360 LLVMValueRef colors1_lo, colors1_hi;
1361
1362 /* sample the first mipmap level */
1363 lp_build_mipmap_level_sizes(bld, ilevel0,
1364 &size0,
1365 &row_stride0_vec, &img_stride0_vec);
1366 if (bld->num_lods == 1) {
1367 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1368 }
1369 else {
1370 /* This path should work for num_lods 1 too but slightly less efficient */
1371 data_ptr0 = bld->base_ptr;
1372 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1373 }
1374
1375 if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
1376 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1377 lp_build_sample_image_nearest_afloat(bld,
1378 size0,
1379 row_stride0_vec, img_stride0_vec,
1380 data_ptr0, mipoff0, s, t, r,
1381 &colors0_lo, &colors0_hi);
1382 }
1383 else {
1384 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1385 lp_build_sample_image_linear_afloat(bld,
1386 size0,
1387 row_stride0_vec, img_stride0_vec,
1388 data_ptr0, mipoff0, s, t, r,
1389 &colors0_lo, &colors0_hi);
1390 }
1391 }
1392 else {
1393 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1394 lp_build_sample_image_nearest(bld,
1395 size0,
1396 row_stride0_vec, img_stride0_vec,
1397 data_ptr0, mipoff0, s, t, r,
1398 &colors0_lo, &colors0_hi);
1399 }
1400 else {
1401 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1402 lp_build_sample_image_linear(bld,
1403 size0,
1404 row_stride0_vec, img_stride0_vec,
1405 data_ptr0, mipoff0, s, t, r,
1406 &colors0_lo, &colors0_hi);
1407 }
1408 }
1409
1410 /* Store the first level's colors in the output variables */
1411 LLVMBuildStore(builder, colors0_lo, colors_lo_var);
1412 LLVMBuildStore(builder, colors0_hi, colors_hi_var);
1413
1414 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1415 LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
1416 bld->perquadf_bld.type, 256.0);
1417 LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type);
1418 struct lp_build_if_state if_ctx;
1419 LLVMValueRef need_lerp;
1420 unsigned num_quads = bld->coord_bld.type.length / 4;
1421 unsigned i;
1422
1423 lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
1424 lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
1425
1426 /* need_lerp = lod_fpart > 0 */
1427 if (num_quads == 1) {
1428 need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
1429 lod_fpart, bld->perquadi_bld.zero,
1430 "need_lerp");
1431 }
1432 else {
1433 /*
1434 * We'll do mip filtering if any of the quads need it.
1435 * It might be better to split the vectors here and only fetch/filter
1436 * quads which need it.
1437 */
1438 /*
1439 * We need to clamp lod_fpart here since we can get negative
1440 * values which would screw up filtering if not all
1441 * lod_fpart values have same sign.
1442 * We can however then skip the greater than comparison.
1443 */
1444 lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart,
1445 bld->perquadi_bld.zero);
1446 need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart);
1447 }
1448
1449 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1450 {
1451 struct lp_build_context h16_bld;
1452
1453 lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
1454
1455 /* sample the second mipmap level */
1456 lp_build_mipmap_level_sizes(bld, ilevel1,
1457 &size1,
1458 &row_stride1_vec, &img_stride1_vec);
1459 lp_build_mipmap_level_sizes(bld, ilevel1,
1460 &size1,
1461 &row_stride1_vec, &img_stride1_vec);
1462 if (bld->num_lods == 1) {
1463 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1464 }
1465 else {
1466 data_ptr1 = bld->base_ptr;
1467 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1468 }
1469
1470 if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
1471 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1472 lp_build_sample_image_nearest_afloat(bld,
1473 size1,
1474 row_stride1_vec, img_stride1_vec,
1475 data_ptr1, mipoff1, s, t, r,
1476 &colors1_lo, &colors1_hi);
1477 }
1478 else {
1479 lp_build_sample_image_linear_afloat(bld,
1480 size1,
1481 row_stride1_vec, img_stride1_vec,
1482 data_ptr1, mipoff1, s, t, r,
1483 &colors1_lo, &colors1_hi);
1484 }
1485 }
1486 else {
1487 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1488 lp_build_sample_image_nearest(bld,
1489 size1,
1490 row_stride1_vec, img_stride1_vec,
1491 data_ptr1, mipoff1, s, t, r,
1492 &colors1_lo, &colors1_hi);
1493 }
1494 else {
1495 lp_build_sample_image_linear(bld,
1496 size1,
1497 row_stride1_vec, img_stride1_vec,
1498 data_ptr1, mipoff1, s, t, r,
1499 &colors1_lo, &colors1_hi);
1500 }
1501 }
1502
1503 /* interpolate samples from the two mipmap levels */
1504
1505 if (num_quads == 1) {
1506 lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
1507 lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
1508
1509 #if HAVE_LLVM == 0x208
1510 /* This is a work-around for a bug in LLVM 2.8.
1511 * Evidently, something goes wrong in the construction of the
1512 * lod_fpart short[8] vector. Adding this no-effect shuffle seems
1513 * to force the vector to be properly constructed.
1514 * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
1515 */
1516 {
1517 LLVMValueRef shuffles[8], shuffle;
1518 assert(h16_bld.type.length <= Elements(shuffles));
1519 for (i = 0; i < h16_bld.type.length; i++)
1520 shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
1521 shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
1522 lod_fpart = LLVMBuildShuffleVector(builder,
1523 lod_fpart, lod_fpart,
1524 shuffle, "");
1525 }
1526 #endif
1527
1528 colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
1529 colors0_lo, colors1_lo);
1530 colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
1531 colors0_hi, colors1_hi);
1532 }
1533 else {
1534 LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16];
1535 struct lp_type perquadi16_type = bld->perquadi_bld.type;
1536 perquadi16_type.width /= 2;
1537 perquadi16_type.length *= 2;
1538 lod_fpart = LLVMBuildBitCast(builder, lod_fpart,
1539 lp_build_vec_type(bld->gallivm,
1540 perquadi16_type), "");
1541 /* XXX this only works for exactly 2 quads. More quads need shuffle */
1542 assert(num_quads == 2);
1543 for (i = 0; i < num_quads; i++) {
1544 LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2);
1545 lod_parts[i] = lp_build_extract_broadcast(bld->gallivm,
1546 perquadi16_type,
1547 h16_bld.type,
1548 lod_fpart,
1549 indexi2);
1550 }
1551 colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0],
1552 colors0_lo, colors1_lo);
1553 colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1],
1554 colors0_hi, colors1_hi);
1555 }
1556
1557 LLVMBuildStore(builder, colors0_lo, colors_lo_var);
1558 LLVMBuildStore(builder, colors0_hi, colors_hi_var);
1559 }
1560 lp_build_endif(&if_ctx);
1561 }
1562 }
1563
1564
1565
1566 /**
1567 * Texture sampling in AoS format. Used when sampling common 32-bit/texel
1568 * formats. 1D/2D/3D/cube texture supported. All mipmap sampling modes
1569 * but only limited texture coord wrap modes.
1570 */
1571 void
1572 lp_build_sample_aos(struct lp_build_sample_context *bld,
1573 unsigned unit,
1574 LLVMValueRef s,
1575 LLVMValueRef t,
1576 LLVMValueRef r,
1577 LLVMValueRef lod_ipart,
1578 LLVMValueRef lod_fpart,
1579 LLVMValueRef ilevel0,
1580 LLVMValueRef ilevel1,
1581 LLVMValueRef texel_out[4])
1582 {
1583 struct lp_build_context *int_bld = &bld->int_bld;
1584 LLVMBuilderRef builder = bld->gallivm->builder;
1585 const unsigned mip_filter = bld->static_state->min_mip_filter;
1586 const unsigned min_filter = bld->static_state->min_img_filter;
1587 const unsigned mag_filter = bld->static_state->mag_img_filter;
1588 const unsigned dims = bld->dims;
1589 LLVMValueRef packed, packed_lo, packed_hi;
1590 LLVMValueRef unswizzled[4];
1591 struct lp_build_context h16_bld;
1592
1593 /* we only support the common/simple wrap modes at this time */
1594 assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
1595 if (dims >= 2)
1596 assert(lp_is_simple_wrap_mode(bld->static_state->wrap_t));
1597 if (dims >= 3)
1598 assert(lp_is_simple_wrap_mode(bld->static_state->wrap_r));
1599
1600
1601 /* make 16-bit fixed-pt builder context */
1602 lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
1603
1604 /*
1605 * Get/interpolate texture colors.
1606 */
1607
1608 packed_lo = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_lo");
1609 packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi");
1610
1611 if (min_filter == mag_filter) {
1612 /* no need to distinguish between minification and magnification */
1613 lp_build_sample_mipmap(bld,
1614 min_filter, mip_filter,
1615 s, t, r,
1616 ilevel0, ilevel1, lod_fpart,
1617 packed_lo, packed_hi);
1618 }
1619 else {
1620 /* Emit conditional to choose min image filter or mag image filter
1621 * depending on the lod being > 0 or <= 0, respectively.
1622 */
1623 struct lp_build_if_state if_ctx;
1624 LLVMValueRef minify;
1625
1626 /*
1627 * XXX this should to all lods into account, if some are min
1628 * some max probably could hack up the coords/weights in the linear
1629 * path with selects to work for nearest.
1630 * If that's just two quads sitting next to each other it seems
1631 * quite ok to do the same filtering method on both though, at
1632 * least unless we have explicit lod (and who uses different
1633 * min/mag filter with that?)
1634 */
1635 if (bld->num_lods > 1)
1636 lod_ipart = LLVMBuildExtractElement(builder, lod_ipart,
1637 lp_build_const_int32(bld->gallivm, 0), "");
1638
1639 /* minify = lod >= 0.0 */
1640 minify = LLVMBuildICmp(builder, LLVMIntSGE,
1641 lod_ipart, int_bld->zero, "");
1642
1643 lp_build_if(&if_ctx, bld->gallivm, minify);
1644 {
1645 /* Use the minification filter */
1646 lp_build_sample_mipmap(bld,
1647 min_filter, mip_filter,
1648 s, t, r,
1649 ilevel0, ilevel1, lod_fpart,
1650 packed_lo, packed_hi);
1651 }
1652 lp_build_else(&if_ctx);
1653 {
1654 /* Use the magnification filter */
1655 lp_build_sample_mipmap(bld,
1656 mag_filter, PIPE_TEX_MIPFILTER_NONE,
1657 s, t, r,
1658 ilevel0, NULL, NULL,
1659 packed_lo, packed_hi);
1660 }
1661 lp_build_endif(&if_ctx);
1662 }
1663
1664 /*
1665 * combine the values stored in 'packed_lo' and 'packed_hi' variables
1666 * into 'packed'
1667 */
1668 packed = lp_build_pack2(bld->gallivm,
1669 h16_bld.type, lp_type_unorm(8, bld->vector_width),
1670 LLVMBuildLoad(builder, packed_lo, ""),
1671 LLVMBuildLoad(builder, packed_hi, ""));
1672
1673 /*
1674 * Convert to SoA and swizzle.
1675 */
1676 lp_build_rgba8_to_f32_soa(bld->gallivm,
1677 bld->texel_type,
1678 packed, unswizzled);
1679
1680 if (util_format_is_rgba8_variant(bld->format_desc)) {
1681 lp_build_format_swizzle_soa(bld->format_desc,
1682 &bld->texel_bld,
1683 unswizzled, texel_out);
1684 }
1685 else {
1686 texel_out[0] = unswizzled[0];
1687 texel_out[1] = unswizzled[1];
1688 texel_out[2] = unswizzled[2];
1689 texel_out[3] = unswizzled[3];
1690 }
1691 }