1 /**************************************************************************
3 * Copyright 2010 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
30 * Texture sampling -- AoS.
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "util/u_debug.h"
39 #include "util/u_dump.h"
40 #include "util/u_memory.h"
41 #include "util/u_math.h"
42 #include "util/format/u_format.h"
43 #include "util/u_cpu_detect.h"
44 #include "lp_bld_debug.h"
45 #include "lp_bld_type.h"
46 #include "lp_bld_const.h"
47 #include "lp_bld_conv.h"
48 #include "lp_bld_arit.h"
49 #include "lp_bld_bitarit.h"
50 #include "lp_bld_logic.h"
51 #include "lp_bld_swizzle.h"
52 #include "lp_bld_pack.h"
53 #include "lp_bld_flow.h"
54 #include "lp_bld_gather.h"
55 #include "lp_bld_format.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_sample.h"
58 #include "lp_bld_sample_aos.h"
59 #include "lp_bld_quad.h"
63 * Build LLVM code for texture coord wrapping, for nearest filtering,
64 * for scaled integer texcoords.
65 * \param block_length is the length of the pixel block along the
67 * \param coord the incoming texcoord (s,t or r) scaled to the texture size
68 * \param coord_f the incoming texcoord (s,t or r) as float vec
69 * \param length the texture size along one dimension
70 * \param stride pixel stride along the coordinate axis (in bytes)
71 * \param offset the texel offset along the coord axis
72 * \param is_pot if TRUE, length is a power of two
73 * \param wrap_mode one of PIPE_TEX_WRAP_x
74 * \param out_offset byte offset for the wrapped coordinate
75 * \param out_i resulting sub-block pixel coordinate for coord0
78 lp_build_sample_wrap_nearest_int(struct lp_build_sample_context
*bld
,
79 unsigned block_length
,
87 LLVMValueRef
*out_offset
,
90 struct lp_build_context
*int_coord_bld
= &bld
->int_coord_bld
;
91 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
92 LLVMValueRef length_minus_one
;
94 length_minus_one
= lp_build_sub(int_coord_bld
, length
, int_coord_bld
->one
);
97 case PIPE_TEX_WRAP_REPEAT
:
99 coord
= LLVMBuildAnd(builder
, coord
, length_minus_one
, "");
101 struct lp_build_context
*coord_bld
= &bld
->coord_bld
;
102 LLVMValueRef length_f
= lp_build_int_to_float(coord_bld
, length
);
104 offset
= lp_build_int_to_float(coord_bld
, offset
);
105 offset
= lp_build_div(coord_bld
, offset
, length_f
);
106 coord_f
= lp_build_add(coord_bld
, coord_f
, offset
);
108 coord
= lp_build_fract_safe(coord_bld
, coord_f
);
109 coord
= lp_build_mul(coord_bld
, coord
, length_f
);
110 coord
= lp_build_itrunc(coord_bld
, coord
);
114 case PIPE_TEX_WRAP_CLAMP_TO_EDGE
:
115 coord
= lp_build_max(int_coord_bld
, coord
, int_coord_bld
->zero
);
116 coord
= lp_build_min(int_coord_bld
, coord
, length_minus_one
);
119 case PIPE_TEX_WRAP_CLAMP
:
120 case PIPE_TEX_WRAP_CLAMP_TO_BORDER
:
121 case PIPE_TEX_WRAP_MIRROR_REPEAT
:
122 case PIPE_TEX_WRAP_MIRROR_CLAMP
:
123 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE
:
124 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER
:
129 lp_build_sample_partial_offset(int_coord_bld
, block_length
, coord
, stride
,
135 * Helper to compute the first coord and the weight for
136 * linear wrap repeat npot textures
139 lp_build_coord_repeat_npot_linear_int(struct lp_build_sample_context
*bld
,
140 LLVMValueRef coord_f
,
141 LLVMValueRef length_i
,
142 LLVMValueRef length_f
,
143 LLVMValueRef
*coord0_i
,
144 LLVMValueRef
*weight_i
)
146 struct lp_build_context
*coord_bld
= &bld
->coord_bld
;
147 struct lp_build_context
*int_coord_bld
= &bld
->int_coord_bld
;
148 struct lp_build_context abs_coord_bld
;
149 struct lp_type abs_type
;
150 LLVMValueRef length_minus_one
= lp_build_sub(int_coord_bld
, length_i
,
152 LLVMValueRef mask
, i32_c8
, i32_c128
, i32_c255
;
154 /* wrap with normalized floats is just fract */
155 coord_f
= lp_build_fract(coord_bld
, coord_f
);
157 coord_f
= lp_build_mul(coord_bld
, coord_f
, length_f
);
158 /* convert to int, compute lerp weight */
159 coord_f
= lp_build_mul_imm(&bld
->coord_bld
, coord_f
, 256);
161 /* At this point we don't have any negative numbers so use non-signed
162 * build context which might help on some archs.
164 abs_type
= coord_bld
->type
;
166 lp_build_context_init(&abs_coord_bld
, bld
->gallivm
, abs_type
);
167 *coord0_i
= lp_build_iround(&abs_coord_bld
, coord_f
);
169 /* subtract 0.5 (add -128) */
170 i32_c128
= lp_build_const_int_vec(bld
->gallivm
, bld
->int_coord_type
, -128);
171 *coord0_i
= LLVMBuildAdd(bld
->gallivm
->builder
, *coord0_i
, i32_c128
, "");
173 /* compute fractional part (AND with 0xff) */
174 i32_c255
= lp_build_const_int_vec(bld
->gallivm
, bld
->int_coord_type
, 255);
175 *weight_i
= LLVMBuildAnd(bld
->gallivm
->builder
, *coord0_i
, i32_c255
, "");
177 /* compute floor (shift right 8) */
178 i32_c8
= lp_build_const_int_vec(bld
->gallivm
, bld
->int_coord_type
, 8);
179 *coord0_i
= LLVMBuildAShr(bld
->gallivm
->builder
, *coord0_i
, i32_c8
, "");
181 * we avoided the 0.5/length division before the repeat wrap,
182 * now need to fix up edge cases with selects
184 mask
= lp_build_compare(int_coord_bld
->gallivm
, int_coord_bld
->type
,
185 PIPE_FUNC_LESS
, *coord0_i
, int_coord_bld
->zero
);
186 *coord0_i
= lp_build_select(int_coord_bld
, mask
, length_minus_one
, *coord0_i
);
188 * We should never get values too large - except if coord was nan or inf,
189 * in which case things go terribly wrong...
190 * Alternatively, could use fract_safe above...
192 *coord0_i
= lp_build_min(int_coord_bld
, *coord0_i
, length_minus_one
);
197 * Build LLVM code for texture coord wrapping, for linear filtering,
198 * for scaled integer texcoords.
199 * \param block_length is the length of the pixel block along the
201 * \param coord0 the incoming texcoord (s,t or r) scaled to the texture size
202 * \param coord_f the incoming texcoord (s,t or r) as float vec
203 * \param length the texture size along one dimension
204 * \param stride pixel stride along the coordinate axis (in bytes)
205 * \param offset the texel offset along the coord axis
206 * \param is_pot if TRUE, length is a power of two
207 * \param wrap_mode one of PIPE_TEX_WRAP_x
208 * \param offset0 resulting relative offset for coord0
209 * \param offset1 resulting relative offset for coord0 + 1
210 * \param i0 resulting sub-block pixel coordinate for coord0
211 * \param i1 resulting sub-block pixel coordinate for coord0 + 1
214 lp_build_sample_wrap_linear_int(struct lp_build_sample_context
*bld
,
215 unsigned block_length
,
217 LLVMValueRef
*weight_i
,
218 LLVMValueRef coord_f
,
224 LLVMValueRef
*offset0
,
225 LLVMValueRef
*offset1
,
229 struct lp_build_context
*int_coord_bld
= &bld
->int_coord_bld
;
230 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
231 LLVMValueRef length_minus_one
;
232 LLVMValueRef lmask
, umask
, mask
;
235 * If the pixel block covers more than one pixel then there is no easy
236 * way to calculate offset1 relative to offset0. Instead, compute them
237 * independently. Otherwise, try to compute offset0 and offset1 with
238 * a single stride multiplication.
241 length_minus_one
= lp_build_sub(int_coord_bld
, length
, int_coord_bld
->one
);
243 if (block_length
!= 1) {
246 case PIPE_TEX_WRAP_REPEAT
:
248 coord1
= lp_build_add(int_coord_bld
, coord0
, int_coord_bld
->one
);
249 coord0
= LLVMBuildAnd(builder
, coord0
, length_minus_one
, "");
250 coord1
= LLVMBuildAnd(builder
, coord1
, length_minus_one
, "");
254 LLVMValueRef length_f
= lp_build_int_to_float(&bld
->coord_bld
, length
);
256 offset
= lp_build_int_to_float(&bld
->coord_bld
, offset
);
257 offset
= lp_build_div(&bld
->coord_bld
, offset
, length_f
);
258 coord_f
= lp_build_add(&bld
->coord_bld
, coord_f
, offset
);
260 lp_build_coord_repeat_npot_linear_int(bld
, coord_f
,
263 mask
= lp_build_compare(bld
->gallivm
, int_coord_bld
->type
,
264 PIPE_FUNC_NOTEQUAL
, coord0
, length_minus_one
);
265 coord1
= LLVMBuildAnd(builder
,
266 lp_build_add(int_coord_bld
, coord0
,
272 case PIPE_TEX_WRAP_CLAMP_TO_EDGE
:
273 coord1
= lp_build_add(int_coord_bld
, coord0
, int_coord_bld
->one
);
274 coord0
= lp_build_clamp(int_coord_bld
, coord0
, int_coord_bld
->zero
,
276 coord1
= lp_build_clamp(int_coord_bld
, coord1
, int_coord_bld
->zero
,
280 case PIPE_TEX_WRAP_CLAMP
:
281 case PIPE_TEX_WRAP_CLAMP_TO_BORDER
:
282 case PIPE_TEX_WRAP_MIRROR_REPEAT
:
283 case PIPE_TEX_WRAP_MIRROR_CLAMP
:
284 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE
:
285 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER
:
288 coord0
= int_coord_bld
->zero
;
289 coord1
= int_coord_bld
->zero
;
292 lp_build_sample_partial_offset(int_coord_bld
, block_length
, coord0
, stride
,
294 lp_build_sample_partial_offset(int_coord_bld
, block_length
, coord1
, stride
,
299 *i0
= int_coord_bld
->zero
;
300 *i1
= int_coord_bld
->zero
;
303 case PIPE_TEX_WRAP_REPEAT
:
305 coord0
= LLVMBuildAnd(builder
, coord0
, length_minus_one
, "");
308 LLVMValueRef length_f
= lp_build_int_to_float(&bld
->coord_bld
, length
);
310 offset
= lp_build_int_to_float(&bld
->coord_bld
, offset
);
311 offset
= lp_build_div(&bld
->coord_bld
, offset
, length_f
);
312 coord_f
= lp_build_add(&bld
->coord_bld
, coord_f
, offset
);
314 lp_build_coord_repeat_npot_linear_int(bld
, coord_f
,
319 mask
= lp_build_compare(bld
->gallivm
, int_coord_bld
->type
,
320 PIPE_FUNC_NOTEQUAL
, coord0
, length_minus_one
);
322 *offset0
= lp_build_mul(int_coord_bld
, coord0
, stride
);
323 *offset1
= LLVMBuildAnd(builder
,
324 lp_build_add(int_coord_bld
, *offset0
, stride
),
328 case PIPE_TEX_WRAP_CLAMP_TO_EDGE
:
329 /* XXX this might be slower than the separate path
330 * on some newer cpus. With sse41 this is 8 instructions vs. 7
331 * - at least on SNB this is almost certainly slower since
332 * min/max are cheaper than selects, and the muls aren't bad.
334 lmask
= lp_build_compare(int_coord_bld
->gallivm
, int_coord_bld
->type
,
335 PIPE_FUNC_GEQUAL
, coord0
, int_coord_bld
->zero
);
336 umask
= lp_build_compare(int_coord_bld
->gallivm
, int_coord_bld
->type
,
337 PIPE_FUNC_LESS
, coord0
, length_minus_one
);
339 coord0
= lp_build_select(int_coord_bld
, lmask
, coord0
, int_coord_bld
->zero
);
340 coord0
= lp_build_select(int_coord_bld
, umask
, coord0
, length_minus_one
);
342 mask
= LLVMBuildAnd(builder
, lmask
, umask
, "");
344 *offset0
= lp_build_mul(int_coord_bld
, coord0
, stride
);
345 *offset1
= lp_build_add(int_coord_bld
,
347 LLVMBuildAnd(builder
, stride
, mask
, ""));
350 case PIPE_TEX_WRAP_CLAMP
:
351 case PIPE_TEX_WRAP_CLAMP_TO_BORDER
:
352 case PIPE_TEX_WRAP_MIRROR_REPEAT
:
353 case PIPE_TEX_WRAP_MIRROR_CLAMP
:
354 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE
:
355 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER
:
358 *offset0
= int_coord_bld
->zero
;
359 *offset1
= int_coord_bld
->zero
;
366 * Fetch texels for image with nearest sampling.
367 * Return filtered color as two vectors of 16-bit fixed point values.
370 lp_build_sample_fetch_image_nearest(struct lp_build_sample_context
*bld
,
371 LLVMValueRef data_ptr
,
373 LLVMValueRef x_subcoord
,
374 LLVMValueRef y_subcoord
,
375 LLVMValueRef
*colors
)
378 * Fetch the pixels as 4 x 32bit (rgba order might differ):
380 * rgba0 rgba1 rgba2 rgba3
382 * bit cast them into 16 x u8
384 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
386 * unpack them into two 8 x i16:
388 * r0 g0 b0 a0 r1 g1 b1 a1
389 * r2 g2 b2 a2 r3 g3 b3 a3
391 * The higher 8 bits of the resulting elements will be zero.
393 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
395 struct lp_build_context u8n
;
396 LLVMTypeRef u8n_vec_type
;
397 struct lp_type fetch_type
;
399 lp_build_context_init(&u8n
, bld
->gallivm
, lp_type_unorm(8, bld
->vector_width
));
400 u8n_vec_type
= lp_build_vec_type(bld
->gallivm
, u8n
.type
);
402 fetch_type
= lp_type_uint(bld
->texel_type
.width
);
403 if (util_format_is_rgba8_variant(bld
->format_desc
)) {
405 * Given the format is a rgba8, just read the pixels as is,
406 * without any swizzling. Swizzling will be done later.
408 rgba8
= lp_build_gather(bld
->gallivm
,
409 bld
->texel_type
.length
,
410 bld
->format_desc
->block
.bits
,
413 data_ptr
, offset
, TRUE
);
415 rgba8
= LLVMBuildBitCast(builder
, rgba8
, u8n_vec_type
, "");
418 rgba8
= lp_build_fetch_rgba_aos(bld
->gallivm
,
433 * Sample a single texture image with nearest sampling.
434 * If sampling a cube texture, r = cube face in [0,5].
435 * Return filtered color as two vectors of 16-bit fixed point values.
438 lp_build_sample_image_nearest(struct lp_build_sample_context
*bld
,
439 LLVMValueRef int_size
,
440 LLVMValueRef row_stride_vec
,
441 LLVMValueRef img_stride_vec
,
442 LLVMValueRef data_ptr
,
443 LLVMValueRef mipoffsets
,
447 const LLVMValueRef
*offsets
,
448 LLVMValueRef
*colors
)
450 const unsigned dims
= bld
->dims
;
451 struct lp_build_context i32
;
452 LLVMValueRef width_vec
, height_vec
, depth_vec
;
453 LLVMValueRef s_ipart
, t_ipart
= NULL
, r_ipart
= NULL
;
454 LLVMValueRef s_float
, t_float
= NULL
, r_float
= NULL
;
455 LLVMValueRef x_stride
;
456 LLVMValueRef x_offset
, offset
;
457 LLVMValueRef x_subcoord
, y_subcoord
, z_subcoord
;
459 lp_build_context_init(&i32
, bld
->gallivm
, lp_type_int_vec(32, bld
->vector_width
));
461 lp_build_extract_image_sizes(bld
,
469 s_float
= s
; t_float
= t
; r_float
= r
;
471 if (bld
->static_sampler_state
->normalized_coords
) {
472 LLVMValueRef flt_size
;
474 flt_size
= lp_build_int_to_float(&bld
->float_size_bld
, int_size
);
476 lp_build_unnormalized_coords(bld
, flt_size
, &s
, &t
, &r
);
479 /* convert float to int */
480 /* For correct rounding, need floor, not truncation here.
481 * Note that in some cases (clamp to edge, no texel offsets) we
482 * could use a non-signed build context which would help archs
483 * greatly which don't have arch rounding.
485 s_ipart
= lp_build_ifloor(&bld
->coord_bld
, s
);
487 t_ipart
= lp_build_ifloor(&bld
->coord_bld
, t
);
489 r_ipart
= lp_build_ifloor(&bld
->coord_bld
, r
);
491 /* add texel offsets */
493 s_ipart
= lp_build_add(&i32
, s_ipart
, offsets
[0]);
495 t_ipart
= lp_build_add(&i32
, t_ipart
, offsets
[1]);
497 r_ipart
= lp_build_add(&i32
, r_ipart
, offsets
[2]);
502 /* get pixel, row, image strides */
503 x_stride
= lp_build_const_vec(bld
->gallivm
,
504 bld
->int_coord_bld
.type
,
505 bld
->format_desc
->block
.bits
/8);
507 /* Do texcoord wrapping, compute texel offset */
508 lp_build_sample_wrap_nearest_int(bld
,
509 bld
->format_desc
->block
.width
,
511 width_vec
, x_stride
, offsets
[0],
512 bld
->static_texture_state
->pot_width
,
513 bld
->static_sampler_state
->wrap_s
,
514 &x_offset
, &x_subcoord
);
517 LLVMValueRef y_offset
;
518 lp_build_sample_wrap_nearest_int(bld
,
519 bld
->format_desc
->block
.height
,
521 height_vec
, row_stride_vec
, offsets
[1],
522 bld
->static_texture_state
->pot_height
,
523 bld
->static_sampler_state
->wrap_t
,
524 &y_offset
, &y_subcoord
);
525 offset
= lp_build_add(&bld
->int_coord_bld
, offset
, y_offset
);
527 LLVMValueRef z_offset
;
528 lp_build_sample_wrap_nearest_int(bld
,
529 1, /* block length (depth) */
531 depth_vec
, img_stride_vec
, offsets
[2],
532 bld
->static_texture_state
->pot_depth
,
533 bld
->static_sampler_state
->wrap_r
,
534 &z_offset
, &z_subcoord
);
535 offset
= lp_build_add(&bld
->int_coord_bld
, offset
, z_offset
);
538 if (has_layer_coord(bld
->static_texture_state
->target
)) {
539 LLVMValueRef z_offset
;
540 /* The r coord is the cube face in [0,5] or array layer */
541 z_offset
= lp_build_mul(&bld
->int_coord_bld
, r
, img_stride_vec
);
542 offset
= lp_build_add(&bld
->int_coord_bld
, offset
, z_offset
);
545 offset
= lp_build_add(&bld
->int_coord_bld
, offset
, mipoffsets
);
548 lp_build_sample_fetch_image_nearest(bld
, data_ptr
, offset
,
549 x_subcoord
, y_subcoord
,
555 * Fetch texels for image with linear sampling.
556 * Return filtered color as two vectors of 16-bit fixed point values.
559 lp_build_sample_fetch_image_linear(struct lp_build_sample_context
*bld
,
560 LLVMValueRef data_ptr
,
561 LLVMValueRef offset
[2][2][2],
562 LLVMValueRef x_subcoord
[2],
563 LLVMValueRef y_subcoord
[2],
564 LLVMValueRef s_fpart
,
565 LLVMValueRef t_fpart
,
566 LLVMValueRef r_fpart
,
567 LLVMValueRef
*colors
)
569 const unsigned dims
= bld
->dims
;
570 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
571 struct lp_build_context u8n
;
572 LLVMTypeRef u8n_vec_type
;
573 LLVMTypeRef elem_type
= LLVMInt32TypeInContext(bld
->gallivm
->context
);
574 LLVMValueRef shuffles
[LP_MAX_VECTOR_LENGTH
];
575 LLVMValueRef shuffle
;
576 LLVMValueRef neighbors
[2][2][2]; /* [z][y][x] */
581 lp_build_context_init(&u8n
, bld
->gallivm
, lp_type_unorm(8, bld
->vector_width
));
582 u8n_vec_type
= lp_build_vec_type(bld
->gallivm
, u8n
.type
);
585 * Transform 4 x i32 in
587 * s_fpart = {s0, s1, s2, s3}
589 * where each value is between 0 and 0xff,
593 * s_fpart = {s0, s0, s0, s0, s1, s1, s1, s1, s2, s2, s2, s2, s3, s3, s3, s3}
595 * and likewise for t_fpart. There is no risk of loosing precision here
596 * since the fractional parts only use the lower 8bits.
598 s_fpart
= LLVMBuildBitCast(builder
, s_fpart
, u8n_vec_type
, "");
600 t_fpart
= LLVMBuildBitCast(builder
, t_fpart
, u8n_vec_type
, "");
602 r_fpart
= LLVMBuildBitCast(builder
, r_fpart
, u8n_vec_type
, "");
604 for (j
= 0; j
< u8n
.type
.length
; j
+= 4) {
605 #if UTIL_ARCH_LITTLE_ENDIAN
606 unsigned subindex
= 0;
608 unsigned subindex
= 3;
612 index
= LLVMConstInt(elem_type
, j
+ subindex
, 0);
613 for (i
= 0; i
< 4; ++i
)
614 shuffles
[j
+ i
] = index
;
617 shuffle
= LLVMConstVector(shuffles
, u8n
.type
.length
);
619 s_fpart
= LLVMBuildShuffleVector(builder
, s_fpart
, u8n
.undef
,
622 t_fpart
= LLVMBuildShuffleVector(builder
, t_fpart
, u8n
.undef
,
626 r_fpart
= LLVMBuildShuffleVector(builder
, r_fpart
, u8n
.undef
,
631 * Fetch the pixels as 4 x 32bit (rgba order might differ):
633 * rgba0 rgba1 rgba2 rgba3
635 * bit cast them into 16 x u8
637 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
639 * unpack them into two 8 x i16:
641 * r0 g0 b0 a0 r1 g1 b1 a1
642 * r2 g2 b2 a2 r3 g3 b3 a3
644 * The higher 8 bits of the resulting elements will be zero.
646 numj
= 1 + (dims
>= 2);
647 numk
= 1 + (dims
>= 3);
649 for (k
= 0; k
< numk
; k
++) {
650 for (j
= 0; j
< numj
; j
++) {
651 for (i
= 0; i
< 2; i
++) {
654 if (util_format_is_rgba8_variant(bld
->format_desc
)) {
655 struct lp_type fetch_type
;
657 * Given the format is a rgba8, just read the pixels as is,
658 * without any swizzling. Swizzling will be done later.
660 fetch_type
= lp_type_uint(bld
->texel_type
.width
);
661 rgba8
= lp_build_gather(bld
->gallivm
,
662 bld
->texel_type
.length
,
663 bld
->format_desc
->block
.bits
,
666 data_ptr
, offset
[k
][j
][i
], TRUE
);
668 rgba8
= LLVMBuildBitCast(builder
, rgba8
, u8n_vec_type
, "");
671 rgba8
= lp_build_fetch_rgba_aos(bld
->gallivm
,
675 data_ptr
, offset
[k
][j
][i
],
681 neighbors
[k
][j
][i
] = rgba8
;
687 * Linear interpolation with 8.8 fixed point.
689 if (bld
->static_sampler_state
->force_nearest_s
) {
690 /* special case 1-D lerp */
691 packed
= lp_build_lerp(&u8n
,
695 LP_BLD_LERP_PRESCALED_WEIGHTS
);
697 else if (bld
->static_sampler_state
->force_nearest_t
) {
698 /* special case 1-D lerp */
699 packed
= lp_build_lerp(&u8n
,
703 LP_BLD_LERP_PRESCALED_WEIGHTS
);
706 /* general 1/2/3-D lerping */
708 packed
= lp_build_lerp(&u8n
,
712 LP_BLD_LERP_PRESCALED_WEIGHTS
);
713 } else if (dims
== 2) {
715 packed
= lp_build_lerp_2d(&u8n
,
721 LP_BLD_LERP_PRESCALED_WEIGHTS
);
725 packed
= lp_build_lerp_3d(&u8n
,
726 s_fpart
, t_fpart
, r_fpart
,
735 LP_BLD_LERP_PRESCALED_WEIGHTS
);
743 * Sample a single texture image with (bi-)(tri-)linear sampling.
744 * Return filtered color as two vectors of 16-bit fixed point values.
747 lp_build_sample_image_linear(struct lp_build_sample_context
*bld
,
748 LLVMValueRef int_size
,
749 LLVMValueRef row_stride_vec
,
750 LLVMValueRef img_stride_vec
,
751 LLVMValueRef data_ptr
,
752 LLVMValueRef mipoffsets
,
756 const LLVMValueRef
*offsets
,
757 LLVMValueRef
*colors
)
759 const unsigned dims
= bld
->dims
;
760 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
761 struct lp_build_context i32
;
762 LLVMValueRef i32_c8
, i32_c128
, i32_c255
;
763 LLVMValueRef width_vec
, height_vec
, depth_vec
;
764 LLVMValueRef s_ipart
, s_fpart
, s_float
;
765 LLVMValueRef t_ipart
= NULL
, t_fpart
= NULL
, t_float
= NULL
;
766 LLVMValueRef r_ipart
= NULL
, r_fpart
= NULL
, r_float
= NULL
;
767 LLVMValueRef x_stride
, y_stride
, z_stride
;
768 LLVMValueRef x_offset0
, x_offset1
;
769 LLVMValueRef y_offset0
, y_offset1
;
770 LLVMValueRef z_offset0
, z_offset1
;
771 LLVMValueRef offset
[2][2][2]; /* [z][y][x] */
772 LLVMValueRef x_subcoord
[2], y_subcoord
[2], z_subcoord
[2];
775 lp_build_context_init(&i32
, bld
->gallivm
, lp_type_int_vec(32, bld
->vector_width
));
777 lp_build_extract_image_sizes(bld
,
785 s_float
= s
; t_float
= t
; r_float
= r
;
787 if (bld
->static_sampler_state
->normalized_coords
) {
788 LLVMValueRef scaled_size
;
789 LLVMValueRef flt_size
;
791 /* scale size by 256 (8 fractional bits) */
792 scaled_size
= lp_build_shl_imm(&bld
->int_size_bld
, int_size
, 8);
794 flt_size
= lp_build_int_to_float(&bld
->float_size_bld
, scaled_size
);
796 lp_build_unnormalized_coords(bld
, flt_size
, &s
, &t
, &r
);
799 /* scale coords by 256 (8 fractional bits) */
800 s
= lp_build_mul_imm(&bld
->coord_bld
, s
, 256);
802 t
= lp_build_mul_imm(&bld
->coord_bld
, t
, 256);
804 r
= lp_build_mul_imm(&bld
->coord_bld
, r
, 256);
807 /* convert float to int */
808 /* For correct rounding, need round to nearest, not truncation here.
809 * Note that in some cases (clamp to edge, no texel offsets) we
810 * could use a non-signed build context which would help archs which
811 * don't have fptosi intrinsic with nearest rounding implemented.
813 s
= lp_build_iround(&bld
->coord_bld
, s
);
815 t
= lp_build_iround(&bld
->coord_bld
, t
);
817 r
= lp_build_iround(&bld
->coord_bld
, r
);
819 /* subtract 0.5 (add -128) */
820 i32_c128
= lp_build_const_int_vec(bld
->gallivm
, i32
.type
, -128);
821 if (!bld
->static_sampler_state
->force_nearest_s
) {
822 s
= LLVMBuildAdd(builder
, s
, i32_c128
, "");
824 if (dims
>= 2 && !bld
->static_sampler_state
->force_nearest_t
) {
825 t
= LLVMBuildAdd(builder
, t
, i32_c128
, "");
828 r
= LLVMBuildAdd(builder
, r
, i32_c128
, "");
831 /* compute floor (shift right 8) */
832 i32_c8
= lp_build_const_int_vec(bld
->gallivm
, i32
.type
, 8);
833 s_ipart
= LLVMBuildAShr(builder
, s
, i32_c8
, "");
835 t_ipart
= LLVMBuildAShr(builder
, t
, i32_c8
, "");
837 r_ipart
= LLVMBuildAShr(builder
, r
, i32_c8
, "");
839 /* add texel offsets */
841 s_ipart
= lp_build_add(&i32
, s_ipart
, offsets
[0]);
843 t_ipart
= lp_build_add(&i32
, t_ipart
, offsets
[1]);
845 r_ipart
= lp_build_add(&i32
, r_ipart
, offsets
[2]);
850 /* compute fractional part (AND with 0xff) */
851 i32_c255
= lp_build_const_int_vec(bld
->gallivm
, i32
.type
, 255);
852 s_fpart
= LLVMBuildAnd(builder
, s
, i32_c255
, "");
854 t_fpart
= LLVMBuildAnd(builder
, t
, i32_c255
, "");
856 r_fpart
= LLVMBuildAnd(builder
, r
, i32_c255
, "");
858 /* get pixel, row and image strides */
859 x_stride
= lp_build_const_vec(bld
->gallivm
, bld
->int_coord_bld
.type
,
860 bld
->format_desc
->block
.bits
/8);
861 y_stride
= row_stride_vec
;
862 z_stride
= img_stride_vec
;
864 /* do texcoord wrapping and compute texel offsets */
865 lp_build_sample_wrap_linear_int(bld
,
866 bld
->format_desc
->block
.width
,
867 s_ipart
, &s_fpart
, s_float
,
868 width_vec
, x_stride
, offsets
[0],
869 bld
->static_texture_state
->pot_width
,
870 bld
->static_sampler_state
->wrap_s
,
871 &x_offset0
, &x_offset1
,
872 &x_subcoord
[0], &x_subcoord
[1]);
874 /* add potential cube/array/mip offsets now as they are constant per pixel */
875 if (has_layer_coord(bld
->static_texture_state
->target
)) {
876 LLVMValueRef z_offset
;
877 z_offset
= lp_build_mul(&bld
->int_coord_bld
, r
, img_stride_vec
);
878 /* The r coord is the cube face in [0,5] or array layer */
879 x_offset0
= lp_build_add(&bld
->int_coord_bld
, x_offset0
, z_offset
);
880 x_offset1
= lp_build_add(&bld
->int_coord_bld
, x_offset1
, z_offset
);
883 x_offset0
= lp_build_add(&bld
->int_coord_bld
, x_offset0
, mipoffsets
);
884 x_offset1
= lp_build_add(&bld
->int_coord_bld
, x_offset1
, mipoffsets
);
887 for (z
= 0; z
< 2; z
++) {
888 for (y
= 0; y
< 2; y
++) {
889 offset
[z
][y
][0] = x_offset0
;
890 offset
[z
][y
][1] = x_offset1
;
895 lp_build_sample_wrap_linear_int(bld
,
896 bld
->format_desc
->block
.height
,
897 t_ipart
, &t_fpart
, t_float
,
898 height_vec
, y_stride
, offsets
[1],
899 bld
->static_texture_state
->pot_height
,
900 bld
->static_sampler_state
->wrap_t
,
901 &y_offset0
, &y_offset1
,
902 &y_subcoord
[0], &y_subcoord
[1]);
904 for (z
= 0; z
< 2; z
++) {
905 for (x
= 0; x
< 2; x
++) {
906 offset
[z
][0][x
] = lp_build_add(&bld
->int_coord_bld
,
907 offset
[z
][0][x
], y_offset0
);
908 offset
[z
][1][x
] = lp_build_add(&bld
->int_coord_bld
,
909 offset
[z
][1][x
], y_offset1
);
915 lp_build_sample_wrap_linear_int(bld
,
916 1, /* block length (depth) */
917 r_ipart
, &r_fpart
, r_float
,
918 depth_vec
, z_stride
, offsets
[2],
919 bld
->static_texture_state
->pot_depth
,
920 bld
->static_sampler_state
->wrap_r
,
921 &z_offset0
, &z_offset1
,
922 &z_subcoord
[0], &z_subcoord
[1]);
923 for (y
= 0; y
< 2; y
++) {
924 for (x
= 0; x
< 2; x
++) {
925 offset
[0][y
][x
] = lp_build_add(&bld
->int_coord_bld
,
926 offset
[0][y
][x
], z_offset0
);
927 offset
[1][y
][x
] = lp_build_add(&bld
->int_coord_bld
,
928 offset
[1][y
][x
], z_offset1
);
933 lp_build_sample_fetch_image_linear(bld
, data_ptr
, offset
,
934 x_subcoord
, y_subcoord
,
935 s_fpart
, t_fpart
, r_fpart
,
941 * Sample the texture/mipmap using given image filter and mip filter.
942 * data0_ptr and data1_ptr point to the two mipmap levels to sample
943 * from. width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
944 * If we're using nearest miplevel sampling the '1' values will be null/unused.
947 lp_build_sample_mipmap(struct lp_build_sample_context
*bld
,
953 const LLVMValueRef
*offsets
,
954 LLVMValueRef ilevel0
,
955 LLVMValueRef ilevel1
,
956 LLVMValueRef lod_fpart
,
957 LLVMValueRef colors_var
)
959 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
962 LLVMValueRef row_stride0_vec
= NULL
;
963 LLVMValueRef row_stride1_vec
= NULL
;
964 LLVMValueRef img_stride0_vec
= NULL
;
965 LLVMValueRef img_stride1_vec
= NULL
;
966 LLVMValueRef data_ptr0
;
967 LLVMValueRef data_ptr1
;
968 LLVMValueRef mipoff0
= NULL
;
969 LLVMValueRef mipoff1
= NULL
;
970 LLVMValueRef colors0
;
971 LLVMValueRef colors1
;
973 /* sample the first mipmap level */
974 lp_build_mipmap_level_sizes(bld
, ilevel0
,
976 &row_stride0_vec
, &img_stride0_vec
);
977 if (bld
->num_mips
== 1) {
978 data_ptr0
= lp_build_get_mipmap_level(bld
, ilevel0
);
981 /* This path should work for num_lods 1 too but slightly less efficient */
982 data_ptr0
= bld
->base_ptr
;
983 mipoff0
= lp_build_get_mip_offsets(bld
, ilevel0
);
986 if (img_filter
== PIPE_TEX_FILTER_NEAREST
) {
987 lp_build_sample_image_nearest(bld
,
989 row_stride0_vec
, img_stride0_vec
,
990 data_ptr0
, mipoff0
, s
, t
, r
, offsets
,
994 assert(img_filter
== PIPE_TEX_FILTER_LINEAR
);
995 lp_build_sample_image_linear(bld
,
997 row_stride0_vec
, img_stride0_vec
,
998 data_ptr0
, mipoff0
, s
, t
, r
, offsets
,
1002 /* Store the first level's colors in the output variables */
1003 LLVMBuildStore(builder
, colors0
, colors_var
);
1005 if (mip_filter
== PIPE_TEX_MIPFILTER_LINEAR
) {
1006 LLVMValueRef h16vec_scale
= lp_build_const_vec(bld
->gallivm
,
1007 bld
->lodf_bld
.type
, 256.0);
1008 LLVMTypeRef i32vec_type
= bld
->lodi_bld
.vec_type
;
1009 struct lp_build_if_state if_ctx
;
1010 LLVMValueRef need_lerp
;
1011 unsigned num_quads
= bld
->coord_bld
.type
.length
/ 4;
1014 lod_fpart
= LLVMBuildFMul(builder
, lod_fpart
, h16vec_scale
, "");
1015 lod_fpart
= LLVMBuildFPToSI(builder
, lod_fpart
, i32vec_type
, "lod_fpart.fixed16");
1017 /* need_lerp = lod_fpart > 0 */
1018 if (bld
->num_lods
== 1) {
1019 need_lerp
= LLVMBuildICmp(builder
, LLVMIntSGT
,
1020 lod_fpart
, bld
->lodi_bld
.zero
,
1025 * We'll do mip filtering if any of the quads need it.
1026 * It might be better to split the vectors here and only fetch/filter
1027 * quads which need it.
1030 * We need to clamp lod_fpart here since we can get negative
1031 * values which would screw up filtering if not all
1032 * lod_fpart values have same sign.
1033 * We can however then skip the greater than comparison.
1035 lod_fpart
= lp_build_max(&bld
->lodi_bld
, lod_fpart
,
1036 bld
->lodi_bld
.zero
);
1037 need_lerp
= lp_build_any_true_range(&bld
->lodi_bld
, bld
->num_lods
, lod_fpart
);
1040 lp_build_if(&if_ctx
, bld
->gallivm
, need_lerp
);
1042 struct lp_build_context u8n_bld
;
1044 lp_build_context_init(&u8n_bld
, bld
->gallivm
, lp_type_unorm(8, bld
->vector_width
));
1046 /* sample the second mipmap level */
1047 lp_build_mipmap_level_sizes(bld
, ilevel1
,
1049 &row_stride1_vec
, &img_stride1_vec
);
1050 if (bld
->num_mips
== 1) {
1051 data_ptr1
= lp_build_get_mipmap_level(bld
, ilevel1
);
1054 data_ptr1
= bld
->base_ptr
;
1055 mipoff1
= lp_build_get_mip_offsets(bld
, ilevel1
);
1058 if (img_filter
== PIPE_TEX_FILTER_NEAREST
) {
1059 lp_build_sample_image_nearest(bld
,
1061 row_stride1_vec
, img_stride1_vec
,
1062 data_ptr1
, mipoff1
, s
, t
, r
, offsets
,
1066 lp_build_sample_image_linear(bld
,
1068 row_stride1_vec
, img_stride1_vec
,
1069 data_ptr1
, mipoff1
, s
, t
, r
, offsets
,
1073 /* interpolate samples from the two mipmap levels */
1075 if (num_quads
== 1 && bld
->num_lods
== 1) {
1076 lod_fpart
= LLVMBuildTrunc(builder
, lod_fpart
, u8n_bld
.elem_type
, "");
1077 lod_fpart
= lp_build_broadcast_scalar(&u8n_bld
, lod_fpart
);
1080 unsigned num_chans_per_lod
= 4 * bld
->coord_type
.length
/ bld
->num_lods
;
1081 LLVMTypeRef tmp_vec_type
= LLVMVectorType(u8n_bld
.elem_type
, bld
->lodi_bld
.type
.length
);
1082 LLVMValueRef shuffle
[LP_MAX_VECTOR_LENGTH
];
1084 /* Take the LSB of lod_fpart */
1085 lod_fpart
= LLVMBuildTrunc(builder
, lod_fpart
, tmp_vec_type
, "");
1087 /* Broadcast each lod weight into their respective channels */
1088 for (i
= 0; i
< u8n_bld
.type
.length
; ++i
) {
1089 shuffle
[i
] = lp_build_const_int32(bld
->gallivm
, i
/ num_chans_per_lod
);
1091 lod_fpart
= LLVMBuildShuffleVector(builder
, lod_fpart
, LLVMGetUndef(tmp_vec_type
),
1092 LLVMConstVector(shuffle
, u8n_bld
.type
.length
), "");
1095 colors0
= lp_build_lerp(&u8n_bld
, lod_fpart
,
1097 LP_BLD_LERP_PRESCALED_WEIGHTS
);
1099 LLVMBuildStore(builder
, colors0
, colors_var
);
1101 lp_build_endif(&if_ctx
);
1108 * Texture sampling in AoS format. Used when sampling common 32-bit/texel
1109 * formats. 1D/2D/3D/cube texture supported. All mipmap sampling modes
1110 * but only limited texture coord wrap modes.
1113 lp_build_sample_aos(struct lp_build_sample_context
*bld
,
1114 unsigned sampler_unit
,
1118 const LLVMValueRef
*offsets
,
1119 LLVMValueRef lod_positive
,
1120 LLVMValueRef lod_fpart
,
1121 LLVMValueRef ilevel0
,
1122 LLVMValueRef ilevel1
,
1123 LLVMValueRef texel_out
[4])
1125 LLVMBuilderRef builder
= bld
->gallivm
->builder
;
1126 const unsigned mip_filter
= bld
->static_sampler_state
->min_mip_filter
;
1127 const unsigned min_filter
= bld
->static_sampler_state
->min_img_filter
;
1128 const unsigned mag_filter
= bld
->static_sampler_state
->mag_img_filter
;
1129 const unsigned dims
= bld
->dims
;
1130 LLVMValueRef packed_var
, packed
;
1131 LLVMValueRef unswizzled
[4];
1132 struct lp_build_context u8n_bld
;
1134 /* we only support the common/simple wrap modes at this time */
1135 assert(lp_is_simple_wrap_mode(bld
->static_sampler_state
->wrap_s
));
1137 assert(lp_is_simple_wrap_mode(bld
->static_sampler_state
->wrap_t
));
1139 assert(lp_is_simple_wrap_mode(bld
->static_sampler_state
->wrap_r
));
1142 /* make 8-bit unorm builder context */
1143 lp_build_context_init(&u8n_bld
, bld
->gallivm
, lp_type_unorm(8, bld
->vector_width
));
1146 * Get/interpolate texture colors.
1149 packed_var
= lp_build_alloca(bld
->gallivm
, u8n_bld
.vec_type
, "packed_var");
1151 if (min_filter
== mag_filter
) {
1152 /* no need to distinguish between minification and magnification */
1153 lp_build_sample_mipmap(bld
,
1154 min_filter
, mip_filter
,
1156 ilevel0
, ilevel1
, lod_fpart
,
1160 /* Emit conditional to choose min image filter or mag image filter
1161 * depending on the lod being > 0 or <= 0, respectively.
1163 struct lp_build_if_state if_ctx
;
1166 * FIXME this should take all lods into account, if some are min
1167 * some max probably could hack up the weights in the linear
1168 * path with selects to work for nearest.
1170 if (bld
->num_lods
> 1)
1171 lod_positive
= LLVMBuildExtractElement(builder
, lod_positive
,
1172 lp_build_const_int32(bld
->gallivm
, 0), "");
1174 lod_positive
= LLVMBuildTrunc(builder
, lod_positive
,
1175 LLVMInt1TypeInContext(bld
->gallivm
->context
), "");
1177 lp_build_if(&if_ctx
, bld
->gallivm
, lod_positive
);
1179 /* Use the minification filter */
1180 lp_build_sample_mipmap(bld
,
1181 min_filter
, mip_filter
,
1183 ilevel0
, ilevel1
, lod_fpart
,
1186 lp_build_else(&if_ctx
);
1188 /* Use the magnification filter */
1189 lp_build_sample_mipmap(bld
,
1190 mag_filter
, PIPE_TEX_MIPFILTER_NONE
,
1192 ilevel0
, NULL
, NULL
,
1195 lp_build_endif(&if_ctx
);
1198 packed
= LLVMBuildLoad(builder
, packed_var
, "");
1201 * Convert to SoA and swizzle.
1203 lp_build_rgba8_to_fi32_soa(bld
->gallivm
,
1205 packed
, unswizzled
);
1207 if (util_format_is_rgba8_variant(bld
->format_desc
)) {
1208 lp_build_format_swizzle_soa(bld
->format_desc
,
1210 unswizzled
, texel_out
);
1213 texel_out
[0] = unswizzled
[0];
1214 texel_out
[1] = unswizzled
[1];
1215 texel_out
[2] = unswizzled
[2];
1216 texel_out
[3] = unswizzled
[3];