gallivm,draw,llvmpipe: Support wider native registers.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_aos.c
1 /**************************************************************************
2 *
3 * Copyright 2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- AoS.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "util/u_debug.h"
39 #include "util/u_dump.h"
40 #include "util/u_memory.h"
41 #include "util/u_math.h"
42 #include "util/u_format.h"
43 #include "util/u_cpu_detect.h"
44 #include "lp_bld_debug.h"
45 #include "lp_bld_type.h"
46 #include "lp_bld_const.h"
47 #include "lp_bld_conv.h"
48 #include "lp_bld_arit.h"
49 #include "lp_bld_bitarit.h"
50 #include "lp_bld_logic.h"
51 #include "lp_bld_swizzle.h"
52 #include "lp_bld_pack.h"
53 #include "lp_bld_flow.h"
54 #include "lp_bld_gather.h"
55 #include "lp_bld_format.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_sample.h"
58 #include "lp_bld_sample_aos.h"
59 #include "lp_bld_quad.h"
60
61
62 /**
63 * Build LLVM code for texture coord wrapping, for nearest filtering,
64 * for scaled integer texcoords.
65 * \param block_length is the length of the pixel block along the
66 * coordinate axis
67 * \param coord the incoming texcoord (s,t,r or q) scaled to the texture size
68 * \param length the texture size along one dimension
69 * \param stride pixel stride along the coordinate axis (in bytes)
70 * \param is_pot if TRUE, length is a power of two
71 * \param wrap_mode one of PIPE_TEX_WRAP_x
72 * \param out_offset byte offset for the wrapped coordinate
73 * \param out_i resulting sub-block pixel coordinate for coord0
74 */
75 static void
76 lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
77 unsigned block_length,
78 LLVMValueRef coord,
79 LLVMValueRef coord_f,
80 LLVMValueRef length,
81 LLVMValueRef stride,
82 boolean is_pot,
83 unsigned wrap_mode,
84 LLVMValueRef *out_offset,
85 LLVMValueRef *out_i)
86 {
87 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
88 LLVMBuilderRef builder = bld->gallivm->builder;
89 LLVMValueRef length_minus_one;
90
91 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
92
93 switch(wrap_mode) {
94 case PIPE_TEX_WRAP_REPEAT:
95 if(is_pot)
96 coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
97 else {
98 struct lp_build_context *coord_bld = &bld->coord_bld;
99 LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
100 coord = lp_build_fract_safe(coord_bld, coord_f);
101 coord = lp_build_mul(coord_bld, coord, length_f);
102 coord = lp_build_itrunc(coord_bld, coord);
103 }
104 break;
105
106 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
107 coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
108 coord = lp_build_min(int_coord_bld, coord, length_minus_one);
109 break;
110
111 case PIPE_TEX_WRAP_CLAMP:
112 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
113 case PIPE_TEX_WRAP_MIRROR_REPEAT:
114 case PIPE_TEX_WRAP_MIRROR_CLAMP:
115 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
116 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
117 default:
118 assert(0);
119 }
120
121 lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride,
122 out_offset, out_i);
123 }
124
125
126 /**
127 * Build LLVM code for texture coord wrapping, for nearest filtering,
128 * for float texcoords.
129 * \param coord the incoming texcoord (s,t,r or q)
130 * \param length the texture size along one dimension
131 * \param is_pot if TRUE, length is a power of two
132 * \param wrap_mode one of PIPE_TEX_WRAP_x
133 * \param icoord the texcoord after wrapping, as int
134 */
135 static void
136 lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
137 LLVMValueRef coord,
138 LLVMValueRef length,
139 boolean is_pot,
140 unsigned wrap_mode,
141 LLVMValueRef *icoord)
142 {
143 struct lp_build_context *coord_bld = &bld->coord_bld;
144 LLVMValueRef length_minus_one;
145
146 switch(wrap_mode) {
147 case PIPE_TEX_WRAP_REPEAT:
148 /* take fraction, unnormalize */
149 coord = lp_build_fract_safe(coord_bld, coord);
150 coord = lp_build_mul(coord_bld, coord, length);
151 *icoord = lp_build_itrunc(coord_bld, coord);
152 break;
153 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
154 length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
155 if (bld->static_state->normalized_coords) {
156 /* scale coord to length */
157 coord = lp_build_mul(coord_bld, coord, length);
158 }
159 coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
160 length_minus_one);
161 *icoord = lp_build_itrunc(coord_bld, coord);
162 break;
163
164 case PIPE_TEX_WRAP_CLAMP:
165 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
166 case PIPE_TEX_WRAP_MIRROR_REPEAT:
167 case PIPE_TEX_WRAP_MIRROR_CLAMP:
168 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
169 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
170 default:
171 assert(0);
172 }
173 }
174
175
176 /**
177 * Build LLVM code for texture coord wrapping, for linear filtering,
178 * for scaled integer texcoords.
179 * \param block_length is the length of the pixel block along the
180 * coordinate axis
181 * \param coord0 the incoming texcoord (s,t,r or q) scaled to the texture size
182 * \param length the texture size along one dimension
183 * \param stride pixel stride along the coordinate axis (in bytes)
184 * \param is_pot if TRUE, length is a power of two
185 * \param wrap_mode one of PIPE_TEX_WRAP_x
186 * \param offset0 resulting relative offset for coord0
187 * \param offset1 resulting relative offset for coord0 + 1
188 * \param i0 resulting sub-block pixel coordinate for coord0
189 * \param i1 resulting sub-block pixel coordinate for coord0 + 1
190 */
191 static void
192 lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
193 unsigned block_length,
194 LLVMValueRef coord0,
195 LLVMValueRef *weight_i,
196 LLVMValueRef coord_f,
197 LLVMValueRef length,
198 LLVMValueRef stride,
199 boolean is_pot,
200 unsigned wrap_mode,
201 LLVMValueRef *offset0,
202 LLVMValueRef *offset1,
203 LLVMValueRef *i0,
204 LLVMValueRef *i1)
205 {
206 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
207 LLVMBuilderRef builder = bld->gallivm->builder;
208 LLVMValueRef length_minus_one;
209 LLVMValueRef lmask, umask, mask;
210
211 /*
212 * If the pixel block covers more than one pixel then there is no easy
213 * way to calculate offset1 relative to offset0. Instead, compute them
214 * independently. Otherwise, try to compute offset0 and offset1 with
215 * a single stride multiplication.
216 */
217
218 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
219
220 if (block_length != 1) {
221 LLVMValueRef coord1;
222 switch(wrap_mode) {
223 case PIPE_TEX_WRAP_REPEAT:
224 if (is_pot) {
225 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
226 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
227 coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
228 }
229 else {
230 LLVMValueRef mask;
231 LLVMValueRef weight;
232 LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
233 lp_build_coord_repeat_npot_linear(bld, coord_f,
234 length, length_f,
235 &coord0, &weight);
236 mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
237 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
238 coord1 = LLVMBuildAnd(builder,
239 lp_build_add(int_coord_bld, coord0,
240 int_coord_bld->one),
241 mask, "");
242 weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
243 *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
244 }
245 break;
246
247 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
248 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
249 coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
250 length_minus_one);
251 coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
252 length_minus_one);
253 break;
254
255 case PIPE_TEX_WRAP_CLAMP:
256 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
257 case PIPE_TEX_WRAP_MIRROR_REPEAT:
258 case PIPE_TEX_WRAP_MIRROR_CLAMP:
259 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
260 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
261 default:
262 assert(0);
263 coord0 = int_coord_bld->zero;
264 coord1 = int_coord_bld->zero;
265 break;
266 }
267 lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
268 offset0, i0);
269 lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
270 offset1, i1);
271 return;
272 }
273
274 *i0 = int_coord_bld->zero;
275 *i1 = int_coord_bld->zero;
276
277 switch(wrap_mode) {
278 case PIPE_TEX_WRAP_REPEAT:
279 if (is_pot) {
280 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
281 }
282 else {
283 LLVMValueRef weight;
284 LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
285 lp_build_coord_repeat_npot_linear(bld, coord_f,
286 length, length_f,
287 &coord0, &weight);
288 weight = lp_build_mul_imm(&bld->coord_bld, weight, 256);
289 *weight_i = lp_build_itrunc(&bld->coord_bld, weight);
290 }
291
292 mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
293 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
294
295 *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
296 *offset1 = LLVMBuildAnd(builder,
297 lp_build_add(int_coord_bld, *offset0, stride),
298 mask, "");
299 break;
300
301 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
302 /* XXX this might be slower than the separate path
303 * on some newer cpus. With sse41 this is 8 instructions vs. 7
304 * - at least on SNB this is almost certainly slower since
305 * min/max are cheaper than selects, and the muls aren't bad.
306 */
307 lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
308 PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
309 umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
310 PIPE_FUNC_LESS, coord0, length_minus_one);
311
312 coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
313 coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
314
315 mask = LLVMBuildAnd(builder, lmask, umask, "");
316
317 *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
318 *offset1 = lp_build_add(int_coord_bld,
319 *offset0,
320 LLVMBuildAnd(builder, stride, mask, ""));
321 break;
322
323 case PIPE_TEX_WRAP_CLAMP:
324 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
325 case PIPE_TEX_WRAP_MIRROR_REPEAT:
326 case PIPE_TEX_WRAP_MIRROR_CLAMP:
327 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
328 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
329 default:
330 assert(0);
331 *offset0 = int_coord_bld->zero;
332 *offset1 = int_coord_bld->zero;
333 break;
334 }
335 }
336
337
338 /**
339 * Build LLVM code for texture coord wrapping, for linear filtering,
340 * for float texcoords.
341 * \param block_length is the length of the pixel block along the
342 * coordinate axis
343 * \param coord the incoming texcoord (s,t,r or q)
344 * \param length the texture size along one dimension
345 * \param is_pot if TRUE, length is a power of two
346 * \param wrap_mode one of PIPE_TEX_WRAP_x
347 * \param coord0 the first texcoord after wrapping, as int
348 * \param coord1 the second texcoord after wrapping, as int
349 * \param weight the filter weight as int (0-255)
350 * \param force_nearest if this coord actually uses nearest filtering
351 */
352 static void
353 lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
354 unsigned block_length,
355 LLVMValueRef coord,
356 LLVMValueRef length,
357 boolean is_pot,
358 unsigned wrap_mode,
359 LLVMValueRef *coord0,
360 LLVMValueRef *coord1,
361 LLVMValueRef *weight,
362 unsigned force_nearest)
363 {
364 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
365 struct lp_build_context *coord_bld = &bld->coord_bld;
366 LLVMBuilderRef builder = bld->gallivm->builder;
367 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
368 LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
369
370 switch(wrap_mode) {
371 case PIPE_TEX_WRAP_REPEAT:
372 if (is_pot) {
373 /* mul by size and subtract 0.5 */
374 coord = lp_build_mul(coord_bld, coord, length);
375 if (!force_nearest)
376 coord = lp_build_sub(coord_bld, coord, half);
377 *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
378 /* convert to int, compute lerp weight */
379 lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
380 *coord1 = lp_build_ifloor(coord_bld, *coord1);
381 /* repeat wrap */
382 length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
383 *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
384 *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
385 }
386 else {
387 LLVMValueRef mask;
388 /* wrap with normalized floats is just fract */
389 coord = lp_build_fract(coord_bld, coord);
390 /* unnormalize */
391 coord = lp_build_mul(coord_bld, coord, length);
392 /*
393 * we avoided the 0.5/length division, have to fix up wrong
394 * edge cases with selects
395 */
396 *coord1 = lp_build_add(coord_bld, coord, half);
397 coord = lp_build_sub(coord_bld, coord, half);
398 *weight = lp_build_fract(coord_bld, coord);
399 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
400 PIPE_FUNC_LESS, coord, coord_bld->zero);
401 *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
402 *coord0 = lp_build_itrunc(coord_bld, *coord0);
403 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
404 PIPE_FUNC_LESS, *coord1, length);
405 *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
406 *coord1 = lp_build_itrunc(coord_bld, *coord1);
407 }
408 break;
409 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
410 if (bld->static_state->normalized_coords) {
411 /* mul by tex size */
412 coord = lp_build_mul(coord_bld, coord, length);
413 }
414 /* subtract 0.5 */
415 if (!force_nearest) {
416 coord = lp_build_sub(coord_bld, coord, half);
417 }
418 /* clamp to [0, length - 1] */
419 coord = lp_build_min(coord_bld, coord, length_minus_one);
420 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
421 *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
422 /* convert to int, compute lerp weight */
423 lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
424 /* coord1 = min(coord1, length-1) */
425 *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
426 *coord1 = lp_build_itrunc(coord_bld, *coord1);
427 break;
428 default:
429 assert(0);
430 *coord0 = int_coord_bld->zero;
431 *coord1 = int_coord_bld->zero;
432 *weight = coord_bld->zero;
433 break;
434 }
435 *weight = lp_build_mul_imm(coord_bld, *weight, 256);
436 *weight = lp_build_itrunc(coord_bld, *weight);
437 return;
438 }
439
440
441 /**
442 * Fetch texels for image with nearest sampling.
443 * Return filtered color as two vectors of 16-bit fixed point values.
444 */
445 static void
446 lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
447 LLVMValueRef data_ptr,
448 LLVMValueRef offset,
449 LLVMValueRef x_subcoord,
450 LLVMValueRef y_subcoord,
451 LLVMValueRef *colors_lo,
452 LLVMValueRef *colors_hi)
453 {
454 /*
455 * Fetch the pixels as 4 x 32bit (rgba order might differ):
456 *
457 * rgba0 rgba1 rgba2 rgba3
458 *
459 * bit cast them into 16 x u8
460 *
461 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
462 *
463 * unpack them into two 8 x i16:
464 *
465 * r0 g0 b0 a0 r1 g1 b1 a1
466 * r2 g2 b2 a2 r3 g3 b3 a3
467 *
468 * The higher 8 bits of the resulting elements will be zero.
469 */
470 LLVMBuilderRef builder = bld->gallivm->builder;
471 LLVMValueRef rgba8;
472 struct lp_build_context h16, u8n;
473 LLVMTypeRef u8n_vec_type;
474
475 lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
476 lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
477 u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
478
479 if (util_format_is_rgba8_variant(bld->format_desc)) {
480 /*
481 * Given the format is a rgba8, just read the pixels as is,
482 * without any swizzling. Swizzling will be done later.
483 */
484 rgba8 = lp_build_gather(bld->gallivm,
485 bld->texel_type.length,
486 bld->format_desc->block.bits,
487 bld->texel_type.width,
488 data_ptr, offset);
489
490 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
491 }
492 else {
493 rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
494 bld->format_desc,
495 u8n.type,
496 data_ptr, offset,
497 x_subcoord,
498 y_subcoord);
499 }
500
501 /* Expand one 4*rgba8 to two 2*rgba16 */
502 lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
503 rgba8,
504 colors_lo, colors_hi);
505 }
506
507
508 /**
509 * Sample a single texture image with nearest sampling.
510 * If sampling a cube texture, r = cube face in [0,5].
511 * Return filtered color as two vectors of 16-bit fixed point values.
512 */
513 static void
514 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
515 LLVMValueRef int_size,
516 LLVMValueRef row_stride_vec,
517 LLVMValueRef img_stride_vec,
518 LLVMValueRef data_ptr,
519 LLVMValueRef s,
520 LLVMValueRef t,
521 LLVMValueRef r,
522 LLVMValueRef *colors_lo,
523 LLVMValueRef *colors_hi)
524 {
525 const unsigned dims = bld->dims;
526 LLVMBuilderRef builder = bld->gallivm->builder;
527 struct lp_build_context i32;
528 LLVMTypeRef i32_vec_type;
529 LLVMValueRef i32_c8;
530 LLVMValueRef width_vec, height_vec, depth_vec;
531 LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
532 LLVMValueRef s_float, t_float = NULL, r_float = NULL;
533 LLVMValueRef x_stride;
534 LLVMValueRef x_offset, offset;
535 LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
536
537 lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
538
539 i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
540
541 lp_build_extract_image_sizes(bld,
542 bld->int_size_type,
543 bld->int_coord_type,
544 int_size,
545 &width_vec,
546 &height_vec,
547 &depth_vec);
548
549 s_float = s; t_float = t; r_float = r;
550
551 if (bld->static_state->normalized_coords) {
552 LLVMValueRef scaled_size;
553 LLVMValueRef flt_size;
554
555 /* scale size by 256 (8 fractional bits) */
556 scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
557
558 flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
559
560 lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
561 }
562 else {
563 /* scale coords by 256 (8 fractional bits) */
564 s = lp_build_mul_imm(&bld->coord_bld, s, 256);
565 if (dims >= 2)
566 t = lp_build_mul_imm(&bld->coord_bld, t, 256);
567 if (dims >= 3)
568 r = lp_build_mul_imm(&bld->coord_bld, r, 256);
569 }
570
571 /* convert float to int */
572 s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
573 if (dims >= 2)
574 t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
575 if (dims >= 3)
576 r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
577
578 /* compute floor (shift right 8) */
579 i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8);
580 s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
581 if (dims >= 2)
582 t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
583 if (dims >= 3)
584 r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
585
586 /* get pixel, row, image strides */
587 x_stride = lp_build_const_vec(bld->gallivm,
588 bld->int_coord_bld.type,
589 bld->format_desc->block.bits/8);
590
591 /* Do texcoord wrapping, compute texel offset */
592 lp_build_sample_wrap_nearest_int(bld,
593 bld->format_desc->block.width,
594 s_ipart, s_float,
595 width_vec, x_stride,
596 bld->static_state->pot_width,
597 bld->static_state->wrap_s,
598 &x_offset, &x_subcoord);
599 offset = x_offset;
600 if (dims >= 2) {
601 LLVMValueRef y_offset;
602 lp_build_sample_wrap_nearest_int(bld,
603 bld->format_desc->block.height,
604 t_ipart, t_float,
605 height_vec, row_stride_vec,
606 bld->static_state->pot_height,
607 bld->static_state->wrap_t,
608 &y_offset, &y_subcoord);
609 offset = lp_build_add(&bld->int_coord_bld, offset, y_offset);
610 if (dims >= 3) {
611 LLVMValueRef z_offset;
612 lp_build_sample_wrap_nearest_int(bld,
613 1, /* block length (depth) */
614 r_ipart, r_float,
615 depth_vec, img_stride_vec,
616 bld->static_state->pot_depth,
617 bld->static_state->wrap_r,
618 &z_offset, &z_subcoord);
619 offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
620 }
621 else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
622 LLVMValueRef z_offset;
623 /* The r coord is the cube face in [0,5] */
624 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
625 offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
626 }
627 }
628
629 lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
630 x_subcoord, y_subcoord,
631 colors_lo, colors_hi);
632 }
633
634
635 /**
636 * Sample a single texture image with nearest sampling.
637 * If sampling a cube texture, r = cube face in [0,5].
638 * Return filtered color as two vectors of 16-bit fixed point values.
639 * Does address calcs (except offsets) with floats.
640 * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
641 */
642 static void
643 lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
644 LLVMValueRef int_size,
645 LLVMValueRef row_stride_vec,
646 LLVMValueRef img_stride_vec,
647 LLVMValueRef data_ptr,
648 LLVMValueRef s,
649 LLVMValueRef t,
650 LLVMValueRef r,
651 LLVMValueRef *colors_lo,
652 LLVMValueRef *colors_hi)
653 {
654 const unsigned dims = bld->dims;
655 LLVMValueRef width_vec, height_vec, depth_vec;
656 LLVMValueRef offset;
657 LLVMValueRef x_subcoord, y_subcoord;
658 LLVMValueRef x_icoord, y_icoord, z_icoord;
659 LLVMValueRef flt_size;
660
661 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
662
663 lp_build_extract_image_sizes(bld,
664 bld->float_size_type,
665 bld->coord_type,
666 flt_size,
667 &width_vec,
668 &height_vec,
669 &depth_vec);
670
671 /* Do texcoord wrapping */
672 lp_build_sample_wrap_nearest_float(bld,
673 s, width_vec,
674 bld->static_state->pot_width,
675 bld->static_state->wrap_s,
676 &x_icoord);
677
678 if (dims >= 2) {
679 lp_build_sample_wrap_nearest_float(bld,
680 t, height_vec,
681 bld->static_state->pot_height,
682 bld->static_state->wrap_t,
683 &y_icoord);
684
685 if (dims >= 3) {
686 lp_build_sample_wrap_nearest_float(bld,
687 r, depth_vec,
688 bld->static_state->pot_depth,
689 bld->static_state->wrap_r,
690 &z_icoord);
691 }
692 else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
693 z_icoord = r;
694 }
695 }
696
697 /*
698 * From here on we deal with ints, and we should split up the 256bit
699 * vectors manually for better generated code.
700 */
701
702 /*
703 * compute texel offsets -
704 * cannot do offset calc with floats, difficult for block-based formats,
705 * and not enough precision anyway.
706 */
707 lp_build_sample_offset(&bld->int_coord_bld,
708 bld->format_desc,
709 x_icoord, y_icoord,
710 z_icoord,
711 row_stride_vec, img_stride_vec,
712 &offset,
713 &x_subcoord, &y_subcoord);
714
715 lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
716 x_subcoord, y_subcoord,
717 colors_lo, colors_hi);
718 }
719
720
721 /**
722 * Fetch texels for image with linear sampling.
723 * Return filtered color as two vectors of 16-bit fixed point values.
724 */
725 static void
726 lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
727 LLVMValueRef data_ptr,
728 LLVMValueRef offset[2][2][2],
729 LLVMValueRef x_subcoord[2],
730 LLVMValueRef y_subcoord[2],
731 LLVMValueRef s_fpart,
732 LLVMValueRef t_fpart,
733 LLVMValueRef r_fpart,
734 LLVMValueRef *colors_lo,
735 LLVMValueRef *colors_hi)
736 {
737 const unsigned dims = bld->dims;
738 LLVMBuilderRef builder = bld->gallivm->builder;
739 struct lp_build_context h16, u8n;
740 LLVMTypeRef h16_vec_type, u8n_vec_type;
741 LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
742 LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
743 LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
744 LLVMValueRef shuffle_lo, shuffle_hi;
745 LLVMValueRef s_fpart_lo, s_fpart_hi;
746 LLVMValueRef t_fpart_lo = NULL, t_fpart_hi = NULL;
747 LLVMValueRef r_fpart_lo = NULL, r_fpart_hi = NULL;
748 LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
749 LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
750 LLVMValueRef packed_lo, packed_hi;
751 unsigned i, j, k;
752 unsigned numj, numk;
753
754 lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
755 lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
756 h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type);
757 u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
758
759 /*
760 * Transform 4 x i32 in
761 *
762 * s_fpart = {s0, s1, s2, s3}
763 *
764 * into 8 x i16
765 *
766 * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
767 *
768 * into two 8 x i16
769 *
770 * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
771 * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
772 *
773 * and likewise for t_fpart. There is no risk of loosing precision here
774 * since the fractional parts only use the lower 8bits.
775 */
776 s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
777 if (dims >= 2)
778 t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
779 if (dims >= 3)
780 r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
781
782 for (j = 0; j < h16.type.length; j += 4) {
783 #ifdef PIPE_ARCH_LITTLE_ENDIAN
784 unsigned subindex = 0;
785 #else
786 unsigned subindex = 1;
787 #endif
788 LLVMValueRef index;
789
790 index = LLVMConstInt(elem_type, j/2 + subindex, 0);
791 for (i = 0; i < 4; ++i)
792 shuffles_lo[j + i] = index;
793
794 index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
795 for (i = 0; i < 4; ++i)
796 shuffles_hi[j + i] = index;
797 }
798
799 shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
800 shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
801
802 s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
803 shuffle_lo, "");
804 s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
805 shuffle_hi, "");
806 if (dims >= 2) {
807 t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
808 shuffle_lo, "");
809 t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
810 shuffle_hi, "");
811 }
812 if (dims >= 3) {
813 r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
814 shuffle_lo, "");
815 r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
816 shuffle_hi, "");
817 }
818
819 /*
820 * Fetch the pixels as 4 x 32bit (rgba order might differ):
821 *
822 * rgba0 rgba1 rgba2 rgba3
823 *
824 * bit cast them into 16 x u8
825 *
826 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
827 *
828 * unpack them into two 8 x i16:
829 *
830 * r0 g0 b0 a0 r1 g1 b1 a1
831 * r2 g2 b2 a2 r3 g3 b3 a3
832 *
833 * The higher 8 bits of the resulting elements will be zero.
834 */
835 numj = 1 + (dims >= 2);
836 numk = 1 + (dims >= 3);
837
838 for (k = 0; k < numk; k++) {
839 for (j = 0; j < numj; j++) {
840 for (i = 0; i < 2; i++) {
841 LLVMValueRef rgba8;
842
843 if (util_format_is_rgba8_variant(bld->format_desc)) {
844 /*
845 * Given the format is a rgba8, just read the pixels as is,
846 * without any swizzling. Swizzling will be done later.
847 */
848 rgba8 = lp_build_gather(bld->gallivm,
849 bld->texel_type.length,
850 bld->format_desc->block.bits,
851 bld->texel_type.width,
852 data_ptr, offset[k][j][i]);
853
854 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
855 }
856 else {
857 rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
858 bld->format_desc,
859 u8n.type,
860 data_ptr, offset[k][j][i],
861 x_subcoord[i],
862 y_subcoord[j]);
863 }
864
865 /* Expand one 4*rgba8 to two 2*rgba16 */
866 lp_build_unpack2(bld->gallivm, u8n.type, h16.type,
867 rgba8,
868 &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
869 }
870 }
871 }
872
873 /*
874 * Linear interpolation with 8.8 fixed point.
875 */
876 if (bld->static_state->force_nearest_s) {
877 /* special case 1-D lerp */
878 packed_lo = lp_build_lerp(&h16,
879 t_fpart_lo,
880 neighbors_lo[0][0][0],
881 neighbors_lo[0][0][1]);
882
883 packed_hi = lp_build_lerp(&h16,
884 t_fpart_hi,
885 neighbors_hi[0][1][0],
886 neighbors_hi[0][1][0]);
887 }
888 else if (bld->static_state->force_nearest_t) {
889 /* special case 1-D lerp */
890 packed_lo = lp_build_lerp(&h16,
891 s_fpart_lo,
892 neighbors_lo[0][0][0],
893 neighbors_lo[0][0][1]);
894
895 packed_hi = lp_build_lerp(&h16,
896 s_fpart_hi,
897 neighbors_hi[0][0][0],
898 neighbors_hi[0][0][1]);
899 }
900 else {
901 /* general 1/2/3-D lerping */
902 if (dims == 1) {
903 packed_lo = lp_build_lerp(&h16,
904 s_fpart_lo,
905 neighbors_lo[0][0][0],
906 neighbors_lo[0][0][1]);
907
908 packed_hi = lp_build_lerp(&h16,
909 s_fpart_hi,
910 neighbors_hi[0][0][0],
911 neighbors_hi[0][0][1]);
912 }
913 else {
914 /* 2-D lerp */
915 packed_lo = lp_build_lerp_2d(&h16,
916 s_fpart_lo, t_fpart_lo,
917 neighbors_lo[0][0][0],
918 neighbors_lo[0][0][1],
919 neighbors_lo[0][1][0],
920 neighbors_lo[0][1][1]);
921
922 packed_hi = lp_build_lerp_2d(&h16,
923 s_fpart_hi, t_fpart_hi,
924 neighbors_hi[0][0][0],
925 neighbors_hi[0][0][1],
926 neighbors_hi[0][1][0],
927 neighbors_hi[0][1][1]);
928
929 if (dims >= 3) {
930 LLVMValueRef packed_lo2, packed_hi2;
931
932 /* lerp in the second z slice */
933 packed_lo2 = lp_build_lerp_2d(&h16,
934 s_fpart_lo, t_fpart_lo,
935 neighbors_lo[1][0][0],
936 neighbors_lo[1][0][1],
937 neighbors_lo[1][1][0],
938 neighbors_lo[1][1][1]);
939
940 packed_hi2 = lp_build_lerp_2d(&h16,
941 s_fpart_hi, t_fpart_hi,
942 neighbors_hi[1][0][0],
943 neighbors_hi[1][0][1],
944 neighbors_hi[1][1][0],
945 neighbors_hi[1][1][1]);
946 /* interp between two z slices */
947 packed_lo = lp_build_lerp(&h16, r_fpart_lo,
948 packed_lo, packed_lo2);
949 packed_hi = lp_build_lerp(&h16, r_fpart_hi,
950 packed_hi, packed_hi2);
951 }
952 }
953 }
954
955 *colors_lo = packed_lo;
956 *colors_hi = packed_hi;
957 }
958
959 /**
960 * Sample a single texture image with (bi-)(tri-)linear sampling.
961 * Return filtered color as two vectors of 16-bit fixed point values.
962 */
963 static void
964 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
965 LLVMValueRef int_size,
966 LLVMValueRef row_stride_vec,
967 LLVMValueRef img_stride_vec,
968 LLVMValueRef data_ptr,
969 LLVMValueRef s,
970 LLVMValueRef t,
971 LLVMValueRef r,
972 LLVMValueRef *colors_lo,
973 LLVMValueRef *colors_hi)
974 {
975 const unsigned dims = bld->dims;
976 LLVMBuilderRef builder = bld->gallivm->builder;
977 struct lp_build_context i32;
978 LLVMTypeRef i32_vec_type;
979 LLVMValueRef i32_c8, i32_c128, i32_c255;
980 LLVMValueRef width_vec, height_vec, depth_vec;
981 LLVMValueRef s_ipart, s_fpart, s_float;
982 LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
983 LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
984 LLVMValueRef x_stride, y_stride, z_stride;
985 LLVMValueRef x_offset0, x_offset1;
986 LLVMValueRef y_offset0, y_offset1;
987 LLVMValueRef z_offset0, z_offset1;
988 LLVMValueRef offset[2][2][2]; /* [z][y][x] */
989 LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
990 unsigned x, y, z;
991
992 lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
993
994 i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);
995
996 lp_build_extract_image_sizes(bld,
997 bld->int_size_type,
998 bld->int_coord_type,
999 int_size,
1000 &width_vec,
1001 &height_vec,
1002 &depth_vec);
1003
1004 s_float = s; t_float = t; r_float = r;
1005
1006 if (bld->static_state->normalized_coords) {
1007 LLVMValueRef scaled_size;
1008 LLVMValueRef flt_size;
1009
1010 /* scale size by 256 (8 fractional bits) */
1011 scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
1012
1013 flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
1014
1015 lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
1016 }
1017 else {
1018 /* scale coords by 256 (8 fractional bits) */
1019 s = lp_build_mul_imm(&bld->coord_bld, s, 256);
1020 if (dims >= 2)
1021 t = lp_build_mul_imm(&bld->coord_bld, t, 256);
1022 if (dims >= 3)
1023 r = lp_build_mul_imm(&bld->coord_bld, r, 256);
1024 }
1025
1026 /* convert float to int */
1027 s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
1028 if (dims >= 2)
1029 t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
1030 if (dims >= 3)
1031 r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
1032
1033 /* subtract 0.5 (add -128) */
1034 i32_c128 = lp_build_const_int_vec(bld->gallivm, i32.type, -128);
1035 if (!bld->static_state->force_nearest_s) {
1036 s = LLVMBuildAdd(builder, s, i32_c128, "");
1037 }
1038 if (dims >= 2 && !bld->static_state->force_nearest_t) {
1039 t = LLVMBuildAdd(builder, t, i32_c128, "");
1040 }
1041 if (dims >= 3) {
1042 r = LLVMBuildAdd(builder, r, i32_c128, "");
1043 }
1044
1045 /* compute floor (shift right 8) */
1046 i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8);
1047 s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
1048 if (dims >= 2)
1049 t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
1050 if (dims >= 3)
1051 r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
1052
1053 /* compute fractional part (AND with 0xff) */
1054 i32_c255 = lp_build_const_int_vec(bld->gallivm, i32.type, 255);
1055 s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
1056 if (dims >= 2)
1057 t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
1058 if (dims >= 3)
1059 r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
1060
1061 /* get pixel, row and image strides */
1062 x_stride = lp_build_const_vec(bld->gallivm, bld->int_coord_bld.type,
1063 bld->format_desc->block.bits/8);
1064 y_stride = row_stride_vec;
1065 z_stride = img_stride_vec;
1066
1067 /* do texcoord wrapping and compute texel offsets */
1068 lp_build_sample_wrap_linear_int(bld,
1069 bld->format_desc->block.width,
1070 s_ipart, &s_fpart, s_float,
1071 width_vec, x_stride,
1072 bld->static_state->pot_width,
1073 bld->static_state->wrap_s,
1074 &x_offset0, &x_offset1,
1075 &x_subcoord[0], &x_subcoord[1]);
1076 for (z = 0; z < 2; z++) {
1077 for (y = 0; y < 2; y++) {
1078 offset[z][y][0] = x_offset0;
1079 offset[z][y][1] = x_offset1;
1080 }
1081 }
1082
1083 if (dims >= 2) {
1084 lp_build_sample_wrap_linear_int(bld,
1085 bld->format_desc->block.height,
1086 t_ipart, &t_fpart, t_float,
1087 height_vec, y_stride,
1088 bld->static_state->pot_height,
1089 bld->static_state->wrap_t,
1090 &y_offset0, &y_offset1,
1091 &y_subcoord[0], &y_subcoord[1]);
1092
1093 for (z = 0; z < 2; z++) {
1094 for (x = 0; x < 2; x++) {
1095 offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1096 offset[z][0][x], y_offset0);
1097 offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1098 offset[z][1][x], y_offset1);
1099 }
1100 }
1101 }
1102
1103 if (dims >= 3) {
1104 lp_build_sample_wrap_linear_int(bld,
1105 bld->format_desc->block.height,
1106 r_ipart, &r_fpart, r_float,
1107 depth_vec, z_stride,
1108 bld->static_state->pot_depth,
1109 bld->static_state->wrap_r,
1110 &z_offset0, &z_offset1,
1111 &z_subcoord[0], &z_subcoord[1]);
1112 for (y = 0; y < 2; y++) {
1113 for (x = 0; x < 2; x++) {
1114 offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1115 offset[0][y][x], z_offset0);
1116 offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1117 offset[1][y][x], z_offset1);
1118 }
1119 }
1120 }
1121 else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
1122 LLVMValueRef z_offset;
1123 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1124 for (y = 0; y < 2; y++) {
1125 for (x = 0; x < 2; x++) {
1126 /* The r coord is the cube face in [0,5] */
1127 offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1128 offset[0][y][x], z_offset);
1129 }
1130 }
1131 }
1132
1133 lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1134 x_subcoord, y_subcoord,
1135 s_fpart, t_fpart, r_fpart,
1136 colors_lo, colors_hi);
1137 }
1138
1139
1140 /**
1141 * Sample a single texture image with (bi-)(tri-)linear sampling.
1142 * Return filtered color as two vectors of 16-bit fixed point values.
1143 * Does address calcs (except offsets) with floats.
1144 * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
1145 */
1146 static void
1147 lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
1148 LLVMValueRef int_size,
1149 LLVMValueRef row_stride_vec,
1150 LLVMValueRef img_stride_vec,
1151 LLVMValueRef data_ptr,
1152 LLVMValueRef s,
1153 LLVMValueRef t,
1154 LLVMValueRef r,
1155 LLVMValueRef *colors_lo,
1156 LLVMValueRef *colors_hi)
1157 {
1158 const unsigned dims = bld->dims;
1159 LLVMValueRef width_vec, height_vec, depth_vec;
1160 LLVMValueRef s_fpart;
1161 LLVMValueRef t_fpart = NULL;
1162 LLVMValueRef r_fpart = NULL;
1163 LLVMValueRef x_stride, y_stride, z_stride;
1164 LLVMValueRef x_offset0, x_offset1;
1165 LLVMValueRef y_offset0, y_offset1;
1166 LLVMValueRef z_offset0, z_offset1;
1167 LLVMValueRef offset[2][2][2]; /* [z][y][x] */
1168 LLVMValueRef x_subcoord[2], y_subcoord[2];
1169 LLVMValueRef flt_size;
1170 LLVMValueRef x_icoord0, x_icoord1;
1171 LLVMValueRef y_icoord0, y_icoord1;
1172 LLVMValueRef z_icoord0, z_icoord1;
1173 unsigned x, y, z;
1174
1175 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
1176
1177 lp_build_extract_image_sizes(bld,
1178 bld->float_size_type,
1179 bld->coord_type,
1180 flt_size,
1181 &width_vec,
1182 &height_vec,
1183 &depth_vec);
1184
1185 /* do texcoord wrapping and compute texel offsets */
1186 lp_build_sample_wrap_linear_float(bld,
1187 bld->format_desc->block.width,
1188 s, width_vec,
1189 bld->static_state->pot_width,
1190 bld->static_state->wrap_s,
1191 &x_icoord0, &x_icoord1,
1192 &s_fpart,
1193 bld->static_state->force_nearest_s);
1194
1195 if (dims >= 2) {
1196 lp_build_sample_wrap_linear_float(bld,
1197 bld->format_desc->block.height,
1198 t, height_vec,
1199 bld->static_state->pot_height,
1200 bld->static_state->wrap_t,
1201 &y_icoord0, &y_icoord1,
1202 &t_fpart,
1203 bld->static_state->force_nearest_t);
1204
1205 if (dims >= 3) {
1206 lp_build_sample_wrap_linear_float(bld,
1207 bld->format_desc->block.height,
1208 r, depth_vec,
1209 bld->static_state->pot_depth,
1210 bld->static_state->wrap_r,
1211 &z_icoord0, &z_icoord1,
1212 &r_fpart, 0);
1213 }
1214 }
1215
1216 /*
1217 * From here on we deal with ints, and we should split up the 256bit
1218 * vectors manually for better generated code.
1219 */
1220
1221 /* get pixel, row and image strides */
1222 x_stride = lp_build_const_vec(bld->gallivm,
1223 bld->int_coord_bld.type,
1224 bld->format_desc->block.bits/8);
1225 y_stride = row_stride_vec;
1226 z_stride = img_stride_vec;
1227
1228 /*
1229 * compute texel offset -
1230 * cannot do offset calc with floats, difficult for block-based formats,
1231 * and not enough precision anyway.
1232 */
1233 lp_build_sample_partial_offset(&bld->int_coord_bld,
1234 bld->format_desc->block.width,
1235 x_icoord0, x_stride,
1236 &x_offset0, &x_subcoord[0]);
1237 lp_build_sample_partial_offset(&bld->int_coord_bld,
1238 bld->format_desc->block.width,
1239 x_icoord1, x_stride,
1240 &x_offset1, &x_subcoord[1]);
1241 for (z = 0; z < 2; z++) {
1242 for (y = 0; y < 2; y++) {
1243 offset[z][y][0] = x_offset0;
1244 offset[z][y][1] = x_offset1;
1245 }
1246 }
1247
1248 if (dims >= 2) {
1249 lp_build_sample_partial_offset(&bld->int_coord_bld,
1250 bld->format_desc->block.height,
1251 y_icoord0, y_stride,
1252 &y_offset0, &y_subcoord[0]);
1253 lp_build_sample_partial_offset(&bld->int_coord_bld,
1254 bld->format_desc->block.height,
1255 y_icoord1, y_stride,
1256 &y_offset1, &y_subcoord[1]);
1257 for (z = 0; z < 2; z++) {
1258 for (x = 0; x < 2; x++) {
1259 offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1260 offset[z][0][x], y_offset0);
1261 offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1262 offset[z][1][x], y_offset1);
1263 }
1264 }
1265 }
1266
1267 if (dims >= 3) {
1268 LLVMValueRef z_subcoord[2];
1269 lp_build_sample_partial_offset(&bld->int_coord_bld,
1270 1,
1271 z_icoord0, z_stride,
1272 &z_offset0, &z_subcoord[0]);
1273 lp_build_sample_partial_offset(&bld->int_coord_bld,
1274 1,
1275 z_icoord1, z_stride,
1276 &z_offset1, &z_subcoord[1]);
1277 for (y = 0; y < 2; y++) {
1278 for (x = 0; x < 2; x++) {
1279 offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1280 offset[0][y][x], z_offset0);
1281 offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1282 offset[1][y][x], z_offset1);
1283 }
1284 }
1285 }
1286 else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
1287 LLVMValueRef z_offset;
1288 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1289 for (y = 0; y < 2; y++) {
1290 for (x = 0; x < 2; x++) {
1291 /* The r coord is the cube face in [0,5] */
1292 offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1293 offset[0][y][x], z_offset);
1294 }
1295 }
1296 }
1297
1298 lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1299 x_subcoord, y_subcoord,
1300 s_fpart, t_fpart, r_fpart,
1301 colors_lo, colors_hi);
1302 }
1303
1304
1305 /**
1306 * Sample the texture/mipmap using given image filter and mip filter.
1307 * data0_ptr and data1_ptr point to the two mipmap levels to sample
1308 * from. width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
1309 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1310 */
1311 static void
1312 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1313 unsigned img_filter,
1314 unsigned mip_filter,
1315 LLVMValueRef s,
1316 LLVMValueRef t,
1317 LLVMValueRef r,
1318 LLVMValueRef ilevel0,
1319 LLVMValueRef ilevel1,
1320 LLVMValueRef lod_fpart,
1321 LLVMValueRef colors_lo_var,
1322 LLVMValueRef colors_hi_var)
1323 {
1324 LLVMBuilderRef builder = bld->gallivm->builder;
1325 LLVMValueRef size0;
1326 LLVMValueRef size1;
1327 LLVMValueRef row_stride0_vec = NULL;
1328 LLVMValueRef row_stride1_vec = NULL;
1329 LLVMValueRef img_stride0_vec = NULL;
1330 LLVMValueRef img_stride1_vec = NULL;
1331 LLVMValueRef data_ptr0;
1332 LLVMValueRef data_ptr1;
1333 LLVMValueRef colors0_lo, colors0_hi;
1334 LLVMValueRef colors1_lo, colors1_hi;
1335
1336 /* sample the first mipmap level */
1337 lp_build_mipmap_level_sizes(bld, ilevel0,
1338 &size0,
1339 &row_stride0_vec, &img_stride0_vec);
1340 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1341 if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
1342 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1343 lp_build_sample_image_nearest_afloat(bld,
1344 size0,
1345 row_stride0_vec, img_stride0_vec,
1346 data_ptr0, s, t, r,
1347 &colors0_lo, &colors0_hi);
1348 }
1349 else {
1350 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1351 lp_build_sample_image_linear_afloat(bld,
1352 size0,
1353 row_stride0_vec, img_stride0_vec,
1354 data_ptr0, s, t, r,
1355 &colors0_lo, &colors0_hi);
1356 }
1357 }
1358 else {
1359 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1360 lp_build_sample_image_nearest(bld,
1361 size0,
1362 row_stride0_vec, img_stride0_vec,
1363 data_ptr0, s, t, r,
1364 &colors0_lo, &colors0_hi);
1365 }
1366 else {
1367 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1368 lp_build_sample_image_linear(bld,
1369 size0,
1370 row_stride0_vec, img_stride0_vec,
1371 data_ptr0, s, t, r,
1372 &colors0_lo, &colors0_hi);
1373 }
1374 }
1375
1376 /* Store the first level's colors in the output variables */
1377 LLVMBuildStore(builder, colors0_lo, colors_lo_var);
1378 LLVMBuildStore(builder, colors0_hi, colors_hi_var);
1379
1380 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1381 LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
1382 bld->perquadf_bld.type, 256.0);
1383 LLVMTypeRef i32vec_type = lp_build_vec_type(bld->gallivm, bld->perquadi_bld.type);
1384 struct lp_build_if_state if_ctx;
1385 LLVMValueRef need_lerp;
1386 unsigned num_quads = bld->coord_bld.type.length / 4;
1387 unsigned i;
1388
1389 lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
1390 lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
1391
1392 /* need_lerp = lod_fpart > 0 */
1393 if (num_quads == 1) {
1394 need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
1395 lod_fpart, bld->perquadi_bld.zero,
1396 "need_lerp");
1397 }
1398 else {
1399 /*
1400 * We'll do mip filtering if any of the quads need it.
1401 * It might be better to split the vectors here and only fetch/filter
1402 * quads which need it.
1403 */
1404 /*
1405 * We need to clamp lod_fpart here since we can get negative
1406 * values which would screw up filtering if not all
1407 * lod_fpart values have same sign.
1408 * We can however then skip the greater than comparison.
1409 */
1410 lod_fpart = lp_build_max(&bld->perquadi_bld, lod_fpart,
1411 bld->perquadi_bld.zero);
1412 need_lerp = lp_build_any_true_range(&bld->perquadi_bld, num_quads, lod_fpart);
1413 }
1414
1415 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1416 {
1417 struct lp_build_context h16_bld;
1418
1419 lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
1420
1421 /* sample the second mipmap level */
1422 lp_build_mipmap_level_sizes(bld, ilevel1,
1423 &size1,
1424 &row_stride1_vec, &img_stride1_vec);
1425 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1426
1427 if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
1428 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1429 lp_build_sample_image_nearest_afloat(bld,
1430 size1,
1431 row_stride1_vec, img_stride1_vec,
1432 data_ptr1, s, t, r,
1433 &colors1_lo, &colors1_hi);
1434 }
1435 else {
1436 lp_build_sample_image_linear_afloat(bld,
1437 size1,
1438 row_stride1_vec, img_stride1_vec,
1439 data_ptr1, s, t, r,
1440 &colors1_lo, &colors1_hi);
1441 }
1442 }
1443 else {
1444 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1445 lp_build_sample_image_nearest(bld,
1446 size1,
1447 row_stride1_vec, img_stride1_vec,
1448 data_ptr1, s, t, r,
1449 &colors1_lo, &colors1_hi);
1450 }
1451 else {
1452 lp_build_sample_image_linear(bld,
1453 size1,
1454 row_stride1_vec, img_stride1_vec,
1455 data_ptr1, s, t, r,
1456 &colors1_lo, &colors1_hi);
1457 }
1458 }
1459
1460 /* interpolate samples from the two mipmap levels */
1461
1462 if (num_quads == 1) {
1463 lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
1464 lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
1465
1466 #if HAVE_LLVM == 0x208
1467 /* This is a work-around for a bug in LLVM 2.8.
1468 * Evidently, something goes wrong in the construction of the
1469 * lod_fpart short[8] vector. Adding this no-effect shuffle seems
1470 * to force the vector to be properly constructed.
1471 * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
1472 */
1473 {
1474 LLVMValueRef shuffles[8], shuffle;
1475 assert(h16_bld.type.length <= Elements(shuffles));
1476 for (i = 0; i < h16_bld.type.length; i++)
1477 shuffles[i] = lp_build_const_int32(bld->gallivm, 2 * (i & 1));
1478 shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
1479 lod_fpart = LLVMBuildShuffleVector(builder,
1480 lod_fpart, lod_fpart,
1481 shuffle, "");
1482 }
1483 #endif
1484
1485 colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
1486 colors0_lo, colors1_lo);
1487 colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
1488 colors0_hi, colors1_hi);
1489 }
1490 else {
1491 LLVMValueRef lod_parts[LP_MAX_VECTOR_LENGTH/16];
1492 struct lp_type perquadi16_type = bld->perquadi_bld.type;
1493 perquadi16_type.width /= 2;
1494 perquadi16_type.length *= 2;
1495 lod_fpart = LLVMBuildBitCast(builder, lod_fpart,
1496 lp_build_vec_type(bld->gallivm,
1497 perquadi16_type), "");
1498 /* XXX this only works for exactly 2 quads. More quads need shuffle */
1499 assert(num_quads == 2);
1500 for (i = 0; i < num_quads; i++) {
1501 LLVMValueRef indexi2 = lp_build_const_int32(bld->gallivm, i*2);
1502 lod_parts[i] = lp_build_extract_broadcast(bld->gallivm,
1503 perquadi16_type,
1504 h16_bld.type,
1505 lod_fpart,
1506 indexi2);
1507 }
1508 colors0_lo = lp_build_lerp(&h16_bld, lod_parts[0],
1509 colors0_lo, colors1_lo);
1510 colors0_hi = lp_build_lerp(&h16_bld, lod_parts[1],
1511 colors0_hi, colors1_hi);
1512 }
1513
1514 LLVMBuildStore(builder, colors0_lo, colors_lo_var);
1515 LLVMBuildStore(builder, colors0_hi, colors_hi_var);
1516 }
1517 lp_build_endif(&if_ctx);
1518 }
1519 }
1520
1521
1522
1523 /**
1524 * Texture sampling in AoS format. Used when sampling common 32-bit/texel
1525 * formats. 1D/2D/3D/cube texture supported. All mipmap sampling modes
1526 * but only limited texture coord wrap modes.
1527 */
1528 void
1529 lp_build_sample_aos(struct lp_build_sample_context *bld,
1530 unsigned unit,
1531 LLVMValueRef s,
1532 LLVMValueRef t,
1533 LLVMValueRef r,
1534 LLVMValueRef lod_ipart,
1535 LLVMValueRef lod_fpart,
1536 LLVMValueRef ilevel0,
1537 LLVMValueRef ilevel1,
1538 LLVMValueRef texel_out[4])
1539 {
1540 struct lp_build_context *int_bld = &bld->int_bld;
1541 LLVMBuilderRef builder = bld->gallivm->builder;
1542 const unsigned mip_filter = bld->static_state->min_mip_filter;
1543 const unsigned min_filter = bld->static_state->min_img_filter;
1544 const unsigned mag_filter = bld->static_state->mag_img_filter;
1545 const unsigned dims = bld->dims;
1546 LLVMValueRef packed, packed_lo, packed_hi;
1547 LLVMValueRef unswizzled[4];
1548 struct lp_build_context h16_bld;
1549
1550 /* we only support the common/simple wrap modes at this time */
1551 assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
1552 if (dims >= 2)
1553 assert(lp_is_simple_wrap_mode(bld->static_state->wrap_t));
1554 if (dims >= 3)
1555 assert(lp_is_simple_wrap_mode(bld->static_state->wrap_r));
1556
1557
1558 /* make 16-bit fixed-pt builder context */
1559 lp_build_context_init(&h16_bld, bld->gallivm, lp_type_ufixed(16, bld->vector_width));
1560
1561 /*
1562 * Get/interpolate texture colors.
1563 */
1564
1565 packed_lo = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_lo");
1566 packed_hi = lp_build_alloca(bld->gallivm, h16_bld.vec_type, "packed_hi");
1567
1568 if (min_filter == mag_filter) {
1569 /* no need to distinguish between minification and magnification */
1570 lp_build_sample_mipmap(bld,
1571 min_filter, mip_filter,
1572 s, t, r,
1573 ilevel0, ilevel1, lod_fpart,
1574 packed_lo, packed_hi);
1575 }
1576 else {
1577 /* Emit conditional to choose min image filter or mag image filter
1578 * depending on the lod being > 0 or <= 0, respectively.
1579 */
1580 struct lp_build_if_state if_ctx;
1581 LLVMValueRef minify;
1582
1583 /* minify = lod >= 0.0 */
1584 minify = LLVMBuildICmp(builder, LLVMIntSGE,
1585 lod_ipart, int_bld->zero, "");
1586
1587 lp_build_if(&if_ctx, bld->gallivm, minify);
1588 {
1589 /* Use the minification filter */
1590 lp_build_sample_mipmap(bld,
1591 min_filter, mip_filter,
1592 s, t, r,
1593 ilevel0, ilevel1, lod_fpart,
1594 packed_lo, packed_hi);
1595 }
1596 lp_build_else(&if_ctx);
1597 {
1598 /* Use the magnification filter */
1599 lp_build_sample_mipmap(bld,
1600 mag_filter, PIPE_TEX_MIPFILTER_NONE,
1601 s, t, r,
1602 ilevel0, NULL, NULL,
1603 packed_lo, packed_hi);
1604 }
1605 lp_build_endif(&if_ctx);
1606 }
1607
1608 /*
1609 * combine the values stored in 'packed_lo' and 'packed_hi' variables
1610 * into 'packed'
1611 */
1612 packed = lp_build_pack2(bld->gallivm,
1613 h16_bld.type, lp_type_unorm(8, bld->vector_width),
1614 LLVMBuildLoad(builder, packed_lo, ""),
1615 LLVMBuildLoad(builder, packed_hi, ""));
1616
1617 /*
1618 * Convert to SoA and swizzle.
1619 */
1620 lp_build_rgba8_to_f32_soa(bld->gallivm,
1621 bld->texel_type,
1622 packed, unswizzled);
1623
1624 if (util_format_is_rgba8_variant(bld->format_desc)) {
1625 lp_build_format_swizzle_soa(bld->format_desc,
1626 &bld->texel_bld,
1627 unswizzled, texel_out);
1628 }
1629 else {
1630 texel_out[0] = unswizzled[0];
1631 texel_out[1] = unswizzled[1];
1632 texel_out[2] = unswizzled[2];
1633 texel_out[3] = unswizzled[3];
1634 }
1635 }