util: Move gallium's PIPE_FORMAT utils to /util/format/
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_soa.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- SoA.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "pipe/p_shader_tokens.h"
39 #include "util/u_debug.h"
40 #include "util/u_dump.h"
41 #include "util/u_memory.h"
42 #include "util/u_math.h"
43 #include "util/format/u_format.h"
44 #include "util/u_cpu_detect.h"
45 #include "util/format_rgb9e5.h"
46 #include "lp_bld_debug.h"
47 #include "lp_bld_type.h"
48 #include "lp_bld_const.h"
49 #include "lp_bld_conv.h"
50 #include "lp_bld_arit.h"
51 #include "lp_bld_bitarit.h"
52 #include "lp_bld_logic.h"
53 #include "lp_bld_printf.h"
54 #include "lp_bld_swizzle.h"
55 #include "lp_bld_flow.h"
56 #include "lp_bld_gather.h"
57 #include "lp_bld_format.h"
58 #include "lp_bld_sample.h"
59 #include "lp_bld_sample_aos.h"
60 #include "lp_bld_struct.h"
61 #include "lp_bld_quad.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_intr.h"
64 #include "lp_bld_misc.h"
65
66
67 /**
68 * Generate code to fetch a texel from a texture at int coords (x, y, z).
69 * The computation depends on whether the texture is 1D, 2D or 3D.
70 * The result, texel, will be float vectors:
71 * texel[0] = red values
72 * texel[1] = green values
73 * texel[2] = blue values
74 * texel[3] = alpha values
75 */
76 static void
77 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
78 LLVMValueRef width,
79 LLVMValueRef height,
80 LLVMValueRef depth,
81 LLVMValueRef x,
82 LLVMValueRef y,
83 LLVMValueRef z,
84 LLVMValueRef y_stride,
85 LLVMValueRef z_stride,
86 LLVMValueRef data_ptr,
87 LLVMValueRef mipoffsets,
88 LLVMValueRef texel_out[4])
89 {
90 const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
91 const unsigned dims = bld->dims;
92 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
93 LLVMBuilderRef builder = bld->gallivm->builder;
94 LLVMValueRef offset;
95 LLVMValueRef i, j;
96 LLVMValueRef use_border = NULL;
97
98 /* use_border = x < 0 || x >= width || y < 0 || y >= height */
99 if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
100 static_state->min_img_filter,
101 static_state->mag_img_filter)) {
102 LLVMValueRef b1, b2;
103 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
104 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
105 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
106 }
107
108 if (dims >= 2 &&
109 lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
110 static_state->min_img_filter,
111 static_state->mag_img_filter)) {
112 LLVMValueRef b1, b2;
113 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
114 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
115 if (use_border) {
116 use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
117 use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
118 }
119 else {
120 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
121 }
122 }
123
124 if (dims == 3 &&
125 lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
126 static_state->min_img_filter,
127 static_state->mag_img_filter)) {
128 LLVMValueRef b1, b2;
129 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
130 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
131 if (use_border) {
132 use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
133 use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
134 }
135 else {
136 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
137 }
138 }
139
140 /* convert x,y,z coords to linear offset from start of texture, in bytes */
141 lp_build_sample_offset(&bld->int_coord_bld,
142 bld->format_desc,
143 x, y, z, y_stride, z_stride,
144 &offset, &i, &j);
145 if (mipoffsets) {
146 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
147 }
148
149 if (use_border) {
150 /* If we can sample the border color, it means that texcoords may
151 * lie outside the bounds of the texture image. We need to do
152 * something to prevent reading out of bounds and causing a segfault.
153 *
154 * Simply AND the texture coords with !use_border. This will cause
155 * coords which are out of bounds to become zero. Zero's guaranteed
156 * to be inside the texture image.
157 */
158 offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
159 }
160
161 lp_build_fetch_rgba_soa(bld->gallivm,
162 bld->format_desc,
163 bld->texel_type, TRUE,
164 data_ptr, offset,
165 i, j,
166 bld->cache,
167 texel_out);
168
169 /*
170 * Note: if we find an app which frequently samples the texture border
171 * we might want to implement a true conditional here to avoid sampling
172 * the texture whenever possible (since that's quite a bit of code).
173 * Ex:
174 * if (use_border) {
175 * texel = border_color;
176 * }
177 * else {
178 * texel = sample_texture(coord);
179 * }
180 * As it is now, we always sample the texture, then selectively replace
181 * the texel color results with the border color.
182 */
183
184 if (use_border) {
185 /* select texel color or border color depending on use_border. */
186 const struct util_format_description *format_desc = bld->format_desc;
187 int chan;
188 struct lp_type border_type = bld->texel_type;
189 border_type.length = 4;
190 /*
191 * Only replace channels which are actually present. The others should
192 * get optimized away eventually by sampler_view swizzle anyway but it's
193 * easier too.
194 */
195 for (chan = 0; chan < 4; chan++) {
196 unsigned chan_s;
197 /* reverse-map channel... */
198 for (chan_s = 0; chan_s < 4; chan_s++) {
199 if (chan_s == format_desc->swizzle[chan]) {
200 break;
201 }
202 }
203 if (chan_s <= 3) {
204 /* use the already clamped color */
205 LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
206 LLVMValueRef border_chan;
207
208 border_chan = lp_build_extract_broadcast(bld->gallivm,
209 border_type,
210 bld->texel_type,
211 bld->border_color_clamped,
212 idx);
213 texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
214 border_chan, texel_out[chan]);
215 }
216 }
217 }
218 }
219
220
221 /**
222 * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
223 * (Note that with pot sizes could do this much more easily post-scale
224 * with some bit arithmetic.)
225 */
226 static LLVMValueRef
227 lp_build_coord_mirror(struct lp_build_sample_context *bld,
228 LLVMValueRef coord, boolean posOnly)
229 {
230 struct lp_build_context *coord_bld = &bld->coord_bld;
231 LLVMValueRef fract;
232 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
233
234 /*
235 * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
236 * it all works out. (The result is in range [-1, 1.0], negative if
237 * the coord is in the "odd" section, otherwise positive.)
238 */
239
240 coord = lp_build_mul(coord_bld, coord, half);
241 fract = lp_build_round(coord_bld, coord);
242 fract = lp_build_sub(coord_bld, coord, fract);
243 coord = lp_build_add(coord_bld, fract, fract);
244
245 if (posOnly) {
246 /*
247 * Theoretically it's not quite 100% accurate because the spec says
248 * that ultimately a scaled coord of -x.0 should map to int coord
249 * -x + 1 with mirroring, not -x (this does not matter for bilinear
250 * filtering).
251 */
252 coord = lp_build_abs(coord_bld, coord);
253 /* kill off NaNs */
254 /* XXX: not safe without arch rounding, fract can be anything. */
255 coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
256 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
257 }
258
259 return coord;
260 }
261
262
263 /**
264 * Helper to compute the first coord and the weight for
265 * linear wrap repeat npot textures
266 */
267 void
268 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
269 LLVMValueRef coord_f,
270 LLVMValueRef length_i,
271 LLVMValueRef length_f,
272 LLVMValueRef *coord0_i,
273 LLVMValueRef *weight_f)
274 {
275 struct lp_build_context *coord_bld = &bld->coord_bld;
276 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
277 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
278 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
279 int_coord_bld->one);
280 LLVMValueRef mask;
281 /* wrap with normalized floats is just fract */
282 coord_f = lp_build_fract(coord_bld, coord_f);
283 /* mul by size and subtract 0.5 */
284 coord_f = lp_build_mul(coord_bld, coord_f, length_f);
285 coord_f = lp_build_sub(coord_bld, coord_f, half);
286 /*
287 * we avoided the 0.5/length division before the repeat wrap,
288 * now need to fix up edge cases with selects
289 */
290 /*
291 * Note we do a float (unordered) compare so we can eliminate NaNs.
292 * (Otherwise would need fract_safe above).
293 */
294 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
295 PIPE_FUNC_LESS, coord_f, coord_bld->zero);
296
297 /* convert to int, compute lerp weight */
298 lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
299 *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
300 }
301
302
303 /**
304 * Build LLVM code for texture wrap mode for linear filtering.
305 * \param x0_out returns first integer texcoord
306 * \param x1_out returns second integer texcoord
307 * \param weight_out returns linear interpolation weight
308 */
309 static void
310 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
311 boolean is_gather,
312 LLVMValueRef coord,
313 LLVMValueRef length,
314 LLVMValueRef length_f,
315 LLVMValueRef offset,
316 boolean is_pot,
317 unsigned wrap_mode,
318 LLVMValueRef *x0_out,
319 LLVMValueRef *x1_out,
320 LLVMValueRef *weight_out)
321 {
322 struct lp_build_context *coord_bld = &bld->coord_bld;
323 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
324 LLVMBuilderRef builder = bld->gallivm->builder;
325 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
326 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
327 LLVMValueRef coord0, coord1, weight;
328
329 switch(wrap_mode) {
330 case PIPE_TEX_WRAP_REPEAT:
331 if (is_pot) {
332 /* mul by size and subtract 0.5 */
333 coord = lp_build_mul(coord_bld, coord, length_f);
334 coord = lp_build_sub(coord_bld, coord, half);
335 if (offset) {
336 offset = lp_build_int_to_float(coord_bld, offset);
337 coord = lp_build_add(coord_bld, coord, offset);
338 }
339 /* convert to int, compute lerp weight */
340 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
341 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
342 /* repeat wrap */
343 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
344 coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
345 }
346 else {
347 LLVMValueRef mask;
348 if (offset) {
349 offset = lp_build_int_to_float(coord_bld, offset);
350 offset = lp_build_div(coord_bld, offset, length_f);
351 coord = lp_build_add(coord_bld, coord, offset);
352 }
353 lp_build_coord_repeat_npot_linear(bld, coord,
354 length, length_f,
355 &coord0, &weight);
356 mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
357 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
358 coord1 = LLVMBuildAnd(builder,
359 lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
360 mask, "");
361 }
362 break;
363
364 case PIPE_TEX_WRAP_CLAMP:
365 if (bld->static_sampler_state->normalized_coords) {
366 /* scale coord to length */
367 coord = lp_build_mul(coord_bld, coord, length_f);
368 }
369 if (offset) {
370 offset = lp_build_int_to_float(coord_bld, offset);
371 coord = lp_build_add(coord_bld, coord, offset);
372 }
373
374 /*
375 * clamp to [0, length]
376 *
377 * Unlike some other wrap modes, this should be correct for gather
378 * too. GL_CLAMP explicitly does this clamp on the coord prior to
379 * actual wrapping (which is per sample).
380 */
381 coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
382
383 coord = lp_build_sub(coord_bld, coord, half);
384
385 /* convert to int, compute lerp weight */
386 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
387 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
388 break;
389
390 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
391 {
392 struct lp_build_context abs_coord_bld = bld->coord_bld;
393 abs_coord_bld.type.sign = FALSE;
394
395 if (bld->static_sampler_state->normalized_coords) {
396 /* mul by tex size */
397 coord = lp_build_mul(coord_bld, coord, length_f);
398 }
399 if (offset) {
400 offset = lp_build_int_to_float(coord_bld, offset);
401 coord = lp_build_add(coord_bld, coord, offset);
402 }
403
404 /* clamp to length max */
405 coord = lp_build_min_ext(coord_bld, coord, length_f,
406 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
407 if (!is_gather) {
408 /* subtract 0.5 */
409 coord = lp_build_sub(coord_bld, coord, half);
410 /* clamp to [0, length - 0.5] */
411 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
412 /* convert to int, compute lerp weight */
413 lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
414 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
415 } else {
416 /*
417 * The non-gather path will end up with coords 0, 1 if coord was
418 * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
419 * really matter what the second coord is). But for gather, we
420 * really need to end up with coords 0, 0.
421 */
422 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
423 coord0 = lp_build_sub(coord_bld, coord, half);
424 coord1 = lp_build_add(coord_bld, coord, half);
425 /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
426 coord0 = lp_build_itrunc(coord_bld, coord0);
427 coord1 = lp_build_itrunc(coord_bld, coord1);
428 weight = coord_bld->undef;
429 }
430 /* coord1 = min(coord1, length-1) */
431 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
432 break;
433 }
434
435 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
436 if (bld->static_sampler_state->normalized_coords) {
437 /* scale coord to length */
438 coord = lp_build_mul(coord_bld, coord, length_f);
439 }
440 if (offset) {
441 offset = lp_build_int_to_float(coord_bld, offset);
442 coord = lp_build_add(coord_bld, coord, offset);
443 }
444 /*
445 * We don't need any clamp. Technically, for very large (pos or neg)
446 * (or infinite) values, clamp against [-length, length] would be
447 * correct, but we don't need to guarantee any specific
448 * result for such coords (the ifloor will be undefined, but for modes
449 * requiring border all resulting coords are safe).
450 */
451 coord = lp_build_sub(coord_bld, coord, half);
452 /* convert to int, compute lerp weight */
453 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
454 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
455 break;
456
457 case PIPE_TEX_WRAP_MIRROR_REPEAT:
458 if (offset) {
459 offset = lp_build_int_to_float(coord_bld, offset);
460 offset = lp_build_div(coord_bld, offset, length_f);
461 coord = lp_build_add(coord_bld, coord, offset);
462 }
463 if (!is_gather) {
464 /* compute mirror function */
465 coord = lp_build_coord_mirror(bld, coord, TRUE);
466
467 /* scale coord to length */
468 coord = lp_build_mul(coord_bld, coord, length_f);
469 coord = lp_build_sub(coord_bld, coord, half);
470
471 /* convert to int, compute lerp weight */
472 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
473 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
474
475 /* coord0 = max(coord0, 0) */
476 coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
477 /* coord1 = min(coord1, length-1) */
478 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
479 } else {
480 /*
481 * This is pretty reasonable in the end, all what the tests care
482 * about is nasty edge cases (scaled coords x.5, so the individual
483 * coords are actually integers, which is REALLY tricky to get right
484 * due to this working differently both for negative numbers as well
485 * as for even/odd cases). But with enough magic it's not too complex
486 * after all.
487 * Maybe should try a bit arithmetic one though for POT textures...
488 */
489 LLVMValueRef isNeg;
490 /*
491 * Wrapping just once still works, even though it means we can
492 * get "wrong" sign due to performing mirror in the middle of the
493 * two coords (because this can only happen very near the odd/even
494 * edges, so both coords will actually end up as 0 or length - 1
495 * in the end).
496 * For GL4 gather with per-sample offsets we'd need to the mirroring
497 * per coord too.
498 */
499 coord = lp_build_coord_mirror(bld, coord, FALSE);
500 coord = lp_build_mul(coord_bld, coord, length_f);
501
502 /*
503 * NaNs should be safe here, we'll do away with them with
504 * the ones' complement plus min.
505 */
506 coord0 = lp_build_sub(coord_bld, coord, half);
507 coord0 = lp_build_ifloor(coord_bld, coord0);
508 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
509 /* ones complement for neg numbers (mirror(negX) = X - 1) */
510 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
511 coord0, int_coord_bld->zero);
512 coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
513 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
514 coord1, int_coord_bld->zero);
515 coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
516 coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
517 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
518
519 weight = coord_bld->undef;
520 }
521 break;
522
523 case PIPE_TEX_WRAP_MIRROR_CLAMP:
524 if (bld->static_sampler_state->normalized_coords) {
525 /* scale coord to length */
526 coord = lp_build_mul(coord_bld, coord, length_f);
527 }
528 if (offset) {
529 offset = lp_build_int_to_float(coord_bld, offset);
530 coord = lp_build_add(coord_bld, coord, offset);
531 }
532 /*
533 * XXX: probably not correct for gather, albeit I'm not
534 * entirely sure as it's poorly specified. The wrapping looks
535 * correct according to the spec which is against gl 1.2.1,
536 * however negative values will be swapped - gl re-specified
537 * wrapping with newer versions (no more pre-clamp except with
538 * GL_CLAMP).
539 */
540 coord = lp_build_abs(coord_bld, coord);
541
542 /* clamp to [0, length] */
543 coord = lp_build_min_ext(coord_bld, coord, length_f,
544 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
545
546 coord = lp_build_sub(coord_bld, coord, half);
547
548 /* convert to int, compute lerp weight */
549 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
550 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
551 break;
552
553 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
554 {
555 struct lp_build_context abs_coord_bld = bld->coord_bld;
556 abs_coord_bld.type.sign = FALSE;
557
558 if (bld->static_sampler_state->normalized_coords) {
559 /* scale coord to length */
560 coord = lp_build_mul(coord_bld, coord, length_f);
561 }
562 if (offset) {
563 offset = lp_build_int_to_float(coord_bld, offset);
564 coord = lp_build_add(coord_bld, coord, offset);
565 }
566 if (!is_gather) {
567 coord = lp_build_abs(coord_bld, coord);
568
569 /* clamp to length max */
570 coord = lp_build_min_ext(coord_bld, coord, length_f,
571 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
572 /* subtract 0.5 */
573 coord = lp_build_sub(coord_bld, coord, half);
574 /* clamp to [0, length - 0.5] */
575 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
576
577 /* convert to int, compute lerp weight */
578 lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
579 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
580 /* coord1 = min(coord1, length-1) */
581 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
582 } else {
583 /*
584 * The non-gather path will swap coord0/1 if coord was negative,
585 * which is ok for filtering since the filter weight matches
586 * accordingly. Also, if coord is close to zero, coord0/1 will
587 * be 0 and 1, instead of 0 and 0 (again ok due to filter
588 * weight being 0.0). Both issues need to be fixed for gather.
589 */
590 LLVMValueRef isNeg;
591
592 /*
593 * Actually wanted to cheat here and use:
594 * coord1 = lp_build_iround(coord_bld, coord);
595 * but it's not good enough for some tests (even piglit
596 * textureGather is set up in a way so the coords area always
597 * .5, that is right at the crossover points).
598 * So do ordinary sub/floor, then do ones' complement
599 * for negative numbers.
600 * (Note can't just do sub|add/abs/itrunc per coord neither -
601 * because the spec demands that mirror(3.0) = 3 but
602 * mirror(-3.0) = 2.)
603 */
604 coord = lp_build_sub(coord_bld, coord, half);
605 coord0 = lp_build_ifloor(coord_bld, coord);
606 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
607 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
608 int_coord_bld->zero);
609 coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
610 coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
611
612 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
613 int_coord_bld->zero);
614 coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
615 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
616
617 weight = coord_bld->undef;
618 }
619 }
620 break;
621
622 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
623 {
624 if (bld->static_sampler_state->normalized_coords) {
625 /* scale coord to length */
626 coord = lp_build_mul(coord_bld, coord, length_f);
627 }
628 if (offset) {
629 offset = lp_build_int_to_float(coord_bld, offset);
630 coord = lp_build_add(coord_bld, coord, offset);
631 }
632 /*
633 * XXX: probably not correct for gather due to swapped
634 * order if coord is negative (same rationale as for
635 * MIRROR_CLAMP).
636 */
637 coord = lp_build_abs(coord_bld, coord);
638
639 /*
640 * We don't need any clamp. Technically, for very large
641 * (or infinite) values, clamp against length would be
642 * correct, but we don't need to guarantee any specific
643 * result for such coords (the ifloor will be undefined, but
644 * for modes requiring border all resulting coords are safe).
645 */
646 coord = lp_build_sub(coord_bld, coord, half);
647
648 /* convert to int, compute lerp weight */
649 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
650 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
651 }
652 break;
653
654 default:
655 assert(0);
656 coord0 = NULL;
657 coord1 = NULL;
658 weight = NULL;
659 }
660
661 *x0_out = coord0;
662 *x1_out = coord1;
663 *weight_out = weight;
664 }
665
666
667 /**
668 * Build LLVM code for texture wrap mode for nearest filtering.
669 * \param coord the incoming texcoord (nominally in [0,1])
670 * \param length the texture size along one dimension, as int vector
671 * \param length_f the texture size along one dimension, as float vector
672 * \param offset texel offset along one dimension (as int vector)
673 * \param is_pot if TRUE, length is a power of two
674 * \param wrap_mode one of PIPE_TEX_WRAP_x
675 */
676 static LLVMValueRef
677 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
678 LLVMValueRef coord,
679 LLVMValueRef length,
680 LLVMValueRef length_f,
681 LLVMValueRef offset,
682 boolean is_pot,
683 unsigned wrap_mode)
684 {
685 struct lp_build_context *coord_bld = &bld->coord_bld;
686 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
687 LLVMBuilderRef builder = bld->gallivm->builder;
688 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
689 LLVMValueRef icoord;
690
691 switch(wrap_mode) {
692 case PIPE_TEX_WRAP_REPEAT:
693 if (is_pot) {
694 coord = lp_build_mul(coord_bld, coord, length_f);
695 icoord = lp_build_ifloor(coord_bld, coord);
696 if (offset) {
697 icoord = lp_build_add(int_coord_bld, icoord, offset);
698 }
699 icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
700 }
701 else {
702 if (offset) {
703 offset = lp_build_int_to_float(coord_bld, offset);
704 offset = lp_build_div(coord_bld, offset, length_f);
705 coord = lp_build_add(coord_bld, coord, offset);
706 }
707 /* take fraction, unnormalize */
708 coord = lp_build_fract_safe(coord_bld, coord);
709 coord = lp_build_mul(coord_bld, coord, length_f);
710 icoord = lp_build_itrunc(coord_bld, coord);
711 }
712 break;
713
714 case PIPE_TEX_WRAP_CLAMP:
715 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
716 if (bld->static_sampler_state->normalized_coords) {
717 /* scale coord to length */
718 coord = lp_build_mul(coord_bld, coord, length_f);
719 }
720
721 if (offset) {
722 offset = lp_build_int_to_float(coord_bld, offset);
723 coord = lp_build_add(coord_bld, coord, offset);
724 }
725 /* floor */
726 /* use itrunc instead since we clamp to 0 anyway */
727 icoord = lp_build_itrunc(coord_bld, coord);
728
729 /* clamp to [0, length - 1]. */
730 icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
731 length_minus_one);
732 break;
733
734 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
735 if (bld->static_sampler_state->normalized_coords) {
736 /* scale coord to length */
737 coord = lp_build_mul(coord_bld, coord, length_f);
738 }
739 /* no clamp necessary, border masking will handle this */
740 icoord = lp_build_ifloor(coord_bld, coord);
741 if (offset) {
742 icoord = lp_build_add(int_coord_bld, icoord, offset);
743 }
744 break;
745
746 case PIPE_TEX_WRAP_MIRROR_REPEAT:
747 if (offset) {
748 offset = lp_build_int_to_float(coord_bld, offset);
749 offset = lp_build_div(coord_bld, offset, length_f);
750 coord = lp_build_add(coord_bld, coord, offset);
751 }
752 /* compute mirror function */
753 coord = lp_build_coord_mirror(bld, coord, TRUE);
754
755 /* scale coord to length */
756 assert(bld->static_sampler_state->normalized_coords);
757 coord = lp_build_mul(coord_bld, coord, length_f);
758
759 /* itrunc == ifloor here */
760 icoord = lp_build_itrunc(coord_bld, coord);
761
762 /* clamp to [0, length - 1] */
763 icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
764 break;
765
766 case PIPE_TEX_WRAP_MIRROR_CLAMP:
767 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
768 if (bld->static_sampler_state->normalized_coords) {
769 /* scale coord to length */
770 coord = lp_build_mul(coord_bld, coord, length_f);
771 }
772 if (offset) {
773 offset = lp_build_int_to_float(coord_bld, offset);
774 coord = lp_build_add(coord_bld, coord, offset);
775 }
776 coord = lp_build_abs(coord_bld, coord);
777
778 /* itrunc == ifloor here */
779 icoord = lp_build_itrunc(coord_bld, coord);
780 /*
781 * Use unsigned min due to possible undef values (NaNs, overflow)
782 */
783 {
784 struct lp_build_context abs_coord_bld = *int_coord_bld;
785 abs_coord_bld.type.sign = FALSE;
786 /* clamp to [0, length - 1] */
787 icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
788 }
789 break;
790
791 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
792 if (bld->static_sampler_state->normalized_coords) {
793 /* scale coord to length */
794 coord = lp_build_mul(coord_bld, coord, length_f);
795 }
796 if (offset) {
797 offset = lp_build_int_to_float(coord_bld, offset);
798 coord = lp_build_add(coord_bld, coord, offset);
799 }
800 coord = lp_build_abs(coord_bld, coord);
801
802 /* itrunc == ifloor here */
803 icoord = lp_build_itrunc(coord_bld, coord);
804 break;
805
806 default:
807 assert(0);
808 icoord = NULL;
809 }
810
811 return icoord;
812 }
813
814
815 /**
816 * Do shadow test/comparison.
817 * \param p shadow ref value
818 * \param texel the texel to compare against
819 */
820 static LLVMValueRef
821 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
822 LLVMValueRef p,
823 LLVMValueRef texel)
824 {
825 struct lp_build_context *texel_bld = &bld->texel_bld;
826 LLVMValueRef res;
827
828 if (0) {
829 //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
830 lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
831 }
832
833 /* result = (p FUNC texel) ? 1 : 0 */
834 /*
835 * honor d3d10 floating point rules here, which state that comparisons
836 * are ordered except NOT_EQUAL which is unordered.
837 */
838 if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
839 res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
840 p, texel);
841 }
842 else {
843 res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
844 p, texel);
845 }
846 return res;
847 }
848
849
850 /**
851 * Generate code to sample a mipmap level with nearest filtering.
852 * If sampling a cube texture, r = cube face in [0,5].
853 */
854 static void
855 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
856 LLVMValueRef size,
857 LLVMValueRef row_stride_vec,
858 LLVMValueRef img_stride_vec,
859 LLVMValueRef data_ptr,
860 LLVMValueRef mipoffsets,
861 const LLVMValueRef *coords,
862 const LLVMValueRef *offsets,
863 LLVMValueRef colors_out[4])
864 {
865 const unsigned dims = bld->dims;
866 LLVMValueRef width_vec;
867 LLVMValueRef height_vec;
868 LLVMValueRef depth_vec;
869 LLVMValueRef flt_size;
870 LLVMValueRef flt_width_vec;
871 LLVMValueRef flt_height_vec;
872 LLVMValueRef flt_depth_vec;
873 LLVMValueRef x, y = NULL, z = NULL;
874
875 lp_build_extract_image_sizes(bld,
876 &bld->int_size_bld,
877 bld->int_coord_type,
878 size,
879 &width_vec, &height_vec, &depth_vec);
880
881 flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
882
883 lp_build_extract_image_sizes(bld,
884 &bld->float_size_bld,
885 bld->coord_type,
886 flt_size,
887 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
888
889 /*
890 * Compute integer texcoords.
891 */
892 x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
893 flt_width_vec, offsets[0],
894 bld->static_texture_state->pot_width,
895 bld->static_sampler_state->wrap_s);
896 lp_build_name(x, "tex.x.wrapped");
897
898 if (dims >= 2) {
899 y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
900 flt_height_vec, offsets[1],
901 bld->static_texture_state->pot_height,
902 bld->static_sampler_state->wrap_t);
903 lp_build_name(y, "tex.y.wrapped");
904
905 if (dims == 3) {
906 z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
907 flt_depth_vec, offsets[2],
908 bld->static_texture_state->pot_depth,
909 bld->static_sampler_state->wrap_r);
910 lp_build_name(z, "tex.z.wrapped");
911 }
912 }
913 if (has_layer_coord(bld->static_texture_state->target)) {
914 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
915 /* add cube layer to face */
916 z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
917 }
918 else {
919 z = coords[2];
920 }
921 lp_build_name(z, "tex.z.layer");
922 }
923
924 /*
925 * Get texture colors.
926 */
927 lp_build_sample_texel_soa(bld,
928 width_vec, height_vec, depth_vec,
929 x, y, z,
930 row_stride_vec, img_stride_vec,
931 data_ptr, mipoffsets, colors_out);
932
933 if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
934 LLVMValueRef cmpval;
935 cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
936 /* this is really just a AND 1.0, cmpval but llvm is clever enough */
937 colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
938 bld->texel_bld.one, bld->texel_bld.zero);
939 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
940 }
941
942 }
943
944
945 /**
946 * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
947 */
948 static LLVMValueRef
949 lp_build_masklerp(struct lp_build_context *bld,
950 LLVMValueRef weight,
951 LLVMValueRef mask0,
952 LLVMValueRef mask1)
953 {
954 struct gallivm_state *gallivm = bld->gallivm;
955 LLVMBuilderRef builder = gallivm->builder;
956 LLVMValueRef weight2;
957
958 weight2 = lp_build_sub(bld, bld->one, weight);
959 weight = LLVMBuildBitCast(builder, weight,
960 lp_build_int_vec_type(gallivm, bld->type), "");
961 weight2 = LLVMBuildBitCast(builder, weight2,
962 lp_build_int_vec_type(gallivm, bld->type), "");
963 weight = LLVMBuildAnd(builder, weight, mask1, "");
964 weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
965 weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
966 weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
967 return lp_build_add(bld, weight, weight2);
968 }
969
970 /**
971 * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
972 */
973 static LLVMValueRef
974 lp_build_masklerp2d(struct lp_build_context *bld,
975 LLVMValueRef weight0,
976 LLVMValueRef weight1,
977 LLVMValueRef mask00,
978 LLVMValueRef mask01,
979 LLVMValueRef mask10,
980 LLVMValueRef mask11)
981 {
982 LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
983 LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
984 return lp_build_lerp(bld, weight1, val0, val1, 0);
985 }
986
987 /*
988 * this is a bit excessive code for something OpenGL just recommends
989 * but does not require.
990 */
991 #define ACCURATE_CUBE_CORNERS 1
992
993 /**
994 * Generate code to sample a mipmap level with linear filtering.
995 * If sampling a cube texture, r = cube face in [0,5].
996 * If linear_mask is present, only pixels having their mask set
997 * will receive linear filtering, the rest will use nearest.
998 */
999 static void
1000 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1001 boolean is_gather,
1002 LLVMValueRef size,
1003 LLVMValueRef linear_mask,
1004 LLVMValueRef row_stride_vec,
1005 LLVMValueRef img_stride_vec,
1006 LLVMValueRef data_ptr,
1007 LLVMValueRef mipoffsets,
1008 const LLVMValueRef *coords,
1009 const LLVMValueRef *offsets,
1010 LLVMValueRef colors_out[4])
1011 {
1012 LLVMBuilderRef builder = bld->gallivm->builder;
1013 struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1014 struct lp_build_context *coord_bld = &bld->coord_bld;
1015 struct lp_build_context *texel_bld = &bld->texel_bld;
1016 const unsigned dims = bld->dims;
1017 LLVMValueRef width_vec;
1018 LLVMValueRef height_vec;
1019 LLVMValueRef depth_vec;
1020 LLVMValueRef flt_size;
1021 LLVMValueRef flt_width_vec;
1022 LLVMValueRef flt_height_vec;
1023 LLVMValueRef flt_depth_vec;
1024 LLVMValueRef fall_off[4], have_corners;
1025 LLVMValueRef z1 = NULL;
1026 LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1027 LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1028 LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1029 LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1030 LLVMValueRef xs[4], ys[4], zs[4];
1031 LLVMValueRef neighbors[2][2][4];
1032 int chan, texel_index;
1033 boolean seamless_cube_filter, accurate_cube_corners;
1034 unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1035
1036 if (is_gather) {
1037 switch (bld->gather_comp) {
1038 case 0: chan_swiz = bld->static_texture_state->swizzle_r; break;
1039 case 1: chan_swiz = bld->static_texture_state->swizzle_g; break;
1040 case 2: chan_swiz = bld->static_texture_state->swizzle_b; break;
1041 case 3: chan_swiz = bld->static_texture_state->swizzle_a; break;
1042 default:
1043 break;
1044 }
1045 }
1046
1047 seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1048 bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1049 bld->static_sampler_state->seamless_cube_map;
1050
1051 /*
1052 * Disable accurate cube corners for integer textures, which should only
1053 * get here in the gather path.
1054 */
1055 accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter &&
1056 !util_format_is_pure_integer(bld->static_texture_state->format);
1057
1058 lp_build_extract_image_sizes(bld,
1059 &bld->int_size_bld,
1060 bld->int_coord_type,
1061 size,
1062 &width_vec, &height_vec, &depth_vec);
1063
1064 flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1065
1066 lp_build_extract_image_sizes(bld,
1067 &bld->float_size_bld,
1068 bld->coord_type,
1069 flt_size,
1070 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1071
1072 /*
1073 * Compute integer texcoords.
1074 */
1075
1076 if (!seamless_cube_filter) {
1077 lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1078 flt_width_vec, offsets[0],
1079 bld->static_texture_state->pot_width,
1080 bld->static_sampler_state->wrap_s,
1081 &x00, &x01, &s_fpart);
1082 lp_build_name(x00, "tex.x0.wrapped");
1083 lp_build_name(x01, "tex.x1.wrapped");
1084 x10 = x00;
1085 x11 = x01;
1086
1087 if (dims >= 2) {
1088 lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1089 flt_height_vec, offsets[1],
1090 bld->static_texture_state->pot_height,
1091 bld->static_sampler_state->wrap_t,
1092 &y00, &y10, &t_fpart);
1093 lp_build_name(y00, "tex.y0.wrapped");
1094 lp_build_name(y10, "tex.y1.wrapped");
1095 y01 = y00;
1096 y11 = y10;
1097
1098 if (dims == 3) {
1099 lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1100 flt_depth_vec, offsets[2],
1101 bld->static_texture_state->pot_depth,
1102 bld->static_sampler_state->wrap_r,
1103 &z00, &z1, &r_fpart);
1104 z01 = z10 = z11 = z00;
1105 lp_build_name(z00, "tex.z0.wrapped");
1106 lp_build_name(z1, "tex.z1.wrapped");
1107 }
1108 }
1109 if (has_layer_coord(bld->static_texture_state->target)) {
1110 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1111 /* add cube layer to face */
1112 z00 = z01 = z10 = z11 = z1 =
1113 lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1114 }
1115 else {
1116 z00 = z01 = z10 = z11 = z1 = coords[2]; /* cube face or layer */
1117 }
1118 lp_build_name(z00, "tex.z0.layer");
1119 lp_build_name(z1, "tex.z1.layer");
1120 }
1121 }
1122 else {
1123 struct lp_build_if_state edge_if;
1124 LLVMTypeRef int1t;
1125 LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1126 LLVMValueRef coord0, coord1, have_edge, have_corner;
1127 LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1128 LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1129 LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1130 LLVMValueRef face = coords[2];
1131 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1132 LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1133 /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1134 height_vec = width_vec;
1135 flt_height_vec = flt_width_vec;
1136
1137 /* XXX the overflow logic is actually sort of duplicated with trilinear,
1138 * since an overflow in one mip should also have a corresponding overflow
1139 * in another.
1140 */
1141 /* should always have normalized coords, and offsets are undefined */
1142 assert(bld->static_sampler_state->normalized_coords);
1143 /*
1144 * The coords should all be between [0,1] however we can have NaNs,
1145 * which will wreak havoc. In particular the y1_clamped value below
1146 * can be -INT_MAX (on x86) and be propagated right through (probably
1147 * other values might be bogus in the end too).
1148 * So kill off the NaNs here.
1149 */
1150 coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1151 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1152 coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
1153 /* instead of clamp, build mask if overflowed */
1154 coord0 = lp_build_sub(coord_bld, coord0, half);
1155 /* convert to int, compute lerp weight */
1156 /* not ideal with AVX (and no AVX2) */
1157 lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
1158 x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1159 coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1160 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1161 coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
1162 coord1 = lp_build_sub(coord_bld, coord1, half);
1163 lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
1164 y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1165
1166 fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1167 fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1168 fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1169 fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1170
1171 fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1172 fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1173 have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1174 have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1175
1176 /* needed for accurate corner filtering branch later, rely on 0 init */
1177 int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1178 have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1179
1180 for (texel_index = 0; texel_index < 4; texel_index++) {
1181 xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1182 ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1183 zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1184 }
1185
1186 lp_build_if(&edge_if, bld->gallivm, have_edge);
1187
1188 have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1189 have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1190 LLVMBuildStore(builder, have_corner, have_corners);
1191
1192 /*
1193 * Need to feed clamped values here for cheap corner handling,
1194 * but only for y coord (as when falling off both edges we only
1195 * fall off the x one) - this should be sufficient.
1196 */
1197 y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1198 y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1199
1200 /*
1201 * Get all possible new coords.
1202 */
1203 lp_build_cube_new_coords(ivec_bld, face,
1204 x0, x1, y0_clamped, y1_clamped,
1205 length_minus_one,
1206 new_faces, new_xcoords, new_ycoords);
1207
1208 /* handle fall off x-, x+ direction */
1209 /* determine new coords, face (not both fall_off vars can be true at same time) */
1210 x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1211 y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1212 x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1213 y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1214 x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1215 y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1216 x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1217 y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1218
1219 z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1220 z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1221
1222 /* handle fall off y-, y+ direction */
1223 /*
1224 * Cheap corner logic: just hack up things so a texel doesn't fall
1225 * off both sides (which means filter weights will be wrong but we'll only
1226 * use valid texels in the filter).
1227 * This means however (y) coords must additionally be clamped (see above).
1228 * This corner handling should be fully OpenGL (but not d3d10) compliant.
1229 */
1230 fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1231 fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1232 fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1233 fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1234
1235 x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1236 y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1237 x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1238 y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1239 x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1240 y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1241 x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1242 y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1243
1244 z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1245 z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1246 z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1247 z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1248
1249 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1250 /* now can add cube layer to face (per sample) */
1251 z00 = lp_build_add(ivec_bld, z00, coords[3]);
1252 z01 = lp_build_add(ivec_bld, z01, coords[3]);
1253 z10 = lp_build_add(ivec_bld, z10, coords[3]);
1254 z11 = lp_build_add(ivec_bld, z11, coords[3]);
1255 }
1256
1257 LLVMBuildStore(builder, x00, xs[0]);
1258 LLVMBuildStore(builder, x01, xs[1]);
1259 LLVMBuildStore(builder, x10, xs[2]);
1260 LLVMBuildStore(builder, x11, xs[3]);
1261 LLVMBuildStore(builder, y00, ys[0]);
1262 LLVMBuildStore(builder, y01, ys[1]);
1263 LLVMBuildStore(builder, y10, ys[2]);
1264 LLVMBuildStore(builder, y11, ys[3]);
1265 LLVMBuildStore(builder, z00, zs[0]);
1266 LLVMBuildStore(builder, z01, zs[1]);
1267 LLVMBuildStore(builder, z10, zs[2]);
1268 LLVMBuildStore(builder, z11, zs[3]);
1269
1270 lp_build_else(&edge_if);
1271
1272 LLVMBuildStore(builder, x0, xs[0]);
1273 LLVMBuildStore(builder, x1, xs[1]);
1274 LLVMBuildStore(builder, x0, xs[2]);
1275 LLVMBuildStore(builder, x1, xs[3]);
1276 LLVMBuildStore(builder, y0, ys[0]);
1277 LLVMBuildStore(builder, y0, ys[1]);
1278 LLVMBuildStore(builder, y1, ys[2]);
1279 LLVMBuildStore(builder, y1, ys[3]);
1280 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1281 LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1282 LLVMBuildStore(builder, cube_layer, zs[0]);
1283 LLVMBuildStore(builder, cube_layer, zs[1]);
1284 LLVMBuildStore(builder, cube_layer, zs[2]);
1285 LLVMBuildStore(builder, cube_layer, zs[3]);
1286 }
1287 else {
1288 LLVMBuildStore(builder, face, zs[0]);
1289 LLVMBuildStore(builder, face, zs[1]);
1290 LLVMBuildStore(builder, face, zs[2]);
1291 LLVMBuildStore(builder, face, zs[3]);
1292 }
1293
1294 lp_build_endif(&edge_if);
1295
1296 x00 = LLVMBuildLoad(builder, xs[0], "");
1297 x01 = LLVMBuildLoad(builder, xs[1], "");
1298 x10 = LLVMBuildLoad(builder, xs[2], "");
1299 x11 = LLVMBuildLoad(builder, xs[3], "");
1300 y00 = LLVMBuildLoad(builder, ys[0], "");
1301 y01 = LLVMBuildLoad(builder, ys[1], "");
1302 y10 = LLVMBuildLoad(builder, ys[2], "");
1303 y11 = LLVMBuildLoad(builder, ys[3], "");
1304 z00 = LLVMBuildLoad(builder, zs[0], "");
1305 z01 = LLVMBuildLoad(builder, zs[1], "");
1306 z10 = LLVMBuildLoad(builder, zs[2], "");
1307 z11 = LLVMBuildLoad(builder, zs[3], "");
1308 }
1309
1310 if (linear_mask) {
1311 /*
1312 * Whack filter weights into place. Whatever texel had more weight is
1313 * the one which should have been selected by nearest filtering hence
1314 * just use 100% weight for it.
1315 */
1316 struct lp_build_context *c_bld = &bld->coord_bld;
1317 LLVMValueRef w1_mask, w1_weight;
1318 LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1319
1320 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1321 /* this select is really just a "and" */
1322 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1323 s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1324 if (dims >= 2) {
1325 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1326 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1327 t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1328 if (dims == 3) {
1329 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1330 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1331 r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1332 }
1333 }
1334 }
1335
1336 /*
1337 * Get texture colors.
1338 */
1339 /* get x0/x1 texels */
1340 lp_build_sample_texel_soa(bld,
1341 width_vec, height_vec, depth_vec,
1342 x00, y00, z00,
1343 row_stride_vec, img_stride_vec,
1344 data_ptr, mipoffsets, neighbors[0][0]);
1345 lp_build_sample_texel_soa(bld,
1346 width_vec, height_vec, depth_vec,
1347 x01, y01, z01,
1348 row_stride_vec, img_stride_vec,
1349 data_ptr, mipoffsets, neighbors[0][1]);
1350
1351 if (dims == 1) {
1352 assert(!is_gather);
1353 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1354 /* Interpolate two samples from 1D image to produce one color */
1355 for (chan = 0; chan < 4; chan++) {
1356 colors_out[chan] = lp_build_lerp(texel_bld, s_fpart,
1357 neighbors[0][0][chan],
1358 neighbors[0][1][chan],
1359 0);
1360 }
1361 }
1362 else {
1363 LLVMValueRef cmpval0, cmpval1;
1364 cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1365 cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1366 /* simplified lerp, AND mask with weight and add */
1367 colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1368 cmpval0, cmpval1);
1369 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1370 }
1371 }
1372 else {
1373 /* 2D/3D texture */
1374 struct lp_build_if_state corner_if;
1375 LLVMValueRef colors0[4], colorss[4];
1376
1377 /* get x0/x1 texels at y1 */
1378 lp_build_sample_texel_soa(bld,
1379 width_vec, height_vec, depth_vec,
1380 x10, y10, z10,
1381 row_stride_vec, img_stride_vec,
1382 data_ptr, mipoffsets, neighbors[1][0]);
1383 lp_build_sample_texel_soa(bld,
1384 width_vec, height_vec, depth_vec,
1385 x11, y11, z11,
1386 row_stride_vec, img_stride_vec,
1387 data_ptr, mipoffsets, neighbors[1][1]);
1388
1389 /*
1390 * To avoid having to duplicate linear_mask / fetch code use
1391 * another branch (with corner condition though edge would work
1392 * as well) here.
1393 */
1394 if (accurate_cube_corners) {
1395 LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1396 LLVMValueRef have_corner, one_third;
1397
1398 colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1399 colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1400 colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1401 colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1402
1403 have_corner = LLVMBuildLoad(builder, have_corners, "");
1404
1405 lp_build_if(&corner_if, bld->gallivm, have_corner);
1406
1407 one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1408 1.0f/3.0f);
1409
1410 /* find corner */
1411 c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1412 c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1413 c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1414 c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1415 c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1416 c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1417 c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1418 c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1419
1420 if (!is_gather) {
1421 /*
1422 * we can't use standard 2d lerp as we need per-element weight
1423 * in case of corners, so just calculate bilinear result as
1424 * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1425 * (This is actually less work than using 2d lerp, 7 vs. 9
1426 * instructions, however calculating the weights needs another 6,
1427 * so actually probably not slower than 2d lerp only for 4 channels
1428 * as weights only need to be calculated once - of course fixing
1429 * the weights has additional cost.)
1430 */
1431 LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1432 wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1433 wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1434 w00 = lp_build_mul(coord_bld, wx0, wy0);
1435 w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1436 w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1437 w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1438
1439 /* find corner weight */
1440 c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1441 c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1442 c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1443 c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1444
1445 /*
1446 * add 1/3 of the corner weight to the weight of the 3 other
1447 * samples and null out corner weight.
1448 */
1449 c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1450 w00 = lp_build_add(coord_bld, w00, c_weight);
1451 w00 = lp_build_andnot(coord_bld, w00, c00f);
1452 w01 = lp_build_add(coord_bld, w01, c_weight);
1453 w01 = lp_build_andnot(coord_bld, w01, c01f);
1454 w10 = lp_build_add(coord_bld, w10, c_weight);
1455 w10 = lp_build_andnot(coord_bld, w10, c10f);
1456 w11 = lp_build_add(coord_bld, w11, c_weight);
1457 w11 = lp_build_andnot(coord_bld, w11, c11f);
1458
1459 if (bld->static_sampler_state->compare_mode ==
1460 PIPE_TEX_COMPARE_NONE) {
1461 for (chan = 0; chan < 4; chan++) {
1462 colors0[chan] = lp_build_mul(coord_bld, w00,
1463 neighbors[0][0][chan]);
1464 tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1465 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1466 tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1467 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1468 tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1469 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1470 }
1471 }
1472 else {
1473 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1474 cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1475 neighbors[0][0][0]);
1476 cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1477 neighbors[0][1][0]);
1478 cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1479 neighbors[1][0][0]);
1480 cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1481 neighbors[1][1][0]);
1482 /*
1483 * inputs to interpolation are just masks so just add
1484 * masked weights together
1485 */
1486 cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1487 coord_bld->vec_type, "");
1488 cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1489 coord_bld->vec_type, "");
1490 cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1491 coord_bld->vec_type, "");
1492 cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1493 coord_bld->vec_type, "");
1494 colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1495 tmp = lp_build_and(coord_bld, w01, cmpval01);
1496 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1497 tmp = lp_build_and(coord_bld, w10, cmpval10);
1498 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1499 tmp = lp_build_and(coord_bld, w11, cmpval11);
1500 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1501 colors0[1] = colors0[2] = colors0[3] = colors0[0];
1502 }
1503 }
1504 else {
1505 /*
1506 * We don't have any weights to adjust, so instead calculate
1507 * the fourth texel as simply the average of the other 3.
1508 * (This would work for non-gather too, however we'd have
1509 * a boatload more of the select stuff due to there being
1510 * 4 times as many colors as weights.)
1511 */
1512 LLVMValueRef col00, col01, col10, col11;
1513 LLVMValueRef colc, colc0, colc1;
1514 col10 = lp_build_swizzle_soa_channel(texel_bld,
1515 neighbors[1][0], chan_swiz);
1516 col11 = lp_build_swizzle_soa_channel(texel_bld,
1517 neighbors[1][1], chan_swiz);
1518 col01 = lp_build_swizzle_soa_channel(texel_bld,
1519 neighbors[0][1], chan_swiz);
1520 col00 = lp_build_swizzle_soa_channel(texel_bld,
1521 neighbors[0][0], chan_swiz);
1522
1523 /*
1524 * The spec says for comparison filtering, the comparison
1525 * must happen before synthesizing the new value.
1526 * This means all gathered values are always 0 or 1,
1527 * except for the non-existing texel, which can be 0,1/3,2/3,1...
1528 * Seems like we'd be allowed to just return 0 or 1 too, so we
1529 * could simplify and pass down the compare mask values to the
1530 * end (using int arithmetic/compare on the mask values to
1531 * construct the fourth texel) and only there convert to floats
1532 * but it's probably not worth it (it might be easier for the cpu
1533 * but not for the code)...
1534 */
1535 if (bld->static_sampler_state->compare_mode !=
1536 PIPE_TEX_COMPARE_NONE) {
1537 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1538 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1539 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1540 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1541 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1542 col00 = lp_build_select(texel_bld, cmpval00,
1543 texel_bld->one, texel_bld->zero);
1544 col01 = lp_build_select(texel_bld, cmpval01,
1545 texel_bld->one, texel_bld->zero);
1546 col10 = lp_build_select(texel_bld, cmpval10,
1547 texel_bld->one, texel_bld->zero);
1548 col11 = lp_build_select(texel_bld, cmpval11,
1549 texel_bld->one, texel_bld->zero);
1550 }
1551
1552 /*
1553 * Null out corner color.
1554 */
1555 col00 = lp_build_andnot(coord_bld, col00, c00f);
1556 col01 = lp_build_andnot(coord_bld, col01, c01f);
1557 col10 = lp_build_andnot(coord_bld, col10, c10f);
1558 col11 = lp_build_andnot(coord_bld, col11, c11f);
1559
1560 /*
1561 * New corner texel color is all colors added / 3.
1562 */
1563 colc0 = lp_build_add(coord_bld, col00, col01);
1564 colc1 = lp_build_add(coord_bld, col10, col11);
1565 colc = lp_build_add(coord_bld, colc0, colc1);
1566 colc = lp_build_mul(coord_bld, one_third, colc);
1567
1568 /*
1569 * Replace the corner texel color with the new value.
1570 */
1571 col00 = lp_build_select(coord_bld, c00, colc, col00);
1572 col01 = lp_build_select(coord_bld, c01, colc, col01);
1573 col10 = lp_build_select(coord_bld, c10, colc, col10);
1574 col11 = lp_build_select(coord_bld, c11, colc, col11);
1575
1576 colors0[0] = col10;
1577 colors0[1] = col11;
1578 colors0[2] = col01;
1579 colors0[3] = col00;
1580 }
1581
1582 LLVMBuildStore(builder, colors0[0], colorss[0]);
1583 LLVMBuildStore(builder, colors0[1], colorss[1]);
1584 LLVMBuildStore(builder, colors0[2], colorss[2]);
1585 LLVMBuildStore(builder, colors0[3], colorss[3]);
1586
1587 lp_build_else(&corner_if);
1588 }
1589
1590 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1591 if (is_gather) {
1592 /*
1593 * Just assign the red channel (no component selection yet).
1594 * This is a bit hackish, we usually do the swizzle at the
1595 * end of sampling (much less values to swizzle), but this
1596 * obviously cannot work when using gather.
1597 */
1598 colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1599 neighbors[1][0],
1600 chan_swiz);
1601 colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1602 neighbors[1][1],
1603 chan_swiz);
1604 colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1605 neighbors[0][1],
1606 chan_swiz);
1607 colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1608 neighbors[0][0],
1609 chan_swiz);
1610 }
1611 else {
1612 /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1613 for (chan = 0; chan < 4; chan++) {
1614 colors0[chan] = lp_build_lerp_2d(texel_bld,
1615 s_fpart, t_fpart,
1616 neighbors[0][0][chan],
1617 neighbors[0][1][chan],
1618 neighbors[1][0][chan],
1619 neighbors[1][1][chan],
1620 0);
1621 }
1622 }
1623 }
1624 else {
1625 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1626 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1627 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1628 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1629 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1630
1631 if (is_gather) {
1632 /* more hacks for swizzling, should be X, ONE or ZERO... */
1633 colors0[0] = lp_build_select(texel_bld, cmpval10,
1634 texel_bld->one, texel_bld->zero);
1635 colors0[1] = lp_build_select(texel_bld, cmpval11,
1636 texel_bld->one, texel_bld->zero);
1637 colors0[2] = lp_build_select(texel_bld, cmpval01,
1638 texel_bld->one, texel_bld->zero);
1639 colors0[3] = lp_build_select(texel_bld, cmpval00,
1640 texel_bld->one, texel_bld->zero);
1641 }
1642 else {
1643 colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1644 cmpval00, cmpval01, cmpval10, cmpval11);
1645 colors0[1] = colors0[2] = colors0[3] = colors0[0];
1646 }
1647 }
1648
1649 if (accurate_cube_corners) {
1650 LLVMBuildStore(builder, colors0[0], colorss[0]);
1651 LLVMBuildStore(builder, colors0[1], colorss[1]);
1652 LLVMBuildStore(builder, colors0[2], colorss[2]);
1653 LLVMBuildStore(builder, colors0[3], colorss[3]);
1654
1655 lp_build_endif(&corner_if);
1656
1657 colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
1658 colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
1659 colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
1660 colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
1661 }
1662
1663 if (dims == 3) {
1664 LLVMValueRef neighbors1[2][2][4];
1665 LLVMValueRef colors1[4];
1666
1667 assert(!is_gather);
1668
1669 /* get x0/x1/y0/y1 texels at z1 */
1670 lp_build_sample_texel_soa(bld,
1671 width_vec, height_vec, depth_vec,
1672 x00, y00, z1,
1673 row_stride_vec, img_stride_vec,
1674 data_ptr, mipoffsets, neighbors1[0][0]);
1675 lp_build_sample_texel_soa(bld,
1676 width_vec, height_vec, depth_vec,
1677 x01, y01, z1,
1678 row_stride_vec, img_stride_vec,
1679 data_ptr, mipoffsets, neighbors1[0][1]);
1680 lp_build_sample_texel_soa(bld,
1681 width_vec, height_vec, depth_vec,
1682 x10, y10, z1,
1683 row_stride_vec, img_stride_vec,
1684 data_ptr, mipoffsets, neighbors1[1][0]);
1685 lp_build_sample_texel_soa(bld,
1686 width_vec, height_vec, depth_vec,
1687 x11, y11, z1,
1688 row_stride_vec, img_stride_vec,
1689 data_ptr, mipoffsets, neighbors1[1][1]);
1690
1691 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1692 /* Bilinear interpolate the four samples from the second Z slice */
1693 for (chan = 0; chan < 4; chan++) {
1694 colors1[chan] = lp_build_lerp_2d(texel_bld,
1695 s_fpart, t_fpart,
1696 neighbors1[0][0][chan],
1697 neighbors1[0][1][chan],
1698 neighbors1[1][0][chan],
1699 neighbors1[1][1][chan],
1700 0);
1701 }
1702 /* Linearly interpolate the two samples from the two 3D slices */
1703 for (chan = 0; chan < 4; chan++) {
1704 colors_out[chan] = lp_build_lerp(texel_bld,
1705 r_fpart,
1706 colors0[chan], colors1[chan],
1707 0);
1708 }
1709 }
1710 else {
1711 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1712 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1713 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1714 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1715 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1716 colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1717 cmpval00, cmpval01, cmpval10, cmpval11);
1718 /* Linearly interpolate the two samples from the two 3D slices */
1719 colors_out[0] = lp_build_lerp(texel_bld,
1720 r_fpart,
1721 colors0[0], colors1[0],
1722 0);
1723 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1724 }
1725 }
1726 else {
1727 /* 2D tex */
1728 for (chan = 0; chan < 4; chan++) {
1729 colors_out[chan] = colors0[chan];
1730 }
1731 }
1732 }
1733 if (is_gather) {
1734 /*
1735 * For gather, we can't do our usual channel swizzling done later,
1736 * so do it here. It only really matters for 0/1 swizzles in case
1737 * of comparison filtering, since in this case the results would be
1738 * wrong, without comparison it should all work out alright but it
1739 * can't hurt to do that here, since it will instantly drop all
1740 * calculations above, though it's a rather stupid idea to do
1741 * gather on a channel which will always return 0 or 1 in any case...
1742 */
1743 if (chan_swiz == PIPE_SWIZZLE_1) {
1744 for (chan = 0; chan < 4; chan++) {
1745 colors_out[chan] = texel_bld->one;
1746 }
1747 } else if (chan_swiz == PIPE_SWIZZLE_0) {
1748 for (chan = 0; chan < 4; chan++) {
1749 colors_out[chan] = texel_bld->zero;
1750 }
1751 }
1752 }
1753 }
1754
1755
1756 /**
1757 * Sample the texture/mipmap using given image filter and mip filter.
1758 * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1759 * from (vectors or scalars).
1760 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1761 */
1762 static void
1763 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1764 unsigned img_filter,
1765 unsigned mip_filter,
1766 boolean is_gather,
1767 const LLVMValueRef *coords,
1768 const LLVMValueRef *offsets,
1769 LLVMValueRef ilevel0,
1770 LLVMValueRef ilevel1,
1771 LLVMValueRef lod_fpart,
1772 LLVMValueRef *colors_out)
1773 {
1774 LLVMBuilderRef builder = bld->gallivm->builder;
1775 LLVMValueRef size0 = NULL;
1776 LLVMValueRef size1 = NULL;
1777 LLVMValueRef row_stride0_vec = NULL;
1778 LLVMValueRef row_stride1_vec = NULL;
1779 LLVMValueRef img_stride0_vec = NULL;
1780 LLVMValueRef img_stride1_vec = NULL;
1781 LLVMValueRef data_ptr0 = NULL;
1782 LLVMValueRef data_ptr1 = NULL;
1783 LLVMValueRef mipoff0 = NULL;
1784 LLVMValueRef mipoff1 = NULL;
1785 LLVMValueRef colors0[4], colors1[4];
1786 unsigned chan;
1787
1788 /* sample the first mipmap level */
1789 lp_build_mipmap_level_sizes(bld, ilevel0,
1790 &size0,
1791 &row_stride0_vec, &img_stride0_vec);
1792 if (bld->num_mips == 1) {
1793 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1794 }
1795 else {
1796 /* This path should work for num_lods 1 too but slightly less efficient */
1797 data_ptr0 = bld->base_ptr;
1798 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1799 }
1800 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1801 lp_build_sample_image_nearest(bld, size0,
1802 row_stride0_vec, img_stride0_vec,
1803 data_ptr0, mipoff0, coords, offsets,
1804 colors0);
1805 }
1806 else {
1807 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1808 lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1809 row_stride0_vec, img_stride0_vec,
1810 data_ptr0, mipoff0, coords, offsets,
1811 colors0);
1812 }
1813
1814 /* Store the first level's colors in the output variables */
1815 for (chan = 0; chan < 4; chan++) {
1816 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1817 }
1818
1819 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1820 struct lp_build_if_state if_ctx;
1821 LLVMValueRef need_lerp;
1822
1823 /* need_lerp = lod_fpart > 0 */
1824 if (bld->num_lods == 1) {
1825 need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1826 lod_fpart, bld->lodf_bld.zero,
1827 "need_lerp");
1828 }
1829 else {
1830 /*
1831 * We'll do mip filtering if any of the quads (or individual
1832 * pixel in case of per-pixel lod) need it.
1833 * It might be better to split the vectors here and only fetch/filter
1834 * quads which need it (if there's one lod per quad).
1835 */
1836 need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1837 PIPE_FUNC_GREATER,
1838 lod_fpart, bld->lodf_bld.zero);
1839 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1840 lp_build_name(need_lerp, "need_lerp");
1841 }
1842
1843 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1844 {
1845 /*
1846 * We unfortunately need to clamp lod_fpart here since we can get
1847 * negative values which would screw up filtering if not all
1848 * lod_fpart values have same sign.
1849 */
1850 lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1851 bld->lodf_bld.zero);
1852 /* sample the second mipmap level */
1853 lp_build_mipmap_level_sizes(bld, ilevel1,
1854 &size1,
1855 &row_stride1_vec, &img_stride1_vec);
1856 if (bld->num_mips == 1) {
1857 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1858 }
1859 else {
1860 data_ptr1 = bld->base_ptr;
1861 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1862 }
1863 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1864 lp_build_sample_image_nearest(bld, size1,
1865 row_stride1_vec, img_stride1_vec,
1866 data_ptr1, mipoff1, coords, offsets,
1867 colors1);
1868 }
1869 else {
1870 lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1871 row_stride1_vec, img_stride1_vec,
1872 data_ptr1, mipoff1, coords, offsets,
1873 colors1);
1874 }
1875
1876 /* interpolate samples from the two mipmap levels */
1877
1878 if (bld->num_lods != bld->coord_type.length)
1879 lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1880 bld->lodf_bld.type,
1881 bld->texel_bld.type,
1882 lod_fpart);
1883
1884 for (chan = 0; chan < 4; chan++) {
1885 colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1886 colors0[chan], colors1[chan],
1887 0);
1888 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1889 }
1890 }
1891 lp_build_endif(&if_ctx);
1892 }
1893 }
1894
1895
1896 /**
1897 * Sample the texture/mipmap using given mip filter, and using
1898 * both nearest and linear filtering at the same time depending
1899 * on linear_mask.
1900 * lod can be per quad but linear_mask is always per pixel.
1901 * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1902 * from (vectors or scalars).
1903 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1904 */
1905 static void
1906 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1907 LLVMValueRef linear_mask,
1908 unsigned mip_filter,
1909 const LLVMValueRef *coords,
1910 const LLVMValueRef *offsets,
1911 LLVMValueRef ilevel0,
1912 LLVMValueRef ilevel1,
1913 LLVMValueRef lod_fpart,
1914 LLVMValueRef lod_positive,
1915 LLVMValueRef *colors_out)
1916 {
1917 LLVMBuilderRef builder = bld->gallivm->builder;
1918 LLVMValueRef size0 = NULL;
1919 LLVMValueRef size1 = NULL;
1920 LLVMValueRef row_stride0_vec = NULL;
1921 LLVMValueRef row_stride1_vec = NULL;
1922 LLVMValueRef img_stride0_vec = NULL;
1923 LLVMValueRef img_stride1_vec = NULL;
1924 LLVMValueRef data_ptr0 = NULL;
1925 LLVMValueRef data_ptr1 = NULL;
1926 LLVMValueRef mipoff0 = NULL;
1927 LLVMValueRef mipoff1 = NULL;
1928 LLVMValueRef colors0[4], colors1[4];
1929 unsigned chan;
1930
1931 /* sample the first mipmap level */
1932 lp_build_mipmap_level_sizes(bld, ilevel0,
1933 &size0,
1934 &row_stride0_vec, &img_stride0_vec);
1935 if (bld->num_mips == 1) {
1936 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1937 }
1938 else {
1939 /* This path should work for num_lods 1 too but slightly less efficient */
1940 data_ptr0 = bld->base_ptr;
1941 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1942 }
1943
1944 lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1945 row_stride0_vec, img_stride0_vec,
1946 data_ptr0, mipoff0, coords, offsets,
1947 colors0);
1948
1949 /* Store the first level's colors in the output variables */
1950 for (chan = 0; chan < 4; chan++) {
1951 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1952 }
1953
1954 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1955 struct lp_build_if_state if_ctx;
1956 LLVMValueRef need_lerp;
1957
1958 /*
1959 * We'll do mip filtering if any of the quads (or individual
1960 * pixel in case of per-pixel lod) need it.
1961 * Note using lod_positive here not lod_fpart since it may be the same
1962 * condition as that used in the outer "if" in the caller hence llvm
1963 * should be able to merge the branches in this case.
1964 */
1965 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1966 lp_build_name(need_lerp, "need_lerp");
1967
1968 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1969 {
1970 /*
1971 * We unfortunately need to clamp lod_fpart here since we can get
1972 * negative values which would screw up filtering if not all
1973 * lod_fpart values have same sign.
1974 */
1975 lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1976 bld->lodf_bld.zero);
1977 /* sample the second mipmap level */
1978 lp_build_mipmap_level_sizes(bld, ilevel1,
1979 &size1,
1980 &row_stride1_vec, &img_stride1_vec);
1981 if (bld->num_mips == 1) {
1982 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1983 }
1984 else {
1985 data_ptr1 = bld->base_ptr;
1986 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1987 }
1988
1989 lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
1990 row_stride1_vec, img_stride1_vec,
1991 data_ptr1, mipoff1, coords, offsets,
1992 colors1);
1993
1994 /* interpolate samples from the two mipmap levels */
1995
1996 if (bld->num_lods != bld->coord_type.length)
1997 lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1998 bld->lodf_bld.type,
1999 bld->texel_bld.type,
2000 lod_fpart);
2001
2002 for (chan = 0; chan < 4; chan++) {
2003 colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
2004 colors0[chan], colors1[chan],
2005 0);
2006 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
2007 }
2008 }
2009 lp_build_endif(&if_ctx);
2010 }
2011 }
2012
2013
2014 /**
2015 * Build (per-coord) layer value.
2016 * Either clamp layer to valid values or fill in optional out_of_bounds
2017 * value and just return value unclamped.
2018 */
2019 static LLVMValueRef
2020 lp_build_layer_coord(struct lp_build_sample_context *bld,
2021 unsigned texture_unit,
2022 boolean is_cube_array,
2023 LLVMValueRef layer,
2024 LLVMValueRef *out_of_bounds)
2025 {
2026 LLVMValueRef num_layers;
2027 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2028
2029 num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2030 bld->context_ptr, texture_unit);
2031
2032 if (out_of_bounds) {
2033 LLVMValueRef out1, out;
2034 assert(!is_cube_array);
2035 num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2036 out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2037 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2038 *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2039 return layer;
2040 }
2041 else {
2042 LLVMValueRef maxlayer;
2043 LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2044 bld->int_bld.one;
2045 maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2046 maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2047 return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2048 }
2049 }
2050
2051
2052 /**
2053 * Calculate cube face, lod, mip levels.
2054 */
2055 static void
2056 lp_build_sample_common(struct lp_build_sample_context *bld,
2057 boolean is_lodq,
2058 unsigned texture_index,
2059 unsigned sampler_index,
2060 LLVMValueRef *coords,
2061 const struct lp_derivatives *derivs, /* optional */
2062 LLVMValueRef lod_bias, /* optional */
2063 LLVMValueRef explicit_lod, /* optional */
2064 LLVMValueRef *lod_pos_or_zero,
2065 LLVMValueRef *lod,
2066 LLVMValueRef *lod_fpart,
2067 LLVMValueRef *ilevel0,
2068 LLVMValueRef *ilevel1)
2069 {
2070 const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2071 const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2072 const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2073 const unsigned target = bld->static_texture_state->target;
2074 LLVMValueRef first_level, cube_rho = NULL;
2075 LLVMValueRef lod_ipart = NULL;
2076 struct lp_derivatives cube_derivs;
2077
2078 /*
2079 printf("%s mip %d min %d mag %d\n", __FUNCTION__,
2080 mip_filter, min_filter, mag_filter);
2081 */
2082
2083 /*
2084 * Choose cube face, recompute texcoords for the chosen face and
2085 * compute rho here too (as it requires transform of derivatives).
2086 */
2087 if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2088 boolean need_derivs;
2089 need_derivs = ((min_filter != mag_filter ||
2090 mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2091 !bld->static_sampler_state->min_max_lod_equal &&
2092 !explicit_lod);
2093 lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2094 derivs = &cube_derivs;
2095 if (target == PIPE_TEXTURE_CUBE_ARRAY) {
2096 /* calculate cube layer coord now */
2097 LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2098 LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2099 layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2100 coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2101 /* because of seamless filtering can't add it to face (coords[2]) here. */
2102 }
2103 }
2104 else if (target == PIPE_TEXTURE_1D_ARRAY ||
2105 target == PIPE_TEXTURE_2D_ARRAY) {
2106 coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2107 coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2108 }
2109
2110 if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2111 /*
2112 * Clamp p coords to [0,1] for fixed function depth texture format here.
2113 * Technically this is not entirely correct for unorm depth as the ref value
2114 * should be converted to the depth format (quantization!) and comparison
2115 * then done in texture format. This would actually help performance (since
2116 * only need to do it once and could save the per-sample conversion of texels
2117 * to floats instead), but it would need more messy code (would need to push
2118 * at least some bits down to actual fetch so conversion could be skipped,
2119 * and would have ugly interaction with border color, would need to convert
2120 * border color to that format too or do some other tricks to make it work).
2121 */
2122 const struct util_format_description *format_desc = bld->format_desc;
2123 unsigned chan_type;
2124 /* not entirely sure we couldn't end up with non-valid swizzle here */
2125 chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2126 format_desc->channel[format_desc->swizzle[0]].type :
2127 UTIL_FORMAT_TYPE_FLOAT;
2128 if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2129 coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2130 bld->coord_bld.zero, bld->coord_bld.one);
2131 }
2132 }
2133
2134 /*
2135 * Compute the level of detail (float).
2136 */
2137 if (min_filter != mag_filter ||
2138 mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2139 /* Need to compute lod either to choose mipmap levels or to
2140 * distinguish between minification/magnification with one mipmap level.
2141 */
2142 lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2143 coords[0], coords[1], coords[2], cube_rho,
2144 derivs, lod_bias, explicit_lod,
2145 mip_filter, lod,
2146 &lod_ipart, lod_fpart, lod_pos_or_zero);
2147 if (is_lodq) {
2148 LLVMValueRef last_level;
2149 last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2150 bld->gallivm,
2151 bld->context_ptr,
2152 texture_index);
2153 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2154 bld->gallivm,
2155 bld->context_ptr,
2156 texture_index);
2157 last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2158 last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2159 last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2160
2161 switch (mip_filter) {
2162 case PIPE_TEX_MIPFILTER_NONE:
2163 *lod_fpart = bld->lodf_bld.zero;
2164 break;
2165 case PIPE_TEX_MIPFILTER_NEAREST:
2166 *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2167 /* fallthrough */
2168 case PIPE_TEX_MIPFILTER_LINEAR:
2169 *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2170 bld->lodf_bld.zero, last_level);
2171 break;
2172 }
2173 return;
2174 }
2175
2176 } else {
2177 lod_ipart = bld->lodi_bld.zero;
2178 *lod_pos_or_zero = bld->lodi_bld.zero;
2179 }
2180
2181 if (bld->num_lods != bld->num_mips) {
2182 /* only makes sense if there's just a single mip level */
2183 assert(bld->num_mips == 1);
2184 lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2185 }
2186
2187 /*
2188 * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2189 */
2190 switch (mip_filter) {
2191 default:
2192 assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2193 /* fall-through */
2194 case PIPE_TEX_MIPFILTER_NONE:
2195 /* always use mip level 0 */
2196 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2197 bld->gallivm, bld->context_ptr,
2198 texture_index);
2199 first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2200 *ilevel0 = first_level;
2201 break;
2202 case PIPE_TEX_MIPFILTER_NEAREST:
2203 assert(lod_ipart);
2204 lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2205 break;
2206 case PIPE_TEX_MIPFILTER_LINEAR:
2207 assert(lod_ipart);
2208 assert(*lod_fpart);
2209 lp_build_linear_mip_levels(bld, texture_index,
2210 lod_ipart, lod_fpart,
2211 ilevel0, ilevel1);
2212 break;
2213 }
2214 }
2215
2216 static void
2217 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2218 unsigned sampler_unit)
2219 {
2220 struct gallivm_state *gallivm = bld->gallivm;
2221 LLVMBuilderRef builder = gallivm->builder;
2222 LLVMValueRef border_color_ptr =
2223 bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2224 bld->context_ptr, sampler_unit);
2225 LLVMValueRef border_color;
2226 const struct util_format_description *format_desc = bld->format_desc;
2227 struct lp_type vec4_type = bld->texel_type;
2228 struct lp_build_context vec4_bld;
2229 LLVMValueRef min_clamp = NULL;
2230 LLVMValueRef max_clamp = NULL;
2231
2232 /*
2233 * For normalized format need to clamp border color (technically
2234 * probably should also quantize the data). Really sucks doing this
2235 * here but can't avoid at least for now since this is part of
2236 * sampler state and texture format is part of sampler_view state.
2237 * GL expects also expects clamping for uint/sint formats too so
2238 * do that as well (d3d10 can't end up here with uint/sint since it
2239 * only supports them with ld).
2240 */
2241 vec4_type.length = 4;
2242 lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2243
2244 /*
2245 * Vectorized clamping of border color. Loading is a bit of a hack since
2246 * we just cast the pointer to float array to pointer to vec4
2247 * (int or float).
2248 */
2249 border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2250 lp_build_const_int32(gallivm, 0));
2251 border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2252 LLVMPointerType(vec4_bld.vec_type, 0), "");
2253 border_color = LLVMBuildLoad(builder, border_color_ptr, "");
2254 /* we don't have aligned type in the dynamic state unfortunately */
2255 LLVMSetAlignment(border_color, 4);
2256
2257 /*
2258 * Instead of having some incredibly complex logic which will try to figure out
2259 * clamping necessary for each channel, simply use the first channel, and treat
2260 * mixed signed/unsigned normalized formats specially.
2261 * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
2262 * good reason.)
2263 */
2264 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2265 int chan;
2266 /* d/s needs special handling because both present means just sampling depth */
2267 if (util_format_is_depth_and_stencil(format_desc->format)) {
2268 chan = format_desc->swizzle[0];
2269 }
2270 else {
2271 chan = util_format_get_first_non_void_channel(format_desc->format);
2272 }
2273 if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2274 unsigned chan_type = format_desc->channel[chan].type;
2275 unsigned chan_norm = format_desc->channel[chan].normalized;
2276 unsigned chan_pure = format_desc->channel[chan].pure_integer;
2277 if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2278 if (chan_norm) {
2279 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2280 max_clamp = vec4_bld.one;
2281 }
2282 else if (chan_pure) {
2283 /*
2284 * Border color was stored as int, hence need min/max clamp
2285 * only if chan has less than 32 bits..
2286 */
2287 unsigned chan_size = format_desc->channel[chan].size;
2288 if (chan_size < 32) {
2289 min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2290 0 - (1 << (chan_size - 1)));
2291 max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2292 (1 << (chan_size - 1)) - 1);
2293 }
2294 }
2295 /* TODO: no idea about non-pure, non-normalized! */
2296 }
2297 else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2298 if (chan_norm) {
2299 min_clamp = vec4_bld.zero;
2300 max_clamp = vec4_bld.one;
2301 }
2302 /*
2303 * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2304 * we use Z32_FLOAT_S8X24 to imply sampling depth component
2305 * and ignoring stencil, which will blow up here if we try to
2306 * do a uint clamp in a float texel build...
2307 * And even if we had that format, mesa st also thinks using z24s8
2308 * means depth sampling ignoring stencil.
2309 */
2310 else if (chan_pure) {
2311 /*
2312 * Border color was stored as uint, hence never need min
2313 * clamp, and only need max clamp if chan has less than 32 bits.
2314 */
2315 unsigned chan_size = format_desc->channel[chan].size;
2316 if (chan_size < 32) {
2317 max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2318 (1 << chan_size) - 1);
2319 }
2320 /* TODO: no idea about non-pure, non-normalized! */
2321 }
2322 }
2323 else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2324 /* TODO: I have no idea what clamp this would need if any! */
2325 }
2326 }
2327 /* mixed plain formats (or different pure size) */
2328 switch (format_desc->format) {
2329 case PIPE_FORMAT_B10G10R10A2_UINT:
2330 case PIPE_FORMAT_R10G10B10A2_UINT:
2331 {
2332 unsigned max10 = (1 << 10) - 1;
2333 max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2334 max10, (1 << 2) - 1, NULL);
2335 }
2336 break;
2337 case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2338 min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2339 -1.0F, 0.0F, NULL);
2340 max_clamp = vec4_bld.one;
2341 break;
2342 case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2343 case PIPE_FORMAT_R5SG5SB6U_NORM:
2344 min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2345 0.0F, 0.0F, NULL);
2346 max_clamp = vec4_bld.one;
2347 break;
2348 default:
2349 break;
2350 }
2351 }
2352 else {
2353 /* cannot figure this out from format description */
2354 if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2355 /* s3tc formats are always unorm */
2356 min_clamp = vec4_bld.zero;
2357 max_clamp = vec4_bld.one;
2358 }
2359 else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2360 format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
2361 switch (format_desc->format) {
2362 case PIPE_FORMAT_RGTC1_UNORM:
2363 case PIPE_FORMAT_RGTC2_UNORM:
2364 case PIPE_FORMAT_LATC1_UNORM:
2365 case PIPE_FORMAT_LATC2_UNORM:
2366 case PIPE_FORMAT_ETC1_RGB8:
2367 min_clamp = vec4_bld.zero;
2368 max_clamp = vec4_bld.one;
2369 break;
2370 case PIPE_FORMAT_RGTC1_SNORM:
2371 case PIPE_FORMAT_RGTC2_SNORM:
2372 case PIPE_FORMAT_LATC1_SNORM:
2373 case PIPE_FORMAT_LATC2_SNORM:
2374 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2375 max_clamp = vec4_bld.one;
2376 break;
2377 default:
2378 assert(0);
2379 break;
2380 }
2381 }
2382 /*
2383 * all others from subsampled/other group, though we don't care
2384 * about yuv (and should not have any from zs here)
2385 */
2386 else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2387 switch (format_desc->format) {
2388 case PIPE_FORMAT_R8G8_B8G8_UNORM:
2389 case PIPE_FORMAT_G8R8_G8B8_UNORM:
2390 case PIPE_FORMAT_G8R8_B8R8_UNORM:
2391 case PIPE_FORMAT_R8G8_R8B8_UNORM:
2392 case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2393 min_clamp = vec4_bld.zero;
2394 max_clamp = vec4_bld.one;
2395 break;
2396 case PIPE_FORMAT_R8G8Bx_SNORM:
2397 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2398 max_clamp = vec4_bld.one;
2399 break;
2400 /*
2401 * Note smallfloat formats usually don't need clamping
2402 * (they still have infinite range) however this is not
2403 * true for r11g11b10 and r9g9b9e5, which can't represent
2404 * negative numbers (and additionally r9g9b9e5 can't represent
2405 * very large numbers). d3d10 seems happy without clamping in
2406 * this case, but gl spec is pretty clear: "for floating
2407 * point and integer formats, border values are clamped to
2408 * the representable range of the format" so do that here.
2409 */
2410 case PIPE_FORMAT_R11G11B10_FLOAT:
2411 min_clamp = vec4_bld.zero;
2412 break;
2413 case PIPE_FORMAT_R9G9B9E5_FLOAT:
2414 min_clamp = vec4_bld.zero;
2415 max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2416 break;
2417 default:
2418 assert(0);
2419 break;
2420 }
2421 }
2422 }
2423
2424 if (min_clamp) {
2425 border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2426 }
2427 if (max_clamp) {
2428 border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2429 }
2430
2431 bld->border_color_clamped = border_color;
2432 }
2433
2434
2435 /**
2436 * General texture sampling codegen.
2437 * This function handles texture sampling for all texture targets (1D,
2438 * 2D, 3D, cube) and all filtering modes.
2439 */
2440 static void
2441 lp_build_sample_general(struct lp_build_sample_context *bld,
2442 unsigned sampler_unit,
2443 boolean is_gather,
2444 const LLVMValueRef *coords,
2445 const LLVMValueRef *offsets,
2446 LLVMValueRef lod_positive,
2447 LLVMValueRef lod_fpart,
2448 LLVMValueRef ilevel0,
2449 LLVMValueRef ilevel1,
2450 LLVMValueRef *colors_out)
2451 {
2452 LLVMBuilderRef builder = bld->gallivm->builder;
2453 const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2454 const unsigned mip_filter = sampler_state->min_mip_filter;
2455 const unsigned min_filter = sampler_state->min_img_filter;
2456 const unsigned mag_filter = sampler_state->mag_img_filter;
2457 LLVMValueRef texels[4];
2458 unsigned chan;
2459
2460 /* if we need border color, (potentially) clamp it now */
2461 if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2462 min_filter,
2463 mag_filter) ||
2464 (bld->dims > 1 &&
2465 lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2466 min_filter,
2467 mag_filter)) ||
2468 (bld->dims > 2 &&
2469 lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2470 min_filter,
2471 mag_filter))) {
2472 lp_build_clamp_border_color(bld, sampler_unit);
2473 }
2474
2475
2476 /*
2477 * Get/interpolate texture colors.
2478 */
2479
2480 for (chan = 0; chan < 4; ++chan) {
2481 texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2482 lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2483 }
2484
2485 if (min_filter == mag_filter) {
2486 /* no need to distinguish between minification and magnification */
2487 lp_build_sample_mipmap(bld, min_filter, mip_filter,
2488 is_gather,
2489 coords, offsets,
2490 ilevel0, ilevel1, lod_fpart,
2491 texels);
2492 }
2493 else {
2494 /*
2495 * Could also get rid of the if-logic and always use mipmap_both, both
2496 * for the single lod and multi-lod case if nothing really uses this.
2497 */
2498 if (bld->num_lods == 1) {
2499 /* Emit conditional to choose min image filter or mag image filter
2500 * depending on the lod being > 0 or <= 0, respectively.
2501 */
2502 struct lp_build_if_state if_ctx;
2503
2504 lod_positive = LLVMBuildTrunc(builder, lod_positive,
2505 LLVMInt1TypeInContext(bld->gallivm->context),
2506 "lod_pos");
2507
2508 lp_build_if(&if_ctx, bld->gallivm, lod_positive);
2509 {
2510 /* Use the minification filter */
2511 lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
2512 coords, offsets,
2513 ilevel0, ilevel1, lod_fpart,
2514 texels);
2515 }
2516 lp_build_else(&if_ctx);
2517 {
2518 /* Use the magnification filter */
2519 lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
2520 FALSE,
2521 coords, offsets,
2522 ilevel0, NULL, NULL,
2523 texels);
2524 }
2525 lp_build_endif(&if_ctx);
2526 }
2527 else {
2528 LLVMValueRef need_linear, linear_mask;
2529 unsigned mip_filter_for_nearest;
2530 struct lp_build_if_state if_ctx;
2531
2532 if (min_filter == PIPE_TEX_FILTER_LINEAR) {
2533 linear_mask = lod_positive;
2534 mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
2535 }
2536 else {
2537 linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
2538 mip_filter_for_nearest = mip_filter;
2539 }
2540 need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
2541 linear_mask);
2542 lp_build_name(need_linear, "need_linear");
2543
2544 if (bld->num_lods != bld->coord_type.length) {
2545 linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2546 bld->lodi_type,
2547 bld->int_coord_type,
2548 linear_mask);
2549 }
2550
2551 lp_build_if(&if_ctx, bld->gallivm, need_linear);
2552 {
2553 /*
2554 * Do sampling with both filters simultaneously. This means using
2555 * a linear filter and doing some tricks (with weights) for the pixels
2556 * which need nearest filter.
2557 * Note that it's probably rare some pixels need nearest and some
2558 * linear filter but the fixups required for the nearest pixels
2559 * aren't all that complicated so just always run a combined path
2560 * if at least some pixels require linear.
2561 */
2562 lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
2563 coords, offsets,
2564 ilevel0, ilevel1,
2565 lod_fpart, lod_positive,
2566 texels);
2567 }
2568 lp_build_else(&if_ctx);
2569 {
2570 /*
2571 * All pixels require just nearest filtering, which is way
2572 * cheaper than linear, hence do a separate path for that.
2573 */
2574 lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
2575 mip_filter_for_nearest, FALSE,
2576 coords, offsets,
2577 ilevel0, ilevel1, lod_fpart,
2578 texels);
2579 }
2580 lp_build_endif(&if_ctx);
2581 }
2582 }
2583
2584 for (chan = 0; chan < 4; ++chan) {
2585 colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
2586 lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
2587 }
2588 }
2589
2590
2591 /**
2592 * Texel fetch function.
2593 * In contrast to general sampling there is no filtering, no coord minification,
2594 * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
2595 * directly to be applied to the selected mip level (after adding texel offsets).
2596 * This function handles texel fetch for all targets where texel fetch is supported
2597 * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
2598 */
2599 static void
2600 lp_build_fetch_texel(struct lp_build_sample_context *bld,
2601 unsigned texture_unit,
2602 const LLVMValueRef *coords,
2603 LLVMValueRef explicit_lod,
2604 const LLVMValueRef *offsets,
2605 LLVMValueRef *colors_out)
2606 {
2607 struct lp_build_context *perquadi_bld = &bld->lodi_bld;
2608 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2609 unsigned dims = bld->dims, chan;
2610 unsigned target = bld->static_texture_state->target;
2611 boolean out_of_bound_ret_zero = TRUE;
2612 LLVMValueRef size, ilevel;
2613 LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
2614 LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
2615 LLVMValueRef width, height, depth, i, j;
2616 LLVMValueRef offset, out_of_bounds, out1;
2617
2618 out_of_bounds = int_coord_bld->zero;
2619
2620 if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
2621 if (bld->num_mips != int_coord_bld->type.length) {
2622 ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
2623 perquadi_bld->type, explicit_lod, 0);
2624 }
2625 else {
2626 ilevel = explicit_lod;
2627 }
2628 lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
2629 out_of_bound_ret_zero ? &out_of_bounds : NULL);
2630 }
2631 else {
2632 assert(bld->num_mips == 1);
2633 if (bld->static_texture_state->target != PIPE_BUFFER) {
2634 ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
2635 bld->context_ptr, texture_unit);
2636 }
2637 else {
2638 ilevel = lp_build_const_int32(bld->gallivm, 0);
2639 }
2640 }
2641 lp_build_mipmap_level_sizes(bld, ilevel,
2642 &size,
2643 &row_stride_vec, &img_stride_vec);
2644 lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
2645 size, &width, &height, &depth);
2646
2647 if (target == PIPE_TEXTURE_1D_ARRAY ||
2648 target == PIPE_TEXTURE_2D_ARRAY) {
2649 if (out_of_bound_ret_zero) {
2650 z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
2651 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2652 }
2653 else {
2654 z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
2655 }
2656 }
2657
2658 /* This is a lot like border sampling */
2659 if (offsets[0]) {
2660 /*
2661 * coords are really unsigned, offsets are signed, but I don't think
2662 * exceeding 31 bits is possible
2663 */
2664 x = lp_build_add(int_coord_bld, x, offsets[0]);
2665 }
2666 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
2667 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2668 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
2669 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2670
2671 if (dims >= 2) {
2672 if (offsets[1]) {
2673 y = lp_build_add(int_coord_bld, y, offsets[1]);
2674 }
2675 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
2676 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2677 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
2678 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2679
2680 if (dims >= 3) {
2681 if (offsets[2]) {
2682 z = lp_build_add(int_coord_bld, z, offsets[2]);
2683 }
2684 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
2685 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2686 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
2687 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2688 }
2689 }
2690
2691 lp_build_sample_offset(int_coord_bld,
2692 bld->format_desc,
2693 x, y, z, row_stride_vec, img_stride_vec,
2694 &offset, &i, &j);
2695
2696 if (bld->static_texture_state->target != PIPE_BUFFER) {
2697 offset = lp_build_add(int_coord_bld, offset,
2698 lp_build_get_mip_offsets(bld, ilevel));
2699 }
2700
2701 offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
2702
2703 lp_build_fetch_rgba_soa(bld->gallivm,
2704 bld->format_desc,
2705 bld->texel_type, TRUE,
2706 bld->base_ptr, offset,
2707 i, j,
2708 bld->cache,
2709 colors_out);
2710
2711 if (out_of_bound_ret_zero) {
2712 /*
2713 * Only needed for ARB_robust_buffer_access_behavior and d3d10.
2714 * Could use min/max above instead of out-of-bounds comparisons
2715 * if we don't care about the result returned for out-of-bounds.
2716 */
2717 for (chan = 0; chan < 4; chan++) {
2718 colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
2719 bld->texel_bld.zero, colors_out[chan]);
2720 }
2721 }
2722 }
2723
2724
2725 /**
2726 * Just set texels to white instead of actually sampling the texture.
2727 * For debugging.
2728 */
2729 void
2730 lp_build_sample_nop(struct gallivm_state *gallivm,
2731 struct lp_type type,
2732 const LLVMValueRef *coords,
2733 LLVMValueRef texel_out[4])
2734 {
2735 LLVMValueRef one = lp_build_one(gallivm, type);
2736 unsigned chan;
2737
2738 for (chan = 0; chan < 4; chan++) {
2739 texel_out[chan] = one;
2740 }
2741 }
2742
2743
2744 /**
2745 * Build the actual texture sampling code.
2746 * 'texel' will return a vector of four LLVMValueRefs corresponding to
2747 * R, G, B, A.
2748 * \param type vector float type to use for coords, etc.
2749 * \param sample_key
2750 * \param derivs partial derivatives of (s,t,r,q) with respect to x and y
2751 */
2752 static void
2753 lp_build_sample_soa_code(struct gallivm_state *gallivm,
2754 const struct lp_static_texture_state *static_texture_state,
2755 const struct lp_static_sampler_state *static_sampler_state,
2756 struct lp_sampler_dynamic_state *dynamic_state,
2757 struct lp_type type,
2758 unsigned sample_key,
2759 unsigned texture_index,
2760 unsigned sampler_index,
2761 LLVMValueRef context_ptr,
2762 LLVMValueRef thread_data_ptr,
2763 const LLVMValueRef *coords,
2764 const LLVMValueRef *offsets,
2765 const struct lp_derivatives *derivs, /* optional */
2766 LLVMValueRef lod, /* optional */
2767 LLVMValueRef texel_out[4])
2768 {
2769 unsigned target = static_texture_state->target;
2770 unsigned dims = texture_dims(target);
2771 unsigned num_quads = type.length / 4;
2772 unsigned mip_filter, min_img_filter, mag_img_filter, i;
2773 struct lp_build_sample_context bld;
2774 struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
2775 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2776 LLVMBuilderRef builder = gallivm->builder;
2777 LLVMValueRef tex_width, newcoords[5];
2778 enum lp_sampler_lod_property lod_property;
2779 enum lp_sampler_lod_control lod_control;
2780 enum lp_sampler_op_type op_type;
2781 LLVMValueRef lod_bias = NULL;
2782 LLVMValueRef explicit_lod = NULL;
2783 boolean op_is_tex, op_is_lodq, op_is_gather;
2784
2785 if (0) {
2786 enum pipe_format fmt = static_texture_state->format;
2787 debug_printf("Sample from %s\n", util_format_name(fmt));
2788 }
2789
2790 lod_property = (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
2791 LP_SAMPLER_LOD_PROPERTY_SHIFT;
2792 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
2793 LP_SAMPLER_LOD_CONTROL_SHIFT;
2794 op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
2795 LP_SAMPLER_OP_TYPE_SHIFT;
2796
2797 op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
2798 op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
2799 op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
2800
2801 if (lod_control == LP_SAMPLER_LOD_BIAS) {
2802 lod_bias = lod;
2803 assert(lod);
2804 assert(derivs == NULL);
2805 }
2806 else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
2807 explicit_lod = lod;
2808 assert(lod);
2809 assert(derivs == NULL);
2810 }
2811 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
2812 assert(derivs);
2813 assert(lod == NULL);
2814 }
2815 else {
2816 assert(derivs == NULL);
2817 assert(lod == NULL);
2818 }
2819
2820 if (static_texture_state->format == PIPE_FORMAT_NONE) {
2821 /*
2822 * If there's nothing bound, format is NONE, and we must return
2823 * all zero as mandated by d3d10 in this case.
2824 */
2825 unsigned chan;
2826 LLVMValueRef zero = lp_build_zero(gallivm, type);
2827 for (chan = 0; chan < 4; chan++) {
2828 texel_out[chan] = zero;
2829 }
2830 return;
2831 }
2832
2833 assert(type.floating);
2834
2835 /* Setup our build context */
2836 memset(&bld, 0, sizeof bld);
2837 bld.gallivm = gallivm;
2838 bld.context_ptr = context_ptr;
2839 bld.static_sampler_state = &derived_sampler_state;
2840 bld.static_texture_state = static_texture_state;
2841 bld.dynamic_state = dynamic_state;
2842 bld.format_desc = util_format_description(static_texture_state->format);
2843 bld.dims = dims;
2844
2845 if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD || op_is_lodq) {
2846 bld.no_quad_lod = TRUE;
2847 }
2848 if (gallivm_perf & GALLIVM_PERF_NO_RHO_APPROX || op_is_lodq) {
2849 bld.no_rho_approx = TRUE;
2850 }
2851 if (gallivm_perf & GALLIVM_PERF_NO_BRILINEAR || op_is_lodq) {
2852 bld.no_brilinear = TRUE;
2853 }
2854
2855 bld.vector_width = lp_type_width(type);
2856
2857 bld.float_type = lp_type_float(32);
2858 bld.int_type = lp_type_int(32);
2859 bld.coord_type = type;
2860 bld.int_coord_type = lp_int_type(type);
2861 bld.float_size_in_type = lp_type_float(32);
2862 bld.float_size_in_type.length = dims > 1 ? 4 : 1;
2863 bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
2864 bld.texel_type = type;
2865
2866 /* always using the first channel hopefully should be safe,
2867 * if not things WILL break in other places anyway.
2868 */
2869 if (bld.format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
2870 bld.format_desc->channel[0].pure_integer) {
2871 if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
2872 bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2873 }
2874 else if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2875 bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
2876 }
2877 }
2878 else if (util_format_has_stencil(bld.format_desc) &&
2879 !util_format_has_depth(bld.format_desc)) {
2880 /* for stencil only formats, sample stencil (uint) */
2881 bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2882 }
2883
2884 if (!static_texture_state->level_zero_only ||
2885 !static_sampler_state->max_lod_pos || op_is_lodq) {
2886 derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
2887 } else {
2888 derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2889 }
2890 if (op_is_gather) {
2891 /*
2892 * gather4 is exactly like GL_LINEAR filtering but in the end skipping
2893 * the actual filtering. Using mostly the same paths, so cube face
2894 * selection, coord wrapping etc. all naturally uses the same code.
2895 */
2896 derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2897 derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
2898 derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
2899 }
2900 mip_filter = derived_sampler_state.min_mip_filter;
2901
2902 if (0) {
2903 debug_printf(" .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
2904 }
2905
2906 if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2907 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
2908 {
2909 /*
2910 * Seamless filtering ignores wrap modes.
2911 * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
2912 * bilinear it's not correct but way better than using for instance repeat.
2913 * Note we even set this for non-seamless. Technically GL allows any wrap
2914 * mode, which made sense when supporting true borders (can get seamless
2915 * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
2916 * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
2917 * up the sampler state (as it makes it texture dependent).
2918 */
2919 derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2920 derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2921 }
2922 /*
2923 * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
2924 * so AoS path could be used. Not sure it's worth the trouble...
2925 */
2926
2927 min_img_filter = derived_sampler_state.min_img_filter;
2928 mag_img_filter = derived_sampler_state.mag_img_filter;
2929
2930
2931 /*
2932 * This is all a bit complicated different paths are chosen for performance
2933 * reasons.
2934 * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
2935 * everything (the last two options are equivalent for 4-wide case).
2936 * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
2937 * lod is calculated then the lod value extracted afterwards so making this
2938 * case basically the same as far as lod handling is concerned for the
2939 * further sample/filter code as the 1 lod for everything case.
2940 * Different lod handling mostly shows up when building mipmap sizes
2941 * (lp_build_mipmap_level_sizes() and friends) and also in filtering
2942 * (getting the fractional part of the lod to the right texels).
2943 */
2944
2945 /*
2946 * There are other situations where at least the multiple int lods could be
2947 * avoided like min and max lod being equal.
2948 */
2949 bld.num_mips = bld.num_lods = 1;
2950
2951 if (bld.no_quad_lod && bld.no_rho_approx &&
2952 ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
2953 (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2954 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
2955 op_is_lodq)) {
2956 /*
2957 * special case for using per-pixel lod even for implicit lod,
2958 * which is generally never required (ok by APIs) except to please
2959 * some (somewhat broken imho) tests (because per-pixel face selection
2960 * can cause derivatives to be different for pixels outside the primitive
2961 * due to the major axis division even if pre-project derivatives are
2962 * looking normal).
2963 * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
2964 * cube maps we do indeed get per-pixel lod values).
2965 */
2966 bld.num_mips = type.length;
2967 bld.num_lods = type.length;
2968 }
2969 else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
2970 (explicit_lod || lod_bias || derivs)) {
2971 if ((!op_is_tex && target != PIPE_BUFFER) ||
2972 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2973 bld.num_mips = type.length;
2974 bld.num_lods = type.length;
2975 }
2976 else if (op_is_tex && min_img_filter != mag_img_filter) {
2977 bld.num_mips = 1;
2978 bld.num_lods = type.length;
2979 }
2980 }
2981 /* TODO: for true scalar_lod should only use 1 lod value */
2982 else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
2983 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2984 bld.num_mips = num_quads;
2985 bld.num_lods = num_quads;
2986 }
2987 else if (op_is_tex && min_img_filter != mag_img_filter) {
2988 bld.num_mips = 1;
2989 bld.num_lods = num_quads;
2990 }
2991
2992 if (op_is_gather)
2993 bld.gather_comp = (sample_key & LP_SAMPLER_GATHER_COMP_MASK) >> LP_SAMPLER_GATHER_COMP_SHIFT;
2994 bld.lodf_type = type;
2995 /* we want native vector size to be able to use our intrinsics */
2996 if (bld.num_lods != type.length) {
2997 /* TODO: this currently always has to be per-quad or per-element */
2998 bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
2999 }
3000 bld.lodi_type = lp_int_type(bld.lodf_type);
3001 bld.levelf_type = bld.lodf_type;
3002 if (bld.num_mips == 1) {
3003 bld.levelf_type.length = 1;
3004 }
3005 bld.leveli_type = lp_int_type(bld.levelf_type);
3006 bld.float_size_type = bld.float_size_in_type;
3007 /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
3008 * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
3009 if (bld.num_mips > 1) {
3010 bld.float_size_type.length = bld.num_mips == type.length ?
3011 bld.num_mips * bld.float_size_in_type.length :
3012 type.length;
3013 }
3014 bld.int_size_type = lp_int_type(bld.float_size_type);
3015
3016 lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
3017 lp_build_context_init(&bld.float_vec_bld, gallivm, type);
3018 lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
3019 lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
3020 lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
3021 lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3022 lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3023 lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3024 lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3025 lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3026 lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3027 lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3028 lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3029 lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3030
3031 /* Get the dynamic state */
3032 tex_width = dynamic_state->width(dynamic_state, gallivm,
3033 context_ptr, texture_index);
3034 bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3035 context_ptr, texture_index);
3036 bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3037 context_ptr, texture_index);
3038 bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3039 context_ptr, texture_index);
3040 bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3041 context_ptr, texture_index);
3042 /* Note that mip_offsets is an array[level] of offsets to texture images */
3043
3044 if (dynamic_state->cache_ptr && thread_data_ptr) {
3045 bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3046 thread_data_ptr, texture_index);
3047 }
3048
3049 /* width, height, depth as single int vector */
3050 if (dims <= 1) {
3051 bld.int_size = tex_width;
3052 }
3053 else {
3054 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3055 tex_width,
3056 LLVMConstInt(i32t, 0, 0), "");
3057 if (dims >= 2) {
3058 LLVMValueRef tex_height =
3059 dynamic_state->height(dynamic_state, gallivm,
3060 context_ptr, texture_index);
3061 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3062 tex_height,
3063 LLVMConstInt(i32t, 1, 0), "");
3064 if (dims >= 3) {
3065 LLVMValueRef tex_depth =
3066 dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3067 texture_index);
3068 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3069 tex_depth,
3070 LLVMConstInt(i32t, 2, 0), "");
3071 }
3072 }
3073 }
3074
3075 for (i = 0; i < 5; i++) {
3076 newcoords[i] = coords[i];
3077 }
3078
3079 if (util_format_is_pure_integer(static_texture_state->format) &&
3080 !util_format_has_depth(bld.format_desc) && op_is_tex &&
3081 (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3082 static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3083 static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3084 /*
3085 * Bail if impossible filtering is specified (the awkard additional
3086 * depth check is because it is legal in gallium to have things like S8Z24
3087 * here which would say it's pure int despite such formats should sample
3088 * the depth component).
3089 * In GL such filters make the texture incomplete, this makes it robust
3090 * against state trackers which set this up regardless (we'd crash in the
3091 * lerp later otherwise).
3092 * At least in some apis it may be legal to use such filters with lod
3093 * queries and/or gather (at least for gather d3d10 says only the wrap
3094 * bits are really used hence filter bits are likely simply ignored).
3095 * For fetch, we don't get valid samplers either way here.
3096 */
3097 unsigned chan;
3098 LLVMValueRef zero = lp_build_zero(gallivm, type);
3099 for (chan = 0; chan < 4; chan++) {
3100 texel_out[chan] = zero;
3101 }
3102 return;
3103 }
3104
3105 if (0) {
3106 /* For debug: no-op texture sampling */
3107 lp_build_sample_nop(gallivm,
3108 bld.texel_type,
3109 newcoords,
3110 texel_out);
3111 }
3112
3113 else if (op_type == LP_SAMPLER_OP_FETCH) {
3114 lp_build_fetch_texel(&bld, texture_index, newcoords,
3115 lod, offsets,
3116 texel_out);
3117 }
3118
3119 else {
3120 LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3121 LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3122 boolean use_aos;
3123
3124 use_aos = util_format_fits_8unorm(bld.format_desc) &&
3125 op_is_tex &&
3126 /* not sure this is strictly needed or simply impossible */
3127 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3128 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3129
3130 use_aos &= bld.num_lods <= num_quads ||
3131 derived_sampler_state.min_img_filter ==
3132 derived_sampler_state.mag_img_filter;
3133
3134 if(gallivm_perf & GALLIVM_PERF_NO_AOS_SAMPLING) {
3135 use_aos = 0;
3136 }
3137
3138 if (dims > 1) {
3139 use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3140 if (dims > 2) {
3141 use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3142 }
3143 }
3144 if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3145 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3146 derived_sampler_state.seamless_cube_map &&
3147 (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3148 derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3149 /* theoretically possible with AoS filtering but not implemented (complex!) */
3150 use_aos = 0;
3151 }
3152
3153 if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3154 !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3155 debug_printf("%s: using floating point linear filtering for %s\n",
3156 __FUNCTION__, bld.format_desc->short_name);
3157 debug_printf(" min_img %d mag_img %d mip %d target %d seamless %d"
3158 " wraps %d wrapt %d wrapr %d\n",
3159 derived_sampler_state.min_img_filter,
3160 derived_sampler_state.mag_img_filter,
3161 derived_sampler_state.min_mip_filter,
3162 static_texture_state->target,
3163 derived_sampler_state.seamless_cube_map,
3164 derived_sampler_state.wrap_s,
3165 derived_sampler_state.wrap_t,
3166 derived_sampler_state.wrap_r);
3167 }
3168
3169 lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3170 newcoords,
3171 derivs, lod_bias, explicit_lod,
3172 &lod_positive, &lod, &lod_fpart,
3173 &ilevel0, &ilevel1);
3174
3175 if (op_is_lodq) {
3176 texel_out[0] = lod_fpart;
3177 texel_out[1] = lod;
3178 texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3179 return;
3180 }
3181
3182 if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3183 /* The aos path doesn't do seamless filtering so simply add cube layer
3184 * to face now.
3185 */
3186 newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3187 }
3188
3189 /*
3190 * we only try 8-wide sampling with soa or if we have AVX2
3191 * as it appears to be a loss with just AVX)
3192 */
3193 if (num_quads == 1 || !use_aos ||
3194 (util_cpu_caps.has_avx2 &&
3195 (bld.num_lods == 1 ||
3196 derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3197 if (use_aos) {
3198 /* do sampling/filtering with fixed pt arithmetic */
3199 lp_build_sample_aos(&bld, sampler_index,
3200 newcoords[0], newcoords[1],
3201 newcoords[2],
3202 offsets, lod_positive, lod_fpart,
3203 ilevel0, ilevel1,
3204 texel_out);
3205 }
3206
3207 else {
3208 lp_build_sample_general(&bld, sampler_index,
3209 op_type == LP_SAMPLER_OP_GATHER,
3210 newcoords, offsets,
3211 lod_positive, lod_fpart,
3212 ilevel0, ilevel1,
3213 texel_out);
3214 }
3215 }
3216 else {
3217 unsigned j;
3218 struct lp_build_sample_context bld4;
3219 struct lp_type type4 = type;
3220 unsigned i;
3221 LLVMValueRef texelout4[4];
3222 LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3223
3224 type4.length = 4;
3225
3226 /* Setup our build context */
3227 memset(&bld4, 0, sizeof bld4);
3228 bld4.no_quad_lod = bld.no_quad_lod;
3229 bld4.no_rho_approx = bld.no_rho_approx;
3230 bld4.no_brilinear = bld.no_brilinear;
3231 bld4.gallivm = bld.gallivm;
3232 bld4.context_ptr = bld.context_ptr;
3233 bld4.static_texture_state = bld.static_texture_state;
3234 bld4.static_sampler_state = bld.static_sampler_state;
3235 bld4.dynamic_state = bld.dynamic_state;
3236 bld4.format_desc = bld.format_desc;
3237 bld4.dims = bld.dims;
3238 bld4.row_stride_array = bld.row_stride_array;
3239 bld4.img_stride_array = bld.img_stride_array;
3240 bld4.base_ptr = bld.base_ptr;
3241 bld4.mip_offsets = bld.mip_offsets;
3242 bld4.int_size = bld.int_size;
3243 bld4.cache = bld.cache;
3244
3245 bld4.vector_width = lp_type_width(type4);
3246
3247 bld4.float_type = lp_type_float(32);
3248 bld4.int_type = lp_type_int(32);
3249 bld4.coord_type = type4;
3250 bld4.int_coord_type = lp_int_type(type4);
3251 bld4.float_size_in_type = lp_type_float(32);
3252 bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3253 bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3254 bld4.texel_type = bld.texel_type;
3255 bld4.texel_type.length = 4;
3256
3257 bld4.num_mips = bld4.num_lods = 1;
3258 if (bld4.no_quad_lod && bld4.no_rho_approx &&
3259 (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3260 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3261 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3262 bld4.num_mips = type4.length;
3263 bld4.num_lods = type4.length;
3264 }
3265 if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3266 (explicit_lod || lod_bias || derivs)) {
3267 if ((!op_is_tex && target != PIPE_BUFFER) ||
3268 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3269 bld4.num_mips = type4.length;
3270 bld4.num_lods = type4.length;
3271 }
3272 else if (op_is_tex && min_img_filter != mag_img_filter) {
3273 bld4.num_mips = 1;
3274 bld4.num_lods = type4.length;
3275 }
3276 }
3277
3278 /* we want native vector size to be able to use our intrinsics */
3279 bld4.lodf_type = type4;
3280 if (bld4.num_lods != type4.length) {
3281 bld4.lodf_type.length = 1;
3282 }
3283 bld4.lodi_type = lp_int_type(bld4.lodf_type);
3284 bld4.levelf_type = type4;
3285 if (bld4.num_mips != type4.length) {
3286 bld4.levelf_type.length = 1;
3287 }
3288 bld4.leveli_type = lp_int_type(bld4.levelf_type);
3289 bld4.float_size_type = bld4.float_size_in_type;
3290 if (bld4.num_mips > 1) {
3291 bld4.float_size_type.length = bld4.num_mips == type4.length ?
3292 bld4.num_mips * bld4.float_size_in_type.length :
3293 type4.length;
3294 }
3295 bld4.int_size_type = lp_int_type(bld4.float_size_type);
3296
3297 lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3298 lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3299 lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3300 lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3301 lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3302 lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3303 lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3304 lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3305 lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3306 lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3307 lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3308 lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3309 lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3310 lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3311
3312 for (i = 0; i < num_quads; i++) {
3313 LLVMValueRef s4, t4, r4;
3314 LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3315 LLVMValueRef ilevel04, ilevel14 = NULL;
3316 LLVMValueRef offsets4[4] = { NULL };
3317 unsigned num_lods = bld4.num_lods;
3318
3319 s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3320 t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3321 r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3322
3323 if (offsets[0]) {
3324 offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3325 if (dims > 1) {
3326 offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3327 if (dims > 2) {
3328 offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3329 }
3330 }
3331 }
3332 lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3333 ilevel04 = bld.num_mips == 1 ? ilevel0 :
3334 lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3335 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3336 ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3337 lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3338 }
3339
3340 if (use_aos) {
3341 /* do sampling/filtering with fixed pt arithmetic */
3342 lp_build_sample_aos(&bld4, sampler_index,
3343 s4, t4, r4, offsets4,
3344 lod_positive4, lod_fpart4,
3345 ilevel04, ilevel14,
3346 texelout4);
3347 }
3348
3349 else {
3350 /* this path is currently unreachable and hence might break easily... */
3351 LLVMValueRef newcoords4[5];
3352 newcoords4[0] = s4;
3353 newcoords4[1] = t4;
3354 newcoords4[2] = r4;
3355 newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3356 newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3357
3358 lp_build_sample_general(&bld4, sampler_index,
3359 op_type == LP_SAMPLER_OP_GATHER,
3360 newcoords4, offsets4,
3361 lod_positive4, lod_fpart4,
3362 ilevel04, ilevel14,
3363 texelout4);
3364 }
3365 for (j = 0; j < 4; j++) {
3366 texelouttmp[j][i] = texelout4[j];
3367 }
3368 }
3369
3370 for (j = 0; j < 4; j++) {
3371 texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3372 }
3373 }
3374 }
3375
3376 if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3377 apply_sampler_swizzle(&bld, texel_out);
3378 }
3379
3380 /*
3381 * texel type can be a (32bit) int/uint (for pure int formats only),
3382 * however we are expected to always return floats (storage is untyped).
3383 */
3384 if (!bld.texel_type.floating) {
3385 unsigned chan;
3386 for (chan = 0; chan < 4; chan++) {
3387 texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3388 lp_build_vec_type(gallivm, type), "");
3389 }
3390 }
3391 }
3392
3393
3394 #define USE_TEX_FUNC_CALL 1
3395
3396 #define LP_MAX_TEX_FUNC_ARGS 32
3397
3398 static inline void
3399 get_target_info(enum pipe_texture_target target,
3400 unsigned *num_coords, unsigned *num_derivs,
3401 unsigned *num_offsets, unsigned *layer)
3402 {
3403 unsigned dims = texture_dims(target);
3404 *num_coords = dims;
3405 *num_offsets = dims;
3406 *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3407 target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3408 *layer = has_layer_coord(target) ? 2: 0;
3409 if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3410 /*
3411 * dims doesn't include r coord for cubes - this is handled
3412 * by layer instead, but need to fix up for cube arrays...
3413 */
3414 *layer = 3;
3415 *num_coords = 3;
3416 }
3417 }
3418
3419
3420 /**
3421 * Generate the function body for a texture sampling function.
3422 */
3423 static void
3424 lp_build_sample_gen_func(struct gallivm_state *gallivm,
3425 const struct lp_static_texture_state *static_texture_state,
3426 const struct lp_static_sampler_state *static_sampler_state,
3427 struct lp_sampler_dynamic_state *dynamic_state,
3428 struct lp_type type,
3429 unsigned texture_index,
3430 unsigned sampler_index,
3431 LLVMValueRef function,
3432 unsigned num_args,
3433 unsigned sample_key)
3434 {
3435 LLVMBuilderRef old_builder;
3436 LLVMBasicBlockRef block;
3437 LLVMValueRef coords[5];
3438 LLVMValueRef offsets[3] = { NULL };
3439 LLVMValueRef lod = NULL;
3440 LLVMValueRef context_ptr;
3441 LLVMValueRef thread_data_ptr = NULL;
3442 LLVMValueRef texel_out[4];
3443 struct lp_derivatives derivs;
3444 struct lp_derivatives *deriv_ptr = NULL;
3445 unsigned num_param = 0;
3446 unsigned i, num_coords, num_derivs, num_offsets, layer;
3447 enum lp_sampler_lod_control lod_control;
3448 boolean need_cache = FALSE;
3449
3450 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3451 LP_SAMPLER_LOD_CONTROL_SHIFT;
3452
3453 get_target_info(static_texture_state->target,
3454 &num_coords, &num_derivs, &num_offsets, &layer);
3455
3456 if (dynamic_state->cache_ptr) {
3457 const struct util_format_description *format_desc;
3458 format_desc = util_format_description(static_texture_state->format);
3459 if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3460 need_cache = TRUE;
3461 }
3462 }
3463
3464 /* "unpack" arguments */
3465 context_ptr = LLVMGetParam(function, num_param++);
3466 if (need_cache) {
3467 thread_data_ptr = LLVMGetParam(function, num_param++);
3468 }
3469 for (i = 0; i < num_coords; i++) {
3470 coords[i] = LLVMGetParam(function, num_param++);
3471 }
3472 for (i = num_coords; i < 5; i++) {
3473 /* This is rather unfortunate... */
3474 coords[i] = lp_build_undef(gallivm, type);
3475 }
3476 if (layer) {
3477 coords[layer] = LLVMGetParam(function, num_param++);
3478 }
3479 if (sample_key & LP_SAMPLER_SHADOW) {
3480 coords[4] = LLVMGetParam(function, num_param++);
3481 }
3482 if (sample_key & LP_SAMPLER_OFFSETS) {
3483 for (i = 0; i < num_offsets; i++) {
3484 offsets[i] = LLVMGetParam(function, num_param++);
3485 }
3486 }
3487 if (lod_control == LP_SAMPLER_LOD_BIAS ||
3488 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3489 lod = LLVMGetParam(function, num_param++);
3490 }
3491 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3492 for (i = 0; i < num_derivs; i++) {
3493 derivs.ddx[i] = LLVMGetParam(function, num_param++);
3494 derivs.ddy[i] = LLVMGetParam(function, num_param++);
3495 }
3496 deriv_ptr = &derivs;
3497 }
3498
3499 assert(num_args == num_param);
3500
3501 /*
3502 * Function body
3503 */
3504
3505 old_builder = gallivm->builder;
3506 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3507 gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
3508 LLVMPositionBuilderAtEnd(gallivm->builder, block);
3509
3510 lp_build_sample_soa_code(gallivm,
3511 static_texture_state,
3512 static_sampler_state,
3513 dynamic_state,
3514 type,
3515 sample_key,
3516 texture_index,
3517 sampler_index,
3518 context_ptr,
3519 thread_data_ptr,
3520 coords,
3521 offsets,
3522 deriv_ptr,
3523 lod,
3524 texel_out);
3525
3526 LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
3527
3528 LLVMDisposeBuilder(gallivm->builder);
3529 gallivm->builder = old_builder;
3530
3531 gallivm_verify_function(gallivm, function);
3532 }
3533
3534
3535 /**
3536 * Call the matching function for texture sampling.
3537 * If there's no match, generate a new one.
3538 */
3539 static void
3540 lp_build_sample_soa_func(struct gallivm_state *gallivm,
3541 const struct lp_static_texture_state *static_texture_state,
3542 const struct lp_static_sampler_state *static_sampler_state,
3543 struct lp_sampler_dynamic_state *dynamic_state,
3544 const struct lp_sampler_params *params)
3545 {
3546 LLVMBuilderRef builder = gallivm->builder;
3547 LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
3548 LLVMGetInsertBlock(builder)));
3549 LLVMValueRef function, inst;
3550 LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
3551 LLVMBasicBlockRef bb;
3552 LLVMValueRef tex_ret;
3553 unsigned num_args = 0;
3554 char func_name[64];
3555 unsigned i, num_coords, num_derivs, num_offsets, layer;
3556 unsigned texture_index = params->texture_index;
3557 unsigned sampler_index = params->sampler_index;
3558 unsigned sample_key = params->sample_key;
3559 const LLVMValueRef *coords = params->coords;
3560 const LLVMValueRef *offsets = params->offsets;
3561 const struct lp_derivatives *derivs = params->derivs;
3562 enum lp_sampler_lod_control lod_control;
3563 boolean need_cache = FALSE;
3564
3565 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3566 LP_SAMPLER_LOD_CONTROL_SHIFT;
3567
3568 get_target_info(static_texture_state->target,
3569 &num_coords, &num_derivs, &num_offsets, &layer);
3570
3571 if (dynamic_state->cache_ptr) {
3572 const struct util_format_description *format_desc;
3573 format_desc = util_format_description(static_texture_state->format);
3574 if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3575 need_cache = TRUE;
3576 }
3577 }
3578 /*
3579 * texture function matches are found by name.
3580 * Thus the name has to include both the texture and sampler unit
3581 * (which covers all static state) plus the actual texture function
3582 * (including things like offsets, shadow coord, lod control).
3583 * Additionally lod_property has to be included too.
3584 */
3585
3586 snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
3587 texture_index, sampler_index, sample_key);
3588
3589 function = LLVMGetNamedFunction(module, func_name);
3590
3591 if(!function) {
3592 LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
3593 LLVMTypeRef ret_type;
3594 LLVMTypeRef function_type;
3595 LLVMTypeRef val_type[4];
3596 unsigned num_param = 0;
3597
3598 /*
3599 * Generate the function prototype.
3600 */
3601
3602 arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
3603 if (need_cache) {
3604 arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
3605 }
3606 for (i = 0; i < num_coords; i++) {
3607 arg_types[num_param++] = LLVMTypeOf(coords[0]);
3608 assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
3609 }
3610 if (layer) {
3611 arg_types[num_param++] = LLVMTypeOf(coords[layer]);
3612 assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
3613 }
3614 if (sample_key & LP_SAMPLER_SHADOW) {
3615 arg_types[num_param++] = LLVMTypeOf(coords[0]);
3616 }
3617 if (sample_key & LP_SAMPLER_OFFSETS) {
3618 for (i = 0; i < num_offsets; i++) {
3619 arg_types[num_param++] = LLVMTypeOf(offsets[0]);
3620 assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
3621 }
3622 }
3623 if (lod_control == LP_SAMPLER_LOD_BIAS ||
3624 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3625 arg_types[num_param++] = LLVMTypeOf(params->lod);
3626 }
3627 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3628 for (i = 0; i < num_derivs; i++) {
3629 arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
3630 arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
3631 assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
3632 assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
3633 }
3634 }
3635
3636 val_type[0] = val_type[1] = val_type[2] = val_type[3] =
3637 lp_build_vec_type(gallivm, params->type);
3638 ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
3639 function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
3640 function = LLVMAddFunction(module, func_name, function_type);
3641
3642 for (i = 0; i < num_param; ++i) {
3643 if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
3644
3645 lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3646 }
3647 }
3648
3649 LLVMSetFunctionCallConv(function, LLVMFastCallConv);
3650 LLVMSetLinkage(function, LLVMInternalLinkage);
3651
3652 lp_build_sample_gen_func(gallivm,
3653 static_texture_state,
3654 static_sampler_state,
3655 dynamic_state,
3656 params->type,
3657 texture_index,
3658 sampler_index,
3659 function,
3660 num_param,
3661 sample_key);
3662 }
3663
3664 num_args = 0;
3665 args[num_args++] = params->context_ptr;
3666 if (need_cache) {
3667 args[num_args++] = params->thread_data_ptr;
3668 }
3669 for (i = 0; i < num_coords; i++) {
3670 args[num_args++] = coords[i];
3671 }
3672 if (layer) {
3673 args[num_args++] = coords[layer];
3674 }
3675 if (sample_key & LP_SAMPLER_SHADOW) {
3676 args[num_args++] = coords[4];
3677 }
3678 if (sample_key & LP_SAMPLER_OFFSETS) {
3679 for (i = 0; i < num_offsets; i++) {
3680 args[num_args++] = offsets[i];
3681 }
3682 }
3683 if (lod_control == LP_SAMPLER_LOD_BIAS ||
3684 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3685 args[num_args++] = params->lod;
3686 }
3687 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3688 for (i = 0; i < num_derivs; i++) {
3689 args[num_args++] = derivs->ddx[i];
3690 args[num_args++] = derivs->ddy[i];
3691 }
3692 }
3693
3694 assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
3695
3696 tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
3697 bb = LLVMGetInsertBlock(builder);
3698 inst = LLVMGetLastInstruction(bb);
3699 LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
3700
3701 for (i = 0; i < 4; i++) {
3702 params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
3703 }
3704 }
3705
3706
3707 /**
3708 * Build texture sampling code.
3709 * Either via a function call or inline it directly.
3710 */
3711 void
3712 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
3713 const struct lp_static_sampler_state *static_sampler_state,
3714 struct lp_sampler_dynamic_state *dynamic_state,
3715 struct gallivm_state *gallivm,
3716 const struct lp_sampler_params *params)
3717 {
3718 boolean use_tex_func = FALSE;
3719
3720 /*
3721 * Do not use a function call if the sampling is "simple enough".
3722 * We define this by
3723 * a) format
3724 * b) no mips (either one level only or no mip filter)
3725 * No mips will definitely make the code smaller, though
3726 * the format requirement is a bit iffy - there's some (SoA) formats
3727 * which definitely generate less code. This does happen to catch
3728 * some important cases though which are hurt quite a bit by using
3729 * a call (though not really because of the call overhead but because
3730 * they are reusing the same texture unit with some of the same
3731 * parameters).
3732 * Ideally we'd let llvm recognize this stuff by doing IPO passes.
3733 */
3734
3735 if (USE_TEX_FUNC_CALL) {
3736 const struct util_format_description *format_desc;
3737 boolean simple_format;
3738 boolean simple_tex;
3739 enum lp_sampler_op_type op_type;
3740 format_desc = util_format_description(static_texture_state->format);
3741 simple_format = !format_desc ||
3742 (util_format_is_rgba8_variant(format_desc) &&
3743 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
3744
3745 op_type = (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3746 LP_SAMPLER_OP_TYPE_SHIFT;
3747 simple_tex =
3748 op_type != LP_SAMPLER_OP_TEXTURE ||
3749 ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
3750 static_texture_state->level_zero_only == TRUE) &&
3751 static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
3752
3753 use_tex_func = format_desc && !(simple_format && simple_tex);
3754 }
3755
3756 if (use_tex_func) {
3757 lp_build_sample_soa_func(gallivm,
3758 static_texture_state,
3759 static_sampler_state,
3760 dynamic_state,
3761 params);
3762 }
3763 else {
3764 lp_build_sample_soa_code(gallivm,
3765 static_texture_state,
3766 static_sampler_state,
3767 dynamic_state,
3768 params->type,
3769 params->sample_key,
3770 params->texture_index,
3771 params->sampler_index,
3772 params->context_ptr,
3773 params->thread_data_ptr,
3774 params->coords,
3775 params->offsets,
3776 params->derivs,
3777 params->lod,
3778 params->texel);
3779 }
3780 }
3781
3782
3783 void
3784 lp_build_size_query_soa(struct gallivm_state *gallivm,
3785 const struct lp_static_texture_state *static_state,
3786 struct lp_sampler_dynamic_state *dynamic_state,
3787 const struct lp_sampler_size_query_params *params)
3788 {
3789 LLVMValueRef lod, level = 0, size;
3790 LLVMValueRef first_level = NULL;
3791 int dims, i;
3792 boolean has_array;
3793 unsigned num_lods = 1;
3794 struct lp_build_context bld_int_vec4;
3795 LLVMValueRef context_ptr = params->context_ptr;
3796 unsigned texture_unit = params->texture_unit;
3797 unsigned target = params->target;
3798
3799 if (static_state->format == PIPE_FORMAT_NONE) {
3800 /*
3801 * If there's nothing bound, format is NONE, and we must return
3802 * all zero as mandated by d3d10 in this case.
3803 */
3804 unsigned chan;
3805 LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
3806 for (chan = 0; chan < 4; chan++) {
3807 params->sizes_out[chan] = zero;
3808 }
3809 return;
3810 }
3811
3812 /*
3813 * Do some sanity verification about bound texture and shader dcl target.
3814 * Not entirely sure what's possible but assume array/non-array
3815 * always compatible (probably not ok for OpenGL but d3d10 has no
3816 * distinction of arrays at the resource level).
3817 * Everything else looks bogus (though not entirely sure about rect/2d).
3818 * Currently disabled because it causes assertion failures if there's
3819 * nothing bound (or rather a dummy texture, not that this case would
3820 * return the right values).
3821 */
3822 if (0 && static_state->target != target) {
3823 if (static_state->target == PIPE_TEXTURE_1D)
3824 assert(target == PIPE_TEXTURE_1D_ARRAY);
3825 else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
3826 assert(target == PIPE_TEXTURE_1D);
3827 else if (static_state->target == PIPE_TEXTURE_2D)
3828 assert(target == PIPE_TEXTURE_2D_ARRAY);
3829 else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
3830 assert(target == PIPE_TEXTURE_2D);
3831 else if (static_state->target == PIPE_TEXTURE_CUBE)
3832 assert(target == PIPE_TEXTURE_CUBE_ARRAY);
3833 else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
3834 assert(target == PIPE_TEXTURE_CUBE);
3835 else
3836 assert(0);
3837 }
3838
3839 dims = texture_dims(target);
3840
3841 switch (target) {
3842 case PIPE_TEXTURE_1D_ARRAY:
3843 case PIPE_TEXTURE_2D_ARRAY:
3844 case PIPE_TEXTURE_CUBE_ARRAY:
3845 has_array = TRUE;
3846 break;
3847 default:
3848 has_array = FALSE;
3849 break;
3850 }
3851
3852 assert(!params->int_type.floating);
3853
3854 lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
3855
3856 if (params->explicit_lod) {
3857 /* FIXME: this needs to honor per-element lod */
3858 lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
3859 lp_build_const_int32(gallivm, 0), "");
3860 first_level = dynamic_state->first_level(dynamic_state, gallivm,
3861 context_ptr, texture_unit);
3862 level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
3863 lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
3864 } else {
3865 lod = bld_int_vec4.zero;
3866 }
3867
3868 size = bld_int_vec4.undef;
3869
3870 size = LLVMBuildInsertElement(gallivm->builder, size,
3871 dynamic_state->width(dynamic_state, gallivm,
3872 context_ptr, texture_unit),
3873 lp_build_const_int32(gallivm, 0), "");
3874
3875 if (dims >= 2) {
3876 size = LLVMBuildInsertElement(gallivm->builder, size,
3877 dynamic_state->height(dynamic_state, gallivm,
3878 context_ptr, texture_unit),
3879 lp_build_const_int32(gallivm, 1), "");
3880 }
3881
3882 if (dims >= 3) {
3883 size = LLVMBuildInsertElement(gallivm->builder, size,
3884 dynamic_state->depth(dynamic_state, gallivm,
3885 context_ptr, texture_unit),
3886 lp_build_const_int32(gallivm, 2), "");
3887 }
3888
3889 size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
3890
3891 if (has_array) {
3892 LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
3893 context_ptr, texture_unit);
3894 if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3895 /*
3896 * It looks like GL wants number of cubes, d3d10.1 has it undefined?
3897 * Could avoid this by passing in number of cubes instead of total
3898 * number of layers (might make things easier elsewhere too).
3899 */
3900 LLVMValueRef six = lp_build_const_int32(gallivm, 6);
3901 layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
3902 }
3903 size = LLVMBuildInsertElement(gallivm->builder, size, layers,
3904 lp_build_const_int32(gallivm, dims), "");
3905 }
3906
3907 /*
3908 * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
3909 * if level is out of bounds (note this can't cover unbound texture
3910 * here, which also requires returning zero).
3911 */
3912 if (params->explicit_lod && params->is_sviewinfo) {
3913 LLVMValueRef last_level, out, out1;
3914 struct lp_build_context leveli_bld;
3915
3916 /* everything is scalar for now */
3917 lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
3918 last_level = dynamic_state->last_level(dynamic_state, gallivm,
3919 context_ptr, texture_unit);
3920
3921 out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
3922 out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
3923 out = lp_build_or(&leveli_bld, out, out1);
3924 if (num_lods == 1) {
3925 out = lp_build_broadcast_scalar(&bld_int_vec4, out);
3926 }
3927 else {
3928 /* TODO */
3929 assert(0);
3930 }
3931 size = lp_build_andnot(&bld_int_vec4, size, out);
3932 }
3933 for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
3934 params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
3935 size,
3936 lp_build_const_int32(gallivm, i));
3937 }
3938 if (params->is_sviewinfo) {
3939 for (; i < 4; i++) {
3940 params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
3941 }
3942 }
3943
3944 /*
3945 * if there's no explicit_lod (buffers, rects) queries requiring nr of
3946 * mips would be illegal.
3947 */
3948 if (params->is_sviewinfo && params->explicit_lod) {
3949 struct lp_build_context bld_int_scalar;
3950 LLVMValueRef num_levels;
3951 lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
3952
3953 if (static_state->level_zero_only) {
3954 num_levels = bld_int_scalar.one;
3955 }
3956 else {
3957 LLVMValueRef last_level;
3958
3959 last_level = dynamic_state->last_level(dynamic_state, gallivm,
3960 context_ptr, texture_unit);
3961 num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
3962 num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
3963 }
3964 params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
3965 num_levels);
3966 }
3967 }
3968
3969 static void
3970 lp_build_do_atomic_soa(struct gallivm_state *gallivm,
3971 const struct util_format_description *format_desc,
3972 struct lp_type type,
3973 LLVMValueRef exec_mask,
3974 LLVMValueRef base_ptr,
3975 LLVMValueRef offset,
3976 LLVMValueRef out_of_bounds,
3977 unsigned img_op,
3978 LLVMAtomicRMWBinOp op,
3979 const LLVMValueRef rgba_in[4],
3980 const LLVMValueRef rgba2_in[4],
3981 LLVMValueRef atomic_result[4])
3982 {
3983 enum pipe_format format = format_desc->format;
3984
3985 if (format != PIPE_FORMAT_R32_UINT && format != PIPE_FORMAT_R32_SINT && format != PIPE_FORMAT_R32_FLOAT)
3986 return;
3987
3988 LLVMValueRef atom_res = lp_build_alloca(gallivm,
3989 LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), type.length), "");
3990
3991 offset = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
3992 struct lp_build_loop_state loop_state;
3993 lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
3994 struct lp_build_if_state ifthen;
3995 LLVMValueRef cond;
3996 LLVMValueRef packed = rgba_in[0], packed2 = rgba2_in[0];
3997
3998 LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
3999 assert(exec_mask);
4000
4001 cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
4002 cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
4003 lp_build_if(&ifthen, gallivm, cond);
4004
4005 LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed, loop_state.counter, "");
4006 LLVMValueRef cast_base_ptr = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
4007 cast_base_ptr = LLVMBuildBitCast(gallivm->builder, cast_base_ptr, LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), "");
4008 data = LLVMBuildBitCast(gallivm->builder, data, LLVMInt32TypeInContext(gallivm->context), "");
4009
4010 if (img_op == LP_IMG_ATOMIC_CAS) {
4011 LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, packed2, loop_state.counter, "");
4012 LLVMValueRef cas_src = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, LLVMInt32TypeInContext(gallivm->context), "");
4013 data = LLVMBuildAtomicCmpXchg(gallivm->builder, cast_base_ptr, data,
4014 cas_src,
4015 LLVMAtomicOrderingSequentiallyConsistent,
4016 LLVMAtomicOrderingSequentiallyConsistent,
4017 false);
4018 data = LLVMBuildExtractValue(gallivm->builder, data, 0, "");
4019 } else {
4020 data = LLVMBuildAtomicRMW(gallivm->builder, op,
4021 cast_base_ptr, data,
4022 LLVMAtomicOrderingSequentiallyConsistent,
4023 false);
4024 }
4025
4026 LLVMValueRef temp_res = LLVMBuildLoad(gallivm->builder, atom_res, "");
4027 temp_res = LLVMBuildInsertElement(gallivm->builder, temp_res, data, loop_state.counter, "");
4028 LLVMBuildStore(gallivm->builder, temp_res, atom_res);
4029
4030 lp_build_endif(&ifthen);
4031 lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
4032 NULL, LLVMIntUGE);
4033 atomic_result[0] = LLVMBuildLoad(gallivm->builder, atom_res, "");
4034 }
4035
4036 void
4037 lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state,
4038 struct lp_sampler_dynamic_state *dynamic_state,
4039 struct gallivm_state *gallivm,
4040 const struct lp_img_params *params)
4041 {
4042 unsigned target = params->target;
4043 unsigned dims = texture_dims(target);
4044 /** regular scalar int type */
4045 struct lp_type int_type, int_coord_type;
4046 struct lp_build_context int_bld, int_coord_bld;
4047 const struct util_format_description *format_desc = util_format_description(static_texture_state->format);
4048 LLVMValueRef x = params->coords[0], y = params->coords[1], z = params->coords[2];
4049 LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
4050 int_type = lp_type_int(32);
4051 int_coord_type = lp_int_type(params->type);
4052 lp_build_context_init(&int_bld, gallivm, int_type);
4053 lp_build_context_init(&int_coord_bld, gallivm, int_coord_type);
4054
4055 LLVMValueRef offset, i, j;
4056
4057 LLVMValueRef row_stride = dynamic_state->row_stride(dynamic_state, gallivm,
4058 params->context_ptr, params->image_index);
4059 LLVMValueRef img_stride = dynamic_state->img_stride(dynamic_state, gallivm,
4060 params->context_ptr, params->image_index);
4061 LLVMValueRef base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
4062 params->context_ptr, params->image_index);
4063 LLVMValueRef width = dynamic_state->width(dynamic_state, gallivm,
4064 params->context_ptr, params->image_index);
4065 LLVMValueRef height = dynamic_state->height(dynamic_state, gallivm,
4066 params->context_ptr, params->image_index);
4067 LLVMValueRef depth = dynamic_state->depth(dynamic_state, gallivm,
4068 params->context_ptr, params->image_index);
4069 boolean layer_coord = has_layer_coord(target);
4070
4071 width = lp_build_broadcast_scalar(&int_coord_bld, width);
4072 if (dims >= 2) {
4073 height = lp_build_broadcast_scalar(&int_coord_bld, height);
4074 row_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, row_stride);
4075 }
4076 if (dims >= 3 || layer_coord) {
4077 depth = lp_build_broadcast_scalar(&int_coord_bld, depth);
4078 img_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, img_stride);
4079 }
4080
4081 LLVMValueRef out_of_bounds = int_coord_bld.zero;
4082 LLVMValueRef out1;
4083 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
4084 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4085
4086 if (dims >= 2) {
4087 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
4088 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4089 }
4090 if (dims >= 3) {
4091 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
4092 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4093 }
4094 lp_build_sample_offset(&int_coord_bld,
4095 format_desc,
4096 x, y, z, row_stride_vec, img_stride_vec,
4097 &offset, &i, &j);
4098
4099 if (params->img_op == LP_IMG_LOAD) {
4100 struct lp_type texel_type = params->type;
4101 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
4102 format_desc->channel[0].pure_integer) {
4103 if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
4104 texel_type = lp_type_int_vec(params->type.width, params->type.width * params->type.length);
4105 } else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
4106 texel_type = lp_type_uint_vec(params->type.width, params->type.width * params->type.length);
4107 }
4108 }
4109
4110 if (static_texture_state->format == PIPE_FORMAT_NONE) {
4111 /*
4112 * If there's nothing bound, format is NONE, and we must return
4113 * all zero as mandated by d3d10 in this case.
4114 */
4115 unsigned chan;
4116 LLVMValueRef zero = lp_build_zero(gallivm, params->type);
4117 for (chan = 0; chan < 4; chan++) {
4118 params->outdata[chan] = zero;
4119 }
4120 return;
4121 }
4122
4123 offset = lp_build_andnot(&int_coord_bld, offset, out_of_bounds);
4124 struct lp_build_context texel_bld;
4125 lp_build_context_init(&texel_bld, gallivm, texel_type);
4126 lp_build_fetch_rgba_soa(gallivm,
4127 format_desc,
4128 texel_type, TRUE,
4129 base_ptr, offset,
4130 i, j,
4131 NULL,
4132 params->outdata);
4133
4134 for (unsigned chan = 0; chan < 4; chan++) {
4135 params->outdata[chan] = lp_build_select(&texel_bld, out_of_bounds,
4136 texel_bld.zero, params->outdata[chan]);
4137 }
4138 } else if (params->img_op == LP_IMG_STORE) {
4139 if (static_texture_state->format == PIPE_FORMAT_NONE)
4140 return;
4141 lp_build_store_rgba_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
4142 params->indata);
4143 } else {
4144 if (static_texture_state->format == PIPE_FORMAT_NONE)
4145 return;
4146 lp_build_do_atomic_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
4147 params->img_op, params->op, params->indata, params->indata2, params->outdata);
4148 }
4149 }