gallivm: use fallback code for mul_hi with llvm >= 7.0
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_soa.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- SoA.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "pipe/p_shader_tokens.h"
39 #include "util/u_debug.h"
40 #include "util/u_dump.h"
41 #include "util/u_memory.h"
42 #include "util/u_math.h"
43 #include "util/u_format.h"
44 #include "util/u_cpu_detect.h"
45 #include "util/format_rgb9e5.h"
46 #include "lp_bld_debug.h"
47 #include "lp_bld_type.h"
48 #include "lp_bld_const.h"
49 #include "lp_bld_conv.h"
50 #include "lp_bld_arit.h"
51 #include "lp_bld_bitarit.h"
52 #include "lp_bld_logic.h"
53 #include "lp_bld_printf.h"
54 #include "lp_bld_swizzle.h"
55 #include "lp_bld_flow.h"
56 #include "lp_bld_gather.h"
57 #include "lp_bld_format.h"
58 #include "lp_bld_sample.h"
59 #include "lp_bld_sample_aos.h"
60 #include "lp_bld_struct.h"
61 #include "lp_bld_quad.h"
62 #include "lp_bld_pack.h"
63 #include "lp_bld_intr.h"
64 #include "lp_bld_misc.h"
65
66
67 /**
68 * Generate code to fetch a texel from a texture at int coords (x, y, z).
69 * The computation depends on whether the texture is 1D, 2D or 3D.
70 * The result, texel, will be float vectors:
71 * texel[0] = red values
72 * texel[1] = green values
73 * texel[2] = blue values
74 * texel[3] = alpha values
75 */
76 static void
77 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
78 LLVMValueRef width,
79 LLVMValueRef height,
80 LLVMValueRef depth,
81 LLVMValueRef x,
82 LLVMValueRef y,
83 LLVMValueRef z,
84 LLVMValueRef y_stride,
85 LLVMValueRef z_stride,
86 LLVMValueRef data_ptr,
87 LLVMValueRef mipoffsets,
88 LLVMValueRef texel_out[4])
89 {
90 const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
91 const unsigned dims = bld->dims;
92 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
93 LLVMBuilderRef builder = bld->gallivm->builder;
94 LLVMValueRef offset;
95 LLVMValueRef i, j;
96 LLVMValueRef use_border = NULL;
97
98 /* use_border = x < 0 || x >= width || y < 0 || y >= height */
99 if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
100 static_state->min_img_filter,
101 static_state->mag_img_filter)) {
102 LLVMValueRef b1, b2;
103 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
104 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
105 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
106 }
107
108 if (dims >= 2 &&
109 lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
110 static_state->min_img_filter,
111 static_state->mag_img_filter)) {
112 LLVMValueRef b1, b2;
113 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
114 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
115 if (use_border) {
116 use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
117 use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
118 }
119 else {
120 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
121 }
122 }
123
124 if (dims == 3 &&
125 lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
126 static_state->min_img_filter,
127 static_state->mag_img_filter)) {
128 LLVMValueRef b1, b2;
129 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
130 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
131 if (use_border) {
132 use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
133 use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
134 }
135 else {
136 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
137 }
138 }
139
140 /* convert x,y,z coords to linear offset from start of texture, in bytes */
141 lp_build_sample_offset(&bld->int_coord_bld,
142 bld->format_desc,
143 x, y, z, y_stride, z_stride,
144 &offset, &i, &j);
145 if (mipoffsets) {
146 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
147 }
148
149 if (use_border) {
150 /* If we can sample the border color, it means that texcoords may
151 * lie outside the bounds of the texture image. We need to do
152 * something to prevent reading out of bounds and causing a segfault.
153 *
154 * Simply AND the texture coords with !use_border. This will cause
155 * coords which are out of bounds to become zero. Zero's guaranteed
156 * to be inside the texture image.
157 */
158 offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
159 }
160
161 lp_build_fetch_rgba_soa(bld->gallivm,
162 bld->format_desc,
163 bld->texel_type, TRUE,
164 data_ptr, offset,
165 i, j,
166 bld->cache,
167 texel_out);
168
169 /*
170 * Note: if we find an app which frequently samples the texture border
171 * we might want to implement a true conditional here to avoid sampling
172 * the texture whenever possible (since that's quite a bit of code).
173 * Ex:
174 * if (use_border) {
175 * texel = border_color;
176 * }
177 * else {
178 * texel = sample_texture(coord);
179 * }
180 * As it is now, we always sample the texture, then selectively replace
181 * the texel color results with the border color.
182 */
183
184 if (use_border) {
185 /* select texel color or border color depending on use_border. */
186 const struct util_format_description *format_desc = bld->format_desc;
187 int chan;
188 struct lp_type border_type = bld->texel_type;
189 border_type.length = 4;
190 /*
191 * Only replace channels which are actually present. The others should
192 * get optimized away eventually by sampler_view swizzle anyway but it's
193 * easier too.
194 */
195 for (chan = 0; chan < 4; chan++) {
196 unsigned chan_s;
197 /* reverse-map channel... */
198 for (chan_s = 0; chan_s < 4; chan_s++) {
199 if (chan_s == format_desc->swizzle[chan]) {
200 break;
201 }
202 }
203 if (chan_s <= 3) {
204 /* use the already clamped color */
205 LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
206 LLVMValueRef border_chan;
207
208 border_chan = lp_build_extract_broadcast(bld->gallivm,
209 border_type,
210 bld->texel_type,
211 bld->border_color_clamped,
212 idx);
213 texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
214 border_chan, texel_out[chan]);
215 }
216 }
217 }
218 }
219
220
221 /**
222 * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
223 * (Note that with pot sizes could do this much more easily post-scale
224 * with some bit arithmetic.)
225 */
226 static LLVMValueRef
227 lp_build_coord_mirror(struct lp_build_sample_context *bld,
228 LLVMValueRef coord, boolean posOnly)
229 {
230 struct lp_build_context *coord_bld = &bld->coord_bld;
231 LLVMValueRef fract;
232 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
233
234 /*
235 * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
236 * it all works out. (The result is in range [-1, 1.0], negative if
237 * the coord is in the "odd" section, otherwise positive.)
238 */
239
240 coord = lp_build_mul(coord_bld, coord, half);
241 fract = lp_build_round(coord_bld, coord);
242 fract = lp_build_sub(coord_bld, coord, fract);
243 coord = lp_build_add(coord_bld, fract, fract);
244
245 if (posOnly) {
246 /*
247 * Theoretically it's not quite 100% accurate because the spec says
248 * that ultimately a scaled coord of -x.0 should map to int coord
249 * -x + 1 with mirroring, not -x (this does not matter for bilinear
250 * filtering).
251 */
252 coord = lp_build_abs(coord_bld, coord);
253 /* kill off NaNs */
254 /* XXX: not safe without arch rounding, fract can be anything. */
255 coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
256 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
257 }
258
259 return coord;
260 }
261
262
263 /**
264 * Helper to compute the first coord and the weight for
265 * linear wrap repeat npot textures
266 */
267 void
268 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
269 LLVMValueRef coord_f,
270 LLVMValueRef length_i,
271 LLVMValueRef length_f,
272 LLVMValueRef *coord0_i,
273 LLVMValueRef *weight_f)
274 {
275 struct lp_build_context *coord_bld = &bld->coord_bld;
276 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
277 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
278 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
279 int_coord_bld->one);
280 LLVMValueRef mask;
281 /* wrap with normalized floats is just fract */
282 coord_f = lp_build_fract(coord_bld, coord_f);
283 /* mul by size and subtract 0.5 */
284 coord_f = lp_build_mul(coord_bld, coord_f, length_f);
285 coord_f = lp_build_sub(coord_bld, coord_f, half);
286 /*
287 * we avoided the 0.5/length division before the repeat wrap,
288 * now need to fix up edge cases with selects
289 */
290 /*
291 * Note we do a float (unordered) compare so we can eliminate NaNs.
292 * (Otherwise would need fract_safe above).
293 */
294 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
295 PIPE_FUNC_LESS, coord_f, coord_bld->zero);
296
297 /* convert to int, compute lerp weight */
298 lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
299 *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
300 }
301
302
303 /**
304 * Build LLVM code for texture wrap mode for linear filtering.
305 * \param x0_out returns first integer texcoord
306 * \param x1_out returns second integer texcoord
307 * \param weight_out returns linear interpolation weight
308 */
309 static void
310 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
311 boolean is_gather,
312 LLVMValueRef coord,
313 LLVMValueRef length,
314 LLVMValueRef length_f,
315 LLVMValueRef offset,
316 boolean is_pot,
317 unsigned wrap_mode,
318 LLVMValueRef *x0_out,
319 LLVMValueRef *x1_out,
320 LLVMValueRef *weight_out)
321 {
322 struct lp_build_context *coord_bld = &bld->coord_bld;
323 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
324 LLVMBuilderRef builder = bld->gallivm->builder;
325 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
326 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
327 LLVMValueRef coord0, coord1, weight;
328
329 switch(wrap_mode) {
330 case PIPE_TEX_WRAP_REPEAT:
331 if (is_pot) {
332 /* mul by size and subtract 0.5 */
333 coord = lp_build_mul(coord_bld, coord, length_f);
334 coord = lp_build_sub(coord_bld, coord, half);
335 if (offset) {
336 offset = lp_build_int_to_float(coord_bld, offset);
337 coord = lp_build_add(coord_bld, coord, offset);
338 }
339 /* convert to int, compute lerp weight */
340 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
341 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
342 /* repeat wrap */
343 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
344 coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
345 }
346 else {
347 LLVMValueRef mask;
348 if (offset) {
349 offset = lp_build_int_to_float(coord_bld, offset);
350 offset = lp_build_div(coord_bld, offset, length_f);
351 coord = lp_build_add(coord_bld, coord, offset);
352 }
353 lp_build_coord_repeat_npot_linear(bld, coord,
354 length, length_f,
355 &coord0, &weight);
356 mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
357 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
358 coord1 = LLVMBuildAnd(builder,
359 lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
360 mask, "");
361 }
362 break;
363
364 case PIPE_TEX_WRAP_CLAMP:
365 if (bld->static_sampler_state->normalized_coords) {
366 /* scale coord to length */
367 coord = lp_build_mul(coord_bld, coord, length_f);
368 }
369 if (offset) {
370 offset = lp_build_int_to_float(coord_bld, offset);
371 coord = lp_build_add(coord_bld, coord, offset);
372 }
373
374 /*
375 * clamp to [0, length]
376 *
377 * Unlike some other wrap modes, this should be correct for gather
378 * too. GL_CLAMP explicitly does this clamp on the coord prior to
379 * actual wrapping (which is per sample).
380 */
381 coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
382
383 coord = lp_build_sub(coord_bld, coord, half);
384
385 /* convert to int, compute lerp weight */
386 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
387 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
388 break;
389
390 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
391 {
392 struct lp_build_context abs_coord_bld = bld->coord_bld;
393 abs_coord_bld.type.sign = FALSE;
394
395 if (bld->static_sampler_state->normalized_coords) {
396 /* mul by tex size */
397 coord = lp_build_mul(coord_bld, coord, length_f);
398 }
399 if (offset) {
400 offset = lp_build_int_to_float(coord_bld, offset);
401 coord = lp_build_add(coord_bld, coord, offset);
402 }
403
404 /* clamp to length max */
405 coord = lp_build_min_ext(coord_bld, coord, length_f,
406 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
407 if (!is_gather) {
408 /* subtract 0.5 */
409 coord = lp_build_sub(coord_bld, coord, half);
410 /* clamp to [0, length - 0.5] */
411 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
412 /* convert to int, compute lerp weight */
413 lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
414 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
415 } else {
416 /*
417 * The non-gather path will end up with coords 0, 1 if coord was
418 * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
419 * really matter what the second coord is). But for gather, we
420 * really need to end up with coords 0, 0.
421 */
422 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
423 coord0 = lp_build_sub(coord_bld, coord, half);
424 coord1 = lp_build_add(coord_bld, coord, half);
425 /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
426 coord0 = lp_build_itrunc(coord_bld, coord0);
427 coord1 = lp_build_itrunc(coord_bld, coord1);
428 weight = coord_bld->undef;
429 }
430 /* coord1 = min(coord1, length-1) */
431 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
432 break;
433 }
434
435 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
436 if (bld->static_sampler_state->normalized_coords) {
437 /* scale coord to length */
438 coord = lp_build_mul(coord_bld, coord, length_f);
439 }
440 if (offset) {
441 offset = lp_build_int_to_float(coord_bld, offset);
442 coord = lp_build_add(coord_bld, coord, offset);
443 }
444 /*
445 * We don't need any clamp. Technically, for very large (pos or neg)
446 * (or infinite) values, clamp against [-length, length] would be
447 * correct, but we don't need to guarantee any specific
448 * result for such coords (the ifloor will be undefined, but for modes
449 * requiring border all resulting coords are safe).
450 */
451 coord = lp_build_sub(coord_bld, coord, half);
452 /* convert to int, compute lerp weight */
453 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
454 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
455 break;
456
457 case PIPE_TEX_WRAP_MIRROR_REPEAT:
458 if (offset) {
459 offset = lp_build_int_to_float(coord_bld, offset);
460 offset = lp_build_div(coord_bld, offset, length_f);
461 coord = lp_build_add(coord_bld, coord, offset);
462 }
463 if (!is_gather) {
464 /* compute mirror function */
465 coord = lp_build_coord_mirror(bld, coord, TRUE);
466
467 /* scale coord to length */
468 coord = lp_build_mul(coord_bld, coord, length_f);
469 coord = lp_build_sub(coord_bld, coord, half);
470
471 /* convert to int, compute lerp weight */
472 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
473 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
474
475 /* coord0 = max(coord0, 0) */
476 coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
477 /* coord1 = min(coord1, length-1) */
478 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
479 } else {
480 /*
481 * This is pretty reasonable in the end, all what the tests care
482 * about is nasty edge cases (scaled coords x.5, so the individual
483 * coords are actually integers, which is REALLY tricky to get right
484 * due to this working differently both for negative numbers as well
485 * as for even/odd cases). But with enough magic it's not too complex
486 * after all.
487 * Maybe should try a bit arithmetic one though for POT textures...
488 */
489 LLVMValueRef isNeg;
490 /*
491 * Wrapping just once still works, even though it means we can
492 * get "wrong" sign due to performing mirror in the middle of the
493 * two coords (because this can only happen very near the odd/even
494 * edges, so both coords will actually end up as 0 or length - 1
495 * in the end).
496 * For GL4 gather with per-sample offsets we'd need to the mirroring
497 * per coord too.
498 */
499 coord = lp_build_coord_mirror(bld, coord, FALSE);
500 coord = lp_build_mul(coord_bld, coord, length_f);
501
502 /*
503 * NaNs should be safe here, we'll do away with them with
504 * the ones' complement plus min.
505 */
506 coord0 = lp_build_sub(coord_bld, coord, half);
507 coord0 = lp_build_ifloor(coord_bld, coord0);
508 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
509 /* ones complement for neg numbers (mirror(negX) = X - 1) */
510 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
511 coord0, int_coord_bld->zero);
512 coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
513 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
514 coord1, int_coord_bld->zero);
515 coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
516 coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
517 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
518
519 weight = coord_bld->undef;
520 }
521 break;
522
523 case PIPE_TEX_WRAP_MIRROR_CLAMP:
524 if (bld->static_sampler_state->normalized_coords) {
525 /* scale coord to length */
526 coord = lp_build_mul(coord_bld, coord, length_f);
527 }
528 if (offset) {
529 offset = lp_build_int_to_float(coord_bld, offset);
530 coord = lp_build_add(coord_bld, coord, offset);
531 }
532 /*
533 * XXX: probably not correct for gather, albeit I'm not
534 * entirely sure as it's poorly specified. The wrapping looks
535 * correct according to the spec which is against gl 1.2.1,
536 * however negative values will be swapped - gl re-specified
537 * wrapping with newer versions (no more pre-clamp except with
538 * GL_CLAMP).
539 */
540 coord = lp_build_abs(coord_bld, coord);
541
542 /* clamp to [0, length] */
543 coord = lp_build_min_ext(coord_bld, coord, length_f,
544 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
545
546 coord = lp_build_sub(coord_bld, coord, half);
547
548 /* convert to int, compute lerp weight */
549 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
550 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
551 break;
552
553 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
554 {
555 struct lp_build_context abs_coord_bld = bld->coord_bld;
556 abs_coord_bld.type.sign = FALSE;
557
558 if (bld->static_sampler_state->normalized_coords) {
559 /* scale coord to length */
560 coord = lp_build_mul(coord_bld, coord, length_f);
561 }
562 if (offset) {
563 offset = lp_build_int_to_float(coord_bld, offset);
564 coord = lp_build_add(coord_bld, coord, offset);
565 }
566 if (!is_gather) {
567 coord = lp_build_abs(coord_bld, coord);
568
569 /* clamp to length max */
570 coord = lp_build_min_ext(coord_bld, coord, length_f,
571 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
572 /* subtract 0.5 */
573 coord = lp_build_sub(coord_bld, coord, half);
574 /* clamp to [0, length - 0.5] */
575 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
576
577 /* convert to int, compute lerp weight */
578 lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
579 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
580 /* coord1 = min(coord1, length-1) */
581 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
582 } else {
583 /*
584 * The non-gather path will swap coord0/1 if coord was negative,
585 * which is ok for filtering since the filter weight matches
586 * accordingly. Also, if coord is close to zero, coord0/1 will
587 * be 0 and 1, instead of 0 and 0 (again ok due to filter
588 * weight being 0.0). Both issues need to be fixed for gather.
589 */
590 LLVMValueRef isNeg;
591
592 /*
593 * Actually wanted to cheat here and use:
594 * coord1 = lp_build_iround(coord_bld, coord);
595 * but it's not good enough for some tests (even piglit
596 * textureGather is set up in a way so the coords area always
597 * .5, that is right at the crossover points).
598 * So do ordinary sub/floor, then do ones' complement
599 * for negative numbers.
600 * (Note can't just do sub|add/abs/itrunc per coord neither -
601 * because the spec demands that mirror(3.0) = 3 but
602 * mirror(-3.0) = 2.)
603 */
604 coord = lp_build_sub(coord_bld, coord, half);
605 coord0 = lp_build_ifloor(coord_bld, coord);
606 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
607 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
608 int_coord_bld->zero);
609 coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
610 coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
611
612 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
613 int_coord_bld->zero);
614 coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
615 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
616
617 weight = coord_bld->undef;
618 }
619 }
620 break;
621
622 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
623 {
624 if (bld->static_sampler_state->normalized_coords) {
625 /* scale coord to length */
626 coord = lp_build_mul(coord_bld, coord, length_f);
627 }
628 if (offset) {
629 offset = lp_build_int_to_float(coord_bld, offset);
630 coord = lp_build_add(coord_bld, coord, offset);
631 }
632 /*
633 * XXX: probably not correct for gather due to swapped
634 * order if coord is negative (same rationale as for
635 * MIRROR_CLAMP).
636 */
637 coord = lp_build_abs(coord_bld, coord);
638
639 /*
640 * We don't need any clamp. Technically, for very large
641 * (or infinite) values, clamp against length would be
642 * correct, but we don't need to guarantee any specific
643 * result for such coords (the ifloor will be undefined, but
644 * for modes requiring border all resulting coords are safe).
645 */
646 coord = lp_build_sub(coord_bld, coord, half);
647
648 /* convert to int, compute lerp weight */
649 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
650 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
651 }
652 break;
653
654 default:
655 assert(0);
656 coord0 = NULL;
657 coord1 = NULL;
658 weight = NULL;
659 }
660
661 *x0_out = coord0;
662 *x1_out = coord1;
663 *weight_out = weight;
664 }
665
666
667 /**
668 * Build LLVM code for texture wrap mode for nearest filtering.
669 * \param coord the incoming texcoord (nominally in [0,1])
670 * \param length the texture size along one dimension, as int vector
671 * \param length_f the texture size along one dimension, as float vector
672 * \param offset texel offset along one dimension (as int vector)
673 * \param is_pot if TRUE, length is a power of two
674 * \param wrap_mode one of PIPE_TEX_WRAP_x
675 */
676 static LLVMValueRef
677 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
678 LLVMValueRef coord,
679 LLVMValueRef length,
680 LLVMValueRef length_f,
681 LLVMValueRef offset,
682 boolean is_pot,
683 unsigned wrap_mode)
684 {
685 struct lp_build_context *coord_bld = &bld->coord_bld;
686 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
687 LLVMBuilderRef builder = bld->gallivm->builder;
688 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
689 LLVMValueRef icoord;
690
691 switch(wrap_mode) {
692 case PIPE_TEX_WRAP_REPEAT:
693 if (is_pot) {
694 coord = lp_build_mul(coord_bld, coord, length_f);
695 icoord = lp_build_ifloor(coord_bld, coord);
696 if (offset) {
697 icoord = lp_build_add(int_coord_bld, icoord, offset);
698 }
699 icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
700 }
701 else {
702 if (offset) {
703 offset = lp_build_int_to_float(coord_bld, offset);
704 offset = lp_build_div(coord_bld, offset, length_f);
705 coord = lp_build_add(coord_bld, coord, offset);
706 }
707 /* take fraction, unnormalize */
708 coord = lp_build_fract_safe(coord_bld, coord);
709 coord = lp_build_mul(coord_bld, coord, length_f);
710 icoord = lp_build_itrunc(coord_bld, coord);
711 }
712 break;
713
714 case PIPE_TEX_WRAP_CLAMP:
715 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
716 if (bld->static_sampler_state->normalized_coords) {
717 /* scale coord to length */
718 coord = lp_build_mul(coord_bld, coord, length_f);
719 }
720
721 if (offset) {
722 offset = lp_build_int_to_float(coord_bld, offset);
723 coord = lp_build_add(coord_bld, coord, offset);
724 }
725 /* floor */
726 /* use itrunc instead since we clamp to 0 anyway */
727 icoord = lp_build_itrunc(coord_bld, coord);
728
729 /* clamp to [0, length - 1]. */
730 icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
731 length_minus_one);
732 break;
733
734 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
735 if (bld->static_sampler_state->normalized_coords) {
736 /* scale coord to length */
737 coord = lp_build_mul(coord_bld, coord, length_f);
738 }
739 /* no clamp necessary, border masking will handle this */
740 icoord = lp_build_ifloor(coord_bld, coord);
741 if (offset) {
742 icoord = lp_build_add(int_coord_bld, icoord, offset);
743 }
744 break;
745
746 case PIPE_TEX_WRAP_MIRROR_REPEAT:
747 if (offset) {
748 offset = lp_build_int_to_float(coord_bld, offset);
749 offset = lp_build_div(coord_bld, offset, length_f);
750 coord = lp_build_add(coord_bld, coord, offset);
751 }
752 /* compute mirror function */
753 coord = lp_build_coord_mirror(bld, coord, TRUE);
754
755 /* scale coord to length */
756 assert(bld->static_sampler_state->normalized_coords);
757 coord = lp_build_mul(coord_bld, coord, length_f);
758
759 /* itrunc == ifloor here */
760 icoord = lp_build_itrunc(coord_bld, coord);
761
762 /* clamp to [0, length - 1] */
763 icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
764 break;
765
766 case PIPE_TEX_WRAP_MIRROR_CLAMP:
767 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
768 if (bld->static_sampler_state->normalized_coords) {
769 /* scale coord to length */
770 coord = lp_build_mul(coord_bld, coord, length_f);
771 }
772 if (offset) {
773 offset = lp_build_int_to_float(coord_bld, offset);
774 coord = lp_build_add(coord_bld, coord, offset);
775 }
776 coord = lp_build_abs(coord_bld, coord);
777
778 /* itrunc == ifloor here */
779 icoord = lp_build_itrunc(coord_bld, coord);
780 /*
781 * Use unsigned min due to possible undef values (NaNs, overflow)
782 */
783 {
784 struct lp_build_context abs_coord_bld = *int_coord_bld;
785 abs_coord_bld.type.sign = FALSE;
786 /* clamp to [0, length - 1] */
787 icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
788 }
789 break;
790
791 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
792 if (bld->static_sampler_state->normalized_coords) {
793 /* scale coord to length */
794 coord = lp_build_mul(coord_bld, coord, length_f);
795 }
796 if (offset) {
797 offset = lp_build_int_to_float(coord_bld, offset);
798 coord = lp_build_add(coord_bld, coord, offset);
799 }
800 coord = lp_build_abs(coord_bld, coord);
801
802 /* itrunc == ifloor here */
803 icoord = lp_build_itrunc(coord_bld, coord);
804 break;
805
806 default:
807 assert(0);
808 icoord = NULL;
809 }
810
811 return icoord;
812 }
813
814
815 /**
816 * Do shadow test/comparison.
817 * \param p shadow ref value
818 * \param texel the texel to compare against
819 */
820 static LLVMValueRef
821 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
822 LLVMValueRef p,
823 LLVMValueRef texel)
824 {
825 struct lp_build_context *texel_bld = &bld->texel_bld;
826 LLVMValueRef res;
827
828 if (0) {
829 //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
830 lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
831 }
832
833 /* result = (p FUNC texel) ? 1 : 0 */
834 /*
835 * honor d3d10 floating point rules here, which state that comparisons
836 * are ordered except NOT_EQUAL which is unordered.
837 */
838 if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
839 res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
840 p, texel);
841 }
842 else {
843 res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
844 p, texel);
845 }
846 return res;
847 }
848
849
850 /**
851 * Generate code to sample a mipmap level with nearest filtering.
852 * If sampling a cube texture, r = cube face in [0,5].
853 */
854 static void
855 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
856 LLVMValueRef size,
857 LLVMValueRef row_stride_vec,
858 LLVMValueRef img_stride_vec,
859 LLVMValueRef data_ptr,
860 LLVMValueRef mipoffsets,
861 const LLVMValueRef *coords,
862 const LLVMValueRef *offsets,
863 LLVMValueRef colors_out[4])
864 {
865 const unsigned dims = bld->dims;
866 LLVMValueRef width_vec;
867 LLVMValueRef height_vec;
868 LLVMValueRef depth_vec;
869 LLVMValueRef flt_size;
870 LLVMValueRef flt_width_vec;
871 LLVMValueRef flt_height_vec;
872 LLVMValueRef flt_depth_vec;
873 LLVMValueRef x, y = NULL, z = NULL;
874
875 lp_build_extract_image_sizes(bld,
876 &bld->int_size_bld,
877 bld->int_coord_type,
878 size,
879 &width_vec, &height_vec, &depth_vec);
880
881 flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
882
883 lp_build_extract_image_sizes(bld,
884 &bld->float_size_bld,
885 bld->coord_type,
886 flt_size,
887 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
888
889 /*
890 * Compute integer texcoords.
891 */
892 x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
893 flt_width_vec, offsets[0],
894 bld->static_texture_state->pot_width,
895 bld->static_sampler_state->wrap_s);
896 lp_build_name(x, "tex.x.wrapped");
897
898 if (dims >= 2) {
899 y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
900 flt_height_vec, offsets[1],
901 bld->static_texture_state->pot_height,
902 bld->static_sampler_state->wrap_t);
903 lp_build_name(y, "tex.y.wrapped");
904
905 if (dims == 3) {
906 z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
907 flt_depth_vec, offsets[2],
908 bld->static_texture_state->pot_depth,
909 bld->static_sampler_state->wrap_r);
910 lp_build_name(z, "tex.z.wrapped");
911 }
912 }
913 if (has_layer_coord(bld->static_texture_state->target)) {
914 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
915 /* add cube layer to face */
916 z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
917 }
918 else {
919 z = coords[2];
920 }
921 lp_build_name(z, "tex.z.layer");
922 }
923
924 /*
925 * Get texture colors.
926 */
927 lp_build_sample_texel_soa(bld,
928 width_vec, height_vec, depth_vec,
929 x, y, z,
930 row_stride_vec, img_stride_vec,
931 data_ptr, mipoffsets, colors_out);
932
933 if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
934 LLVMValueRef cmpval;
935 cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
936 /* this is really just a AND 1.0, cmpval but llvm is clever enough */
937 colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
938 bld->texel_bld.one, bld->texel_bld.zero);
939 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
940 }
941
942 }
943
944
945 /**
946 * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
947 */
948 static LLVMValueRef
949 lp_build_masklerp(struct lp_build_context *bld,
950 LLVMValueRef weight,
951 LLVMValueRef mask0,
952 LLVMValueRef mask1)
953 {
954 struct gallivm_state *gallivm = bld->gallivm;
955 LLVMBuilderRef builder = gallivm->builder;
956 LLVMValueRef weight2;
957
958 weight2 = lp_build_sub(bld, bld->one, weight);
959 weight = LLVMBuildBitCast(builder, weight,
960 lp_build_int_vec_type(gallivm, bld->type), "");
961 weight2 = LLVMBuildBitCast(builder, weight2,
962 lp_build_int_vec_type(gallivm, bld->type), "");
963 weight = LLVMBuildAnd(builder, weight, mask1, "");
964 weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
965 weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
966 weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
967 return lp_build_add(bld, weight, weight2);
968 }
969
970 /**
971 * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
972 */
973 static LLVMValueRef
974 lp_build_masklerp2d(struct lp_build_context *bld,
975 LLVMValueRef weight0,
976 LLVMValueRef weight1,
977 LLVMValueRef mask00,
978 LLVMValueRef mask01,
979 LLVMValueRef mask10,
980 LLVMValueRef mask11)
981 {
982 LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
983 LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
984 return lp_build_lerp(bld, weight1, val0, val1, 0);
985 }
986
987 /*
988 * this is a bit excessive code for something OpenGL just recommends
989 * but does not require.
990 */
991 #define ACCURATE_CUBE_CORNERS 1
992
993 /**
994 * Generate code to sample a mipmap level with linear filtering.
995 * If sampling a cube texture, r = cube face in [0,5].
996 * If linear_mask is present, only pixels having their mask set
997 * will receive linear filtering, the rest will use nearest.
998 */
999 static void
1000 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1001 boolean is_gather,
1002 LLVMValueRef size,
1003 LLVMValueRef linear_mask,
1004 LLVMValueRef row_stride_vec,
1005 LLVMValueRef img_stride_vec,
1006 LLVMValueRef data_ptr,
1007 LLVMValueRef mipoffsets,
1008 const LLVMValueRef *coords,
1009 const LLVMValueRef *offsets,
1010 LLVMValueRef colors_out[4])
1011 {
1012 LLVMBuilderRef builder = bld->gallivm->builder;
1013 struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1014 struct lp_build_context *coord_bld = &bld->coord_bld;
1015 struct lp_build_context *texel_bld = &bld->texel_bld;
1016 const unsigned dims = bld->dims;
1017 LLVMValueRef width_vec;
1018 LLVMValueRef height_vec;
1019 LLVMValueRef depth_vec;
1020 LLVMValueRef flt_size;
1021 LLVMValueRef flt_width_vec;
1022 LLVMValueRef flt_height_vec;
1023 LLVMValueRef flt_depth_vec;
1024 LLVMValueRef fall_off[4], have_corners;
1025 LLVMValueRef z1 = NULL;
1026 LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1027 LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1028 LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1029 LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1030 LLVMValueRef xs[4], ys[4], zs[4];
1031 LLVMValueRef neighbors[2][2][4];
1032 int chan, texel_index;
1033 boolean seamless_cube_filter, accurate_cube_corners;
1034 unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1035
1036 seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1037 bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1038 bld->static_sampler_state->seamless_cube_map;
1039
1040 accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
1041
1042 lp_build_extract_image_sizes(bld,
1043 &bld->int_size_bld,
1044 bld->int_coord_type,
1045 size,
1046 &width_vec, &height_vec, &depth_vec);
1047
1048 flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1049
1050 lp_build_extract_image_sizes(bld,
1051 &bld->float_size_bld,
1052 bld->coord_type,
1053 flt_size,
1054 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1055
1056 /*
1057 * Compute integer texcoords.
1058 */
1059
1060 if (!seamless_cube_filter) {
1061 lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1062 flt_width_vec, offsets[0],
1063 bld->static_texture_state->pot_width,
1064 bld->static_sampler_state->wrap_s,
1065 &x00, &x01, &s_fpart);
1066 lp_build_name(x00, "tex.x0.wrapped");
1067 lp_build_name(x01, "tex.x1.wrapped");
1068 x10 = x00;
1069 x11 = x01;
1070
1071 if (dims >= 2) {
1072 lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1073 flt_height_vec, offsets[1],
1074 bld->static_texture_state->pot_height,
1075 bld->static_sampler_state->wrap_t,
1076 &y00, &y10, &t_fpart);
1077 lp_build_name(y00, "tex.y0.wrapped");
1078 lp_build_name(y10, "tex.y1.wrapped");
1079 y01 = y00;
1080 y11 = y10;
1081
1082 if (dims == 3) {
1083 lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1084 flt_depth_vec, offsets[2],
1085 bld->static_texture_state->pot_depth,
1086 bld->static_sampler_state->wrap_r,
1087 &z00, &z1, &r_fpart);
1088 z01 = z10 = z11 = z00;
1089 lp_build_name(z00, "tex.z0.wrapped");
1090 lp_build_name(z1, "tex.z1.wrapped");
1091 }
1092 }
1093 if (has_layer_coord(bld->static_texture_state->target)) {
1094 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1095 /* add cube layer to face */
1096 z00 = z01 = z10 = z11 = z1 =
1097 lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1098 }
1099 else {
1100 z00 = z01 = z10 = z11 = z1 = coords[2]; /* cube face or layer */
1101 }
1102 lp_build_name(z00, "tex.z0.layer");
1103 lp_build_name(z1, "tex.z1.layer");
1104 }
1105 }
1106 else {
1107 struct lp_build_if_state edge_if;
1108 LLVMTypeRef int1t;
1109 LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1110 LLVMValueRef coord0, coord1, have_edge, have_corner;
1111 LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1112 LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1113 LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1114 LLVMValueRef face = coords[2];
1115 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1116 LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1117 /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1118 height_vec = width_vec;
1119 flt_height_vec = flt_width_vec;
1120
1121 /* XXX the overflow logic is actually sort of duplicated with trilinear,
1122 * since an overflow in one mip should also have a corresponding overflow
1123 * in another.
1124 */
1125 /* should always have normalized coords, and offsets are undefined */
1126 assert(bld->static_sampler_state->normalized_coords);
1127 /*
1128 * The coords should all be between [0,1] however we can have NaNs,
1129 * which will wreak havoc. In particular the y1_clamped value below
1130 * can be -INT_MAX (on x86) and be propagated right through (probably
1131 * other values might be bogus in the end too).
1132 * So kill off the NaNs here.
1133 */
1134 coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1135 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1136 coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
1137 /* instead of clamp, build mask if overflowed */
1138 coord0 = lp_build_sub(coord_bld, coord0, half);
1139 /* convert to int, compute lerp weight */
1140 /* not ideal with AVX (and no AVX2) */
1141 lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
1142 x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1143 coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1144 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1145 coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
1146 coord1 = lp_build_sub(coord_bld, coord1, half);
1147 lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
1148 y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1149
1150 fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1151 fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1152 fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1153 fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1154
1155 fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1156 fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1157 have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1158 have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1159
1160 /* needed for accurate corner filtering branch later, rely on 0 init */
1161 int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1162 have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1163
1164 for (texel_index = 0; texel_index < 4; texel_index++) {
1165 xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1166 ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1167 zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1168 }
1169
1170 lp_build_if(&edge_if, bld->gallivm, have_edge);
1171
1172 have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1173 have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1174 LLVMBuildStore(builder, have_corner, have_corners);
1175
1176 /*
1177 * Need to feed clamped values here for cheap corner handling,
1178 * but only for y coord (as when falling off both edges we only
1179 * fall off the x one) - this should be sufficient.
1180 */
1181 y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1182 y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1183
1184 /*
1185 * Get all possible new coords.
1186 */
1187 lp_build_cube_new_coords(ivec_bld, face,
1188 x0, x1, y0_clamped, y1_clamped,
1189 length_minus_one,
1190 new_faces, new_xcoords, new_ycoords);
1191
1192 /* handle fall off x-, x+ direction */
1193 /* determine new coords, face (not both fall_off vars can be true at same time) */
1194 x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1195 y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1196 x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1197 y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1198 x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1199 y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1200 x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1201 y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1202
1203 z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1204 z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1205
1206 /* handle fall off y-, y+ direction */
1207 /*
1208 * Cheap corner logic: just hack up things so a texel doesn't fall
1209 * off both sides (which means filter weights will be wrong but we'll only
1210 * use valid texels in the filter).
1211 * This means however (y) coords must additionally be clamped (see above).
1212 * This corner handling should be fully OpenGL (but not d3d10) compliant.
1213 */
1214 fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1215 fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1216 fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1217 fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1218
1219 x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1220 y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1221 x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1222 y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1223 x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1224 y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1225 x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1226 y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1227
1228 z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1229 z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1230 z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1231 z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1232
1233 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1234 /* now can add cube layer to face (per sample) */
1235 z00 = lp_build_add(ivec_bld, z00, coords[3]);
1236 z01 = lp_build_add(ivec_bld, z01, coords[3]);
1237 z10 = lp_build_add(ivec_bld, z10, coords[3]);
1238 z11 = lp_build_add(ivec_bld, z11, coords[3]);
1239 }
1240
1241 LLVMBuildStore(builder, x00, xs[0]);
1242 LLVMBuildStore(builder, x01, xs[1]);
1243 LLVMBuildStore(builder, x10, xs[2]);
1244 LLVMBuildStore(builder, x11, xs[3]);
1245 LLVMBuildStore(builder, y00, ys[0]);
1246 LLVMBuildStore(builder, y01, ys[1]);
1247 LLVMBuildStore(builder, y10, ys[2]);
1248 LLVMBuildStore(builder, y11, ys[3]);
1249 LLVMBuildStore(builder, z00, zs[0]);
1250 LLVMBuildStore(builder, z01, zs[1]);
1251 LLVMBuildStore(builder, z10, zs[2]);
1252 LLVMBuildStore(builder, z11, zs[3]);
1253
1254 lp_build_else(&edge_if);
1255
1256 LLVMBuildStore(builder, x0, xs[0]);
1257 LLVMBuildStore(builder, x1, xs[1]);
1258 LLVMBuildStore(builder, x0, xs[2]);
1259 LLVMBuildStore(builder, x1, xs[3]);
1260 LLVMBuildStore(builder, y0, ys[0]);
1261 LLVMBuildStore(builder, y0, ys[1]);
1262 LLVMBuildStore(builder, y1, ys[2]);
1263 LLVMBuildStore(builder, y1, ys[3]);
1264 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1265 LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1266 LLVMBuildStore(builder, cube_layer, zs[0]);
1267 LLVMBuildStore(builder, cube_layer, zs[1]);
1268 LLVMBuildStore(builder, cube_layer, zs[2]);
1269 LLVMBuildStore(builder, cube_layer, zs[3]);
1270 }
1271 else {
1272 LLVMBuildStore(builder, face, zs[0]);
1273 LLVMBuildStore(builder, face, zs[1]);
1274 LLVMBuildStore(builder, face, zs[2]);
1275 LLVMBuildStore(builder, face, zs[3]);
1276 }
1277
1278 lp_build_endif(&edge_if);
1279
1280 x00 = LLVMBuildLoad(builder, xs[0], "");
1281 x01 = LLVMBuildLoad(builder, xs[1], "");
1282 x10 = LLVMBuildLoad(builder, xs[2], "");
1283 x11 = LLVMBuildLoad(builder, xs[3], "");
1284 y00 = LLVMBuildLoad(builder, ys[0], "");
1285 y01 = LLVMBuildLoad(builder, ys[1], "");
1286 y10 = LLVMBuildLoad(builder, ys[2], "");
1287 y11 = LLVMBuildLoad(builder, ys[3], "");
1288 z00 = LLVMBuildLoad(builder, zs[0], "");
1289 z01 = LLVMBuildLoad(builder, zs[1], "");
1290 z10 = LLVMBuildLoad(builder, zs[2], "");
1291 z11 = LLVMBuildLoad(builder, zs[3], "");
1292 }
1293
1294 if (linear_mask) {
1295 /*
1296 * Whack filter weights into place. Whatever texel had more weight is
1297 * the one which should have been selected by nearest filtering hence
1298 * just use 100% weight for it.
1299 */
1300 struct lp_build_context *c_bld = &bld->coord_bld;
1301 LLVMValueRef w1_mask, w1_weight;
1302 LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1303
1304 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1305 /* this select is really just a "and" */
1306 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1307 s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1308 if (dims >= 2) {
1309 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1310 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1311 t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1312 if (dims == 3) {
1313 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1314 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1315 r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1316 }
1317 }
1318 }
1319
1320 /*
1321 * Get texture colors.
1322 */
1323 /* get x0/x1 texels */
1324 lp_build_sample_texel_soa(bld,
1325 width_vec, height_vec, depth_vec,
1326 x00, y00, z00,
1327 row_stride_vec, img_stride_vec,
1328 data_ptr, mipoffsets, neighbors[0][0]);
1329 lp_build_sample_texel_soa(bld,
1330 width_vec, height_vec, depth_vec,
1331 x01, y01, z01,
1332 row_stride_vec, img_stride_vec,
1333 data_ptr, mipoffsets, neighbors[0][1]);
1334
1335 if (dims == 1) {
1336 assert(!is_gather);
1337 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1338 /* Interpolate two samples from 1D image to produce one color */
1339 for (chan = 0; chan < 4; chan++) {
1340 colors_out[chan] = lp_build_lerp(texel_bld, s_fpart,
1341 neighbors[0][0][chan],
1342 neighbors[0][1][chan],
1343 0);
1344 }
1345 }
1346 else {
1347 LLVMValueRef cmpval0, cmpval1;
1348 cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1349 cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1350 /* simplified lerp, AND mask with weight and add */
1351 colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1352 cmpval0, cmpval1);
1353 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1354 }
1355 }
1356 else {
1357 /* 2D/3D texture */
1358 struct lp_build_if_state corner_if;
1359 LLVMValueRef colors0[4], colorss[4];
1360
1361 /* get x0/x1 texels at y1 */
1362 lp_build_sample_texel_soa(bld,
1363 width_vec, height_vec, depth_vec,
1364 x10, y10, z10,
1365 row_stride_vec, img_stride_vec,
1366 data_ptr, mipoffsets, neighbors[1][0]);
1367 lp_build_sample_texel_soa(bld,
1368 width_vec, height_vec, depth_vec,
1369 x11, y11, z11,
1370 row_stride_vec, img_stride_vec,
1371 data_ptr, mipoffsets, neighbors[1][1]);
1372
1373 /*
1374 * To avoid having to duplicate linear_mask / fetch code use
1375 * another branch (with corner condition though edge would work
1376 * as well) here.
1377 */
1378 if (accurate_cube_corners) {
1379 LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1380 LLVMValueRef have_corner, one_third;
1381
1382 colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1383 colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1384 colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1385 colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1386
1387 have_corner = LLVMBuildLoad(builder, have_corners, "");
1388
1389 lp_build_if(&corner_if, bld->gallivm, have_corner);
1390
1391 one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1392 1.0f/3.0f);
1393
1394 /* find corner */
1395 c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1396 c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1397 c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1398 c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1399 c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1400 c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1401 c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1402 c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1403
1404 if (!is_gather) {
1405 /*
1406 * we can't use standard 2d lerp as we need per-element weight
1407 * in case of corners, so just calculate bilinear result as
1408 * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1409 * (This is actually less work than using 2d lerp, 7 vs. 9
1410 * instructions, however calculating the weights needs another 6,
1411 * so actually probably not slower than 2d lerp only for 4 channels
1412 * as weights only need to be calculated once - of course fixing
1413 * the weights has additional cost.)
1414 */
1415 LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1416 wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1417 wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1418 w00 = lp_build_mul(coord_bld, wx0, wy0);
1419 w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1420 w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1421 w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1422
1423 /* find corner weight */
1424 c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1425 c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1426 c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1427 c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1428
1429 /*
1430 * add 1/3 of the corner weight to the weight of the 3 other
1431 * samples and null out corner weight.
1432 */
1433 c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1434 w00 = lp_build_add(coord_bld, w00, c_weight);
1435 w00 = lp_build_andnot(coord_bld, w00, c00f);
1436 w01 = lp_build_add(coord_bld, w01, c_weight);
1437 w01 = lp_build_andnot(coord_bld, w01, c01f);
1438 w10 = lp_build_add(coord_bld, w10, c_weight);
1439 w10 = lp_build_andnot(coord_bld, w10, c10f);
1440 w11 = lp_build_add(coord_bld, w11, c_weight);
1441 w11 = lp_build_andnot(coord_bld, w11, c11f);
1442
1443 if (bld->static_sampler_state->compare_mode ==
1444 PIPE_TEX_COMPARE_NONE) {
1445 for (chan = 0; chan < 4; chan++) {
1446 colors0[chan] = lp_build_mul(coord_bld, w00,
1447 neighbors[0][0][chan]);
1448 tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1449 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1450 tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1451 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1452 tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1453 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1454 }
1455 }
1456 else {
1457 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1458 cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1459 neighbors[0][0][0]);
1460 cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1461 neighbors[0][1][0]);
1462 cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1463 neighbors[1][0][0]);
1464 cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1465 neighbors[1][1][0]);
1466 /*
1467 * inputs to interpolation are just masks so just add
1468 * masked weights together
1469 */
1470 cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1471 coord_bld->vec_type, "");
1472 cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1473 coord_bld->vec_type, "");
1474 cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1475 coord_bld->vec_type, "");
1476 cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1477 coord_bld->vec_type, "");
1478 colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1479 tmp = lp_build_and(coord_bld, w01, cmpval01);
1480 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1481 tmp = lp_build_and(coord_bld, w10, cmpval10);
1482 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1483 tmp = lp_build_and(coord_bld, w11, cmpval11);
1484 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1485 colors0[1] = colors0[2] = colors0[3] = colors0[0];
1486 }
1487 }
1488 else {
1489 /*
1490 * We don't have any weights to adjust, so instead calculate
1491 * the fourth texel as simply the average of the other 3.
1492 * (This would work for non-gather too, however we'd have
1493 * a boatload more of the select stuff due to there being
1494 * 4 times as many colors as weights.)
1495 */
1496 LLVMValueRef col00, col01, col10, col11;
1497 LLVMValueRef colc, colc0, colc1;
1498 col10 = lp_build_swizzle_soa_channel(texel_bld,
1499 neighbors[1][0], chan_swiz);
1500 col11 = lp_build_swizzle_soa_channel(texel_bld,
1501 neighbors[1][1], chan_swiz);
1502 col01 = lp_build_swizzle_soa_channel(texel_bld,
1503 neighbors[0][1], chan_swiz);
1504 col00 = lp_build_swizzle_soa_channel(texel_bld,
1505 neighbors[0][0], chan_swiz);
1506
1507 /*
1508 * The spec says for comparison filtering, the comparison
1509 * must happen before synthesizing the new value.
1510 * This means all gathered values are always 0 or 1,
1511 * except for the non-existing texel, which can be 0,1/3,2/3,1...
1512 * Seems like we'd be allowed to just return 0 or 1 too, so we
1513 * could simplify and pass down the compare mask values to the
1514 * end (using int arithmetic/compare on the mask values to
1515 * construct the fourth texel) and only there convert to floats
1516 * but it's probably not worth it (it might be easier for the cpu
1517 * but not for the code)...
1518 */
1519 if (bld->static_sampler_state->compare_mode !=
1520 PIPE_TEX_COMPARE_NONE) {
1521 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1522 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1523 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1524 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1525 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1526 col00 = lp_build_select(texel_bld, cmpval00,
1527 texel_bld->one, texel_bld->zero);
1528 col01 = lp_build_select(texel_bld, cmpval01,
1529 texel_bld->one, texel_bld->zero);
1530 col10 = lp_build_select(texel_bld, cmpval10,
1531 texel_bld->one, texel_bld->zero);
1532 col11 = lp_build_select(texel_bld, cmpval11,
1533 texel_bld->one, texel_bld->zero);
1534 }
1535
1536 /*
1537 * Null out corner color.
1538 */
1539 col00 = lp_build_andnot(coord_bld, col00, c00f);
1540 col01 = lp_build_andnot(coord_bld, col01, c01f);
1541 col10 = lp_build_andnot(coord_bld, col10, c10f);
1542 col11 = lp_build_andnot(coord_bld, col11, c11f);
1543
1544 /*
1545 * New corner texel color is all colors added / 3.
1546 */
1547 colc0 = lp_build_add(coord_bld, col00, col01);
1548 colc1 = lp_build_add(coord_bld, col10, col11);
1549 colc = lp_build_add(coord_bld, colc0, colc1);
1550 colc = lp_build_mul(coord_bld, one_third, colc);
1551
1552 /*
1553 * Replace the corner texel color with the new value.
1554 */
1555 col00 = lp_build_select(coord_bld, c00, colc, col00);
1556 col01 = lp_build_select(coord_bld, c01, colc, col01);
1557 col10 = lp_build_select(coord_bld, c10, colc, col10);
1558 col11 = lp_build_select(coord_bld, c11, colc, col11);
1559
1560 colors0[0] = col10;
1561 colors0[1] = col11;
1562 colors0[2] = col01;
1563 colors0[3] = col00;
1564 }
1565
1566 LLVMBuildStore(builder, colors0[0], colorss[0]);
1567 LLVMBuildStore(builder, colors0[1], colorss[1]);
1568 LLVMBuildStore(builder, colors0[2], colorss[2]);
1569 LLVMBuildStore(builder, colors0[3], colorss[3]);
1570
1571 lp_build_else(&corner_if);
1572 }
1573
1574 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1575 if (is_gather) {
1576 /*
1577 * Just assign the red channel (no component selection yet).
1578 * This is a bit hackish, we usually do the swizzle at the
1579 * end of sampling (much less values to swizzle), but this
1580 * obviously cannot work when using gather.
1581 */
1582 colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1583 neighbors[1][0],
1584 chan_swiz);
1585 colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1586 neighbors[1][1],
1587 chan_swiz);
1588 colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1589 neighbors[0][1],
1590 chan_swiz);
1591 colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1592 neighbors[0][0],
1593 chan_swiz);
1594 }
1595 else {
1596 /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1597 for (chan = 0; chan < 4; chan++) {
1598 colors0[chan] = lp_build_lerp_2d(texel_bld,
1599 s_fpart, t_fpart,
1600 neighbors[0][0][chan],
1601 neighbors[0][1][chan],
1602 neighbors[1][0][chan],
1603 neighbors[1][1][chan],
1604 0);
1605 }
1606 }
1607 }
1608 else {
1609 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1610 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1611 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1612 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1613 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1614
1615 if (is_gather) {
1616 /* more hacks for swizzling, should be X, ONE or ZERO... */
1617 colors0[0] = lp_build_select(texel_bld, cmpval10,
1618 texel_bld->one, texel_bld->zero);
1619 colors0[1] = lp_build_select(texel_bld, cmpval11,
1620 texel_bld->one, texel_bld->zero);
1621 colors0[2] = lp_build_select(texel_bld, cmpval01,
1622 texel_bld->one, texel_bld->zero);
1623 colors0[3] = lp_build_select(texel_bld, cmpval00,
1624 texel_bld->one, texel_bld->zero);
1625 }
1626 else {
1627 colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1628 cmpval00, cmpval01, cmpval10, cmpval11);
1629 colors0[1] = colors0[2] = colors0[3] = colors0[0];
1630 }
1631 }
1632
1633 if (accurate_cube_corners) {
1634 LLVMBuildStore(builder, colors0[0], colorss[0]);
1635 LLVMBuildStore(builder, colors0[1], colorss[1]);
1636 LLVMBuildStore(builder, colors0[2], colorss[2]);
1637 LLVMBuildStore(builder, colors0[3], colorss[3]);
1638
1639 lp_build_endif(&corner_if);
1640
1641 colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
1642 colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
1643 colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
1644 colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
1645 }
1646
1647 if (dims == 3) {
1648 LLVMValueRef neighbors1[2][2][4];
1649 LLVMValueRef colors1[4];
1650
1651 assert(!is_gather);
1652
1653 /* get x0/x1/y0/y1 texels at z1 */
1654 lp_build_sample_texel_soa(bld,
1655 width_vec, height_vec, depth_vec,
1656 x00, y00, z1,
1657 row_stride_vec, img_stride_vec,
1658 data_ptr, mipoffsets, neighbors1[0][0]);
1659 lp_build_sample_texel_soa(bld,
1660 width_vec, height_vec, depth_vec,
1661 x01, y01, z1,
1662 row_stride_vec, img_stride_vec,
1663 data_ptr, mipoffsets, neighbors1[0][1]);
1664 lp_build_sample_texel_soa(bld,
1665 width_vec, height_vec, depth_vec,
1666 x10, y10, z1,
1667 row_stride_vec, img_stride_vec,
1668 data_ptr, mipoffsets, neighbors1[1][0]);
1669 lp_build_sample_texel_soa(bld,
1670 width_vec, height_vec, depth_vec,
1671 x11, y11, z1,
1672 row_stride_vec, img_stride_vec,
1673 data_ptr, mipoffsets, neighbors1[1][1]);
1674
1675 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1676 /* Bilinear interpolate the four samples from the second Z slice */
1677 for (chan = 0; chan < 4; chan++) {
1678 colors1[chan] = lp_build_lerp_2d(texel_bld,
1679 s_fpart, t_fpart,
1680 neighbors1[0][0][chan],
1681 neighbors1[0][1][chan],
1682 neighbors1[1][0][chan],
1683 neighbors1[1][1][chan],
1684 0);
1685 }
1686 /* Linearly interpolate the two samples from the two 3D slices */
1687 for (chan = 0; chan < 4; chan++) {
1688 colors_out[chan] = lp_build_lerp(texel_bld,
1689 r_fpart,
1690 colors0[chan], colors1[chan],
1691 0);
1692 }
1693 }
1694 else {
1695 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1696 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1697 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1698 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1699 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1700 colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1701 cmpval00, cmpval01, cmpval10, cmpval11);
1702 /* Linearly interpolate the two samples from the two 3D slices */
1703 colors_out[0] = lp_build_lerp(texel_bld,
1704 r_fpart,
1705 colors0[0], colors1[0],
1706 0);
1707 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1708 }
1709 }
1710 else {
1711 /* 2D tex */
1712 for (chan = 0; chan < 4; chan++) {
1713 colors_out[chan] = colors0[chan];
1714 }
1715 }
1716 }
1717 if (is_gather) {
1718 /*
1719 * For gather, we can't do our usual channel swizzling done later,
1720 * so do it here. It only really matters for 0/1 swizzles in case
1721 * of comparison filtering, since in this case the results would be
1722 * wrong, without comparison it should all work out alright but it
1723 * can't hurt to do that here, since it will instantly drop all
1724 * calculations above, though it's a rather stupid idea to do
1725 * gather on a channel which will always return 0 or 1 in any case...
1726 */
1727 if (chan_swiz == PIPE_SWIZZLE_1) {
1728 for (chan = 0; chan < 4; chan++) {
1729 colors_out[chan] = texel_bld->one;
1730 }
1731 } else if (chan_swiz == PIPE_SWIZZLE_0) {
1732 for (chan = 0; chan < 4; chan++) {
1733 colors_out[chan] = texel_bld->zero;
1734 }
1735 }
1736 }
1737 }
1738
1739
1740 /**
1741 * Sample the texture/mipmap using given image filter and mip filter.
1742 * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1743 * from (vectors or scalars).
1744 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1745 */
1746 static void
1747 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1748 unsigned img_filter,
1749 unsigned mip_filter,
1750 boolean is_gather,
1751 const LLVMValueRef *coords,
1752 const LLVMValueRef *offsets,
1753 LLVMValueRef ilevel0,
1754 LLVMValueRef ilevel1,
1755 LLVMValueRef lod_fpart,
1756 LLVMValueRef *colors_out)
1757 {
1758 LLVMBuilderRef builder = bld->gallivm->builder;
1759 LLVMValueRef size0 = NULL;
1760 LLVMValueRef size1 = NULL;
1761 LLVMValueRef row_stride0_vec = NULL;
1762 LLVMValueRef row_stride1_vec = NULL;
1763 LLVMValueRef img_stride0_vec = NULL;
1764 LLVMValueRef img_stride1_vec = NULL;
1765 LLVMValueRef data_ptr0 = NULL;
1766 LLVMValueRef data_ptr1 = NULL;
1767 LLVMValueRef mipoff0 = NULL;
1768 LLVMValueRef mipoff1 = NULL;
1769 LLVMValueRef colors0[4], colors1[4];
1770 unsigned chan;
1771
1772 /* sample the first mipmap level */
1773 lp_build_mipmap_level_sizes(bld, ilevel0,
1774 &size0,
1775 &row_stride0_vec, &img_stride0_vec);
1776 if (bld->num_mips == 1) {
1777 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1778 }
1779 else {
1780 /* This path should work for num_lods 1 too but slightly less efficient */
1781 data_ptr0 = bld->base_ptr;
1782 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1783 }
1784 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1785 lp_build_sample_image_nearest(bld, size0,
1786 row_stride0_vec, img_stride0_vec,
1787 data_ptr0, mipoff0, coords, offsets,
1788 colors0);
1789 }
1790 else {
1791 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1792 lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1793 row_stride0_vec, img_stride0_vec,
1794 data_ptr0, mipoff0, coords, offsets,
1795 colors0);
1796 }
1797
1798 /* Store the first level's colors in the output variables */
1799 for (chan = 0; chan < 4; chan++) {
1800 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1801 }
1802
1803 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1804 struct lp_build_if_state if_ctx;
1805 LLVMValueRef need_lerp;
1806
1807 /* need_lerp = lod_fpart > 0 */
1808 if (bld->num_lods == 1) {
1809 need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1810 lod_fpart, bld->lodf_bld.zero,
1811 "need_lerp");
1812 }
1813 else {
1814 /*
1815 * We'll do mip filtering if any of the quads (or individual
1816 * pixel in case of per-pixel lod) need it.
1817 * It might be better to split the vectors here and only fetch/filter
1818 * quads which need it (if there's one lod per quad).
1819 */
1820 need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1821 PIPE_FUNC_GREATER,
1822 lod_fpart, bld->lodf_bld.zero);
1823 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1824 lp_build_name(need_lerp, "need_lerp");
1825 }
1826
1827 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1828 {
1829 /*
1830 * We unfortunately need to clamp lod_fpart here since we can get
1831 * negative values which would screw up filtering if not all
1832 * lod_fpart values have same sign.
1833 */
1834 lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1835 bld->lodf_bld.zero);
1836 /* sample the second mipmap level */
1837 lp_build_mipmap_level_sizes(bld, ilevel1,
1838 &size1,
1839 &row_stride1_vec, &img_stride1_vec);
1840 if (bld->num_mips == 1) {
1841 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1842 }
1843 else {
1844 data_ptr1 = bld->base_ptr;
1845 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1846 }
1847 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1848 lp_build_sample_image_nearest(bld, size1,
1849 row_stride1_vec, img_stride1_vec,
1850 data_ptr1, mipoff1, coords, offsets,
1851 colors1);
1852 }
1853 else {
1854 lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1855 row_stride1_vec, img_stride1_vec,
1856 data_ptr1, mipoff1, coords, offsets,
1857 colors1);
1858 }
1859
1860 /* interpolate samples from the two mipmap levels */
1861
1862 if (bld->num_lods != bld->coord_type.length)
1863 lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1864 bld->lodf_bld.type,
1865 bld->texel_bld.type,
1866 lod_fpart);
1867
1868 for (chan = 0; chan < 4; chan++) {
1869 colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1870 colors0[chan], colors1[chan],
1871 0);
1872 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1873 }
1874 }
1875 lp_build_endif(&if_ctx);
1876 }
1877 }
1878
1879
1880 /**
1881 * Sample the texture/mipmap using given mip filter, and using
1882 * both nearest and linear filtering at the same time depending
1883 * on linear_mask.
1884 * lod can be per quad but linear_mask is always per pixel.
1885 * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1886 * from (vectors or scalars).
1887 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1888 */
1889 static void
1890 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1891 LLVMValueRef linear_mask,
1892 unsigned mip_filter,
1893 const LLVMValueRef *coords,
1894 const LLVMValueRef *offsets,
1895 LLVMValueRef ilevel0,
1896 LLVMValueRef ilevel1,
1897 LLVMValueRef lod_fpart,
1898 LLVMValueRef lod_positive,
1899 LLVMValueRef *colors_out)
1900 {
1901 LLVMBuilderRef builder = bld->gallivm->builder;
1902 LLVMValueRef size0 = NULL;
1903 LLVMValueRef size1 = NULL;
1904 LLVMValueRef row_stride0_vec = NULL;
1905 LLVMValueRef row_stride1_vec = NULL;
1906 LLVMValueRef img_stride0_vec = NULL;
1907 LLVMValueRef img_stride1_vec = NULL;
1908 LLVMValueRef data_ptr0 = NULL;
1909 LLVMValueRef data_ptr1 = NULL;
1910 LLVMValueRef mipoff0 = NULL;
1911 LLVMValueRef mipoff1 = NULL;
1912 LLVMValueRef colors0[4], colors1[4];
1913 unsigned chan;
1914
1915 /* sample the first mipmap level */
1916 lp_build_mipmap_level_sizes(bld, ilevel0,
1917 &size0,
1918 &row_stride0_vec, &img_stride0_vec);
1919 if (bld->num_mips == 1) {
1920 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1921 }
1922 else {
1923 /* This path should work for num_lods 1 too but slightly less efficient */
1924 data_ptr0 = bld->base_ptr;
1925 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1926 }
1927
1928 lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1929 row_stride0_vec, img_stride0_vec,
1930 data_ptr0, mipoff0, coords, offsets,
1931 colors0);
1932
1933 /* Store the first level's colors in the output variables */
1934 for (chan = 0; chan < 4; chan++) {
1935 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1936 }
1937
1938 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1939 struct lp_build_if_state if_ctx;
1940 LLVMValueRef need_lerp;
1941
1942 /*
1943 * We'll do mip filtering if any of the quads (or individual
1944 * pixel in case of per-pixel lod) need it.
1945 * Note using lod_positive here not lod_fpart since it may be the same
1946 * condition as that used in the outer "if" in the caller hence llvm
1947 * should be able to merge the branches in this case.
1948 */
1949 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1950 lp_build_name(need_lerp, "need_lerp");
1951
1952 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1953 {
1954 /*
1955 * We unfortunately need to clamp lod_fpart here since we can get
1956 * negative values which would screw up filtering if not all
1957 * lod_fpart values have same sign.
1958 */
1959 lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1960 bld->lodf_bld.zero);
1961 /* sample the second mipmap level */
1962 lp_build_mipmap_level_sizes(bld, ilevel1,
1963 &size1,
1964 &row_stride1_vec, &img_stride1_vec);
1965 if (bld->num_mips == 1) {
1966 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1967 }
1968 else {
1969 data_ptr1 = bld->base_ptr;
1970 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1971 }
1972
1973 lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
1974 row_stride1_vec, img_stride1_vec,
1975 data_ptr1, mipoff1, coords, offsets,
1976 colors1);
1977
1978 /* interpolate samples from the two mipmap levels */
1979
1980 if (bld->num_lods != bld->coord_type.length)
1981 lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1982 bld->lodf_bld.type,
1983 bld->texel_bld.type,
1984 lod_fpart);
1985
1986 for (chan = 0; chan < 4; chan++) {
1987 colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1988 colors0[chan], colors1[chan],
1989 0);
1990 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1991 }
1992 }
1993 lp_build_endif(&if_ctx);
1994 }
1995 }
1996
1997
1998 /**
1999 * Build (per-coord) layer value.
2000 * Either clamp layer to valid values or fill in optional out_of_bounds
2001 * value and just return value unclamped.
2002 */
2003 static LLVMValueRef
2004 lp_build_layer_coord(struct lp_build_sample_context *bld,
2005 unsigned texture_unit,
2006 boolean is_cube_array,
2007 LLVMValueRef layer,
2008 LLVMValueRef *out_of_bounds)
2009 {
2010 LLVMValueRef num_layers;
2011 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2012
2013 num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2014 bld->context_ptr, texture_unit);
2015
2016 if (out_of_bounds) {
2017 LLVMValueRef out1, out;
2018 assert(!is_cube_array);
2019 num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2020 out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2021 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2022 *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2023 return layer;
2024 }
2025 else {
2026 LLVMValueRef maxlayer;
2027 LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2028 bld->int_bld.one;
2029 maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2030 maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2031 return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2032 }
2033 }
2034
2035
2036 /**
2037 * Calculate cube face, lod, mip levels.
2038 */
2039 static void
2040 lp_build_sample_common(struct lp_build_sample_context *bld,
2041 boolean is_lodq,
2042 unsigned texture_index,
2043 unsigned sampler_index,
2044 LLVMValueRef *coords,
2045 const struct lp_derivatives *derivs, /* optional */
2046 LLVMValueRef lod_bias, /* optional */
2047 LLVMValueRef explicit_lod, /* optional */
2048 LLVMValueRef *lod_pos_or_zero,
2049 LLVMValueRef *lod,
2050 LLVMValueRef *lod_fpart,
2051 LLVMValueRef *ilevel0,
2052 LLVMValueRef *ilevel1)
2053 {
2054 const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2055 const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2056 const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2057 const unsigned target = bld->static_texture_state->target;
2058 LLVMValueRef first_level, cube_rho = NULL;
2059 LLVMValueRef lod_ipart = NULL;
2060 struct lp_derivatives cube_derivs;
2061
2062 /*
2063 printf("%s mip %d min %d mag %d\n", __FUNCTION__,
2064 mip_filter, min_filter, mag_filter);
2065 */
2066
2067 /*
2068 * Choose cube face, recompute texcoords for the chosen face and
2069 * compute rho here too (as it requires transform of derivatives).
2070 */
2071 if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2072 boolean need_derivs;
2073 need_derivs = ((min_filter != mag_filter ||
2074 mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2075 !bld->static_sampler_state->min_max_lod_equal &&
2076 !explicit_lod);
2077 lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2078 derivs = &cube_derivs;
2079 if (target == PIPE_TEXTURE_CUBE_ARRAY) {
2080 /* calculate cube layer coord now */
2081 LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2082 LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2083 layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2084 coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2085 /* because of seamless filtering can't add it to face (coords[2]) here. */
2086 }
2087 }
2088 else if (target == PIPE_TEXTURE_1D_ARRAY ||
2089 target == PIPE_TEXTURE_2D_ARRAY) {
2090 coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2091 coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2092 }
2093
2094 if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2095 /*
2096 * Clamp p coords to [0,1] for fixed function depth texture format here.
2097 * Technically this is not entirely correct for unorm depth as the ref value
2098 * should be converted to the depth format (quantization!) and comparison
2099 * then done in texture format. This would actually help performance (since
2100 * only need to do it once and could save the per-sample conversion of texels
2101 * to floats instead), but it would need more messy code (would need to push
2102 * at least some bits down to actual fetch so conversion could be skipped,
2103 * and would have ugly interaction with border color, would need to convert
2104 * border color to that format too or do some other tricks to make it work).
2105 */
2106 const struct util_format_description *format_desc = bld->format_desc;
2107 unsigned chan_type;
2108 /* not entirely sure we couldn't end up with non-valid swizzle here */
2109 chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2110 format_desc->channel[format_desc->swizzle[0]].type :
2111 UTIL_FORMAT_TYPE_FLOAT;
2112 if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2113 coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2114 bld->coord_bld.zero, bld->coord_bld.one);
2115 }
2116 }
2117
2118 /*
2119 * Compute the level of detail (float).
2120 */
2121 if (min_filter != mag_filter ||
2122 mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2123 /* Need to compute lod either to choose mipmap levels or to
2124 * distinguish between minification/magnification with one mipmap level.
2125 */
2126 lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2127 coords[0], coords[1], coords[2], cube_rho,
2128 derivs, lod_bias, explicit_lod,
2129 mip_filter, lod,
2130 &lod_ipart, lod_fpart, lod_pos_or_zero);
2131 if (is_lodq) {
2132 LLVMValueRef last_level;
2133 last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2134 bld->gallivm,
2135 bld->context_ptr,
2136 texture_index);
2137 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2138 bld->gallivm,
2139 bld->context_ptr,
2140 texture_index);
2141 last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2142 last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2143 last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2144
2145 switch (mip_filter) {
2146 case PIPE_TEX_MIPFILTER_NONE:
2147 *lod_fpart = bld->lodf_bld.zero;
2148 break;
2149 case PIPE_TEX_MIPFILTER_NEAREST:
2150 *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2151 /* fallthrough */
2152 case PIPE_TEX_MIPFILTER_LINEAR:
2153 *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2154 bld->lodf_bld.zero, last_level);
2155 break;
2156 }
2157 return;
2158 }
2159
2160 } else {
2161 lod_ipart = bld->lodi_bld.zero;
2162 *lod_pos_or_zero = bld->lodi_bld.zero;
2163 }
2164
2165 if (bld->num_lods != bld->num_mips) {
2166 /* only makes sense if there's just a single mip level */
2167 assert(bld->num_mips == 1);
2168 lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2169 }
2170
2171 /*
2172 * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2173 */
2174 switch (mip_filter) {
2175 default:
2176 assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2177 /* fall-through */
2178 case PIPE_TEX_MIPFILTER_NONE:
2179 /* always use mip level 0 */
2180 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2181 bld->gallivm, bld->context_ptr,
2182 texture_index);
2183 first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2184 *ilevel0 = first_level;
2185 break;
2186 case PIPE_TEX_MIPFILTER_NEAREST:
2187 assert(lod_ipart);
2188 lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2189 break;
2190 case PIPE_TEX_MIPFILTER_LINEAR:
2191 assert(lod_ipart);
2192 assert(*lod_fpart);
2193 lp_build_linear_mip_levels(bld, texture_index,
2194 lod_ipart, lod_fpart,
2195 ilevel0, ilevel1);
2196 break;
2197 }
2198 }
2199
2200 static void
2201 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2202 unsigned sampler_unit)
2203 {
2204 struct gallivm_state *gallivm = bld->gallivm;
2205 LLVMBuilderRef builder = gallivm->builder;
2206 LLVMValueRef border_color_ptr =
2207 bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2208 bld->context_ptr, sampler_unit);
2209 LLVMValueRef border_color;
2210 const struct util_format_description *format_desc = bld->format_desc;
2211 struct lp_type vec4_type = bld->texel_type;
2212 struct lp_build_context vec4_bld;
2213 LLVMValueRef min_clamp = NULL;
2214 LLVMValueRef max_clamp = NULL;
2215
2216 /*
2217 * For normalized format need to clamp border color (technically
2218 * probably should also quantize the data). Really sucks doing this
2219 * here but can't avoid at least for now since this is part of
2220 * sampler state and texture format is part of sampler_view state.
2221 * GL expects also expects clamping for uint/sint formats too so
2222 * do that as well (d3d10 can't end up here with uint/sint since it
2223 * only supports them with ld).
2224 */
2225 vec4_type.length = 4;
2226 lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2227
2228 /*
2229 * Vectorized clamping of border color. Loading is a bit of a hack since
2230 * we just cast the pointer to float array to pointer to vec4
2231 * (int or float).
2232 */
2233 border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2234 lp_build_const_int32(gallivm, 0));
2235 border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2236 LLVMPointerType(vec4_bld.vec_type, 0), "");
2237 border_color = LLVMBuildLoad(builder, border_color_ptr, "");
2238 /* we don't have aligned type in the dynamic state unfortunately */
2239 LLVMSetAlignment(border_color, 4);
2240
2241 /*
2242 * Instead of having some incredibly complex logic which will try to figure out
2243 * clamping necessary for each channel, simply use the first channel, and treat
2244 * mixed signed/unsigned normalized formats specially.
2245 * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
2246 * good reason.)
2247 */
2248 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2249 int chan;
2250 /* d/s needs special handling because both present means just sampling depth */
2251 if (util_format_is_depth_and_stencil(format_desc->format)) {
2252 chan = format_desc->swizzle[0];
2253 }
2254 else {
2255 chan = util_format_get_first_non_void_channel(format_desc->format);
2256 }
2257 if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2258 unsigned chan_type = format_desc->channel[chan].type;
2259 unsigned chan_norm = format_desc->channel[chan].normalized;
2260 unsigned chan_pure = format_desc->channel[chan].pure_integer;
2261 if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2262 if (chan_norm) {
2263 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2264 max_clamp = vec4_bld.one;
2265 }
2266 else if (chan_pure) {
2267 /*
2268 * Border color was stored as int, hence need min/max clamp
2269 * only if chan has less than 32 bits..
2270 */
2271 unsigned chan_size = format_desc->channel[chan].size;
2272 if (chan_size < 32) {
2273 min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2274 0 - (1 << (chan_size - 1)));
2275 max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2276 (1 << (chan_size - 1)) - 1);
2277 }
2278 }
2279 /* TODO: no idea about non-pure, non-normalized! */
2280 }
2281 else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2282 if (chan_norm) {
2283 min_clamp = vec4_bld.zero;
2284 max_clamp = vec4_bld.one;
2285 }
2286 /*
2287 * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2288 * we use Z32_FLOAT_S8X24 to imply sampling depth component
2289 * and ignoring stencil, which will blow up here if we try to
2290 * do a uint clamp in a float texel build...
2291 * And even if we had that format, mesa st also thinks using z24s8
2292 * means depth sampling ignoring stencil.
2293 */
2294 else if (chan_pure) {
2295 /*
2296 * Border color was stored as uint, hence never need min
2297 * clamp, and only need max clamp if chan has less than 32 bits.
2298 */
2299 unsigned chan_size = format_desc->channel[chan].size;
2300 if (chan_size < 32) {
2301 max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2302 (1 << chan_size) - 1);
2303 }
2304 /* TODO: no idea about non-pure, non-normalized! */
2305 }
2306 }
2307 else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2308 /* TODO: I have no idea what clamp this would need if any! */
2309 }
2310 }
2311 /* mixed plain formats (or different pure size) */
2312 switch (format_desc->format) {
2313 case PIPE_FORMAT_B10G10R10A2_UINT:
2314 case PIPE_FORMAT_R10G10B10A2_UINT:
2315 {
2316 unsigned max10 = (1 << 10) - 1;
2317 max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2318 max10, (1 << 2) - 1, NULL);
2319 }
2320 break;
2321 case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2322 min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2323 -1.0F, 0.0F, NULL);
2324 max_clamp = vec4_bld.one;
2325 break;
2326 case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2327 case PIPE_FORMAT_R5SG5SB6U_NORM:
2328 min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2329 0.0F, 0.0F, NULL);
2330 max_clamp = vec4_bld.one;
2331 break;
2332 default:
2333 break;
2334 }
2335 }
2336 else {
2337 /* cannot figure this out from format description */
2338 if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2339 /* s3tc formats are always unorm */
2340 min_clamp = vec4_bld.zero;
2341 max_clamp = vec4_bld.one;
2342 }
2343 else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2344 format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
2345 switch (format_desc->format) {
2346 case PIPE_FORMAT_RGTC1_UNORM:
2347 case PIPE_FORMAT_RGTC2_UNORM:
2348 case PIPE_FORMAT_LATC1_UNORM:
2349 case PIPE_FORMAT_LATC2_UNORM:
2350 case PIPE_FORMAT_ETC1_RGB8:
2351 min_clamp = vec4_bld.zero;
2352 max_clamp = vec4_bld.one;
2353 break;
2354 case PIPE_FORMAT_RGTC1_SNORM:
2355 case PIPE_FORMAT_RGTC2_SNORM:
2356 case PIPE_FORMAT_LATC1_SNORM:
2357 case PIPE_FORMAT_LATC2_SNORM:
2358 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2359 max_clamp = vec4_bld.one;
2360 break;
2361 default:
2362 assert(0);
2363 break;
2364 }
2365 }
2366 /*
2367 * all others from subsampled/other group, though we don't care
2368 * about yuv (and should not have any from zs here)
2369 */
2370 else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2371 switch (format_desc->format) {
2372 case PIPE_FORMAT_R8G8_B8G8_UNORM:
2373 case PIPE_FORMAT_G8R8_G8B8_UNORM:
2374 case PIPE_FORMAT_G8R8_B8R8_UNORM:
2375 case PIPE_FORMAT_R8G8_R8B8_UNORM:
2376 case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2377 min_clamp = vec4_bld.zero;
2378 max_clamp = vec4_bld.one;
2379 break;
2380 case PIPE_FORMAT_R8G8Bx_SNORM:
2381 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2382 max_clamp = vec4_bld.one;
2383 break;
2384 /*
2385 * Note smallfloat formats usually don't need clamping
2386 * (they still have infinite range) however this is not
2387 * true for r11g11b10 and r9g9b9e5, which can't represent
2388 * negative numbers (and additionally r9g9b9e5 can't represent
2389 * very large numbers). d3d10 seems happy without clamping in
2390 * this case, but gl spec is pretty clear: "for floating
2391 * point and integer formats, border values are clamped to
2392 * the representable range of the format" so do that here.
2393 */
2394 case PIPE_FORMAT_R11G11B10_FLOAT:
2395 min_clamp = vec4_bld.zero;
2396 break;
2397 case PIPE_FORMAT_R9G9B9E5_FLOAT:
2398 min_clamp = vec4_bld.zero;
2399 max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2400 break;
2401 default:
2402 assert(0);
2403 break;
2404 }
2405 }
2406 }
2407
2408 if (min_clamp) {
2409 border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2410 }
2411 if (max_clamp) {
2412 border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2413 }
2414
2415 bld->border_color_clamped = border_color;
2416 }
2417
2418
2419 /**
2420 * General texture sampling codegen.
2421 * This function handles texture sampling for all texture targets (1D,
2422 * 2D, 3D, cube) and all filtering modes.
2423 */
2424 static void
2425 lp_build_sample_general(struct lp_build_sample_context *bld,
2426 unsigned sampler_unit,
2427 boolean is_gather,
2428 const LLVMValueRef *coords,
2429 const LLVMValueRef *offsets,
2430 LLVMValueRef lod_positive,
2431 LLVMValueRef lod_fpart,
2432 LLVMValueRef ilevel0,
2433 LLVMValueRef ilevel1,
2434 LLVMValueRef *colors_out)
2435 {
2436 LLVMBuilderRef builder = bld->gallivm->builder;
2437 const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2438 const unsigned mip_filter = sampler_state->min_mip_filter;
2439 const unsigned min_filter = sampler_state->min_img_filter;
2440 const unsigned mag_filter = sampler_state->mag_img_filter;
2441 LLVMValueRef texels[4];
2442 unsigned chan;
2443
2444 /* if we need border color, (potentially) clamp it now */
2445 if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2446 min_filter,
2447 mag_filter) ||
2448 (bld->dims > 1 &&
2449 lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2450 min_filter,
2451 mag_filter)) ||
2452 (bld->dims > 2 &&
2453 lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2454 min_filter,
2455 mag_filter))) {
2456 lp_build_clamp_border_color(bld, sampler_unit);
2457 }
2458
2459
2460 /*
2461 * Get/interpolate texture colors.
2462 */
2463
2464 for (chan = 0; chan < 4; ++chan) {
2465 texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2466 lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2467 }
2468
2469 if (min_filter == mag_filter) {
2470 /* no need to distinguish between minification and magnification */
2471 lp_build_sample_mipmap(bld, min_filter, mip_filter,
2472 is_gather,
2473 coords, offsets,
2474 ilevel0, ilevel1, lod_fpart,
2475 texels);
2476 }
2477 else {
2478 /*
2479 * Could also get rid of the if-logic and always use mipmap_both, both
2480 * for the single lod and multi-lod case if nothing really uses this.
2481 */
2482 if (bld->num_lods == 1) {
2483 /* Emit conditional to choose min image filter or mag image filter
2484 * depending on the lod being > 0 or <= 0, respectively.
2485 */
2486 struct lp_build_if_state if_ctx;
2487
2488 lod_positive = LLVMBuildTrunc(builder, lod_positive,
2489 LLVMInt1TypeInContext(bld->gallivm->context),
2490 "lod_pos");
2491
2492 lp_build_if(&if_ctx, bld->gallivm, lod_positive);
2493 {
2494 /* Use the minification filter */
2495 lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
2496 coords, offsets,
2497 ilevel0, ilevel1, lod_fpart,
2498 texels);
2499 }
2500 lp_build_else(&if_ctx);
2501 {
2502 /* Use the magnification filter */
2503 lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
2504 FALSE,
2505 coords, offsets,
2506 ilevel0, NULL, NULL,
2507 texels);
2508 }
2509 lp_build_endif(&if_ctx);
2510 }
2511 else {
2512 LLVMValueRef need_linear, linear_mask;
2513 unsigned mip_filter_for_nearest;
2514 struct lp_build_if_state if_ctx;
2515
2516 if (min_filter == PIPE_TEX_FILTER_LINEAR) {
2517 linear_mask = lod_positive;
2518 mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
2519 }
2520 else {
2521 linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
2522 mip_filter_for_nearest = mip_filter;
2523 }
2524 need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
2525 linear_mask);
2526 lp_build_name(need_linear, "need_linear");
2527
2528 if (bld->num_lods != bld->coord_type.length) {
2529 linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2530 bld->lodi_type,
2531 bld->int_coord_type,
2532 linear_mask);
2533 }
2534
2535 lp_build_if(&if_ctx, bld->gallivm, need_linear);
2536 {
2537 /*
2538 * Do sampling with both filters simultaneously. This means using
2539 * a linear filter and doing some tricks (with weights) for the pixels
2540 * which need nearest filter.
2541 * Note that it's probably rare some pixels need nearest and some
2542 * linear filter but the fixups required for the nearest pixels
2543 * aren't all that complicated so just always run a combined path
2544 * if at least some pixels require linear.
2545 */
2546 lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
2547 coords, offsets,
2548 ilevel0, ilevel1,
2549 lod_fpart, lod_positive,
2550 texels);
2551 }
2552 lp_build_else(&if_ctx);
2553 {
2554 /*
2555 * All pixels require just nearest filtering, which is way
2556 * cheaper than linear, hence do a separate path for that.
2557 */
2558 lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
2559 mip_filter_for_nearest, FALSE,
2560 coords, offsets,
2561 ilevel0, ilevel1, lod_fpart,
2562 texels);
2563 }
2564 lp_build_endif(&if_ctx);
2565 }
2566 }
2567
2568 for (chan = 0; chan < 4; ++chan) {
2569 colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
2570 lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
2571 }
2572 }
2573
2574
2575 /**
2576 * Texel fetch function.
2577 * In contrast to general sampling there is no filtering, no coord minification,
2578 * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
2579 * directly to be applied to the selected mip level (after adding texel offsets).
2580 * This function handles texel fetch for all targets where texel fetch is supported
2581 * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
2582 */
2583 static void
2584 lp_build_fetch_texel(struct lp_build_sample_context *bld,
2585 unsigned texture_unit,
2586 const LLVMValueRef *coords,
2587 LLVMValueRef explicit_lod,
2588 const LLVMValueRef *offsets,
2589 LLVMValueRef *colors_out)
2590 {
2591 struct lp_build_context *perquadi_bld = &bld->lodi_bld;
2592 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2593 unsigned dims = bld->dims, chan;
2594 unsigned target = bld->static_texture_state->target;
2595 boolean out_of_bound_ret_zero = TRUE;
2596 LLVMValueRef size, ilevel;
2597 LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
2598 LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
2599 LLVMValueRef width, height, depth, i, j;
2600 LLVMValueRef offset, out_of_bounds, out1;
2601
2602 out_of_bounds = int_coord_bld->zero;
2603
2604 if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
2605 if (bld->num_mips != int_coord_bld->type.length) {
2606 ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
2607 perquadi_bld->type, explicit_lod, 0);
2608 }
2609 else {
2610 ilevel = explicit_lod;
2611 }
2612 lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
2613 out_of_bound_ret_zero ? &out_of_bounds : NULL);
2614 }
2615 else {
2616 assert(bld->num_mips == 1);
2617 if (bld->static_texture_state->target != PIPE_BUFFER) {
2618 ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
2619 bld->context_ptr, texture_unit);
2620 }
2621 else {
2622 ilevel = lp_build_const_int32(bld->gallivm, 0);
2623 }
2624 }
2625 lp_build_mipmap_level_sizes(bld, ilevel,
2626 &size,
2627 &row_stride_vec, &img_stride_vec);
2628 lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
2629 size, &width, &height, &depth);
2630
2631 if (target == PIPE_TEXTURE_1D_ARRAY ||
2632 target == PIPE_TEXTURE_2D_ARRAY) {
2633 if (out_of_bound_ret_zero) {
2634 z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
2635 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2636 }
2637 else {
2638 z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
2639 }
2640 }
2641
2642 /* This is a lot like border sampling */
2643 if (offsets[0]) {
2644 /*
2645 * coords are really unsigned, offsets are signed, but I don't think
2646 * exceeding 31 bits is possible
2647 */
2648 x = lp_build_add(int_coord_bld, x, offsets[0]);
2649 }
2650 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
2651 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2652 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
2653 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2654
2655 if (dims >= 2) {
2656 if (offsets[1]) {
2657 y = lp_build_add(int_coord_bld, y, offsets[1]);
2658 }
2659 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
2660 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2661 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
2662 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2663
2664 if (dims >= 3) {
2665 if (offsets[2]) {
2666 z = lp_build_add(int_coord_bld, z, offsets[2]);
2667 }
2668 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
2669 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2670 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
2671 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2672 }
2673 }
2674
2675 lp_build_sample_offset(int_coord_bld,
2676 bld->format_desc,
2677 x, y, z, row_stride_vec, img_stride_vec,
2678 &offset, &i, &j);
2679
2680 if (bld->static_texture_state->target != PIPE_BUFFER) {
2681 offset = lp_build_add(int_coord_bld, offset,
2682 lp_build_get_mip_offsets(bld, ilevel));
2683 }
2684
2685 offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
2686
2687 lp_build_fetch_rgba_soa(bld->gallivm,
2688 bld->format_desc,
2689 bld->texel_type, TRUE,
2690 bld->base_ptr, offset,
2691 i, j,
2692 bld->cache,
2693 colors_out);
2694
2695 if (out_of_bound_ret_zero) {
2696 /*
2697 * Only needed for ARB_robust_buffer_access_behavior and d3d10.
2698 * Could use min/max above instead of out-of-bounds comparisons
2699 * if we don't care about the result returned for out-of-bounds.
2700 */
2701 for (chan = 0; chan < 4; chan++) {
2702 colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
2703 bld->texel_bld.zero, colors_out[chan]);
2704 }
2705 }
2706 }
2707
2708
2709 /**
2710 * Just set texels to white instead of actually sampling the texture.
2711 * For debugging.
2712 */
2713 void
2714 lp_build_sample_nop(struct gallivm_state *gallivm,
2715 struct lp_type type,
2716 const LLVMValueRef *coords,
2717 LLVMValueRef texel_out[4])
2718 {
2719 LLVMValueRef one = lp_build_one(gallivm, type);
2720 unsigned chan;
2721
2722 for (chan = 0; chan < 4; chan++) {
2723 texel_out[chan] = one;
2724 }
2725 }
2726
2727
2728 /**
2729 * Build the actual texture sampling code.
2730 * 'texel' will return a vector of four LLVMValueRefs corresponding to
2731 * R, G, B, A.
2732 * \param type vector float type to use for coords, etc.
2733 * \param sample_key
2734 * \param derivs partial derivatives of (s,t,r,q) with respect to x and y
2735 */
2736 static void
2737 lp_build_sample_soa_code(struct gallivm_state *gallivm,
2738 const struct lp_static_texture_state *static_texture_state,
2739 const struct lp_static_sampler_state *static_sampler_state,
2740 struct lp_sampler_dynamic_state *dynamic_state,
2741 struct lp_type type,
2742 unsigned sample_key,
2743 unsigned texture_index,
2744 unsigned sampler_index,
2745 LLVMValueRef context_ptr,
2746 LLVMValueRef thread_data_ptr,
2747 const LLVMValueRef *coords,
2748 const LLVMValueRef *offsets,
2749 const struct lp_derivatives *derivs, /* optional */
2750 LLVMValueRef lod, /* optional */
2751 LLVMValueRef texel_out[4])
2752 {
2753 unsigned target = static_texture_state->target;
2754 unsigned dims = texture_dims(target);
2755 unsigned num_quads = type.length / 4;
2756 unsigned mip_filter, min_img_filter, mag_img_filter, i;
2757 struct lp_build_sample_context bld;
2758 struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
2759 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2760 LLVMBuilderRef builder = gallivm->builder;
2761 LLVMValueRef tex_width, newcoords[5];
2762 enum lp_sampler_lod_property lod_property;
2763 enum lp_sampler_lod_control lod_control;
2764 enum lp_sampler_op_type op_type;
2765 LLVMValueRef lod_bias = NULL;
2766 LLVMValueRef explicit_lod = NULL;
2767 boolean op_is_tex, op_is_lodq, op_is_gather;
2768
2769 if (0) {
2770 enum pipe_format fmt = static_texture_state->format;
2771 debug_printf("Sample from %s\n", util_format_name(fmt));
2772 }
2773
2774 lod_property = (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
2775 LP_SAMPLER_LOD_PROPERTY_SHIFT;
2776 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
2777 LP_SAMPLER_LOD_CONTROL_SHIFT;
2778 op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
2779 LP_SAMPLER_OP_TYPE_SHIFT;
2780
2781 op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
2782 op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
2783 op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
2784
2785 if (lod_control == LP_SAMPLER_LOD_BIAS) {
2786 lod_bias = lod;
2787 assert(lod);
2788 assert(derivs == NULL);
2789 }
2790 else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
2791 explicit_lod = lod;
2792 assert(lod);
2793 assert(derivs == NULL);
2794 }
2795 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
2796 assert(derivs);
2797 assert(lod == NULL);
2798 }
2799 else {
2800 assert(derivs == NULL);
2801 assert(lod == NULL);
2802 }
2803
2804 if (static_texture_state->format == PIPE_FORMAT_NONE) {
2805 /*
2806 * If there's nothing bound, format is NONE, and we must return
2807 * all zero as mandated by d3d10 in this case.
2808 */
2809 unsigned chan;
2810 LLVMValueRef zero = lp_build_zero(gallivm, type);
2811 for (chan = 0; chan < 4; chan++) {
2812 texel_out[chan] = zero;
2813 }
2814 return;
2815 }
2816
2817 assert(type.floating);
2818
2819 /* Setup our build context */
2820 memset(&bld, 0, sizeof bld);
2821 bld.gallivm = gallivm;
2822 bld.context_ptr = context_ptr;
2823 bld.static_sampler_state = &derived_sampler_state;
2824 bld.static_texture_state = static_texture_state;
2825 bld.dynamic_state = dynamic_state;
2826 bld.format_desc = util_format_description(static_texture_state->format);
2827 bld.dims = dims;
2828
2829 if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD || op_is_lodq) {
2830 bld.no_quad_lod = TRUE;
2831 }
2832 if (gallivm_perf & GALLIVM_PERF_NO_RHO_APPROX || op_is_lodq) {
2833 bld.no_rho_approx = TRUE;
2834 }
2835 if (gallivm_perf & GALLIVM_PERF_NO_BRILINEAR || op_is_lodq) {
2836 bld.no_brilinear = TRUE;
2837 }
2838
2839 bld.vector_width = lp_type_width(type);
2840
2841 bld.float_type = lp_type_float(32);
2842 bld.int_type = lp_type_int(32);
2843 bld.coord_type = type;
2844 bld.int_coord_type = lp_int_type(type);
2845 bld.float_size_in_type = lp_type_float(32);
2846 bld.float_size_in_type.length = dims > 1 ? 4 : 1;
2847 bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
2848 bld.texel_type = type;
2849
2850 /* always using the first channel hopefully should be safe,
2851 * if not things WILL break in other places anyway.
2852 */
2853 if (bld.format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
2854 bld.format_desc->channel[0].pure_integer) {
2855 if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
2856 bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2857 }
2858 else if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2859 bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
2860 }
2861 }
2862 else if (util_format_has_stencil(bld.format_desc) &&
2863 !util_format_has_depth(bld.format_desc)) {
2864 /* for stencil only formats, sample stencil (uint) */
2865 bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2866 }
2867
2868 if (!static_texture_state->level_zero_only ||
2869 !static_sampler_state->max_lod_pos || op_is_lodq) {
2870 derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
2871 } else {
2872 derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2873 }
2874 if (op_is_gather) {
2875 /*
2876 * gather4 is exactly like GL_LINEAR filtering but in the end skipping
2877 * the actual filtering. Using mostly the same paths, so cube face
2878 * selection, coord wrapping etc. all naturally uses the same code.
2879 */
2880 derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2881 derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
2882 derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
2883 }
2884 mip_filter = derived_sampler_state.min_mip_filter;
2885
2886 if (0) {
2887 debug_printf(" .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
2888 }
2889
2890 if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2891 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
2892 {
2893 /*
2894 * Seamless filtering ignores wrap modes.
2895 * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
2896 * bilinear it's not correct but way better than using for instance repeat.
2897 * Note we even set this for non-seamless. Technically GL allows any wrap
2898 * mode, which made sense when supporting true borders (can get seamless
2899 * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
2900 * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
2901 * up the sampler state (as it makes it texture dependent).
2902 */
2903 derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2904 derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2905 }
2906 /*
2907 * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
2908 * so AoS path could be used. Not sure it's worth the trouble...
2909 */
2910
2911 min_img_filter = derived_sampler_state.min_img_filter;
2912 mag_img_filter = derived_sampler_state.mag_img_filter;
2913
2914
2915 /*
2916 * This is all a bit complicated different paths are chosen for performance
2917 * reasons.
2918 * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
2919 * everything (the last two options are equivalent for 4-wide case).
2920 * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
2921 * lod is calculated then the lod value extracted afterwards so making this
2922 * case basically the same as far as lod handling is concerned for the
2923 * further sample/filter code as the 1 lod for everything case.
2924 * Different lod handling mostly shows up when building mipmap sizes
2925 * (lp_build_mipmap_level_sizes() and friends) and also in filtering
2926 * (getting the fractional part of the lod to the right texels).
2927 */
2928
2929 /*
2930 * There are other situations where at least the multiple int lods could be
2931 * avoided like min and max lod being equal.
2932 */
2933 bld.num_mips = bld.num_lods = 1;
2934
2935 if (bld.no_quad_lod && bld.no_rho_approx &&
2936 ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
2937 (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2938 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
2939 op_is_lodq)) {
2940 /*
2941 * special case for using per-pixel lod even for implicit lod,
2942 * which is generally never required (ok by APIs) except to please
2943 * some (somewhat broken imho) tests (because per-pixel face selection
2944 * can cause derivatives to be different for pixels outside the primitive
2945 * due to the major axis division even if pre-project derivatives are
2946 * looking normal).
2947 * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
2948 * cube maps we do indeed get per-pixel lod values).
2949 */
2950 bld.num_mips = type.length;
2951 bld.num_lods = type.length;
2952 }
2953 else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
2954 (explicit_lod || lod_bias || derivs)) {
2955 if ((!op_is_tex && target != PIPE_BUFFER) ||
2956 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2957 bld.num_mips = type.length;
2958 bld.num_lods = type.length;
2959 }
2960 else if (op_is_tex && min_img_filter != mag_img_filter) {
2961 bld.num_mips = 1;
2962 bld.num_lods = type.length;
2963 }
2964 }
2965 /* TODO: for true scalar_lod should only use 1 lod value */
2966 else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
2967 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2968 bld.num_mips = num_quads;
2969 bld.num_lods = num_quads;
2970 }
2971 else if (op_is_tex && min_img_filter != mag_img_filter) {
2972 bld.num_mips = 1;
2973 bld.num_lods = num_quads;
2974 }
2975
2976
2977 bld.lodf_type = type;
2978 /* we want native vector size to be able to use our intrinsics */
2979 if (bld.num_lods != type.length) {
2980 /* TODO: this currently always has to be per-quad or per-element */
2981 bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
2982 }
2983 bld.lodi_type = lp_int_type(bld.lodf_type);
2984 bld.levelf_type = bld.lodf_type;
2985 if (bld.num_mips == 1) {
2986 bld.levelf_type.length = 1;
2987 }
2988 bld.leveli_type = lp_int_type(bld.levelf_type);
2989 bld.float_size_type = bld.float_size_in_type;
2990 /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
2991 * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
2992 if (bld.num_mips > 1) {
2993 bld.float_size_type.length = bld.num_mips == type.length ?
2994 bld.num_mips * bld.float_size_in_type.length :
2995 type.length;
2996 }
2997 bld.int_size_type = lp_int_type(bld.float_size_type);
2998
2999 lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
3000 lp_build_context_init(&bld.float_vec_bld, gallivm, type);
3001 lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
3002 lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
3003 lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
3004 lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3005 lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3006 lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3007 lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3008 lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3009 lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3010 lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3011 lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3012 lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3013
3014 /* Get the dynamic state */
3015 tex_width = dynamic_state->width(dynamic_state, gallivm,
3016 context_ptr, texture_index);
3017 bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3018 context_ptr, texture_index);
3019 bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3020 context_ptr, texture_index);
3021 bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3022 context_ptr, texture_index);
3023 bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3024 context_ptr, texture_index);
3025 /* Note that mip_offsets is an array[level] of offsets to texture images */
3026
3027 if (dynamic_state->cache_ptr && thread_data_ptr) {
3028 bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3029 thread_data_ptr, texture_index);
3030 }
3031
3032 /* width, height, depth as single int vector */
3033 if (dims <= 1) {
3034 bld.int_size = tex_width;
3035 }
3036 else {
3037 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3038 tex_width,
3039 LLVMConstInt(i32t, 0, 0), "");
3040 if (dims >= 2) {
3041 LLVMValueRef tex_height =
3042 dynamic_state->height(dynamic_state, gallivm,
3043 context_ptr, texture_index);
3044 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3045 tex_height,
3046 LLVMConstInt(i32t, 1, 0), "");
3047 if (dims >= 3) {
3048 LLVMValueRef tex_depth =
3049 dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3050 texture_index);
3051 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3052 tex_depth,
3053 LLVMConstInt(i32t, 2, 0), "");
3054 }
3055 }
3056 }
3057
3058 for (i = 0; i < 5; i++) {
3059 newcoords[i] = coords[i];
3060 }
3061
3062 if (util_format_is_pure_integer(static_texture_state->format) &&
3063 !util_format_has_depth(bld.format_desc) && op_is_tex &&
3064 (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3065 static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3066 static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3067 /*
3068 * Bail if impossible filtering is specified (the awkard additional
3069 * depth check is because it is legal in gallium to have things like S8Z24
3070 * here which would say it's pure int despite such formats should sample
3071 * the depth component).
3072 * In GL such filters make the texture incomplete, this makes it robust
3073 * against state trackers which set this up regardless (we'd crash in the
3074 * lerp later otherwise).
3075 * At least in some apis it may be legal to use such filters with lod
3076 * queries and/or gather (at least for gather d3d10 says only the wrap
3077 * bits are really used hence filter bits are likely simply ignored).
3078 * For fetch, we don't get valid samplers either way here.
3079 */
3080 unsigned chan;
3081 LLVMValueRef zero = lp_build_zero(gallivm, type);
3082 for (chan = 0; chan < 4; chan++) {
3083 texel_out[chan] = zero;
3084 }
3085 return;
3086 }
3087
3088 if (0) {
3089 /* For debug: no-op texture sampling */
3090 lp_build_sample_nop(gallivm,
3091 bld.texel_type,
3092 newcoords,
3093 texel_out);
3094 }
3095
3096 else if (op_type == LP_SAMPLER_OP_FETCH) {
3097 lp_build_fetch_texel(&bld, texture_index, newcoords,
3098 lod, offsets,
3099 texel_out);
3100 }
3101
3102 else {
3103 LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3104 LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3105 boolean use_aos;
3106
3107 use_aos = util_format_fits_8unorm(bld.format_desc) &&
3108 op_is_tex &&
3109 /* not sure this is strictly needed or simply impossible */
3110 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3111 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3112
3113 use_aos &= bld.num_lods <= num_quads ||
3114 derived_sampler_state.min_img_filter ==
3115 derived_sampler_state.mag_img_filter;
3116
3117 if(gallivm_perf & GALLIVM_PERF_NO_AOS_SAMPLING) {
3118 use_aos = 0;
3119 }
3120
3121 if (dims > 1) {
3122 use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3123 if (dims > 2) {
3124 use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3125 }
3126 }
3127 if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3128 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3129 derived_sampler_state.seamless_cube_map &&
3130 (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3131 derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3132 /* theoretically possible with AoS filtering but not implemented (complex!) */
3133 use_aos = 0;
3134 }
3135
3136 if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3137 !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3138 debug_printf("%s: using floating point linear filtering for %s\n",
3139 __FUNCTION__, bld.format_desc->short_name);
3140 debug_printf(" min_img %d mag_img %d mip %d target %d seamless %d"
3141 " wraps %d wrapt %d wrapr %d\n",
3142 derived_sampler_state.min_img_filter,
3143 derived_sampler_state.mag_img_filter,
3144 derived_sampler_state.min_mip_filter,
3145 static_texture_state->target,
3146 derived_sampler_state.seamless_cube_map,
3147 derived_sampler_state.wrap_s,
3148 derived_sampler_state.wrap_t,
3149 derived_sampler_state.wrap_r);
3150 }
3151
3152 lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3153 newcoords,
3154 derivs, lod_bias, explicit_lod,
3155 &lod_positive, &lod, &lod_fpart,
3156 &ilevel0, &ilevel1);
3157
3158 if (op_is_lodq) {
3159 texel_out[0] = lod_fpart;
3160 texel_out[1] = lod;
3161 texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3162 return;
3163 }
3164
3165 if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3166 /* The aos path doesn't do seamless filtering so simply add cube layer
3167 * to face now.
3168 */
3169 newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3170 }
3171
3172 /*
3173 * we only try 8-wide sampling with soa or if we have AVX2
3174 * as it appears to be a loss with just AVX)
3175 */
3176 if (num_quads == 1 || !use_aos ||
3177 (util_cpu_caps.has_avx2 &&
3178 (bld.num_lods == 1 ||
3179 derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3180 if (use_aos) {
3181 /* do sampling/filtering with fixed pt arithmetic */
3182 lp_build_sample_aos(&bld, sampler_index,
3183 newcoords[0], newcoords[1],
3184 newcoords[2],
3185 offsets, lod_positive, lod_fpart,
3186 ilevel0, ilevel1,
3187 texel_out);
3188 }
3189
3190 else {
3191 lp_build_sample_general(&bld, sampler_index,
3192 op_type == LP_SAMPLER_OP_GATHER,
3193 newcoords, offsets,
3194 lod_positive, lod_fpart,
3195 ilevel0, ilevel1,
3196 texel_out);
3197 }
3198 }
3199 else {
3200 unsigned j;
3201 struct lp_build_sample_context bld4;
3202 struct lp_type type4 = type;
3203 unsigned i;
3204 LLVMValueRef texelout4[4];
3205 LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3206
3207 type4.length = 4;
3208
3209 /* Setup our build context */
3210 memset(&bld4, 0, sizeof bld4);
3211 bld4.no_quad_lod = bld.no_quad_lod;
3212 bld4.no_rho_approx = bld.no_rho_approx;
3213 bld4.no_brilinear = bld.no_brilinear;
3214 bld4.gallivm = bld.gallivm;
3215 bld4.context_ptr = bld.context_ptr;
3216 bld4.static_texture_state = bld.static_texture_state;
3217 bld4.static_sampler_state = bld.static_sampler_state;
3218 bld4.dynamic_state = bld.dynamic_state;
3219 bld4.format_desc = bld.format_desc;
3220 bld4.dims = bld.dims;
3221 bld4.row_stride_array = bld.row_stride_array;
3222 bld4.img_stride_array = bld.img_stride_array;
3223 bld4.base_ptr = bld.base_ptr;
3224 bld4.mip_offsets = bld.mip_offsets;
3225 bld4.int_size = bld.int_size;
3226 bld4.cache = bld.cache;
3227
3228 bld4.vector_width = lp_type_width(type4);
3229
3230 bld4.float_type = lp_type_float(32);
3231 bld4.int_type = lp_type_int(32);
3232 bld4.coord_type = type4;
3233 bld4.int_coord_type = lp_int_type(type4);
3234 bld4.float_size_in_type = lp_type_float(32);
3235 bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3236 bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3237 bld4.texel_type = bld.texel_type;
3238 bld4.texel_type.length = 4;
3239
3240 bld4.num_mips = bld4.num_lods = 1;
3241 if (bld4.no_quad_lod && bld4.no_rho_approx &&
3242 (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3243 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3244 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3245 bld4.num_mips = type4.length;
3246 bld4.num_lods = type4.length;
3247 }
3248 if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3249 (explicit_lod || lod_bias || derivs)) {
3250 if ((!op_is_tex && target != PIPE_BUFFER) ||
3251 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3252 bld4.num_mips = type4.length;
3253 bld4.num_lods = type4.length;
3254 }
3255 else if (op_is_tex && min_img_filter != mag_img_filter) {
3256 bld4.num_mips = 1;
3257 bld4.num_lods = type4.length;
3258 }
3259 }
3260
3261 /* we want native vector size to be able to use our intrinsics */
3262 bld4.lodf_type = type4;
3263 if (bld4.num_lods != type4.length) {
3264 bld4.lodf_type.length = 1;
3265 }
3266 bld4.lodi_type = lp_int_type(bld4.lodf_type);
3267 bld4.levelf_type = type4;
3268 if (bld4.num_mips != type4.length) {
3269 bld4.levelf_type.length = 1;
3270 }
3271 bld4.leveli_type = lp_int_type(bld4.levelf_type);
3272 bld4.float_size_type = bld4.float_size_in_type;
3273 if (bld4.num_mips > 1) {
3274 bld4.float_size_type.length = bld4.num_mips == type4.length ?
3275 bld4.num_mips * bld4.float_size_in_type.length :
3276 type4.length;
3277 }
3278 bld4.int_size_type = lp_int_type(bld4.float_size_type);
3279
3280 lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3281 lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3282 lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3283 lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3284 lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3285 lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3286 lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3287 lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3288 lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3289 lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3290 lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3291 lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3292 lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3293 lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3294
3295 for (i = 0; i < num_quads; i++) {
3296 LLVMValueRef s4, t4, r4;
3297 LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3298 LLVMValueRef ilevel04, ilevel14 = NULL;
3299 LLVMValueRef offsets4[4] = { NULL };
3300 unsigned num_lods = bld4.num_lods;
3301
3302 s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3303 t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3304 r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3305
3306 if (offsets[0]) {
3307 offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3308 if (dims > 1) {
3309 offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3310 if (dims > 2) {
3311 offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3312 }
3313 }
3314 }
3315 lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3316 ilevel04 = bld.num_mips == 1 ? ilevel0 :
3317 lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3318 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3319 ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3320 lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3321 }
3322
3323 if (use_aos) {
3324 /* do sampling/filtering with fixed pt arithmetic */
3325 lp_build_sample_aos(&bld4, sampler_index,
3326 s4, t4, r4, offsets4,
3327 lod_positive4, lod_fpart4,
3328 ilevel04, ilevel14,
3329 texelout4);
3330 }
3331
3332 else {
3333 /* this path is currently unreachable and hence might break easily... */
3334 LLVMValueRef newcoords4[5];
3335 newcoords4[0] = s4;
3336 newcoords4[1] = t4;
3337 newcoords4[2] = r4;
3338 newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3339 newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3340
3341 lp_build_sample_general(&bld4, sampler_index,
3342 op_type == LP_SAMPLER_OP_GATHER,
3343 newcoords4, offsets4,
3344 lod_positive4, lod_fpart4,
3345 ilevel04, ilevel14,
3346 texelout4);
3347 }
3348 for (j = 0; j < 4; j++) {
3349 texelouttmp[j][i] = texelout4[j];
3350 }
3351 }
3352
3353 for (j = 0; j < 4; j++) {
3354 texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3355 }
3356 }
3357 }
3358
3359 if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3360 apply_sampler_swizzle(&bld, texel_out);
3361 }
3362
3363 /*
3364 * texel type can be a (32bit) int/uint (for pure int formats only),
3365 * however we are expected to always return floats (storage is untyped).
3366 */
3367 if (!bld.texel_type.floating) {
3368 unsigned chan;
3369 for (chan = 0; chan < 4; chan++) {
3370 texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3371 lp_build_vec_type(gallivm, type), "");
3372 }
3373 }
3374 }
3375
3376
3377 #define USE_TEX_FUNC_CALL 1
3378
3379 #define LP_MAX_TEX_FUNC_ARGS 32
3380
3381 static inline void
3382 get_target_info(enum pipe_texture_target target,
3383 unsigned *num_coords, unsigned *num_derivs,
3384 unsigned *num_offsets, unsigned *layer)
3385 {
3386 unsigned dims = texture_dims(target);
3387 *num_coords = dims;
3388 *num_offsets = dims;
3389 *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3390 target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3391 *layer = has_layer_coord(target) ? 2: 0;
3392 if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3393 /*
3394 * dims doesn't include r coord for cubes - this is handled
3395 * by layer instead, but need to fix up for cube arrays...
3396 */
3397 *layer = 3;
3398 *num_coords = 3;
3399 }
3400 }
3401
3402
3403 /**
3404 * Generate the function body for a texture sampling function.
3405 */
3406 static void
3407 lp_build_sample_gen_func(struct gallivm_state *gallivm,
3408 const struct lp_static_texture_state *static_texture_state,
3409 const struct lp_static_sampler_state *static_sampler_state,
3410 struct lp_sampler_dynamic_state *dynamic_state,
3411 struct lp_type type,
3412 unsigned texture_index,
3413 unsigned sampler_index,
3414 LLVMValueRef function,
3415 unsigned num_args,
3416 unsigned sample_key)
3417 {
3418 LLVMBuilderRef old_builder;
3419 LLVMBasicBlockRef block;
3420 LLVMValueRef coords[5];
3421 LLVMValueRef offsets[3] = { NULL };
3422 LLVMValueRef lod = NULL;
3423 LLVMValueRef context_ptr;
3424 LLVMValueRef thread_data_ptr = NULL;
3425 LLVMValueRef texel_out[4];
3426 struct lp_derivatives derivs;
3427 struct lp_derivatives *deriv_ptr = NULL;
3428 unsigned num_param = 0;
3429 unsigned i, num_coords, num_derivs, num_offsets, layer;
3430 enum lp_sampler_lod_control lod_control;
3431 boolean need_cache = FALSE;
3432
3433 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3434 LP_SAMPLER_LOD_CONTROL_SHIFT;
3435
3436 get_target_info(static_texture_state->target,
3437 &num_coords, &num_derivs, &num_offsets, &layer);
3438
3439 if (dynamic_state->cache_ptr) {
3440 const struct util_format_description *format_desc;
3441 format_desc = util_format_description(static_texture_state->format);
3442 if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3443 need_cache = TRUE;
3444 }
3445 }
3446
3447 /* "unpack" arguments */
3448 context_ptr = LLVMGetParam(function, num_param++);
3449 if (need_cache) {
3450 thread_data_ptr = LLVMGetParam(function, num_param++);
3451 }
3452 for (i = 0; i < num_coords; i++) {
3453 coords[i] = LLVMGetParam(function, num_param++);
3454 }
3455 for (i = num_coords; i < 5; i++) {
3456 /* This is rather unfortunate... */
3457 coords[i] = lp_build_undef(gallivm, type);
3458 }
3459 if (layer) {
3460 coords[layer] = LLVMGetParam(function, num_param++);
3461 }
3462 if (sample_key & LP_SAMPLER_SHADOW) {
3463 coords[4] = LLVMGetParam(function, num_param++);
3464 }
3465 if (sample_key & LP_SAMPLER_OFFSETS) {
3466 for (i = 0; i < num_offsets; i++) {
3467 offsets[i] = LLVMGetParam(function, num_param++);
3468 }
3469 }
3470 if (lod_control == LP_SAMPLER_LOD_BIAS ||
3471 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3472 lod = LLVMGetParam(function, num_param++);
3473 }
3474 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3475 for (i = 0; i < num_derivs; i++) {
3476 derivs.ddx[i] = LLVMGetParam(function, num_param++);
3477 derivs.ddy[i] = LLVMGetParam(function, num_param++);
3478 }
3479 deriv_ptr = &derivs;
3480 }
3481
3482 assert(num_args == num_param);
3483
3484 /*
3485 * Function body
3486 */
3487
3488 old_builder = gallivm->builder;
3489 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3490 gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
3491 LLVMPositionBuilderAtEnd(gallivm->builder, block);
3492
3493 lp_build_sample_soa_code(gallivm,
3494 static_texture_state,
3495 static_sampler_state,
3496 dynamic_state,
3497 type,
3498 sample_key,
3499 texture_index,
3500 sampler_index,
3501 context_ptr,
3502 thread_data_ptr,
3503 coords,
3504 offsets,
3505 deriv_ptr,
3506 lod,
3507 texel_out);
3508
3509 LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
3510
3511 LLVMDisposeBuilder(gallivm->builder);
3512 gallivm->builder = old_builder;
3513
3514 gallivm_verify_function(gallivm, function);
3515 }
3516
3517
3518 /**
3519 * Call the matching function for texture sampling.
3520 * If there's no match, generate a new one.
3521 */
3522 static void
3523 lp_build_sample_soa_func(struct gallivm_state *gallivm,
3524 const struct lp_static_texture_state *static_texture_state,
3525 const struct lp_static_sampler_state *static_sampler_state,
3526 struct lp_sampler_dynamic_state *dynamic_state,
3527 const struct lp_sampler_params *params)
3528 {
3529 LLVMBuilderRef builder = gallivm->builder;
3530 LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
3531 LLVMGetInsertBlock(builder)));
3532 LLVMValueRef function, inst;
3533 LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
3534 LLVMBasicBlockRef bb;
3535 LLVMValueRef tex_ret;
3536 unsigned num_args = 0;
3537 char func_name[64];
3538 unsigned i, num_coords, num_derivs, num_offsets, layer;
3539 unsigned texture_index = params->texture_index;
3540 unsigned sampler_index = params->sampler_index;
3541 unsigned sample_key = params->sample_key;
3542 const LLVMValueRef *coords = params->coords;
3543 const LLVMValueRef *offsets = params->offsets;
3544 const struct lp_derivatives *derivs = params->derivs;
3545 enum lp_sampler_lod_control lod_control;
3546 boolean need_cache = FALSE;
3547
3548 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3549 LP_SAMPLER_LOD_CONTROL_SHIFT;
3550
3551 get_target_info(static_texture_state->target,
3552 &num_coords, &num_derivs, &num_offsets, &layer);
3553
3554 if (dynamic_state->cache_ptr) {
3555 const struct util_format_description *format_desc;
3556 format_desc = util_format_description(static_texture_state->format);
3557 if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3558 need_cache = TRUE;
3559 }
3560 }
3561 /*
3562 * texture function matches are found by name.
3563 * Thus the name has to include both the texture and sampler unit
3564 * (which covers all static state) plus the actual texture function
3565 * (including things like offsets, shadow coord, lod control).
3566 * Additionally lod_property has to be included too.
3567 */
3568
3569 snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
3570 texture_index, sampler_index, sample_key);
3571
3572 function = LLVMGetNamedFunction(module, func_name);
3573
3574 if(!function) {
3575 LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
3576 LLVMTypeRef ret_type;
3577 LLVMTypeRef function_type;
3578 LLVMTypeRef val_type[4];
3579 unsigned num_param = 0;
3580
3581 /*
3582 * Generate the function prototype.
3583 */
3584
3585 arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
3586 if (need_cache) {
3587 arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
3588 }
3589 for (i = 0; i < num_coords; i++) {
3590 arg_types[num_param++] = LLVMTypeOf(coords[0]);
3591 assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
3592 }
3593 if (layer) {
3594 arg_types[num_param++] = LLVMTypeOf(coords[layer]);
3595 assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
3596 }
3597 if (sample_key & LP_SAMPLER_SHADOW) {
3598 arg_types[num_param++] = LLVMTypeOf(coords[0]);
3599 }
3600 if (sample_key & LP_SAMPLER_OFFSETS) {
3601 for (i = 0; i < num_offsets; i++) {
3602 arg_types[num_param++] = LLVMTypeOf(offsets[0]);
3603 assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
3604 }
3605 }
3606 if (lod_control == LP_SAMPLER_LOD_BIAS ||
3607 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3608 arg_types[num_param++] = LLVMTypeOf(params->lod);
3609 }
3610 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3611 for (i = 0; i < num_derivs; i++) {
3612 arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
3613 arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
3614 assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
3615 assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
3616 }
3617 }
3618
3619 val_type[0] = val_type[1] = val_type[2] = val_type[3] =
3620 lp_build_vec_type(gallivm, params->type);
3621 ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
3622 function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
3623 function = LLVMAddFunction(module, func_name, function_type);
3624
3625 for (i = 0; i < num_param; ++i) {
3626 if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
3627
3628 lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3629 }
3630 }
3631
3632 LLVMSetFunctionCallConv(function, LLVMFastCallConv);
3633 LLVMSetLinkage(function, LLVMInternalLinkage);
3634
3635 lp_build_sample_gen_func(gallivm,
3636 static_texture_state,
3637 static_sampler_state,
3638 dynamic_state,
3639 params->type,
3640 texture_index,
3641 sampler_index,
3642 function,
3643 num_param,
3644 sample_key);
3645 }
3646
3647 num_args = 0;
3648 args[num_args++] = params->context_ptr;
3649 if (need_cache) {
3650 args[num_args++] = params->thread_data_ptr;
3651 }
3652 for (i = 0; i < num_coords; i++) {
3653 args[num_args++] = coords[i];
3654 }
3655 if (layer) {
3656 args[num_args++] = coords[layer];
3657 }
3658 if (sample_key & LP_SAMPLER_SHADOW) {
3659 args[num_args++] = coords[4];
3660 }
3661 if (sample_key & LP_SAMPLER_OFFSETS) {
3662 for (i = 0; i < num_offsets; i++) {
3663 args[num_args++] = offsets[i];
3664 }
3665 }
3666 if (lod_control == LP_SAMPLER_LOD_BIAS ||
3667 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3668 args[num_args++] = params->lod;
3669 }
3670 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3671 for (i = 0; i < num_derivs; i++) {
3672 args[num_args++] = derivs->ddx[i];
3673 args[num_args++] = derivs->ddy[i];
3674 }
3675 }
3676
3677 assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
3678
3679 tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
3680 bb = LLVMGetInsertBlock(builder);
3681 inst = LLVMGetLastInstruction(bb);
3682 LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
3683
3684 for (i = 0; i < 4; i++) {
3685 params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
3686 }
3687 }
3688
3689
3690 /**
3691 * Build texture sampling code.
3692 * Either via a function call or inline it directly.
3693 */
3694 void
3695 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
3696 const struct lp_static_sampler_state *static_sampler_state,
3697 struct lp_sampler_dynamic_state *dynamic_state,
3698 struct gallivm_state *gallivm,
3699 const struct lp_sampler_params *params)
3700 {
3701 boolean use_tex_func = FALSE;
3702
3703 /*
3704 * Do not use a function call if the sampling is "simple enough".
3705 * We define this by
3706 * a) format
3707 * b) no mips (either one level only or no mip filter)
3708 * No mips will definitely make the code smaller, though
3709 * the format requirement is a bit iffy - there's some (SoA) formats
3710 * which definitely generate less code. This does happen to catch
3711 * some important cases though which are hurt quite a bit by using
3712 * a call (though not really because of the call overhead but because
3713 * they are reusing the same texture unit with some of the same
3714 * parameters).
3715 * Ideally we'd let llvm recognize this stuff by doing IPO passes.
3716 */
3717
3718 if (USE_TEX_FUNC_CALL) {
3719 const struct util_format_description *format_desc;
3720 boolean simple_format;
3721 boolean simple_tex;
3722 enum lp_sampler_op_type op_type;
3723 format_desc = util_format_description(static_texture_state->format);
3724 simple_format = !format_desc ||
3725 (util_format_is_rgba8_variant(format_desc) &&
3726 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
3727
3728 op_type = (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3729 LP_SAMPLER_OP_TYPE_SHIFT;
3730 simple_tex =
3731 op_type != LP_SAMPLER_OP_TEXTURE ||
3732 ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
3733 static_texture_state->level_zero_only == TRUE) &&
3734 static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
3735
3736 use_tex_func = format_desc && !(simple_format && simple_tex);
3737 }
3738
3739 if (use_tex_func) {
3740 lp_build_sample_soa_func(gallivm,
3741 static_texture_state,
3742 static_sampler_state,
3743 dynamic_state,
3744 params);
3745 }
3746 else {
3747 lp_build_sample_soa_code(gallivm,
3748 static_texture_state,
3749 static_sampler_state,
3750 dynamic_state,
3751 params->type,
3752 params->sample_key,
3753 params->texture_index,
3754 params->sampler_index,
3755 params->context_ptr,
3756 params->thread_data_ptr,
3757 params->coords,
3758 params->offsets,
3759 params->derivs,
3760 params->lod,
3761 params->texel);
3762 }
3763 }
3764
3765
3766 void
3767 lp_build_size_query_soa(struct gallivm_state *gallivm,
3768 const struct lp_static_texture_state *static_state,
3769 struct lp_sampler_dynamic_state *dynamic_state,
3770 const struct lp_sampler_size_query_params *params)
3771 {
3772 LLVMValueRef lod, level = 0, size;
3773 LLVMValueRef first_level = NULL;
3774 int dims, i;
3775 boolean has_array;
3776 unsigned num_lods = 1;
3777 struct lp_build_context bld_int_vec4;
3778 LLVMValueRef context_ptr = params->context_ptr;
3779 unsigned texture_unit = params->texture_unit;
3780 unsigned target = params->target;
3781
3782 if (static_state->format == PIPE_FORMAT_NONE) {
3783 /*
3784 * If there's nothing bound, format is NONE, and we must return
3785 * all zero as mandated by d3d10 in this case.
3786 */
3787 unsigned chan;
3788 LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
3789 for (chan = 0; chan < 4; chan++) {
3790 params->sizes_out[chan] = zero;
3791 }
3792 return;
3793 }
3794
3795 /*
3796 * Do some sanity verification about bound texture and shader dcl target.
3797 * Not entirely sure what's possible but assume array/non-array
3798 * always compatible (probably not ok for OpenGL but d3d10 has no
3799 * distinction of arrays at the resource level).
3800 * Everything else looks bogus (though not entirely sure about rect/2d).
3801 * Currently disabled because it causes assertion failures if there's
3802 * nothing bound (or rather a dummy texture, not that this case would
3803 * return the right values).
3804 */
3805 if (0 && static_state->target != target) {
3806 if (static_state->target == PIPE_TEXTURE_1D)
3807 assert(target == PIPE_TEXTURE_1D_ARRAY);
3808 else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
3809 assert(target == PIPE_TEXTURE_1D);
3810 else if (static_state->target == PIPE_TEXTURE_2D)
3811 assert(target == PIPE_TEXTURE_2D_ARRAY);
3812 else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
3813 assert(target == PIPE_TEXTURE_2D);
3814 else if (static_state->target == PIPE_TEXTURE_CUBE)
3815 assert(target == PIPE_TEXTURE_CUBE_ARRAY);
3816 else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
3817 assert(target == PIPE_TEXTURE_CUBE);
3818 else
3819 assert(0);
3820 }
3821
3822 dims = texture_dims(target);
3823
3824 switch (target) {
3825 case PIPE_TEXTURE_1D_ARRAY:
3826 case PIPE_TEXTURE_2D_ARRAY:
3827 case PIPE_TEXTURE_CUBE_ARRAY:
3828 has_array = TRUE;
3829 break;
3830 default:
3831 has_array = FALSE;
3832 break;
3833 }
3834
3835 assert(!params->int_type.floating);
3836
3837 lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
3838
3839 if (params->explicit_lod) {
3840 /* FIXME: this needs to honor per-element lod */
3841 lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
3842 lp_build_const_int32(gallivm, 0), "");
3843 first_level = dynamic_state->first_level(dynamic_state, gallivm,
3844 context_ptr, texture_unit);
3845 level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
3846 lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
3847 } else {
3848 lod = bld_int_vec4.zero;
3849 }
3850
3851 size = bld_int_vec4.undef;
3852
3853 size = LLVMBuildInsertElement(gallivm->builder, size,
3854 dynamic_state->width(dynamic_state, gallivm,
3855 context_ptr, texture_unit),
3856 lp_build_const_int32(gallivm, 0), "");
3857
3858 if (dims >= 2) {
3859 size = LLVMBuildInsertElement(gallivm->builder, size,
3860 dynamic_state->height(dynamic_state, gallivm,
3861 context_ptr, texture_unit),
3862 lp_build_const_int32(gallivm, 1), "");
3863 }
3864
3865 if (dims >= 3) {
3866 size = LLVMBuildInsertElement(gallivm->builder, size,
3867 dynamic_state->depth(dynamic_state, gallivm,
3868 context_ptr, texture_unit),
3869 lp_build_const_int32(gallivm, 2), "");
3870 }
3871
3872 size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
3873
3874 if (has_array) {
3875 LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
3876 context_ptr, texture_unit);
3877 if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3878 /*
3879 * It looks like GL wants number of cubes, d3d10.1 has it undefined?
3880 * Could avoid this by passing in number of cubes instead of total
3881 * number of layers (might make things easier elsewhere too).
3882 */
3883 LLVMValueRef six = lp_build_const_int32(gallivm, 6);
3884 layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
3885 }
3886 size = LLVMBuildInsertElement(gallivm->builder, size, layers,
3887 lp_build_const_int32(gallivm, dims), "");
3888 }
3889
3890 /*
3891 * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
3892 * if level is out of bounds (note this can't cover unbound texture
3893 * here, which also requires returning zero).
3894 */
3895 if (params->explicit_lod && params->is_sviewinfo) {
3896 LLVMValueRef last_level, out, out1;
3897 struct lp_build_context leveli_bld;
3898
3899 /* everything is scalar for now */
3900 lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
3901 last_level = dynamic_state->last_level(dynamic_state, gallivm,
3902 context_ptr, texture_unit);
3903
3904 out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
3905 out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
3906 out = lp_build_or(&leveli_bld, out, out1);
3907 if (num_lods == 1) {
3908 out = lp_build_broadcast_scalar(&bld_int_vec4, out);
3909 }
3910 else {
3911 /* TODO */
3912 assert(0);
3913 }
3914 size = lp_build_andnot(&bld_int_vec4, size, out);
3915 }
3916 for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
3917 params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
3918 size,
3919 lp_build_const_int32(gallivm, i));
3920 }
3921 if (params->is_sviewinfo) {
3922 for (; i < 4; i++) {
3923 params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
3924 }
3925 }
3926
3927 /*
3928 * if there's no explicit_lod (buffers, rects) queries requiring nr of
3929 * mips would be illegal.
3930 */
3931 if (params->is_sviewinfo && params->explicit_lod) {
3932 struct lp_build_context bld_int_scalar;
3933 LLVMValueRef num_levels;
3934 lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
3935
3936 if (static_state->level_zero_only) {
3937 num_levels = bld_int_scalar.one;
3938 }
3939 else {
3940 LLVMValueRef last_level;
3941
3942 last_level = dynamic_state->last_level(dynamic_state, gallivm,
3943 context_ptr, texture_unit);
3944 num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
3945 num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
3946 }
3947 params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
3948 num_levels);
3949 }
3950 }
3951
3952 static void
3953 lp_build_do_atomic_soa(struct gallivm_state *gallivm,
3954 const struct util_format_description *format_desc,
3955 struct lp_type type,
3956 LLVMValueRef exec_mask,
3957 LLVMValueRef base_ptr,
3958 LLVMValueRef offset,
3959 LLVMValueRef out_of_bounds,
3960 unsigned img_op,
3961 LLVMAtomicRMWBinOp op,
3962 const LLVMValueRef rgba_in[4],
3963 const LLVMValueRef rgba2_in[4],
3964 LLVMValueRef atomic_result[4])
3965 {
3966 enum pipe_format format = format_desc->format;
3967
3968 if (format != PIPE_FORMAT_R32_UINT && format != PIPE_FORMAT_R32_SINT && format != PIPE_FORMAT_R32_FLOAT)
3969 return;
3970
3971 LLVMValueRef atom_res = lp_build_alloca(gallivm,
3972 LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), type.length), "");
3973
3974 offset = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
3975 struct lp_build_loop_state loop_state;
3976 lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
3977 struct lp_build_if_state ifthen;
3978 LLVMValueRef cond;
3979 LLVMValueRef packed = rgba_in[0], packed2 = rgba2_in[0];
3980
3981 LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
3982 assert(exec_mask);
3983
3984 cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
3985 cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
3986 lp_build_if(&ifthen, gallivm, cond);
3987
3988 LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed, loop_state.counter, "");
3989 LLVMValueRef cast_base_ptr = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
3990 cast_base_ptr = LLVMBuildBitCast(gallivm->builder, cast_base_ptr, LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), "");
3991 data = LLVMBuildBitCast(gallivm->builder, data, LLVMInt32TypeInContext(gallivm->context), "");
3992
3993 if (img_op == LP_IMG_ATOMIC_CAS) {
3994 LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, packed2, loop_state.counter, "");
3995 LLVMValueRef cas_src = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, LLVMInt32TypeInContext(gallivm->context), "");
3996 data = LLVMBuildAtomicCmpXchg(gallivm->builder, cast_base_ptr, data,
3997 cas_src,
3998 LLVMAtomicOrderingSequentiallyConsistent,
3999 LLVMAtomicOrderingSequentiallyConsistent,
4000 false);
4001 data = LLVMBuildExtractValue(gallivm->builder, data, 0, "");
4002 } else {
4003 data = LLVMBuildAtomicRMW(gallivm->builder, op,
4004 cast_base_ptr, data,
4005 LLVMAtomicOrderingSequentiallyConsistent,
4006 false);
4007 }
4008
4009 LLVMValueRef temp_res = LLVMBuildLoad(gallivm->builder, atom_res, "");
4010 temp_res = LLVMBuildInsertElement(gallivm->builder, temp_res, data, loop_state.counter, "");
4011 LLVMBuildStore(gallivm->builder, temp_res, atom_res);
4012
4013 lp_build_endif(&ifthen);
4014 lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
4015 NULL, LLVMIntUGE);
4016 atomic_result[0] = LLVMBuildLoad(gallivm->builder, atom_res, "");
4017 }
4018
4019 void
4020 lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state,
4021 struct lp_sampler_dynamic_state *dynamic_state,
4022 struct gallivm_state *gallivm,
4023 const struct lp_img_params *params)
4024 {
4025 unsigned target = params->target;
4026 unsigned dims = texture_dims(target);
4027 /** regular scalar int type */
4028 struct lp_type int_type, int_coord_type;
4029 struct lp_build_context int_bld, int_coord_bld;
4030 const struct util_format_description *format_desc = util_format_description(static_texture_state->format);
4031 LLVMValueRef x = params->coords[0], y = params->coords[1], z = params->coords[2];
4032 LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
4033 int_type = lp_type_int(32);
4034 int_coord_type = lp_int_type(params->type);
4035 lp_build_context_init(&int_bld, gallivm, int_type);
4036 lp_build_context_init(&int_coord_bld, gallivm, int_coord_type);
4037
4038 LLVMValueRef offset, i, j;
4039
4040 LLVMValueRef row_stride = dynamic_state->row_stride(dynamic_state, gallivm,
4041 params->context_ptr, params->image_index);
4042 LLVMValueRef img_stride = dynamic_state->img_stride(dynamic_state, gallivm,
4043 params->context_ptr, params->image_index);
4044 LLVMValueRef base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
4045 params->context_ptr, params->image_index);
4046 LLVMValueRef width = dynamic_state->width(dynamic_state, gallivm,
4047 params->context_ptr, params->image_index);
4048 LLVMValueRef height = dynamic_state->height(dynamic_state, gallivm,
4049 params->context_ptr, params->image_index);
4050 LLVMValueRef depth = dynamic_state->depth(dynamic_state, gallivm,
4051 params->context_ptr, params->image_index);
4052 boolean layer_coord = has_layer_coord(target);
4053
4054 width = lp_build_broadcast_scalar(&int_coord_bld, width);
4055 if (dims >= 2) {
4056 height = lp_build_broadcast_scalar(&int_coord_bld, height);
4057 row_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, row_stride);
4058 }
4059 if (dims >= 3 || layer_coord) {
4060 depth = lp_build_broadcast_scalar(&int_coord_bld, depth);
4061 img_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, img_stride);
4062 }
4063
4064 LLVMValueRef out_of_bounds = int_coord_bld.zero;
4065 LLVMValueRef out1;
4066 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
4067 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4068
4069 if (dims >= 2) {
4070 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
4071 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4072 }
4073 if (dims >= 3) {
4074 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
4075 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4076 }
4077 lp_build_sample_offset(&int_coord_bld,
4078 format_desc,
4079 x, y, z, row_stride_vec, img_stride_vec,
4080 &offset, &i, &j);
4081
4082 if (params->img_op == LP_IMG_LOAD) {
4083 struct lp_type texel_type = params->type;
4084 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
4085 format_desc->channel[0].pure_integer) {
4086 if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
4087 texel_type = lp_type_int_vec(params->type.width, params->type.width * params->type.length);
4088 } else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
4089 texel_type = lp_type_uint_vec(params->type.width, params->type.width * params->type.length);
4090 }
4091 }
4092
4093 if (static_texture_state->format == PIPE_FORMAT_NONE) {
4094 /*
4095 * If there's nothing bound, format is NONE, and we must return
4096 * all zero as mandated by d3d10 in this case.
4097 */
4098 unsigned chan;
4099 LLVMValueRef zero = lp_build_zero(gallivm, params->type);
4100 for (chan = 0; chan < 4; chan++) {
4101 params->outdata[chan] = zero;
4102 }
4103 return;
4104 }
4105
4106 offset = lp_build_andnot(&int_coord_bld, offset, out_of_bounds);
4107 struct lp_build_context texel_bld;
4108 lp_build_context_init(&texel_bld, gallivm, texel_type);
4109 lp_build_fetch_rgba_soa(gallivm,
4110 format_desc,
4111 texel_type, TRUE,
4112 base_ptr, offset,
4113 i, j,
4114 NULL,
4115 params->outdata);
4116
4117 for (unsigned chan = 0; chan < 4; chan++) {
4118 params->outdata[chan] = lp_build_select(&texel_bld, out_of_bounds,
4119 texel_bld.zero, params->outdata[chan]);
4120 }
4121 } else if (params->img_op == LP_IMG_STORE) {
4122 if (static_texture_state->format == PIPE_FORMAT_NONE)
4123 return;
4124 lp_build_store_rgba_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
4125 params->indata);
4126 } else {
4127 if (static_texture_state->format == PIPE_FORMAT_NONE)
4128 return;
4129 lp_build_do_atomic_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
4130 params->img_op, params->op, params->indata, params->indata2, params->outdata);
4131 }
4132 }