gallivm: minor rho calculation optimization for 1 or 3 coords
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- common code.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 */
34
35 #include "pipe/p_defines.h"
36 #include "pipe/p_state.h"
37 #include "util/u_format.h"
38 #include "util/u_math.h"
39 #include "lp_bld_arit.h"
40 #include "lp_bld_const.h"
41 #include "lp_bld_debug.h"
42 #include "lp_bld_printf.h"
43 #include "lp_bld_flow.h"
44 #include "lp_bld_sample.h"
45 #include "lp_bld_swizzle.h"
46 #include "lp_bld_type.h"
47 #include "lp_bld_logic.h"
48 #include "lp_bld_pack.h"
49 #include "lp_bld_quad.h"
50
51
52 /*
53 * Bri-linear factor. Should be greater than one.
54 */
55 #define BRILINEAR_FACTOR 2
56
57 /**
58 * Does the given texture wrap mode allow sampling the texture border color?
59 * XXX maybe move this into gallium util code.
60 */
61 boolean
62 lp_sampler_wrap_mode_uses_border_color(unsigned mode,
63 unsigned min_img_filter,
64 unsigned mag_img_filter)
65 {
66 switch (mode) {
67 case PIPE_TEX_WRAP_REPEAT:
68 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
69 case PIPE_TEX_WRAP_MIRROR_REPEAT:
70 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
71 return FALSE;
72 case PIPE_TEX_WRAP_CLAMP:
73 case PIPE_TEX_WRAP_MIRROR_CLAMP:
74 if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
75 mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
76 return FALSE;
77 } else {
78 return TRUE;
79 }
80 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
81 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
82 return TRUE;
83 default:
84 assert(0 && "unexpected wrap mode");
85 return FALSE;
86 }
87 }
88
89
90 /**
91 * Initialize lp_sampler_static_texture_state object with the gallium
92 * texture/sampler_view state (this contains the parts which are
93 * considered static).
94 */
95 void
96 lp_sampler_static_texture_state(struct lp_static_texture_state *state,
97 const struct pipe_sampler_view *view)
98 {
99 const struct pipe_resource *texture;
100
101 memset(state, 0, sizeof *state);
102
103 if (!view || !view->texture)
104 return;
105
106 texture = view->texture;
107
108 state->format = view->format;
109 state->swizzle_r = view->swizzle_r;
110 state->swizzle_g = view->swizzle_g;
111 state->swizzle_b = view->swizzle_b;
112 state->swizzle_a = view->swizzle_a;
113
114 state->target = texture->target;
115 state->pot_width = util_is_power_of_two(texture->width0);
116 state->pot_height = util_is_power_of_two(texture->height0);
117 state->pot_depth = util_is_power_of_two(texture->depth0);
118 state->level_zero_only = !view->u.tex.last_level;
119
120 /*
121 * the layer / element / level parameters are all either dynamic
122 * state or handled transparently wrt execution.
123 */
124 }
125
126
127 /**
128 * Initialize lp_sampler_static_sampler_state object with the gallium sampler
129 * state (this contains the parts which are considered static).
130 */
131 void
132 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
133 const struct pipe_sampler_state *sampler)
134 {
135 memset(state, 0, sizeof *state);
136
137 if (!sampler)
138 return;
139
140 /*
141 * We don't copy sampler state over unless it is actually enabled, to avoid
142 * spurious recompiles, as the sampler static state is part of the shader
143 * key.
144 *
145 * Ideally the state tracker or cso_cache module would make all state
146 * canonical, but until that happens it's better to be safe than sorry here.
147 *
148 * XXX: Actually there's much more than can be done here, especially
149 * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
150 */
151
152 state->wrap_s = sampler->wrap_s;
153 state->wrap_t = sampler->wrap_t;
154 state->wrap_r = sampler->wrap_r;
155 state->min_img_filter = sampler->min_img_filter;
156 state->mag_img_filter = sampler->mag_img_filter;
157
158 if (sampler->max_lod > 0.0f) {
159 state->min_mip_filter = sampler->min_mip_filter;
160 } else {
161 state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
162 }
163
164 if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
165 if (sampler->lod_bias != 0.0f) {
166 state->lod_bias_non_zero = 1;
167 }
168
169 /* If min_lod == max_lod we can greatly simplify mipmap selection.
170 * This is a case that occurs during automatic mipmap generation.
171 */
172 if (sampler->min_lod == sampler->max_lod) {
173 state->min_max_lod_equal = 1;
174 } else {
175 if (sampler->min_lod > 0.0f) {
176 state->apply_min_lod = 1;
177 }
178
179 /*
180 * XXX this won't do anything with the mesa state tracker which always
181 * sets max_lod to not more than actually present mip maps...
182 */
183 if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
184 state->apply_max_lod = 1;
185 }
186 }
187 }
188
189 state->compare_mode = sampler->compare_mode;
190 if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
191 state->compare_func = sampler->compare_func;
192 }
193
194 state->normalized_coords = sampler->normalized_coords;
195 }
196
197
198 /**
199 * Generate code to compute coordinate gradient (rho).
200 * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
201 *
202 * The resulting rho is scalar per quad.
203 */
204 static LLVMValueRef
205 lp_build_rho(struct lp_build_sample_context *bld,
206 unsigned texture_unit,
207 LLVMValueRef s,
208 LLVMValueRef t,
209 LLVMValueRef r,
210 const struct lp_derivatives *derivs)
211 {
212 struct gallivm_state *gallivm = bld->gallivm;
213 struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
214 struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
215 struct lp_build_context *float_bld = &bld->float_bld;
216 struct lp_build_context *coord_bld = &bld->coord_bld;
217 struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
218 const unsigned dims = bld->dims;
219 LLVMValueRef ddx_ddy[2];
220 LLVMBuilderRef builder = bld->gallivm->builder;
221 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
222 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
223 LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
224 LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
225 LLVMValueRef rho_vec;
226 LLVMValueRef int_size, float_size;
227 LLVMValueRef rho;
228 LLVMValueRef first_level, first_level_vec;
229 unsigned length = coord_bld->type.length;
230 unsigned num_quads = length / 4;
231 unsigned i;
232 LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
233 LLVMValueRef rho_xvec, rho_yvec;
234
235 /* Note that all simplified calculations will only work for isotropic filtering */
236
237 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
238 bld->gallivm, texture_unit);
239 first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
240 int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
241 float_size = lp_build_int_to_float(float_size_bld, int_size);
242
243 /* XXX ignoring explicit derivs for cube maps for now */
244 if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
245 LLVMValueRef ddmax[3];
246 for (i = 0; i < dims; i++) {
247 LLVMValueRef ddx, ddy;
248 LLVMValueRef floatdim;
249 LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
250 ddx = lp_build_abs(coord_bld, derivs->ddx[i]);
251 ddy = lp_build_abs(coord_bld, derivs->ddy[i]);
252 ddmax[i] = lp_build_max(coord_bld, ddx, ddy);
253 floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
254 coord_bld->type, float_size, indexi);
255 ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
256 }
257 rho_vec = ddmax[0];
258 if (dims > 1) {
259 rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[1]);
260 if (dims > 2) {
261 rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[2]);
262 }
263 }
264 /*
265 * rho_vec now still contains per-pixel rho, convert to scalar per quad
266 * since we can't handle per-pixel rho/lod from now on (TODO).
267 */
268 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
269 perquadf_bld->type, rho_vec, 0);
270 }
271 else {
272 /*
273 * This looks all a bit complex, but it's not that bad
274 * (the shuffle code makes it look worse than it is).
275 * Still, might not be ideal for all cases.
276 */
277 if (dims < 2) {
278 ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
279 }
280 else if (dims >= 2) {
281 ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
282 if (dims > 2) {
283 ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
284 }
285 }
286
287 ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
288 if (dims > 2) {
289 ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
290 }
291
292 if (dims < 2) {
293 static const unsigned char swizzle1[] = { /* no-op swizzle */
294 0, LP_BLD_SWIZZLE_DONTCARE,
295 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
296 };
297 static const unsigned char swizzle2[] = {
298 2, LP_BLD_SWIZZLE_DONTCARE,
299 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
300 };
301 rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle1);
302 rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2);
303 }
304 else if (dims == 2) {
305 static const unsigned char swizzle1[] = {
306 0, 2,
307 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
308 };
309 static const unsigned char swizzle2[] = {
310 1, 3,
311 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
312 };
313 rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle1);
314 rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2);
315 }
316 else {
317 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
318 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
319 assert(dims == 3);
320 for (i = 0; i < num_quads; i++) {
321 shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
322 shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
323 shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
324 shuffles1[4*i + 3] = i32undef;
325 shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
326 shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
327 shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
328 shuffles2[4*i + 3] = i32undef;
329 }
330 rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
331 LLVMConstVector(shuffles1, length), "");
332 rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
333 LLVMConstVector(shuffles2, length), "");
334 }
335
336 rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
337
338 if (bld->coord_type.length > 4) {
339 /* expand size to each quad */
340 if (dims > 1) {
341 /* could use some broadcast_vector helper for this? */
342 int num_quads = bld->coord_type.length / 4;
343 LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
344 for (i = 0; i < num_quads; i++) {
345 src[i] = float_size;
346 }
347 float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
348 }
349 else {
350 float_size = lp_build_broadcast_scalar(coord_bld, float_size);
351 }
352 rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
353
354 if (dims <= 1) {
355 rho = rho_vec;
356 }
357 else {
358 if (dims >= 2) {
359 static const unsigned char swizzle1[] = {
360 0, LP_BLD_SWIZZLE_DONTCARE,
361 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
362 };
363 static const unsigned char swizzle2[] = {
364 1, LP_BLD_SWIZZLE_DONTCARE,
365 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
366 };
367 LLVMValueRef rho_s, rho_t, rho_r;
368
369 rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
370 rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
371
372 rho = lp_build_max(coord_bld, rho_s, rho_t);
373
374 if (dims >= 3) {
375 static const unsigned char swizzle3[] = {
376 2, LP_BLD_SWIZZLE_DONTCARE,
377 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
378 };
379 rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle3);
380 rho = lp_build_max(coord_bld, rho, rho_r);
381 }
382 }
383 }
384 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
385 perquadf_bld->type, rho, 0);
386 }
387 else {
388 if (dims <= 1) {
389 rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
390 }
391 rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
392
393 if (dims <= 1) {
394 rho = rho_vec;
395 }
396 else {
397 if (dims >= 2) {
398 LLVMValueRef rho_s, rho_t, rho_r;
399
400 rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
401 rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
402
403 rho = lp_build_max(float_bld, rho_s, rho_t);
404
405 if (dims >= 3) {
406 rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
407 rho = lp_build_max(float_bld, rho, rho_r);
408 }
409 }
410 }
411 }
412 }
413
414 return rho;
415 }
416
417
418 /*
419 * Bri-linear lod computation
420 *
421 * Use a piece-wise linear approximation of log2 such that:
422 * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
423 * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
424 * with the steepness specified in 'factor'
425 * - exact result for 0.5, 1.5, etc.
426 *
427 *
428 * 1.0 - /----*
429 * /
430 * /
431 * /
432 * 0.5 - *
433 * /
434 * /
435 * /
436 * 0.0 - *----/
437 *
438 * | |
439 * 2^0 2^1
440 *
441 * This is a technique also commonly used in hardware:
442 * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
443 *
444 * TODO: For correctness, this should only be applied when texture is known to
445 * have regular mipmaps, i.e., mipmaps derived from the base level.
446 *
447 * TODO: This could be done in fixed point, where applicable.
448 */
449 static void
450 lp_build_brilinear_lod(struct lp_build_context *bld,
451 LLVMValueRef lod,
452 double factor,
453 LLVMValueRef *out_lod_ipart,
454 LLVMValueRef *out_lod_fpart)
455 {
456 LLVMValueRef lod_fpart;
457 double pre_offset = (factor - 0.5)/factor - 0.5;
458 double post_offset = 1 - factor;
459
460 if (0) {
461 lp_build_printf(bld->gallivm, "lod = %f\n", lod);
462 }
463
464 lod = lp_build_add(bld, lod,
465 lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
466
467 lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
468
469 lod_fpart = lp_build_mul(bld, lod_fpart,
470 lp_build_const_vec(bld->gallivm, bld->type, factor));
471
472 lod_fpart = lp_build_add(bld, lod_fpart,
473 lp_build_const_vec(bld->gallivm, bld->type, post_offset));
474
475 /*
476 * It's not necessary to clamp lod_fpart since:
477 * - the above expression will never produce numbers greater than one.
478 * - the mip filtering branch is only taken if lod_fpart is positive
479 */
480
481 *out_lod_fpart = lod_fpart;
482
483 if (0) {
484 lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
485 lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
486 }
487 }
488
489
490 /*
491 * Combined log2 and brilinear lod computation.
492 *
493 * It's in all identical to calling lp_build_fast_log2() and
494 * lp_build_brilinear_lod() above, but by combining we can compute the integer
495 * and fractional part independently.
496 */
497 static void
498 lp_build_brilinear_rho(struct lp_build_context *bld,
499 LLVMValueRef rho,
500 double factor,
501 LLVMValueRef *out_lod_ipart,
502 LLVMValueRef *out_lod_fpart)
503 {
504 LLVMValueRef lod_ipart;
505 LLVMValueRef lod_fpart;
506
507 const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
508 const double post_offset = 1 - 2*factor;
509
510 assert(bld->type.floating);
511
512 assert(lp_check_value(bld->type, rho));
513
514 /*
515 * The pre factor will make the intersections with the exact powers of two
516 * happen precisely where we want then to be, which means that the integer
517 * part will not need any post adjustments.
518 */
519 rho = lp_build_mul(bld, rho,
520 lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
521
522 /* ipart = ifloor(log2(rho)) */
523 lod_ipart = lp_build_extract_exponent(bld, rho, 0);
524
525 /* fpart = rho / 2**ipart */
526 lod_fpart = lp_build_extract_mantissa(bld, rho);
527
528 lod_fpart = lp_build_mul(bld, lod_fpart,
529 lp_build_const_vec(bld->gallivm, bld->type, factor));
530
531 lod_fpart = lp_build_add(bld, lod_fpart,
532 lp_build_const_vec(bld->gallivm, bld->type, post_offset));
533
534 /*
535 * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
536 * - the above expression will never produce numbers greater than one.
537 * - the mip filtering branch is only taken if lod_fpart is positive
538 */
539
540 *out_lod_ipart = lod_ipart;
541 *out_lod_fpart = lod_fpart;
542 }
543
544
545 /**
546 * Generate code to compute texture level of detail (lambda).
547 * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
548 * \param lod_bias optional float vector with the shader lod bias
549 * \param explicit_lod optional float vector with the explicit lod
550 * \param width scalar int texture width
551 * \param height scalar int texture height
552 * \param depth scalar int texture depth
553 *
554 * The resulting lod is scalar per quad, so only the first value per quad
555 * passed in from lod_bias, explicit_lod is used.
556 */
557 void
558 lp_build_lod_selector(struct lp_build_sample_context *bld,
559 unsigned texture_unit,
560 unsigned sampler_unit,
561 LLVMValueRef s,
562 LLVMValueRef t,
563 LLVMValueRef r,
564 const struct lp_derivatives *derivs,
565 LLVMValueRef lod_bias, /* optional */
566 LLVMValueRef explicit_lod, /* optional */
567 unsigned mip_filter,
568 LLVMValueRef *out_lod_ipart,
569 LLVMValueRef *out_lod_fpart)
570
571 {
572 LLVMBuilderRef builder = bld->gallivm->builder;
573 struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
574 LLVMValueRef lod;
575
576 *out_lod_ipart = bld->perquadi_bld.zero;
577 *out_lod_fpart = perquadf_bld->zero;
578
579 if (bld->static_sampler_state->min_max_lod_equal) {
580 /* User is forcing sampling from a particular mipmap level.
581 * This is hit during mipmap generation.
582 */
583 LLVMValueRef min_lod =
584 bld->dynamic_state->min_lod(bld->dynamic_state,
585 bld->gallivm, sampler_unit);
586
587 lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
588 }
589 else {
590 if (explicit_lod) {
591 lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
592 perquadf_bld->type, explicit_lod, 0);
593 }
594 else {
595 LLVMValueRef rho;
596
597 rho = lp_build_rho(bld, texture_unit, s, t, r, derivs);
598
599 /*
600 * Compute lod = log2(rho)
601 */
602
603 if (!lod_bias &&
604 !bld->static_sampler_state->lod_bias_non_zero &&
605 !bld->static_sampler_state->apply_max_lod &&
606 !bld->static_sampler_state->apply_min_lod) {
607 /*
608 * Special case when there are no post-log2 adjustments, which
609 * saves instructions but keeping the integer and fractional lod
610 * computations separate from the start.
611 */
612
613 if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
614 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
615 *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
616 *out_lod_fpart = perquadf_bld->zero;
617 return;
618 }
619 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
620 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
621 lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
622 out_lod_ipart, out_lod_fpart);
623 return;
624 }
625 }
626
627 if (0) {
628 lod = lp_build_log2(perquadf_bld, rho);
629 }
630 else {
631 lod = lp_build_fast_log2(perquadf_bld, rho);
632 }
633
634 /* add shader lod bias */
635 if (lod_bias) {
636 lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
637 perquadf_bld->type, lod_bias, 0);
638 lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
639 }
640 }
641
642 /* add sampler lod bias */
643 if (bld->static_sampler_state->lod_bias_non_zero) {
644 LLVMValueRef sampler_lod_bias =
645 bld->dynamic_state->lod_bias(bld->dynamic_state,
646 bld->gallivm, sampler_unit);
647 sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
648 sampler_lod_bias);
649 lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
650 }
651
652 /* clamp lod */
653 if (bld->static_sampler_state->apply_max_lod) {
654 LLVMValueRef max_lod =
655 bld->dynamic_state->max_lod(bld->dynamic_state,
656 bld->gallivm, sampler_unit);
657 max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
658
659 lod = lp_build_min(perquadf_bld, lod, max_lod);
660 }
661 if (bld->static_sampler_state->apply_min_lod) {
662 LLVMValueRef min_lod =
663 bld->dynamic_state->min_lod(bld->dynamic_state,
664 bld->gallivm, sampler_unit);
665 min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
666
667 lod = lp_build_max(perquadf_bld, lod, min_lod);
668 }
669 }
670
671 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
672 if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
673 lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
674 out_lod_ipart, out_lod_fpart);
675 }
676 else {
677 lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
678 }
679
680 lp_build_name(*out_lod_fpart, "lod_fpart");
681 }
682 else {
683 *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
684 }
685
686 lp_build_name(*out_lod_ipart, "lod_ipart");
687
688 return;
689 }
690
691
692 /**
693 * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
694 * mipmap level index.
695 * Note: this is all scalar per quad code.
696 * \param lod_ipart int texture level of detail
697 * \param level_out returns integer
698 */
699 void
700 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
701 unsigned texture_unit,
702 LLVMValueRef lod_ipart,
703 LLVMValueRef *level_out)
704 {
705 struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
706 LLVMValueRef first_level, last_level, level;
707
708 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
709 bld->gallivm, texture_unit);
710 last_level = bld->dynamic_state->last_level(bld->dynamic_state,
711 bld->gallivm, texture_unit);
712 first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
713 last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
714
715 level = lp_build_add(perquadi_bld, lod_ipart, first_level);
716
717 /* clamp level to legal range of levels */
718 *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
719 }
720
721
722 /**
723 * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
724 * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
725 * Later, we'll sample from those two mipmap levels and interpolate between them.
726 */
727 void
728 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
729 unsigned texture_unit,
730 LLVMValueRef lod_ipart,
731 LLVMValueRef *lod_fpart_inout,
732 LLVMValueRef *level0_out,
733 LLVMValueRef *level1_out)
734 {
735 LLVMBuilderRef builder = bld->gallivm->builder;
736 struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
737 struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
738 LLVMValueRef first_level, last_level;
739 LLVMValueRef clamp_min;
740 LLVMValueRef clamp_max;
741
742 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
743 bld->gallivm, texture_unit);
744 last_level = bld->dynamic_state->last_level(bld->dynamic_state,
745 bld->gallivm, texture_unit);
746 first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
747 last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
748
749 *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
750 *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
751
752 /*
753 * Clamp both *level0_out and *level1_out to [first_level, last_level], with
754 * the minimum number of comparisons, and zeroing lod_fpart in the extreme
755 * ends in the process.
756 */
757
758 /*
759 * This code (vector select in particular) only works with llvm 3.1
760 * (if there's more than one quad, with x86 backend). Might consider
761 * converting to our lp_bld_logic helpers.
762 */
763 #if HAVE_LLVM < 0x0301
764 assert(perquadi_bld->type.length == 1);
765 #endif
766
767 /* *level0_out < first_level */
768 clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
769 *level0_out, first_level,
770 "clamp_lod_to_first");
771
772 *level0_out = LLVMBuildSelect(builder, clamp_min,
773 first_level, *level0_out, "");
774
775 *level1_out = LLVMBuildSelect(builder, clamp_min,
776 first_level, *level1_out, "");
777
778 *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
779 perquadf_bld->zero, *lod_fpart_inout, "");
780
781 /* *level0_out >= last_level */
782 clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
783 *level0_out, last_level,
784 "clamp_lod_to_last");
785
786 *level0_out = LLVMBuildSelect(builder, clamp_max,
787 last_level, *level0_out, "");
788
789 *level1_out = LLVMBuildSelect(builder, clamp_max,
790 last_level, *level1_out, "");
791
792 *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
793 perquadf_bld->zero, *lod_fpart_inout, "");
794
795 lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
796 lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
797 lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
798 }
799
800
801 /**
802 * Return pointer to a single mipmap level.
803 * \param level integer mipmap level
804 */
805 LLVMValueRef
806 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
807 LLVMValueRef level)
808 {
809 LLVMBuilderRef builder = bld->gallivm->builder;
810 LLVMValueRef indexes[2], data_ptr, mip_offset;
811
812 indexes[0] = lp_build_const_int32(bld->gallivm, 0);
813 indexes[1] = level;
814 mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
815 mip_offset = LLVMBuildLoad(builder, mip_offset, "");
816 data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
817 return data_ptr;
818 }
819
820 /**
821 * Return (per-pixel) offsets to mip levels.
822 * \param level integer mipmap level
823 */
824 LLVMValueRef
825 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
826 LLVMValueRef level)
827 {
828 LLVMBuilderRef builder = bld->gallivm->builder;
829 LLVMValueRef indexes[2], offsets, offset1;
830
831 indexes[0] = lp_build_const_int32(bld->gallivm, 0);
832 if (bld->num_lods == 1) {
833 indexes[1] = level;
834 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
835 offset1 = LLVMBuildLoad(builder, offset1, "");
836 offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
837 }
838 else if (bld->num_lods == bld->coord_bld.type.length / 4) {
839 unsigned i;
840
841 offsets = bld->int_coord_bld.undef;
842 for (i = 0; i < bld->num_lods; i++) {
843 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
844 LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
845 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
846 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
847 offset1 = LLVMBuildLoad(builder, offset1, "");
848 offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
849 }
850 offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
851 }
852 else {
853 unsigned i;
854
855 assert (bld->num_lods == bld->coord_bld.type.length);
856
857 offsets = bld->int_coord_bld.undef;
858 for (i = 0; i < bld->num_lods; i++) {
859 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
860 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
861 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
862 offset1 = LLVMBuildLoad(builder, offset1, "");
863 offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
864 }
865 }
866 return offsets;
867 }
868
869
870 /**
871 * Codegen equivalent for u_minify().
872 * Return max(1, base_size >> level);
873 */
874 LLVMValueRef
875 lp_build_minify(struct lp_build_context *bld,
876 LLVMValueRef base_size,
877 LLVMValueRef level)
878 {
879 LLVMBuilderRef builder = bld->gallivm->builder;
880 assert(lp_check_value(bld->type, base_size));
881 assert(lp_check_value(bld->type, level));
882
883 if (level == bld->zero) {
884 /* if we're using mipmap level zero, no minification is needed */
885 return base_size;
886 }
887 else {
888 LLVMValueRef size =
889 LLVMBuildLShr(builder, base_size, level, "minify");
890 assert(bld->type.sign);
891 size = lp_build_max(bld, size, bld->one);
892 return size;
893 }
894 }
895
896
897 /**
898 * Dereference stride_array[mipmap_level] array to get a stride.
899 * Return stride as a vector.
900 */
901 static LLVMValueRef
902 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
903 LLVMValueRef stride_array, LLVMValueRef level)
904 {
905 LLVMBuilderRef builder = bld->gallivm->builder;
906 LLVMValueRef indexes[2], stride, stride1;
907 indexes[0] = lp_build_const_int32(bld->gallivm, 0);
908 if (bld->num_lods == 1) {
909 indexes[1] = level;
910 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
911 stride1 = LLVMBuildLoad(builder, stride1, "");
912 stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
913 }
914 else if (bld->num_lods == bld->coord_bld.type.length / 4) {
915 LLVMValueRef stride1;
916 unsigned i;
917
918 stride = bld->int_coord_bld.undef;
919 for (i = 0; i < bld->num_lods; i++) {
920 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
921 LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, i);
922 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
923 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
924 stride1 = LLVMBuildLoad(builder, stride1, "");
925 stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
926 }
927 stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
928 }
929 else {
930 LLVMValueRef stride1;
931 unsigned i;
932
933 assert (bld->num_lods == bld->coord_bld.type.length);
934
935 stride = bld->int_coord_bld.undef;
936 for (i = 0; i < bld->coord_bld.type.length; i++) {
937 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
938 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
939 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
940 stride1 = LLVMBuildLoad(builder, stride1, "");
941 stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
942 }
943 }
944 return stride;
945 }
946
947
948 /**
949 * When sampling a mipmap, we need to compute the width, height, depth
950 * of the source levels from the level indexes. This helper function
951 * does that.
952 */
953 void
954 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
955 LLVMValueRef ilevel,
956 LLVMValueRef *out_size,
957 LLVMValueRef *row_stride_vec,
958 LLVMValueRef *img_stride_vec)
959 {
960 const unsigned dims = bld->dims;
961 LLVMValueRef ilevel_vec;
962
963 /*
964 * Compute width, height, depth at mipmap level 'ilevel'
965 */
966 if (bld->num_lods == 1) {
967 ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
968 *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
969 }
970 else {
971 LLVMValueRef int_size_vec;
972 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
973 unsigned num_quads = bld->coord_bld.type.length / 4;
974 unsigned i;
975
976 if (bld->num_lods == num_quads) {
977 /*
978 * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
979 * intel "forgot" the variable shift count instruction until avx2.
980 * A harmless 8x32 shift gets translated into 32 instructions
981 * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
982 * unable to recognize if there are really just 2 different shift
983 * count values. So do the shift 4-wide before expansion.
984 */
985 struct lp_build_context bld4;
986 struct lp_type type4;
987
988 type4 = bld->int_coord_bld.type;
989 type4.length = 4;
990
991 lp_build_context_init(&bld4, bld->gallivm, type4);
992
993 if (bld->dims == 1) {
994 assert(bld->int_size_in_bld.type.length == 1);
995 int_size_vec = lp_build_broadcast_scalar(&bld4,
996 bld->int_size);
997 }
998 else {
999 assert(bld->int_size_in_bld.type.length == 4);
1000 int_size_vec = bld->int_size;
1001 }
1002
1003 for (i = 0; i < num_quads; i++) {
1004 LLVMValueRef ileveli;
1005 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1006
1007 ileveli = lp_build_extract_broadcast(bld->gallivm,
1008 bld->perquadi_bld.type,
1009 bld4.type,
1010 ilevel,
1011 indexi);
1012 tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
1013 }
1014 /*
1015 * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
1016 * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
1017 */
1018 *out_size = lp_build_concat(bld->gallivm,
1019 tmp,
1020 bld4.type,
1021 num_quads);
1022 }
1023 else {
1024 /* FIXME: this is terrible and results in _huge_ vector
1025 * (for the dims > 1 case).
1026 * Should refactor this (together with extract_image_sizes) and do
1027 * something more useful. Could for instance if we have width,height
1028 * with 4-wide vector pack all elements into a 8xi16 vector
1029 * (on which we can still do useful math) instead of using a 16xi32
1030 * vector.
1031 * FIXME: some callers can't handle this yet.
1032 * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
1033 * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
1034 */
1035 assert(bld->num_lods == bld->coord_bld.type.length);
1036 if (bld->dims == 1) {
1037 assert(bld->int_size_bld.type.length == 1);
1038 int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
1039 bld->int_size);
1040 /* vector shift with variable shift count alert... */
1041 *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
1042 }
1043 else {
1044 LLVMValueRef ilevel1;
1045 for (i = 0; i < bld->num_lods; i++) {
1046 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1047 ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
1048 bld->int_size_in_bld.type, ilevel, indexi);
1049 tmp[i] = bld->int_size;
1050 tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
1051 }
1052 int_size_vec = lp_build_concat(bld->gallivm,
1053 tmp,
1054 bld->int_size_in_bld.type,
1055 bld->num_lods);
1056 }
1057 }
1058 }
1059
1060 if (dims >= 2) {
1061 *row_stride_vec = lp_build_get_level_stride_vec(bld,
1062 bld->row_stride_array,
1063 ilevel);
1064 }
1065 if (dims == 3 ||
1066 bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1067 bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1068 bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1069 *img_stride_vec = lp_build_get_level_stride_vec(bld,
1070 bld->img_stride_array,
1071 ilevel);
1072 }
1073 }
1074
1075
1076 /**
1077 * Extract and broadcast texture size.
1078 *
1079 * @param size_type type of the texture size vector (either
1080 * bld->int_size_type or bld->float_size_type)
1081 * @param coord_type type of the texture size vector (either
1082 * bld->int_coord_type or bld->coord_type)
1083 * @param size vector with the texture size (width, height, depth)
1084 */
1085 void
1086 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
1087 struct lp_build_context *size_bld,
1088 struct lp_type coord_type,
1089 LLVMValueRef size,
1090 LLVMValueRef *out_width,
1091 LLVMValueRef *out_height,
1092 LLVMValueRef *out_depth)
1093 {
1094 const unsigned dims = bld->dims;
1095 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1096 struct lp_type size_type = size_bld->type;
1097
1098 if (bld->num_lods == 1) {
1099 *out_width = lp_build_extract_broadcast(bld->gallivm,
1100 size_type,
1101 coord_type,
1102 size,
1103 LLVMConstInt(i32t, 0, 0));
1104 if (dims >= 2) {
1105 *out_height = lp_build_extract_broadcast(bld->gallivm,
1106 size_type,
1107 coord_type,
1108 size,
1109 LLVMConstInt(i32t, 1, 0));
1110 if (dims == 3) {
1111 *out_depth = lp_build_extract_broadcast(bld->gallivm,
1112 size_type,
1113 coord_type,
1114 size,
1115 LLVMConstInt(i32t, 2, 0));
1116 }
1117 }
1118 }
1119 else {
1120 unsigned num_quads = bld->coord_bld.type.length / 4;
1121
1122 if (dims == 1) {
1123 *out_width = size;
1124 }
1125 else if (bld->num_lods == num_quads) {
1126 *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
1127 if (dims >= 2) {
1128 *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
1129 if (dims == 3) {
1130 *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
1131 }
1132 }
1133 }
1134 else {
1135 assert(bld->num_lods == bld->coord_type.length);
1136 *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1137 coord_type, size, 0);
1138 if (dims >= 2) {
1139 *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1140 coord_type, size, 1);
1141 if (dims == 3) {
1142 *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1143 coord_type, size, 2);
1144 }
1145 }
1146 }
1147 }
1148 }
1149
1150
1151 /**
1152 * Unnormalize coords.
1153 *
1154 * @param flt_size vector with the integer texture size (width, height, depth)
1155 */
1156 void
1157 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
1158 LLVMValueRef flt_size,
1159 LLVMValueRef *s,
1160 LLVMValueRef *t,
1161 LLVMValueRef *r)
1162 {
1163 const unsigned dims = bld->dims;
1164 LLVMValueRef width;
1165 LLVMValueRef height;
1166 LLVMValueRef depth;
1167
1168 lp_build_extract_image_sizes(bld,
1169 &bld->float_size_bld,
1170 bld->coord_type,
1171 flt_size,
1172 &width,
1173 &height,
1174 &depth);
1175
1176 /* s = s * width, t = t * height */
1177 *s = lp_build_mul(&bld->coord_bld, *s, width);
1178 if (dims >= 2) {
1179 *t = lp_build_mul(&bld->coord_bld, *t, height);
1180 if (dims >= 3) {
1181 *r = lp_build_mul(&bld->coord_bld, *r, depth);
1182 }
1183 }
1184 }
1185
1186
1187 /** Helper used by lp_build_cube_lookup() */
1188 static LLVMValueRef
1189 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
1190 {
1191 /* ima = +0.5 / abs(coord); */
1192 LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1193 LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1194 LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
1195 return ima;
1196 }
1197
1198 /** Helper used by lp_build_cube_lookup() */
1199 static LLVMValueRef
1200 lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
1201 {
1202 /* ima = -0.5 / abs(coord); */
1203 LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
1204 LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1205 LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
1206 return ima;
1207 }
1208
1209 /**
1210 * Helper used by lp_build_cube_lookup()
1211 * FIXME: the sign here can also be 0.
1212 * Arithmetically this could definitely make a difference. Either
1213 * fix the comment or use other (simpler) sign function, not sure
1214 * which one it should be.
1215 * \param sign scalar +1 or -1
1216 * \param coord float vector
1217 * \param ima float vector
1218 */
1219 static LLVMValueRef
1220 lp_build_cube_coord(struct lp_build_context *coord_bld,
1221 LLVMValueRef sign, int negate_coord,
1222 LLVMValueRef coord, LLVMValueRef ima)
1223 {
1224 /* return negate(coord) * ima * sign + 0.5; */
1225 LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1226 LLVMValueRef res;
1227
1228 assert(negate_coord == +1 || negate_coord == -1);
1229
1230 if (negate_coord == -1) {
1231 coord = lp_build_negate(coord_bld, coord);
1232 }
1233
1234 res = lp_build_mul(coord_bld, coord, ima);
1235 if (sign) {
1236 sign = lp_build_broadcast_scalar(coord_bld, sign);
1237 res = lp_build_mul(coord_bld, res, sign);
1238 }
1239 res = lp_build_add(coord_bld, res, half);
1240
1241 return res;
1242 }
1243
1244
1245 /** Helper used by lp_build_cube_lookup()
1246 * Return (major_coord >= 0) ? pos_face : neg_face;
1247 */
1248 static LLVMValueRef
1249 lp_build_cube_face(struct lp_build_sample_context *bld,
1250 LLVMValueRef major_coord,
1251 unsigned pos_face, unsigned neg_face)
1252 {
1253 struct gallivm_state *gallivm = bld->gallivm;
1254 LLVMBuilderRef builder = gallivm->builder;
1255 LLVMValueRef cmp = LLVMBuildFCmp(builder, LLVMRealUGE,
1256 major_coord,
1257 bld->float_bld.zero, "");
1258 LLVMValueRef pos = lp_build_const_int32(gallivm, pos_face);
1259 LLVMValueRef neg = lp_build_const_int32(gallivm, neg_face);
1260 LLVMValueRef res = LLVMBuildSelect(builder, cmp, pos, neg, "");
1261 return res;
1262 }
1263
1264
1265
1266 /**
1267 * Generate code to do cube face selection and compute per-face texcoords.
1268 */
1269 void
1270 lp_build_cube_lookup(struct lp_build_sample_context *bld,
1271 LLVMValueRef s,
1272 LLVMValueRef t,
1273 LLVMValueRef r,
1274 LLVMValueRef *face,
1275 LLVMValueRef *face_s,
1276 LLVMValueRef *face_t)
1277 {
1278 struct lp_build_context *coord_bld = &bld->coord_bld;
1279 LLVMBuilderRef builder = bld->gallivm->builder;
1280 struct gallivm_state *gallivm = bld->gallivm;
1281 LLVMValueRef rx, ry, rz;
1282 LLVMValueRef tmp[4], rxyz, arxyz;
1283
1284 /*
1285 * Use the average of the four pixel's texcoords to choose the face.
1286 * Slight simplification just calculate the sum, skip scaling.
1287 */
1288 tmp[0] = s;
1289 tmp[1] = t;
1290 tmp[2] = r;
1291 rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
1292 arxyz = lp_build_abs(&bld->coord_bld, rxyz);
1293
1294 if (coord_bld->type.length > 4) {
1295 struct lp_build_context *cint_bld = &bld->int_coord_bld;
1296 struct lp_type intctype = cint_bld->type;
1297 LLVMValueRef signrxs, signrys, signrzs, signrxyz, sign;
1298 LLVMValueRef arxs, arys, arzs;
1299 LLVMValueRef arx_ge_ary, maxarxsarys, arz_ge_arx_ary;
1300 LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
1301 LLVMValueRef ryneg, rzneg;
1302 LLVMValueRef ma, ima;
1303 LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
1304 LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
1305 1 << (intctype.width - 1));
1306 LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
1307 intctype.width -1);
1308 LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
1309 LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
1310 LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
1311
1312 assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
1313 assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
1314 assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
1315
1316 rx = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
1317 ry = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
1318 rz = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
1319 ryneg = LLVMBuildXor(builder, ry, signmask, "");
1320 rzneg = LLVMBuildXor(builder, rz, signmask, "");
1321
1322 /* the sign bit comes from the averaged vector (per quad),
1323 * as does the decision which face to use */
1324 signrxyz = LLVMBuildBitCast(builder, rxyz, lp_build_vec_type(gallivm, intctype), "");
1325 signrxyz = LLVMBuildAnd(builder, signrxyz, signmask, "");
1326
1327 arxs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 0, 4);
1328 arys = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 1, 4);
1329 arzs = lp_build_swizzle_scalar_aos(coord_bld, arxyz, 2, 4);
1330
1331 /*
1332 * select x if x >= y else select y
1333 * select previous result if y >= max(x,y) else select z
1334 */
1335 arx_ge_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, arxs, arys);
1336 maxarxsarys = lp_build_max(coord_bld, arxs, arys);
1337 arz_ge_arx_ary = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, maxarxsarys, arzs);
1338
1339 /*
1340 * compute all possible new s/t coords
1341 * snewx = signrx * -rz;
1342 * tnewx = -ry;
1343 * snewy = rx;
1344 * tnewy = signry * rz;
1345 * snewz = signrz * rx;
1346 * tnewz = -ry;
1347 */
1348 signrxs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 0, 4);
1349 snewx = LLVMBuildXor(builder, signrxs, rzneg, "");
1350 tnewx = ryneg;
1351
1352 signrys = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 1, 4);
1353 snewy = rx;
1354 tnewy = LLVMBuildXor(builder, signrys, rz, "");
1355
1356 signrzs = lp_build_swizzle_scalar_aos(cint_bld, signrxyz, 2, 4);
1357 snewz = LLVMBuildXor(builder, signrzs, rx, "");
1358 tnewz = ryneg;
1359
1360 /* XXX on x86 unclear if we should cast the values back to float
1361 * or not - on some cpus (nehalem) pblendvb has twice the throughput
1362 * of blendvps though on others there just might be domain
1363 * transition penalties when using it (this depends on what llvm
1364 * will chose for the bit ops above so there appears no "right way",
1365 * but given the boatload of selects let's just use the int type).
1366 *
1367 * Unfortunately we also need the sign bit of the summed coords.
1368 */
1369 *face_s = lp_build_select(cint_bld, arx_ge_ary, snewx, snewy);
1370 *face_t = lp_build_select(cint_bld, arx_ge_ary, tnewx, tnewy);
1371 ma = lp_build_select(coord_bld, arx_ge_ary, s, t);
1372 *face = lp_build_select(cint_bld, arx_ge_ary, facex, facey);
1373 sign = lp_build_select(cint_bld, arx_ge_ary, signrxs, signrys);
1374
1375 *face_s = lp_build_select(cint_bld, arz_ge_arx_ary, *face_s, snewz);
1376 *face_t = lp_build_select(cint_bld, arz_ge_arx_ary, *face_t, tnewz);
1377 ma = lp_build_select(coord_bld, arz_ge_arx_ary, ma, r);
1378 *face = lp_build_select(cint_bld, arz_ge_arx_ary, *face, facez);
1379 sign = lp_build_select(cint_bld, arz_ge_arx_ary, sign, signrzs);
1380
1381 *face_s = LLVMBuildBitCast(builder, *face_s,
1382 lp_build_vec_type(gallivm, coord_bld->type), "");
1383 *face_t = LLVMBuildBitCast(builder, *face_t,
1384 lp_build_vec_type(gallivm, coord_bld->type), "");
1385
1386 /* add +1 for neg face */
1387 /* XXX with AVX probably want to use another select here -
1388 * as long as we ensure vblendvps gets used we can actually
1389 * skip the comparison and just use sign as a "mask" directly.
1390 */
1391 sign = LLVMBuildLShr(builder, sign, signshift, "");
1392 *face = LLVMBuildOr(builder, *face, sign, "face");
1393
1394 ima = lp_build_cube_imapos(coord_bld, ma);
1395
1396 *face_s = lp_build_mul(coord_bld, *face_s, ima);
1397 *face_s = lp_build_add(coord_bld, *face_s, posHalf);
1398 *face_t = lp_build_mul(coord_bld, *face_t, ima);
1399 *face_t = lp_build_add(coord_bld, *face_t, posHalf);
1400 }
1401
1402 else {
1403 struct lp_build_if_state if_ctx;
1404 LLVMValueRef face_s_var;
1405 LLVMValueRef face_t_var;
1406 LLVMValueRef face_var;
1407 LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
1408 LLVMValueRef shuffles[4];
1409 LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
1410 LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
1411 struct lp_build_context *float_bld = &bld->float_bld;
1412
1413 assert(bld->coord_bld.type.length == 4);
1414
1415 shuffles[0] = lp_build_const_int32(gallivm, 0);
1416 shuffles[1] = lp_build_const_int32(gallivm, 1);
1417 shuffles[2] = lp_build_const_int32(gallivm, 0);
1418 shuffles[3] = lp_build_const_int32(gallivm, 1);
1419 arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1420 shuffles[0] = lp_build_const_int32(gallivm, 1);
1421 shuffles[1] = lp_build_const_int32(gallivm, 0);
1422 shuffles[2] = lp_build_const_int32(gallivm, 2);
1423 shuffles[3] = lp_build_const_int32(gallivm, 2);
1424 aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1425 arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
1426
1427 shuffles[0] = lp_build_const_int32(gallivm, 0);
1428 shuffles[1] = lp_build_const_int32(gallivm, 1);
1429 arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1430 LLVMConstVector(shuffles, 2), "");
1431 shuffles[0] = lp_build_const_int32(gallivm, 2);
1432 shuffles[1] = lp_build_const_int32(gallivm, 3);
1433 arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1434 LLVMConstVector(shuffles, 2), "");
1435 arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
1436
1437 arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1438 lp_build_const_int32(gallivm, 0), "");
1439 arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
1440 lp_build_const_int32(gallivm, 0), "");
1441 ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1442 lp_build_const_int32(gallivm, 1), "");
1443 ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
1444 lp_build_const_int32(gallivm, 0), "");
1445 face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
1446 face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
1447 face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
1448
1449 lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
1450 {
1451 /* +/- X face */
1452 LLVMValueRef sign, ima;
1453 rx = LLVMBuildExtractElement(builder, rxyz,
1454 lp_build_const_int32(gallivm, 0), "");
1455 /* +/- X face */
1456 sign = lp_build_sgn(float_bld, rx);
1457 ima = lp_build_cube_imaneg(coord_bld, s);
1458 *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
1459 *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1460 *face = lp_build_cube_face(bld, rx,
1461 PIPE_TEX_FACE_POS_X,
1462 PIPE_TEX_FACE_NEG_X);
1463 LLVMBuildStore(builder, *face_s, face_s_var);
1464 LLVMBuildStore(builder, *face_t, face_t_var);
1465 LLVMBuildStore(builder, *face, face_var);
1466 }
1467 lp_build_else(&if_ctx);
1468 {
1469 struct lp_build_if_state if_ctx2;
1470
1471 lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
1472 {
1473 LLVMValueRef sign, ima;
1474 /* +/- Y face */
1475 ry = LLVMBuildExtractElement(builder, rxyz,
1476 lp_build_const_int32(gallivm, 1), "");
1477 sign = lp_build_sgn(float_bld, ry);
1478 ima = lp_build_cube_imaneg(coord_bld, t);
1479 *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
1480 *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
1481 *face = lp_build_cube_face(bld, ry,
1482 PIPE_TEX_FACE_POS_Y,
1483 PIPE_TEX_FACE_NEG_Y);
1484 LLVMBuildStore(builder, *face_s, face_s_var);
1485 LLVMBuildStore(builder, *face_t, face_t_var);
1486 LLVMBuildStore(builder, *face, face_var);
1487 }
1488 lp_build_else(&if_ctx2);
1489 {
1490 /* +/- Z face */
1491 LLVMValueRef sign, ima;
1492 rz = LLVMBuildExtractElement(builder, rxyz,
1493 lp_build_const_int32(gallivm, 2), "");
1494 sign = lp_build_sgn(float_bld, rz);
1495 ima = lp_build_cube_imaneg(coord_bld, r);
1496 *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
1497 *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1498 *face = lp_build_cube_face(bld, rz,
1499 PIPE_TEX_FACE_POS_Z,
1500 PIPE_TEX_FACE_NEG_Z);
1501 LLVMBuildStore(builder, *face_s, face_s_var);
1502 LLVMBuildStore(builder, *face_t, face_t_var);
1503 LLVMBuildStore(builder, *face, face_var);
1504 }
1505 lp_build_endif(&if_ctx2);
1506 }
1507
1508 lp_build_endif(&if_ctx);
1509
1510 *face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
1511 *face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
1512 *face = LLVMBuildLoad(builder, face_var, "face");
1513 *face = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
1514 }
1515 }
1516
1517
1518 /**
1519 * Compute the partial offset of a pixel block along an arbitrary axis.
1520 *
1521 * @param coord coordinate in pixels
1522 * @param stride number of bytes between rows of successive pixel blocks
1523 * @param block_length number of pixels in a pixels block along the coordinate
1524 * axis
1525 * @param out_offset resulting relative offset of the pixel block in bytes
1526 * @param out_subcoord resulting sub-block pixel coordinate
1527 */
1528 void
1529 lp_build_sample_partial_offset(struct lp_build_context *bld,
1530 unsigned block_length,
1531 LLVMValueRef coord,
1532 LLVMValueRef stride,
1533 LLVMValueRef *out_offset,
1534 LLVMValueRef *out_subcoord)
1535 {
1536 LLVMBuilderRef builder = bld->gallivm->builder;
1537 LLVMValueRef offset;
1538 LLVMValueRef subcoord;
1539
1540 if (block_length == 1) {
1541 subcoord = bld->zero;
1542 }
1543 else {
1544 /*
1545 * Pixel blocks have power of two dimensions. LLVM should convert the
1546 * rem/div to bit arithmetic.
1547 * TODO: Verify this.
1548 * It does indeed BUT it does transform it to scalar (and back) when doing so
1549 * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
1550 * The generated code looks seriously unfunny and is quite expensive.
1551 */
1552 #if 0
1553 LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
1554 subcoord = LLVMBuildURem(builder, coord, block_width, "");
1555 coord = LLVMBuildUDiv(builder, coord, block_width, "");
1556 #else
1557 unsigned logbase2 = util_logbase2(block_length);
1558 LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
1559 LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
1560 subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
1561 coord = LLVMBuildLShr(builder, coord, block_shift, "");
1562 #endif
1563 }
1564
1565 offset = lp_build_mul(bld, coord, stride);
1566
1567 assert(out_offset);
1568 assert(out_subcoord);
1569
1570 *out_offset = offset;
1571 *out_subcoord = subcoord;
1572 }
1573
1574
1575 /**
1576 * Compute the offset of a pixel block.
1577 *
1578 * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
1579 *
1580 * Returns the relative offset and i,j sub-block coordinates
1581 */
1582 void
1583 lp_build_sample_offset(struct lp_build_context *bld,
1584 const struct util_format_description *format_desc,
1585 LLVMValueRef x,
1586 LLVMValueRef y,
1587 LLVMValueRef z,
1588 LLVMValueRef y_stride,
1589 LLVMValueRef z_stride,
1590 LLVMValueRef *out_offset,
1591 LLVMValueRef *out_i,
1592 LLVMValueRef *out_j)
1593 {
1594 LLVMValueRef x_stride;
1595 LLVMValueRef offset;
1596
1597 x_stride = lp_build_const_vec(bld->gallivm, bld->type,
1598 format_desc->block.bits/8);
1599
1600 lp_build_sample_partial_offset(bld,
1601 format_desc->block.width,
1602 x, x_stride,
1603 &offset, out_i);
1604
1605 if (y && y_stride) {
1606 LLVMValueRef y_offset;
1607 lp_build_sample_partial_offset(bld,
1608 format_desc->block.height,
1609 y, y_stride,
1610 &y_offset, out_j);
1611 offset = lp_build_add(bld, offset, y_offset);
1612 }
1613 else {
1614 *out_j = bld->zero;
1615 }
1616
1617 if (z && z_stride) {
1618 LLVMValueRef z_offset;
1619 LLVMValueRef k;
1620 lp_build_sample_partial_offset(bld,
1621 1, /* pixel blocks are always 2D */
1622 z, z_stride,
1623 &z_offset, &k);
1624 offset = lp_build_add(bld, offset, z_offset);
1625 }
1626
1627 *out_offset = offset;
1628 }