d689c7b385868b604593d33e685be0aa6570664b
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- common code.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 */
34
35 #include "pipe/p_defines.h"
36 #include "pipe/p_state.h"
37 #include "util/u_format.h"
38 #include "util/u_math.h"
39 #include "lp_bld_arit.h"
40 #include "lp_bld_const.h"
41 #include "lp_bld_debug.h"
42 #include "lp_bld_printf.h"
43 #include "lp_bld_flow.h"
44 #include "lp_bld_sample.h"
45 #include "lp_bld_swizzle.h"
46 #include "lp_bld_type.h"
47 #include "lp_bld_logic.h"
48 #include "lp_bld_pack.h"
49 #include "lp_bld_quad.h"
50
51
52 /*
53 * Bri-linear factor. Should be greater than one.
54 */
55 #define BRILINEAR_FACTOR 2
56
57 /**
58 * Does the given texture wrap mode allow sampling the texture border color?
59 * XXX maybe move this into gallium util code.
60 */
61 boolean
62 lp_sampler_wrap_mode_uses_border_color(unsigned mode,
63 unsigned min_img_filter,
64 unsigned mag_img_filter)
65 {
66 switch (mode) {
67 case PIPE_TEX_WRAP_REPEAT:
68 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
69 case PIPE_TEX_WRAP_MIRROR_REPEAT:
70 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
71 return FALSE;
72 case PIPE_TEX_WRAP_CLAMP:
73 case PIPE_TEX_WRAP_MIRROR_CLAMP:
74 if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
75 mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
76 return FALSE;
77 } else {
78 return TRUE;
79 }
80 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
81 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
82 return TRUE;
83 default:
84 assert(0 && "unexpected wrap mode");
85 return FALSE;
86 }
87 }
88
89
90 /**
91 * Initialize lp_sampler_static_texture_state object with the gallium
92 * texture/sampler_view state (this contains the parts which are
93 * considered static).
94 */
95 void
96 lp_sampler_static_texture_state(struct lp_static_texture_state *state,
97 const struct pipe_sampler_view *view)
98 {
99 const struct pipe_resource *texture;
100
101 memset(state, 0, sizeof *state);
102
103 if (!view || !view->texture)
104 return;
105
106 texture = view->texture;
107
108 state->format = view->format;
109 state->swizzle_r = view->swizzle_r;
110 state->swizzle_g = view->swizzle_g;
111 state->swizzle_b = view->swizzle_b;
112 state->swizzle_a = view->swizzle_a;
113
114 state->target = texture->target;
115 state->pot_width = util_is_power_of_two(texture->width0);
116 state->pot_height = util_is_power_of_two(texture->height0);
117 state->pot_depth = util_is_power_of_two(texture->depth0);
118 state->level_zero_only = !view->u.tex.last_level;
119
120 /*
121 * the layer / element / level parameters are all either dynamic
122 * state or handled transparently wrt execution.
123 */
124 }
125
126
127 /**
128 * Initialize lp_sampler_static_sampler_state object with the gallium sampler
129 * state (this contains the parts which are considered static).
130 */
131 void
132 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
133 const struct pipe_sampler_state *sampler)
134 {
135 memset(state, 0, sizeof *state);
136
137 if (!sampler)
138 return;
139
140 /*
141 * We don't copy sampler state over unless it is actually enabled, to avoid
142 * spurious recompiles, as the sampler static state is part of the shader
143 * key.
144 *
145 * Ideally the state tracker or cso_cache module would make all state
146 * canonical, but until that happens it's better to be safe than sorry here.
147 *
148 * XXX: Actually there's much more than can be done here, especially
149 * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
150 */
151
152 state->wrap_s = sampler->wrap_s;
153 state->wrap_t = sampler->wrap_t;
154 state->wrap_r = sampler->wrap_r;
155 state->min_img_filter = sampler->min_img_filter;
156 state->mag_img_filter = sampler->mag_img_filter;
157
158 if (sampler->max_lod > 0.0f) {
159 state->min_mip_filter = sampler->min_mip_filter;
160 } else {
161 state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
162 }
163
164 if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
165 if (sampler->lod_bias != 0.0f) {
166 state->lod_bias_non_zero = 1;
167 }
168
169 /* If min_lod == max_lod we can greatly simplify mipmap selection.
170 * This is a case that occurs during automatic mipmap generation.
171 */
172 if (sampler->min_lod == sampler->max_lod) {
173 state->min_max_lod_equal = 1;
174 } else {
175 if (sampler->min_lod > 0.0f) {
176 state->apply_min_lod = 1;
177 }
178
179 /*
180 * XXX this won't do anything with the mesa state tracker which always
181 * sets max_lod to not more than actually present mip maps...
182 */
183 if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
184 state->apply_max_lod = 1;
185 }
186 }
187 }
188
189 state->compare_mode = sampler->compare_mode;
190 if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
191 state->compare_func = sampler->compare_func;
192 }
193
194 state->normalized_coords = sampler->normalized_coords;
195 }
196
197
198 /**
199 * Generate code to compute coordinate gradient (rho).
200 * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
201 *
202 * The resulting rho is scalar per quad.
203 */
204 static LLVMValueRef
205 lp_build_rho(struct lp_build_sample_context *bld,
206 unsigned texture_unit,
207 LLVMValueRef s,
208 LLVMValueRef t,
209 LLVMValueRef r,
210 LLVMValueRef cube_rho,
211 const struct lp_derivatives *derivs)
212 {
213 struct gallivm_state *gallivm = bld->gallivm;
214 struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
215 struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
216 struct lp_build_context *float_bld = &bld->float_bld;
217 struct lp_build_context *coord_bld = &bld->coord_bld;
218 struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
219 const unsigned dims = bld->dims;
220 LLVMValueRef ddx_ddy[2];
221 LLVMBuilderRef builder = bld->gallivm->builder;
222 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
223 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
224 LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
225 LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
226 LLVMValueRef rho_vec;
227 LLVMValueRef int_size, float_size;
228 LLVMValueRef rho;
229 LLVMValueRef first_level, first_level_vec;
230 unsigned length = coord_bld->type.length;
231 unsigned num_quads = length / 4;
232 unsigned i;
233 LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
234 LLVMValueRef rho_xvec, rho_yvec;
235
236 /* Note that all simplified calculations will only work for isotropic filtering */
237
238 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
239 bld->gallivm, texture_unit);
240 first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
241 int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
242 float_size = lp_build_int_to_float(float_size_bld, int_size);
243
244 if (cube_rho) {
245 LLVMValueRef cubesize;
246 LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
247 /*
248 * Cube map code did already everything except size mul and per-quad extraction.
249 */
250 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
251 perquadf_bld->type, cube_rho, 0);
252 if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
253 rho = lp_build_sqrt(perquadf_bld, rho);
254 }
255 /* Could optimize this for single quad just skip the broadcast */
256 cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
257 perquadf_bld->type, float_size, index0);
258 rho = lp_build_mul(perquadf_bld, cubesize, rho);
259 }
260 else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
261 LLVMValueRef ddmax[3], ddx[3], ddy[3];
262 for (i = 0; i < dims; i++) {
263 LLVMValueRef floatdim;
264 LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
265
266 floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
267 coord_bld->type, float_size, indexi);
268
269 if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
270 ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
271 ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
272 ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
273 ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
274 }
275 else {
276 LLVMValueRef tmpx, tmpy;
277 tmpx = lp_build_abs(coord_bld, derivs->ddx[i]);
278 tmpy = lp_build_abs(coord_bld, derivs->ddy[i]);
279 ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
280 ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
281 }
282 }
283 if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
284 rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
285 rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
286 if (dims > 2) {
287 rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
288 rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
289 }
290 rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
291 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
292 perquadf_bld->type, rho_vec, 0);
293 /*
294 * note that as long as we don't care about per-pixel lod could reduce math
295 * more (at some shuffle cost), but for now only do sqrt after packing.
296 */
297 rho = lp_build_sqrt(perquadf_bld, rho);
298 }
299 else {
300 rho_vec = ddmax[0];
301 if (dims > 1) {
302 rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[1]);
303 if (dims > 2) {
304 rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[2]);
305 }
306 }
307 /*
308 * rho_vec now still contains per-pixel rho, convert to scalar per quad
309 * since we can't handle per-pixel rho/lod from now on (TODO).
310 */
311 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
312 perquadf_bld->type, rho_vec, 0);
313 }
314 }
315 else {
316 /*
317 * This looks all a bit complex, but it's not that bad
318 * (the shuffle code makes it look worse than it is).
319 * Still, might not be ideal for all cases.
320 */
321 static const unsigned char swizzle0[] = { /* no-op swizzle */
322 0, LP_BLD_SWIZZLE_DONTCARE,
323 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
324 };
325 static const unsigned char swizzle1[] = {
326 1, LP_BLD_SWIZZLE_DONTCARE,
327 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
328 };
329 static const unsigned char swizzle2[] = {
330 2, LP_BLD_SWIZZLE_DONTCARE,
331 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
332 };
333
334 if (dims < 2) {
335 ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
336 }
337 else if (dims >= 2) {
338 ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
339 if (dims > 2) {
340 ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
341 }
342 }
343
344 if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
345 static const unsigned char swizzle01[] = { /* no-op swizzle */
346 0, 1,
347 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
348 };
349 static const unsigned char swizzle23[] = {
350 2, 3,
351 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
352 };
353 LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
354
355 for (i = 0; i < num_quads; i++) {
356 shuffles[i*4+0] = shuffles[i*4+1] = index0;
357 shuffles[i*4+2] = shuffles[i*4+3] = index1;
358 }
359 floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
360 LLVMConstVector(shuffles, length), "");
361 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim);
362 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
363 ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
364 ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
365 rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
366
367 if (dims > 2) {
368 static const unsigned char swizzle02[] = {
369 0, 2,
370 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
371 };
372 floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
373 coord_bld->type, float_size, index2);
374 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim);
375 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
376 ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
377 rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
378 }
379 rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
380 rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
381 rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
382
383 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
384 perquadf_bld->type, rho_vec, 0);
385 rho = lp_build_sqrt(perquadf_bld, rho);
386 }
387 else {
388 ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
389 if (dims > 2) {
390 ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
391 }
392
393 if (dims < 2) {
394 rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0);
395 rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle1);
396 }
397 else if (dims == 2) {
398 static const unsigned char swizzle02[] = {
399 0, 2,
400 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
401 };
402 static const unsigned char swizzle13[] = {
403 1, 3,
404 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
405 };
406 rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02);
407 rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13);
408 }
409 else {
410 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
411 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
412 assert(dims == 3);
413 for (i = 0; i < num_quads; i++) {
414 shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
415 shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
416 shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
417 shuffles1[4*i + 3] = i32undef;
418 shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
419 shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
420 shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
421 shuffles2[4*i + 3] = i32undef;
422 }
423 rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
424 LLVMConstVector(shuffles1, length), "");
425 rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
426 LLVMConstVector(shuffles2, length), "");
427 }
428
429 rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
430
431 if (bld->coord_type.length > 4) {
432 /* expand size to each quad */
433 if (dims > 1) {
434 /* could use some broadcast_vector helper for this? */
435 LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
436 for (i = 0; i < num_quads; i++) {
437 src[i] = float_size;
438 }
439 float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
440 }
441 else {
442 float_size = lp_build_broadcast_scalar(coord_bld, float_size);
443 }
444 rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
445
446 if (dims <= 1) {
447 rho = rho_vec;
448 }
449 else {
450 if (dims >= 2) {
451 LLVMValueRef rho_s, rho_t, rho_r;
452
453 rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
454 rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
455
456 rho = lp_build_max(coord_bld, rho_s, rho_t);
457
458 if (dims >= 3) {
459 rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
460 rho = lp_build_max(coord_bld, rho, rho_r);
461 }
462 }
463 }
464 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
465 perquadf_bld->type, rho, 0);
466 }
467 else {
468 if (dims <= 1) {
469 rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
470 }
471 rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
472
473 if (dims <= 1) {
474 rho = rho_vec;
475 }
476 else {
477 if (dims >= 2) {
478 LLVMValueRef rho_s, rho_t, rho_r;
479
480 rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
481 rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
482
483 rho = lp_build_max(float_bld, rho_s, rho_t);
484
485 if (dims >= 3) {
486 rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
487 rho = lp_build_max(float_bld, rho, rho_r);
488 }
489 }
490 }
491 }
492 }
493 }
494
495 return rho;
496 }
497
498
499 /*
500 * Bri-linear lod computation
501 *
502 * Use a piece-wise linear approximation of log2 such that:
503 * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
504 * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
505 * with the steepness specified in 'factor'
506 * - exact result for 0.5, 1.5, etc.
507 *
508 *
509 * 1.0 - /----*
510 * /
511 * /
512 * /
513 * 0.5 - *
514 * /
515 * /
516 * /
517 * 0.0 - *----/
518 *
519 * | |
520 * 2^0 2^1
521 *
522 * This is a technique also commonly used in hardware:
523 * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
524 *
525 * TODO: For correctness, this should only be applied when texture is known to
526 * have regular mipmaps, i.e., mipmaps derived from the base level.
527 *
528 * TODO: This could be done in fixed point, where applicable.
529 */
530 static void
531 lp_build_brilinear_lod(struct lp_build_context *bld,
532 LLVMValueRef lod,
533 double factor,
534 LLVMValueRef *out_lod_ipart,
535 LLVMValueRef *out_lod_fpart)
536 {
537 LLVMValueRef lod_fpart;
538 double pre_offset = (factor - 0.5)/factor - 0.5;
539 double post_offset = 1 - factor;
540
541 if (0) {
542 lp_build_printf(bld->gallivm, "lod = %f\n", lod);
543 }
544
545 lod = lp_build_add(bld, lod,
546 lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
547
548 lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
549
550 lod_fpart = lp_build_mul(bld, lod_fpart,
551 lp_build_const_vec(bld->gallivm, bld->type, factor));
552
553 lod_fpart = lp_build_add(bld, lod_fpart,
554 lp_build_const_vec(bld->gallivm, bld->type, post_offset));
555
556 /*
557 * It's not necessary to clamp lod_fpart since:
558 * - the above expression will never produce numbers greater than one.
559 * - the mip filtering branch is only taken if lod_fpart is positive
560 */
561
562 *out_lod_fpart = lod_fpart;
563
564 if (0) {
565 lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
566 lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
567 }
568 }
569
570
571 /*
572 * Combined log2 and brilinear lod computation.
573 *
574 * It's in all identical to calling lp_build_fast_log2() and
575 * lp_build_brilinear_lod() above, but by combining we can compute the integer
576 * and fractional part independently.
577 */
578 static void
579 lp_build_brilinear_rho(struct lp_build_context *bld,
580 LLVMValueRef rho,
581 double factor,
582 LLVMValueRef *out_lod_ipart,
583 LLVMValueRef *out_lod_fpart)
584 {
585 LLVMValueRef lod_ipart;
586 LLVMValueRef lod_fpart;
587
588 const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
589 const double post_offset = 1 - 2*factor;
590
591 assert(bld->type.floating);
592
593 assert(lp_check_value(bld->type, rho));
594
595 /*
596 * The pre factor will make the intersections with the exact powers of two
597 * happen precisely where we want then to be, which means that the integer
598 * part will not need any post adjustments.
599 */
600 rho = lp_build_mul(bld, rho,
601 lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
602
603 /* ipart = ifloor(log2(rho)) */
604 lod_ipart = lp_build_extract_exponent(bld, rho, 0);
605
606 /* fpart = rho / 2**ipart */
607 lod_fpart = lp_build_extract_mantissa(bld, rho);
608
609 lod_fpart = lp_build_mul(bld, lod_fpart,
610 lp_build_const_vec(bld->gallivm, bld->type, factor));
611
612 lod_fpart = lp_build_add(bld, lod_fpart,
613 lp_build_const_vec(bld->gallivm, bld->type, post_offset));
614
615 /*
616 * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
617 * - the above expression will never produce numbers greater than one.
618 * - the mip filtering branch is only taken if lod_fpart is positive
619 */
620
621 *out_lod_ipart = lod_ipart;
622 *out_lod_fpart = lod_fpart;
623 }
624
625
626 /**
627 * Generate code to compute texture level of detail (lambda).
628 * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
629 * \param lod_bias optional float vector with the shader lod bias
630 * \param explicit_lod optional float vector with the explicit lod
631 * \param width scalar int texture width
632 * \param height scalar int texture height
633 * \param depth scalar int texture depth
634 *
635 * The resulting lod is scalar per quad, so only the first value per quad
636 * passed in from lod_bias, explicit_lod is used.
637 */
638 void
639 lp_build_lod_selector(struct lp_build_sample_context *bld,
640 unsigned texture_unit,
641 unsigned sampler_unit,
642 LLVMValueRef s,
643 LLVMValueRef t,
644 LLVMValueRef r,
645 LLVMValueRef cube_rho,
646 const struct lp_derivatives *derivs,
647 LLVMValueRef lod_bias, /* optional */
648 LLVMValueRef explicit_lod, /* optional */
649 unsigned mip_filter,
650 LLVMValueRef *out_lod_ipart,
651 LLVMValueRef *out_lod_fpart)
652
653 {
654 LLVMBuilderRef builder = bld->gallivm->builder;
655 struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
656 LLVMValueRef lod;
657
658 *out_lod_ipart = bld->perquadi_bld.zero;
659 *out_lod_fpart = perquadf_bld->zero;
660
661 if (bld->static_sampler_state->min_max_lod_equal) {
662 /* User is forcing sampling from a particular mipmap level.
663 * This is hit during mipmap generation.
664 */
665 LLVMValueRef min_lod =
666 bld->dynamic_state->min_lod(bld->dynamic_state,
667 bld->gallivm, sampler_unit);
668
669 lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
670 }
671 else {
672 if (explicit_lod) {
673 lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
674 perquadf_bld->type, explicit_lod, 0);
675 }
676 else {
677 LLVMValueRef rho;
678
679 rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
680
681 /*
682 * Compute lod = log2(rho)
683 */
684
685 if (!lod_bias &&
686 !bld->static_sampler_state->lod_bias_non_zero &&
687 !bld->static_sampler_state->apply_max_lod &&
688 !bld->static_sampler_state->apply_min_lod) {
689 /*
690 * Special case when there are no post-log2 adjustments, which
691 * saves instructions but keeping the integer and fractional lod
692 * computations separate from the start.
693 */
694
695 if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
696 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
697 *out_lod_ipart = lp_build_ilog2(perquadf_bld, rho);
698 *out_lod_fpart = perquadf_bld->zero;
699 return;
700 }
701 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
702 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
703 lp_build_brilinear_rho(perquadf_bld, rho, BRILINEAR_FACTOR,
704 out_lod_ipart, out_lod_fpart);
705 return;
706 }
707 }
708
709 if (0) {
710 lod = lp_build_log2(perquadf_bld, rho);
711 }
712 else {
713 lod = lp_build_fast_log2(perquadf_bld, rho);
714 }
715
716 /* add shader lod bias */
717 if (lod_bias) {
718 lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
719 perquadf_bld->type, lod_bias, 0);
720 lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
721 }
722 }
723
724 /* add sampler lod bias */
725 if (bld->static_sampler_state->lod_bias_non_zero) {
726 LLVMValueRef sampler_lod_bias =
727 bld->dynamic_state->lod_bias(bld->dynamic_state,
728 bld->gallivm, sampler_unit);
729 sampler_lod_bias = lp_build_broadcast_scalar(perquadf_bld,
730 sampler_lod_bias);
731 lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
732 }
733
734 /* clamp lod */
735 if (bld->static_sampler_state->apply_max_lod) {
736 LLVMValueRef max_lod =
737 bld->dynamic_state->max_lod(bld->dynamic_state,
738 bld->gallivm, sampler_unit);
739 max_lod = lp_build_broadcast_scalar(perquadf_bld, max_lod);
740
741 lod = lp_build_min(perquadf_bld, lod, max_lod);
742 }
743 if (bld->static_sampler_state->apply_min_lod) {
744 LLVMValueRef min_lod =
745 bld->dynamic_state->min_lod(bld->dynamic_state,
746 bld->gallivm, sampler_unit);
747 min_lod = lp_build_broadcast_scalar(perquadf_bld, min_lod);
748
749 lod = lp_build_max(perquadf_bld, lod, min_lod);
750 }
751 }
752
753 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
754 if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
755 lp_build_brilinear_lod(perquadf_bld, lod, BRILINEAR_FACTOR,
756 out_lod_ipart, out_lod_fpart);
757 }
758 else {
759 lp_build_ifloor_fract(perquadf_bld, lod, out_lod_ipart, out_lod_fpart);
760 }
761
762 lp_build_name(*out_lod_fpart, "lod_fpart");
763 }
764 else {
765 *out_lod_ipart = lp_build_iround(perquadf_bld, lod);
766 }
767
768 lp_build_name(*out_lod_ipart, "lod_ipart");
769
770 return;
771 }
772
773
774 /**
775 * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
776 * mipmap level index.
777 * Note: this is all scalar per quad code.
778 * \param lod_ipart int texture level of detail
779 * \param level_out returns integer
780 */
781 void
782 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
783 unsigned texture_unit,
784 LLVMValueRef lod_ipart,
785 LLVMValueRef *level_out)
786 {
787 struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
788 LLVMValueRef first_level, last_level, level;
789
790 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
791 bld->gallivm, texture_unit);
792 last_level = bld->dynamic_state->last_level(bld->dynamic_state,
793 bld->gallivm, texture_unit);
794 first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
795 last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
796
797 level = lp_build_add(perquadi_bld, lod_ipart, first_level);
798
799 /* clamp level to legal range of levels */
800 *level_out = lp_build_clamp(perquadi_bld, level, first_level, last_level);
801 }
802
803
804 /**
805 * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
806 * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
807 * Later, we'll sample from those two mipmap levels and interpolate between them.
808 */
809 void
810 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
811 unsigned texture_unit,
812 LLVMValueRef lod_ipart,
813 LLVMValueRef *lod_fpart_inout,
814 LLVMValueRef *level0_out,
815 LLVMValueRef *level1_out)
816 {
817 LLVMBuilderRef builder = bld->gallivm->builder;
818 struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
819 struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
820 LLVMValueRef first_level, last_level;
821 LLVMValueRef clamp_min;
822 LLVMValueRef clamp_max;
823
824 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
825 bld->gallivm, texture_unit);
826 last_level = bld->dynamic_state->last_level(bld->dynamic_state,
827 bld->gallivm, texture_unit);
828 first_level = lp_build_broadcast_scalar(perquadi_bld, first_level);
829 last_level = lp_build_broadcast_scalar(perquadi_bld, last_level);
830
831 *level0_out = lp_build_add(perquadi_bld, lod_ipart, first_level);
832 *level1_out = lp_build_add(perquadi_bld, *level0_out, perquadi_bld->one);
833
834 /*
835 * Clamp both *level0_out and *level1_out to [first_level, last_level], with
836 * the minimum number of comparisons, and zeroing lod_fpart in the extreme
837 * ends in the process.
838 */
839
840 /*
841 * This code (vector select in particular) only works with llvm 3.1
842 * (if there's more than one quad, with x86 backend). Might consider
843 * converting to our lp_bld_logic helpers.
844 */
845 #if HAVE_LLVM < 0x0301
846 assert(perquadi_bld->type.length == 1);
847 #endif
848
849 /* *level0_out < first_level */
850 clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
851 *level0_out, first_level,
852 "clamp_lod_to_first");
853
854 *level0_out = LLVMBuildSelect(builder, clamp_min,
855 first_level, *level0_out, "");
856
857 *level1_out = LLVMBuildSelect(builder, clamp_min,
858 first_level, *level1_out, "");
859
860 *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
861 perquadf_bld->zero, *lod_fpart_inout, "");
862
863 /* *level0_out >= last_level */
864 clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
865 *level0_out, last_level,
866 "clamp_lod_to_last");
867
868 *level0_out = LLVMBuildSelect(builder, clamp_max,
869 last_level, *level0_out, "");
870
871 *level1_out = LLVMBuildSelect(builder, clamp_max,
872 last_level, *level1_out, "");
873
874 *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
875 perquadf_bld->zero, *lod_fpart_inout, "");
876
877 lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
878 lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
879 lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
880 }
881
882
883 /**
884 * Return pointer to a single mipmap level.
885 * \param level integer mipmap level
886 */
887 LLVMValueRef
888 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
889 LLVMValueRef level)
890 {
891 LLVMBuilderRef builder = bld->gallivm->builder;
892 LLVMValueRef indexes[2], data_ptr, mip_offset;
893
894 indexes[0] = lp_build_const_int32(bld->gallivm, 0);
895 indexes[1] = level;
896 mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
897 mip_offset = LLVMBuildLoad(builder, mip_offset, "");
898 data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
899 return data_ptr;
900 }
901
902 /**
903 * Return (per-pixel) offsets to mip levels.
904 * \param level integer mipmap level
905 */
906 LLVMValueRef
907 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
908 LLVMValueRef level)
909 {
910 LLVMBuilderRef builder = bld->gallivm->builder;
911 LLVMValueRef indexes[2], offsets, offset1;
912
913 indexes[0] = lp_build_const_int32(bld->gallivm, 0);
914 if (bld->num_lods == 1) {
915 indexes[1] = level;
916 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
917 offset1 = LLVMBuildLoad(builder, offset1, "");
918 offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
919 }
920 else if (bld->num_lods == bld->coord_bld.type.length / 4) {
921 unsigned i;
922
923 offsets = bld->int_coord_bld.undef;
924 for (i = 0; i < bld->num_lods; i++) {
925 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
926 LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
927 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
928 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
929 offset1 = LLVMBuildLoad(builder, offset1, "");
930 offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
931 }
932 offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
933 }
934 else {
935 unsigned i;
936
937 assert (bld->num_lods == bld->coord_bld.type.length);
938
939 offsets = bld->int_coord_bld.undef;
940 for (i = 0; i < bld->num_lods; i++) {
941 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
942 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
943 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
944 offset1 = LLVMBuildLoad(builder, offset1, "");
945 offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
946 }
947 }
948 return offsets;
949 }
950
951
952 /**
953 * Codegen equivalent for u_minify().
954 * Return max(1, base_size >> level);
955 */
956 LLVMValueRef
957 lp_build_minify(struct lp_build_context *bld,
958 LLVMValueRef base_size,
959 LLVMValueRef level)
960 {
961 LLVMBuilderRef builder = bld->gallivm->builder;
962 assert(lp_check_value(bld->type, base_size));
963 assert(lp_check_value(bld->type, level));
964
965 if (level == bld->zero) {
966 /* if we're using mipmap level zero, no minification is needed */
967 return base_size;
968 }
969 else {
970 LLVMValueRef size =
971 LLVMBuildLShr(builder, base_size, level, "minify");
972 assert(bld->type.sign);
973 size = lp_build_max(bld, size, bld->one);
974 return size;
975 }
976 }
977
978
979 /**
980 * Dereference stride_array[mipmap_level] array to get a stride.
981 * Return stride as a vector.
982 */
983 static LLVMValueRef
984 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
985 LLVMValueRef stride_array, LLVMValueRef level)
986 {
987 LLVMBuilderRef builder = bld->gallivm->builder;
988 LLVMValueRef indexes[2], stride, stride1;
989 indexes[0] = lp_build_const_int32(bld->gallivm, 0);
990 if (bld->num_lods == 1) {
991 indexes[1] = level;
992 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
993 stride1 = LLVMBuildLoad(builder, stride1, "");
994 stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
995 }
996 else if (bld->num_lods == bld->coord_bld.type.length / 4) {
997 LLVMValueRef stride1;
998 unsigned i;
999
1000 stride = bld->int_coord_bld.undef;
1001 for (i = 0; i < bld->num_lods; i++) {
1002 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1003 LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
1004 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1005 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1006 stride1 = LLVMBuildLoad(builder, stride1, "");
1007 stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
1008 }
1009 stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
1010 }
1011 else {
1012 LLVMValueRef stride1;
1013 unsigned i;
1014
1015 assert (bld->num_lods == bld->coord_bld.type.length);
1016
1017 stride = bld->int_coord_bld.undef;
1018 for (i = 0; i < bld->coord_bld.type.length; i++) {
1019 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1020 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1021 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1022 stride1 = LLVMBuildLoad(builder, stride1, "");
1023 stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
1024 }
1025 }
1026 return stride;
1027 }
1028
1029
1030 /**
1031 * When sampling a mipmap, we need to compute the width, height, depth
1032 * of the source levels from the level indexes. This helper function
1033 * does that.
1034 */
1035 void
1036 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
1037 LLVMValueRef ilevel,
1038 LLVMValueRef *out_size,
1039 LLVMValueRef *row_stride_vec,
1040 LLVMValueRef *img_stride_vec)
1041 {
1042 const unsigned dims = bld->dims;
1043 LLVMValueRef ilevel_vec;
1044
1045 /*
1046 * Compute width, height, depth at mipmap level 'ilevel'
1047 */
1048 if (bld->num_lods == 1) {
1049 ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
1050 *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
1051 }
1052 else {
1053 LLVMValueRef int_size_vec;
1054 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
1055 unsigned num_quads = bld->coord_bld.type.length / 4;
1056 unsigned i;
1057
1058 if (bld->num_lods == num_quads) {
1059 /*
1060 * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
1061 * intel "forgot" the variable shift count instruction until avx2.
1062 * A harmless 8x32 shift gets translated into 32 instructions
1063 * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
1064 * unable to recognize if there are really just 2 different shift
1065 * count values. So do the shift 4-wide before expansion.
1066 */
1067 struct lp_build_context bld4;
1068 struct lp_type type4;
1069
1070 type4 = bld->int_coord_bld.type;
1071 type4.length = 4;
1072
1073 lp_build_context_init(&bld4, bld->gallivm, type4);
1074
1075 if (bld->dims == 1) {
1076 assert(bld->int_size_in_bld.type.length == 1);
1077 int_size_vec = lp_build_broadcast_scalar(&bld4,
1078 bld->int_size);
1079 }
1080 else {
1081 assert(bld->int_size_in_bld.type.length == 4);
1082 int_size_vec = bld->int_size;
1083 }
1084
1085 for (i = 0; i < num_quads; i++) {
1086 LLVMValueRef ileveli;
1087 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1088
1089 ileveli = lp_build_extract_broadcast(bld->gallivm,
1090 bld->perquadi_bld.type,
1091 bld4.type,
1092 ilevel,
1093 indexi);
1094 tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
1095 }
1096 /*
1097 * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
1098 * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
1099 */
1100 *out_size = lp_build_concat(bld->gallivm,
1101 tmp,
1102 bld4.type,
1103 num_quads);
1104 }
1105 else {
1106 /* FIXME: this is terrible and results in _huge_ vector
1107 * (for the dims > 1 case).
1108 * Should refactor this (together with extract_image_sizes) and do
1109 * something more useful. Could for instance if we have width,height
1110 * with 4-wide vector pack all elements into a 8xi16 vector
1111 * (on which we can still do useful math) instead of using a 16xi32
1112 * vector.
1113 * FIXME: some callers can't handle this yet.
1114 * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
1115 * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
1116 */
1117 assert(bld->num_lods == bld->coord_bld.type.length);
1118 if (bld->dims == 1) {
1119 assert(bld->int_size_bld.type.length == 1);
1120 int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
1121 bld->int_size);
1122 /* vector shift with variable shift count alert... */
1123 *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
1124 }
1125 else {
1126 LLVMValueRef ilevel1;
1127 for (i = 0; i < bld->num_lods; i++) {
1128 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1129 ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
1130 bld->int_size_in_bld.type, ilevel, indexi);
1131 tmp[i] = bld->int_size;
1132 tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
1133 }
1134 int_size_vec = lp_build_concat(bld->gallivm,
1135 tmp,
1136 bld->int_size_in_bld.type,
1137 bld->num_lods);
1138 }
1139 }
1140 }
1141
1142 if (dims >= 2) {
1143 *row_stride_vec = lp_build_get_level_stride_vec(bld,
1144 bld->row_stride_array,
1145 ilevel);
1146 }
1147 if (dims == 3 ||
1148 bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1149 bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1150 bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1151 *img_stride_vec = lp_build_get_level_stride_vec(bld,
1152 bld->img_stride_array,
1153 ilevel);
1154 }
1155 }
1156
1157
1158 /**
1159 * Extract and broadcast texture size.
1160 *
1161 * @param size_type type of the texture size vector (either
1162 * bld->int_size_type or bld->float_size_type)
1163 * @param coord_type type of the texture size vector (either
1164 * bld->int_coord_type or bld->coord_type)
1165 * @param size vector with the texture size (width, height, depth)
1166 */
1167 void
1168 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
1169 struct lp_build_context *size_bld,
1170 struct lp_type coord_type,
1171 LLVMValueRef size,
1172 LLVMValueRef *out_width,
1173 LLVMValueRef *out_height,
1174 LLVMValueRef *out_depth)
1175 {
1176 const unsigned dims = bld->dims;
1177 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1178 struct lp_type size_type = size_bld->type;
1179
1180 if (bld->num_lods == 1) {
1181 *out_width = lp_build_extract_broadcast(bld->gallivm,
1182 size_type,
1183 coord_type,
1184 size,
1185 LLVMConstInt(i32t, 0, 0));
1186 if (dims >= 2) {
1187 *out_height = lp_build_extract_broadcast(bld->gallivm,
1188 size_type,
1189 coord_type,
1190 size,
1191 LLVMConstInt(i32t, 1, 0));
1192 if (dims == 3) {
1193 *out_depth = lp_build_extract_broadcast(bld->gallivm,
1194 size_type,
1195 coord_type,
1196 size,
1197 LLVMConstInt(i32t, 2, 0));
1198 }
1199 }
1200 }
1201 else {
1202 unsigned num_quads = bld->coord_bld.type.length / 4;
1203
1204 if (dims == 1) {
1205 *out_width = size;
1206 }
1207 else if (bld->num_lods == num_quads) {
1208 *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
1209 if (dims >= 2) {
1210 *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
1211 if (dims == 3) {
1212 *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
1213 }
1214 }
1215 }
1216 else {
1217 assert(bld->num_lods == bld->coord_type.length);
1218 *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1219 coord_type, size, 0);
1220 if (dims >= 2) {
1221 *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1222 coord_type, size, 1);
1223 if (dims == 3) {
1224 *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1225 coord_type, size, 2);
1226 }
1227 }
1228 }
1229 }
1230 }
1231
1232
1233 /**
1234 * Unnormalize coords.
1235 *
1236 * @param flt_size vector with the integer texture size (width, height, depth)
1237 */
1238 void
1239 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
1240 LLVMValueRef flt_size,
1241 LLVMValueRef *s,
1242 LLVMValueRef *t,
1243 LLVMValueRef *r)
1244 {
1245 const unsigned dims = bld->dims;
1246 LLVMValueRef width;
1247 LLVMValueRef height;
1248 LLVMValueRef depth;
1249
1250 lp_build_extract_image_sizes(bld,
1251 &bld->float_size_bld,
1252 bld->coord_type,
1253 flt_size,
1254 &width,
1255 &height,
1256 &depth);
1257
1258 /* s = s * width, t = t * height */
1259 *s = lp_build_mul(&bld->coord_bld, *s, width);
1260 if (dims >= 2) {
1261 *t = lp_build_mul(&bld->coord_bld, *t, height);
1262 if (dims >= 3) {
1263 *r = lp_build_mul(&bld->coord_bld, *r, depth);
1264 }
1265 }
1266 }
1267
1268
1269 /** Helper used by lp_build_cube_lookup() */
1270 static LLVMValueRef
1271 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
1272 {
1273 /* ima = +0.5 / abs(coord); */
1274 LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1275 LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1276 LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
1277 return ima;
1278 }
1279
1280 /** Helper used by lp_build_cube_lookup() */
1281 static LLVMValueRef
1282 lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
1283 {
1284 /* ima = -0.5 / abs(coord); */
1285 LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
1286 LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1287 LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
1288 return ima;
1289 }
1290
1291 /**
1292 * Helper used by lp_build_cube_lookup()
1293 * FIXME: the sign here can also be 0.
1294 * Arithmetically this could definitely make a difference. Either
1295 * fix the comment or use other (simpler) sign function, not sure
1296 * which one it should be.
1297 * \param sign scalar +1 or -1
1298 * \param coord float vector
1299 * \param ima float vector
1300 */
1301 static LLVMValueRef
1302 lp_build_cube_coord(struct lp_build_context *coord_bld,
1303 LLVMValueRef sign, int negate_coord,
1304 LLVMValueRef coord, LLVMValueRef ima)
1305 {
1306 /* return negate(coord) * ima * sign + 0.5; */
1307 LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1308 LLVMValueRef res;
1309
1310 assert(negate_coord == +1 || negate_coord == -1);
1311
1312 if (negate_coord == -1) {
1313 coord = lp_build_negate(coord_bld, coord);
1314 }
1315
1316 res = lp_build_mul(coord_bld, coord, ima);
1317 if (sign) {
1318 sign = lp_build_broadcast_scalar(coord_bld, sign);
1319 res = lp_build_mul(coord_bld, res, sign);
1320 }
1321 res = lp_build_add(coord_bld, res, half);
1322
1323 return res;
1324 }
1325
1326
1327 /** Helper used by lp_build_cube_lookup()
1328 * Return (major_coord >= 0) ? pos_face : neg_face;
1329 */
1330 static LLVMValueRef
1331 lp_build_cube_face(struct lp_build_sample_context *bld,
1332 LLVMValueRef major_coord,
1333 unsigned pos_face, unsigned neg_face)
1334 {
1335 struct gallivm_state *gallivm = bld->gallivm;
1336 LLVMBuilderRef builder = gallivm->builder;
1337 LLVMValueRef cmp = LLVMBuildFCmp(builder, LLVMRealUGE,
1338 major_coord,
1339 bld->float_bld.zero, "");
1340 LLVMValueRef pos = lp_build_const_int32(gallivm, pos_face);
1341 LLVMValueRef neg = lp_build_const_int32(gallivm, neg_face);
1342 LLVMValueRef res = LLVMBuildSelect(builder, cmp, pos, neg, "");
1343 return res;
1344 }
1345
1346
1347
1348 /**
1349 * Generate code to do cube face selection and compute per-face texcoords.
1350 */
1351 void
1352 lp_build_cube_lookup(struct lp_build_sample_context *bld,
1353 LLVMValueRef s,
1354 LLVMValueRef t,
1355 LLVMValueRef r,
1356 const struct lp_derivatives *derivs, /* optional */
1357 LLVMValueRef *face,
1358 LLVMValueRef *face_s,
1359 LLVMValueRef *face_t,
1360 LLVMValueRef *rho,
1361 boolean need_derivs)
1362 {
1363 struct lp_build_context *coord_bld = &bld->coord_bld;
1364 LLVMBuilderRef builder = bld->gallivm->builder;
1365 struct gallivm_state *gallivm = bld->gallivm;
1366 LLVMValueRef si, ti, ri;
1367
1368 if (1 || coord_bld->type.length > 4) {
1369 /*
1370 * Do per-pixel face selection. We cannot however (as we used to do)
1371 * simply calculate the derivs afterwards (which is very bogus for
1372 * explicit derivs btw) because the values would be "random" when
1373 * not all pixels lie on the same face. So what we do here is just
1374 * calculate the derivatives after scaling the coords by the absolute
1375 * value of the inverse major axis, and essentially do rho calculation
1376 * steps as if it were a 3d texture. This is perfect if all pixels hit
1377 * the same face, but not so great at edges, I believe the max error
1378 * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
1379 * the 3d distance between 2 points on the cube instead of measuring up/down
1380 * the edge). Still this is possibly a win over just selecting the same face
1381 * for all pixels. Unfortunately, something like that doesn't work for
1382 * explicit derivatives.
1383 * TODO: handle explicit derivatives by transforming them alongside coords
1384 * somehow.
1385 */
1386 struct lp_build_context *cint_bld = &bld->int_coord_bld;
1387 struct lp_type intctype = cint_bld->type;
1388 LLVMValueRef signs, signt, signr, signma;
1389 LLVMValueRef as, at, ar;
1390 LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
1391 LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
1392 LLVMValueRef tnegi, rnegi;
1393 LLVMValueRef ma, mai, ima;
1394 LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
1395 LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
1396 1 << (intctype.width - 1));
1397 LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
1398 intctype.width -1);
1399 LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
1400 LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
1401 LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
1402
1403 assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
1404 assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
1405 assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
1406
1407 /*
1408 * get absolute value (for x/y/z face selection) and sign bit
1409 * (for mirroring minor coords and pos/neg face selection)
1410 * of the original coords.
1411 */
1412 as = lp_build_abs(&bld->coord_bld, s);
1413 at = lp_build_abs(&bld->coord_bld, t);
1414 ar = lp_build_abs(&bld->coord_bld, r);
1415
1416 /*
1417 * major face determination: select x if x > y else select y
1418 * select z if z >= max(x,y) else select previous result
1419 * if some axis are the same we chose z over y, y over x - the
1420 * dx10 spec seems to ask for it while OpenGL doesn't care (if we
1421 * wouldn't care could save a select or two if using different
1422 * compares and doing at_g_as_ar last since tnewx and tnewz are the
1423 * same).
1424 */
1425 as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
1426 maxasat = lp_build_max(coord_bld, as, at);
1427 ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
1428
1429 if (need_derivs) {
1430 LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
1431 static const unsigned char swizzle0[] = { /* no-op swizzle */
1432 0, LP_BLD_SWIZZLE_DONTCARE,
1433 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1434 };
1435 static const unsigned char swizzle1[] = {
1436 1, LP_BLD_SWIZZLE_DONTCARE,
1437 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1438 };
1439 static const unsigned char swizzle01[] = { /* no-op swizzle */
1440 0, 1,
1441 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1442 };
1443 static const unsigned char swizzle23[] = {
1444 2, 3,
1445 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1446 };
1447 static const unsigned char swizzle02[] = {
1448 0, 2,
1449 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1450 };
1451
1452 /*
1453 * scale the s/t/r coords pre-select/mirror so we can calculate
1454 * "reasonable" derivs.
1455 */
1456 ma = lp_build_select(coord_bld, as_ge_at, s, t);
1457 ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
1458 ima = lp_build_cube_imapos(coord_bld, ma);
1459 s = lp_build_mul(coord_bld, s, ima);
1460 t = lp_build_mul(coord_bld, t, ima);
1461 r = lp_build_mul(coord_bld, r, ima);
1462
1463 /*
1464 * This isn't quite the same as the "ordinary" (3d deriv) path since we
1465 * know the texture is square which simplifies things (we can omit the
1466 * size mul which happens very early completely here and do it at the
1467 * very end).
1468 */
1469 ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
1470 ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
1471
1472 if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
1473 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
1474 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
1475 }
1476 else {
1477 ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
1478 ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
1479 }
1480
1481 tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
1482 tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
1483 tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
1484
1485 if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
1486 rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
1487 rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
1488 }
1489 else {
1490 rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
1491 rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
1492 }
1493
1494 tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
1495 tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
1496 *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
1497 }
1498
1499 si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
1500 ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
1501 ri = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
1502 signs = LLVMBuildAnd(builder, si, signmask, "");
1503 signt = LLVMBuildAnd(builder, ti, signmask, "");
1504 signr = LLVMBuildAnd(builder, ri, signmask, "");
1505
1506 /*
1507 * compute all possible new s/t coords
1508 * snewx = signs * -r;
1509 * tnewx = -t;
1510 * snewy = s;
1511 * tnewy = signt * r;
1512 * snewz = signr * s;
1513 * tnewz = -t;
1514 */
1515 tnegi = LLVMBuildXor(builder, ti, signmask, "");
1516 rnegi = LLVMBuildXor(builder, ri, signmask, "");
1517
1518 snewx = LLVMBuildXor(builder, signs, rnegi, "");
1519 tnewx = tnegi;
1520
1521 snewy = si;
1522 tnewy = LLVMBuildXor(builder, signt, ri, "");
1523
1524 snewz = LLVMBuildXor(builder, signr, si, "");
1525 tnewz = tnegi;
1526
1527 /* XXX on x86 unclear if we should cast the values back to float
1528 * or not - on some cpus (nehalem) pblendvb has twice the throughput
1529 * of blendvps though on others there just might be domain
1530 * transition penalties when using it (this depends on what llvm
1531 * will chose for the bit ops above so there appears no "right way",
1532 * but given the boatload of selects let's just use the int type).
1533 */
1534
1535 /* select/mirror */
1536 if (!need_derivs) {
1537 ma = lp_build_select(coord_bld, as_ge_at, s, t);
1538 }
1539 *face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
1540 *face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
1541 *face = lp_build_select(cint_bld, as_ge_at, facex, facey);
1542
1543 if (!need_derivs) {
1544 ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
1545 }
1546 *face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, *face_s);
1547 *face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, *face_t);
1548 *face = lp_build_select(cint_bld, ar_ge_as_at, facez, *face);
1549
1550 *face_s = LLVMBuildBitCast(builder, *face_s,
1551 lp_build_vec_type(gallivm, coord_bld->type), "");
1552 *face_t = LLVMBuildBitCast(builder, *face_t,
1553 lp_build_vec_type(gallivm, coord_bld->type), "");
1554
1555 /* add +1 for neg face */
1556 /* XXX with AVX probably want to use another select here -
1557 * as long as we ensure vblendvps gets used we can actually
1558 * skip the comparison and just use sign as a "mask" directly.
1559 */
1560 mai = LLVMBuildBitCast(builder, ma, lp_build_vec_type(gallivm, intctype), "");
1561 signma = LLVMBuildLShr(builder, mai, signshift, "");
1562 *face = LLVMBuildOr(builder, *face, signma, "face");
1563
1564 /* project coords */
1565 if (!need_derivs) {
1566 ima = lp_build_cube_imapos(coord_bld, ma);
1567 *face_s = lp_build_mul(coord_bld, *face_s, ima);
1568 *face_t = lp_build_mul(coord_bld, *face_t, ima);
1569 }
1570
1571 *face_s = lp_build_add(coord_bld, *face_s, posHalf);
1572 *face_t = lp_build_add(coord_bld, *face_t, posHalf);
1573 }
1574
1575 else {
1576 struct lp_build_if_state if_ctx;
1577 LLVMValueRef face_s_var;
1578 LLVMValueRef face_t_var;
1579 LLVMValueRef face_var;
1580 LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
1581 LLVMValueRef shuffles[4];
1582 LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
1583 LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
1584 LLVMValueRef tmp[4], rxyz, arxyz;
1585 struct lp_build_context *float_bld = &bld->float_bld;
1586
1587 assert(bld->coord_bld.type.length == 4);
1588
1589 tmp[0] = s;
1590 tmp[1] = t;
1591 tmp[2] = r;
1592 rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
1593 arxyz = lp_build_abs(&bld->coord_bld, rxyz);
1594
1595 shuffles[0] = lp_build_const_int32(gallivm, 0);
1596 shuffles[1] = lp_build_const_int32(gallivm, 1);
1597 shuffles[2] = lp_build_const_int32(gallivm, 0);
1598 shuffles[3] = lp_build_const_int32(gallivm, 1);
1599 arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1600 shuffles[0] = lp_build_const_int32(gallivm, 1);
1601 shuffles[1] = lp_build_const_int32(gallivm, 0);
1602 shuffles[2] = lp_build_const_int32(gallivm, 2);
1603 shuffles[3] = lp_build_const_int32(gallivm, 2);
1604 aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1605 arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
1606
1607 shuffles[0] = lp_build_const_int32(gallivm, 0);
1608 shuffles[1] = lp_build_const_int32(gallivm, 1);
1609 arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1610 LLVMConstVector(shuffles, 2), "");
1611 shuffles[0] = lp_build_const_int32(gallivm, 2);
1612 shuffles[1] = lp_build_const_int32(gallivm, 3);
1613 arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1614 LLVMConstVector(shuffles, 2), "");
1615 arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
1616
1617 arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1618 lp_build_const_int32(gallivm, 0), "");
1619 arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
1620 lp_build_const_int32(gallivm, 0), "");
1621 ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1622 lp_build_const_int32(gallivm, 1), "");
1623 ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
1624 lp_build_const_int32(gallivm, 0), "");
1625 face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
1626 face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
1627 face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
1628
1629 lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
1630 {
1631 /* +/- X face */
1632 LLVMValueRef sign, ima;
1633 si = LLVMBuildExtractElement(builder, rxyz,
1634 lp_build_const_int32(gallivm, 0), "");
1635 /* +/- X face */
1636 sign = lp_build_sgn(float_bld, si);
1637 ima = lp_build_cube_imaneg(coord_bld, s);
1638 *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
1639 *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1640 *face = lp_build_cube_face(bld, si,
1641 PIPE_TEX_FACE_POS_X,
1642 PIPE_TEX_FACE_NEG_X);
1643 LLVMBuildStore(builder, *face_s, face_s_var);
1644 LLVMBuildStore(builder, *face_t, face_t_var);
1645 LLVMBuildStore(builder, *face, face_var);
1646 }
1647 lp_build_else(&if_ctx);
1648 {
1649 struct lp_build_if_state if_ctx2;
1650
1651 lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
1652 {
1653 LLVMValueRef sign, ima;
1654 /* +/- Y face */
1655 ti = LLVMBuildExtractElement(builder, rxyz,
1656 lp_build_const_int32(gallivm, 1), "");
1657 sign = lp_build_sgn(float_bld, ti);
1658 ima = lp_build_cube_imaneg(coord_bld, t);
1659 *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
1660 *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
1661 *face = lp_build_cube_face(bld, ti,
1662 PIPE_TEX_FACE_POS_Y,
1663 PIPE_TEX_FACE_NEG_Y);
1664 LLVMBuildStore(builder, *face_s, face_s_var);
1665 LLVMBuildStore(builder, *face_t, face_t_var);
1666 LLVMBuildStore(builder, *face, face_var);
1667 }
1668 lp_build_else(&if_ctx2);
1669 {
1670 /* +/- Z face */
1671 LLVMValueRef sign, ima;
1672 ri = LLVMBuildExtractElement(builder, rxyz,
1673 lp_build_const_int32(gallivm, 2), "");
1674 sign = lp_build_sgn(float_bld, ri);
1675 ima = lp_build_cube_imaneg(coord_bld, r);
1676 *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
1677 *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1678 *face = lp_build_cube_face(bld, ri,
1679 PIPE_TEX_FACE_POS_Z,
1680 PIPE_TEX_FACE_NEG_Z);
1681 LLVMBuildStore(builder, *face_s, face_s_var);
1682 LLVMBuildStore(builder, *face_t, face_t_var);
1683 LLVMBuildStore(builder, *face, face_var);
1684 }
1685 lp_build_endif(&if_ctx2);
1686 }
1687
1688 lp_build_endif(&if_ctx);
1689
1690 *face_s = LLVMBuildLoad(builder, face_s_var, "face_s");
1691 *face_t = LLVMBuildLoad(builder, face_t_var, "face_t");
1692 *face = LLVMBuildLoad(builder, face_var, "face");
1693 *face = lp_build_broadcast_scalar(&bld->int_coord_bld, *face);
1694 }
1695 }
1696
1697
1698 /**
1699 * Compute the partial offset of a pixel block along an arbitrary axis.
1700 *
1701 * @param coord coordinate in pixels
1702 * @param stride number of bytes between rows of successive pixel blocks
1703 * @param block_length number of pixels in a pixels block along the coordinate
1704 * axis
1705 * @param out_offset resulting relative offset of the pixel block in bytes
1706 * @param out_subcoord resulting sub-block pixel coordinate
1707 */
1708 void
1709 lp_build_sample_partial_offset(struct lp_build_context *bld,
1710 unsigned block_length,
1711 LLVMValueRef coord,
1712 LLVMValueRef stride,
1713 LLVMValueRef *out_offset,
1714 LLVMValueRef *out_subcoord)
1715 {
1716 LLVMBuilderRef builder = bld->gallivm->builder;
1717 LLVMValueRef offset;
1718 LLVMValueRef subcoord;
1719
1720 if (block_length == 1) {
1721 subcoord = bld->zero;
1722 }
1723 else {
1724 /*
1725 * Pixel blocks have power of two dimensions. LLVM should convert the
1726 * rem/div to bit arithmetic.
1727 * TODO: Verify this.
1728 * It does indeed BUT it does transform it to scalar (and back) when doing so
1729 * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
1730 * The generated code looks seriously unfunny and is quite expensive.
1731 */
1732 #if 0
1733 LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
1734 subcoord = LLVMBuildURem(builder, coord, block_width, "");
1735 coord = LLVMBuildUDiv(builder, coord, block_width, "");
1736 #else
1737 unsigned logbase2 = util_logbase2(block_length);
1738 LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
1739 LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
1740 subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
1741 coord = LLVMBuildLShr(builder, coord, block_shift, "");
1742 #endif
1743 }
1744
1745 offset = lp_build_mul(bld, coord, stride);
1746
1747 assert(out_offset);
1748 assert(out_subcoord);
1749
1750 *out_offset = offset;
1751 *out_subcoord = subcoord;
1752 }
1753
1754
1755 /**
1756 * Compute the offset of a pixel block.
1757 *
1758 * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
1759 *
1760 * Returns the relative offset and i,j sub-block coordinates
1761 */
1762 void
1763 lp_build_sample_offset(struct lp_build_context *bld,
1764 const struct util_format_description *format_desc,
1765 LLVMValueRef x,
1766 LLVMValueRef y,
1767 LLVMValueRef z,
1768 LLVMValueRef y_stride,
1769 LLVMValueRef z_stride,
1770 LLVMValueRef *out_offset,
1771 LLVMValueRef *out_i,
1772 LLVMValueRef *out_j)
1773 {
1774 LLVMValueRef x_stride;
1775 LLVMValueRef offset;
1776
1777 x_stride = lp_build_const_vec(bld->gallivm, bld->type,
1778 format_desc->block.bits/8);
1779
1780 lp_build_sample_partial_offset(bld,
1781 format_desc->block.width,
1782 x, x_stride,
1783 &offset, out_i);
1784
1785 if (y && y_stride) {
1786 LLVMValueRef y_offset;
1787 lp_build_sample_partial_offset(bld,
1788 format_desc->block.height,
1789 y, y_stride,
1790 &y_offset, out_j);
1791 offset = lp_build_add(bld, offset, y_offset);
1792 }
1793 else {
1794 *out_j = bld->zero;
1795 }
1796
1797 if (z && z_stride) {
1798 LLVMValueRef z_offset;
1799 LLVMValueRef k;
1800 lp_build_sample_partial_offset(bld,
1801 1, /* pixel blocks are always 2D */
1802 z, z_stride,
1803 &z_offset, &k);
1804 offset = lp_build_add(bld, offset, z_offset);
1805 }
1806
1807 *out_offset = offset;
1808 }