gallivm: do per-element lod for lod bias and explicit derivs too
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- common code.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 */
34
35 #include "pipe/p_defines.h"
36 #include "pipe/p_state.h"
37 #include "util/u_format.h"
38 #include "util/u_math.h"
39 #include "lp_bld_arit.h"
40 #include "lp_bld_const.h"
41 #include "lp_bld_debug.h"
42 #include "lp_bld_printf.h"
43 #include "lp_bld_flow.h"
44 #include "lp_bld_sample.h"
45 #include "lp_bld_swizzle.h"
46 #include "lp_bld_type.h"
47 #include "lp_bld_logic.h"
48 #include "lp_bld_pack.h"
49 #include "lp_bld_quad.h"
50 #include "lp_bld_bitarit.h"
51
52
53 /*
54 * Bri-linear factor. Should be greater than one.
55 */
56 #define BRILINEAR_FACTOR 2
57
58 /**
59 * Does the given texture wrap mode allow sampling the texture border color?
60 * XXX maybe move this into gallium util code.
61 */
62 boolean
63 lp_sampler_wrap_mode_uses_border_color(unsigned mode,
64 unsigned min_img_filter,
65 unsigned mag_img_filter)
66 {
67 switch (mode) {
68 case PIPE_TEX_WRAP_REPEAT:
69 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
70 case PIPE_TEX_WRAP_MIRROR_REPEAT:
71 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
72 return FALSE;
73 case PIPE_TEX_WRAP_CLAMP:
74 case PIPE_TEX_WRAP_MIRROR_CLAMP:
75 if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
76 mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
77 return FALSE;
78 } else {
79 return TRUE;
80 }
81 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
82 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
83 return TRUE;
84 default:
85 assert(0 && "unexpected wrap mode");
86 return FALSE;
87 }
88 }
89
90
91 /**
92 * Initialize lp_sampler_static_texture_state object with the gallium
93 * texture/sampler_view state (this contains the parts which are
94 * considered static).
95 */
96 void
97 lp_sampler_static_texture_state(struct lp_static_texture_state *state,
98 const struct pipe_sampler_view *view)
99 {
100 const struct pipe_resource *texture;
101
102 memset(state, 0, sizeof *state);
103
104 if (!view || !view->texture)
105 return;
106
107 texture = view->texture;
108
109 state->format = view->format;
110 state->swizzle_r = view->swizzle_r;
111 state->swizzle_g = view->swizzle_g;
112 state->swizzle_b = view->swizzle_b;
113 state->swizzle_a = view->swizzle_a;
114
115 state->target = texture->target;
116 state->pot_width = util_is_power_of_two(texture->width0);
117 state->pot_height = util_is_power_of_two(texture->height0);
118 state->pot_depth = util_is_power_of_two(texture->depth0);
119 state->level_zero_only = !view->u.tex.last_level;
120
121 /*
122 * the layer / element / level parameters are all either dynamic
123 * state or handled transparently wrt execution.
124 */
125 }
126
127
128 /**
129 * Initialize lp_sampler_static_sampler_state object with the gallium sampler
130 * state (this contains the parts which are considered static).
131 */
132 void
133 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
134 const struct pipe_sampler_state *sampler)
135 {
136 memset(state, 0, sizeof *state);
137
138 if (!sampler)
139 return;
140
141 /*
142 * We don't copy sampler state over unless it is actually enabled, to avoid
143 * spurious recompiles, as the sampler static state is part of the shader
144 * key.
145 *
146 * Ideally the state tracker or cso_cache module would make all state
147 * canonical, but until that happens it's better to be safe than sorry here.
148 *
149 * XXX: Actually there's much more than can be done here, especially
150 * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
151 */
152
153 state->wrap_s = sampler->wrap_s;
154 state->wrap_t = sampler->wrap_t;
155 state->wrap_r = sampler->wrap_r;
156 state->min_img_filter = sampler->min_img_filter;
157 state->mag_img_filter = sampler->mag_img_filter;
158
159 if (sampler->max_lod > 0.0f) {
160 state->min_mip_filter = sampler->min_mip_filter;
161 } else {
162 state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
163 }
164
165 if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
166 if (sampler->lod_bias != 0.0f) {
167 state->lod_bias_non_zero = 1;
168 }
169
170 /* If min_lod == max_lod we can greatly simplify mipmap selection.
171 * This is a case that occurs during automatic mipmap generation.
172 */
173 if (sampler->min_lod == sampler->max_lod) {
174 state->min_max_lod_equal = 1;
175 } else {
176 if (sampler->min_lod > 0.0f) {
177 state->apply_min_lod = 1;
178 }
179
180 /*
181 * XXX this won't do anything with the mesa state tracker which always
182 * sets max_lod to not more than actually present mip maps...
183 */
184 if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
185 state->apply_max_lod = 1;
186 }
187 }
188 }
189
190 state->compare_mode = sampler->compare_mode;
191 if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
192 state->compare_func = sampler->compare_func;
193 }
194
195 state->normalized_coords = sampler->normalized_coords;
196 }
197
198
199 /**
200 * Generate code to compute coordinate gradient (rho).
201 * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
202 *
203 * The resulting rho has bld->levelf format (per quad or per element).
204 */
205 static LLVMValueRef
206 lp_build_rho(struct lp_build_sample_context *bld,
207 unsigned texture_unit,
208 LLVMValueRef s,
209 LLVMValueRef t,
210 LLVMValueRef r,
211 LLVMValueRef cube_rho,
212 const struct lp_derivatives *derivs)
213 {
214 struct gallivm_state *gallivm = bld->gallivm;
215 struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
216 struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
217 struct lp_build_context *float_bld = &bld->float_bld;
218 struct lp_build_context *coord_bld = &bld->coord_bld;
219 struct lp_build_context *levelf_bld = &bld->levelf_bld;
220 const unsigned dims = bld->dims;
221 LLVMValueRef ddx_ddy[2];
222 LLVMBuilderRef builder = bld->gallivm->builder;
223 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
224 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
225 LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
226 LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
227 LLVMValueRef rho_vec;
228 LLVMValueRef int_size, float_size;
229 LLVMValueRef rho;
230 LLVMValueRef first_level, first_level_vec;
231 unsigned length = coord_bld->type.length;
232 unsigned num_quads = length / 4;
233 boolean rho_per_quad = levelf_bld->type.length != length;
234 unsigned i;
235 LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
236 LLVMValueRef rho_xvec, rho_yvec;
237
238 /* Note that all simplified calculations will only work for isotropic filtering */
239
240 /*
241 * rho calcs are always per quad except for explicit derivs (excluding
242 * the messy cube maps for now) when requested.
243 */
244
245 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
246 bld->gallivm, texture_unit);
247 first_level_vec = lp_build_broadcast_scalar(int_size_bld, first_level);
248 int_size = lp_build_minify(int_size_bld, bld->int_size, first_level_vec);
249 float_size = lp_build_int_to_float(float_size_bld, int_size);
250
251 if (cube_rho) {
252 LLVMValueRef cubesize;
253 LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
254
255 /*
256 * Cube map code did already everything except size mul and per-quad extraction.
257 * Luckily cube maps are always quadratic!
258 */
259 if (rho_per_quad) {
260 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
261 levelf_bld->type, cube_rho, 0);
262 }
263 else {
264 rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
265 }
266 if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
267 rho = lp_build_sqrt(levelf_bld, rho);
268 }
269 /* Could optimize this for single quad just skip the broadcast */
270 cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
271 levelf_bld->type, float_size, index0);
272 rho = lp_build_mul(levelf_bld, cubesize, rho);
273 }
274 else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
275 LLVMValueRef ddmax[3], ddx[3], ddy[3];
276 for (i = 0; i < dims; i++) {
277 LLVMValueRef floatdim;
278 LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
279
280 floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
281 coord_bld->type, float_size, indexi);
282
283 if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
284 ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
285 ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
286 ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
287 ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
288 }
289 else {
290 LLVMValueRef tmpx, tmpy;
291 tmpx = lp_build_abs(coord_bld, derivs->ddx[i]);
292 tmpy = lp_build_abs(coord_bld, derivs->ddy[i]);
293 ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
294 ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
295 }
296 }
297 if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
298 rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
299 rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
300 if (dims > 2) {
301 rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
302 rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
303 }
304 rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
305
306 if (rho_per_quad) {
307 /*
308 * note for this case without per-pixel lod could reduce math more
309 * (at some shuffle cost), but for now only do sqrt after packing,
310 * otherwise would also need different code to per-pixel lod case.
311 */
312 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
313 levelf_bld->type, rho, 0);
314 }
315 rho = lp_build_sqrt(levelf_bld, rho);
316
317 }
318 else {
319 rho = ddmax[0];
320 if (dims > 1) {
321 rho = lp_build_max(coord_bld, rho, ddmax[1]);
322 if (dims > 2) {
323 rho = lp_build_max(coord_bld, rho, ddmax[2]);
324 }
325 }
326 if (rho_per_quad) {
327 /*
328 * rho_vec contains per-pixel rho, convert to scalar per quad.
329 */
330 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
331 levelf_bld->type, rho, 0);
332 }
333 }
334 }
335 else {
336 /*
337 * This looks all a bit complex, but it's not that bad
338 * (the shuffle code makes it look worse than it is).
339 * Still, might not be ideal for all cases.
340 */
341 static const unsigned char swizzle0[] = { /* no-op swizzle */
342 0, LP_BLD_SWIZZLE_DONTCARE,
343 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
344 };
345 static const unsigned char swizzle1[] = {
346 1, LP_BLD_SWIZZLE_DONTCARE,
347 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
348 };
349 static const unsigned char swizzle2[] = {
350 2, LP_BLD_SWIZZLE_DONTCARE,
351 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
352 };
353
354 if (dims < 2) {
355 ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
356 }
357 else if (dims >= 2) {
358 ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
359 if (dims > 2) {
360 ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
361 }
362 }
363
364 if ((gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) && (dims > 1)) {
365 static const unsigned char swizzle01[] = { /* no-op swizzle */
366 0, 1,
367 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
368 };
369 static const unsigned char swizzle23[] = {
370 2, 3,
371 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
372 };
373 LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
374
375 for (i = 0; i < num_quads; i++) {
376 shuffles[i*4+0] = shuffles[i*4+1] = index0;
377 shuffles[i*4+2] = shuffles[i*4+3] = index1;
378 }
379 floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
380 LLVMConstVector(shuffles, length), "");
381 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim);
382 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
383 ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
384 ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
385 rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
386
387 if (dims > 2) {
388 static const unsigned char swizzle02[] = {
389 0, 2,
390 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
391 };
392 floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
393 coord_bld->type, float_size, index2);
394 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim);
395 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
396 ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
397 rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
398 }
399
400 rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
401 rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
402 rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
403
404 if (rho_per_quad) {
405 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
406 levelf_bld->type, rho, 0);
407 }
408 else {
409 /*
410 * on some cpus with half-speed 8-wide sqrt (e.g. SNB but not IVB)
411 * doing pack/sqrt/unpack/swizzle might be better for 8-wide case,
412 * same is true for cpus having faster scalars than 4-wide vecs
413 * for 4-wide case (where pack/unpack would be no-ops anyway).
414 * (Same is true really for cube_rho case above.)
415 */
416 rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
417 }
418 rho = lp_build_sqrt(levelf_bld, rho);
419 }
420 else {
421 ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
422 if (dims > 2) {
423 ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
424 }
425
426 if (dims < 2) {
427 rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0);
428 rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2);
429 }
430 else if (dims == 2) {
431 static const unsigned char swizzle02[] = {
432 0, 2,
433 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
434 };
435 static const unsigned char swizzle13[] = {
436 1, 3,
437 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
438 };
439 rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02);
440 rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13);
441 }
442 else {
443 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
444 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
445 assert(dims == 3);
446 for (i = 0; i < num_quads; i++) {
447 shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
448 shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
449 shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
450 shuffles1[4*i + 3] = i32undef;
451 shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
452 shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
453 shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
454 shuffles2[4*i + 3] = i32undef;
455 }
456 rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
457 LLVMConstVector(shuffles1, length), "");
458 rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
459 LLVMConstVector(shuffles2, length), "");
460 }
461
462 rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
463
464 if (bld->coord_type.length > 4) {
465 /* expand size to each quad */
466 if (dims > 1) {
467 /* could use some broadcast_vector helper for this? */
468 LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
469 for (i = 0; i < num_quads; i++) {
470 src[i] = float_size;
471 }
472 float_size = lp_build_concat(bld->gallivm, src, float_size_bld->type, num_quads);
473 }
474 else {
475 float_size = lp_build_broadcast_scalar(coord_bld, float_size);
476 }
477 rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
478
479 if (dims <= 1) {
480 rho = rho_vec;
481 }
482 else {
483 if (dims >= 2) {
484 LLVMValueRef rho_s, rho_t, rho_r;
485
486 rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
487 rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
488
489 rho = lp_build_max(coord_bld, rho_s, rho_t);
490
491 if (dims >= 3) {
492 rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
493 rho = lp_build_max(coord_bld, rho, rho_r);
494 }
495 }
496 }
497 if (rho_per_quad) {
498 rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
499 levelf_bld->type, rho, 0);
500 }
501 else {
502 rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
503 }
504 }
505 else {
506 if (dims <= 1) {
507 rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
508 }
509 rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
510
511 if (dims <= 1) {
512 rho = rho_vec;
513 }
514 else {
515 if (dims >= 2) {
516 LLVMValueRef rho_s, rho_t, rho_r;
517
518 rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
519 rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
520
521 rho = lp_build_max(float_bld, rho_s, rho_t);
522
523 if (dims >= 3) {
524 rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
525 rho = lp_build_max(float_bld, rho, rho_r);
526 }
527 }
528 }
529 if (!rho_per_quad) {
530 rho = lp_build_broadcast_scalar(levelf_bld, rho);
531 }
532 }
533 }
534 }
535
536 return rho;
537 }
538
539
540 /*
541 * Bri-linear lod computation
542 *
543 * Use a piece-wise linear approximation of log2 such that:
544 * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
545 * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
546 * with the steepness specified in 'factor'
547 * - exact result for 0.5, 1.5, etc.
548 *
549 *
550 * 1.0 - /----*
551 * /
552 * /
553 * /
554 * 0.5 - *
555 * /
556 * /
557 * /
558 * 0.0 - *----/
559 *
560 * | |
561 * 2^0 2^1
562 *
563 * This is a technique also commonly used in hardware:
564 * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
565 *
566 * TODO: For correctness, this should only be applied when texture is known to
567 * have regular mipmaps, i.e., mipmaps derived from the base level.
568 *
569 * TODO: This could be done in fixed point, where applicable.
570 */
571 static void
572 lp_build_brilinear_lod(struct lp_build_context *bld,
573 LLVMValueRef lod,
574 double factor,
575 LLVMValueRef *out_lod_ipart,
576 LLVMValueRef *out_lod_fpart)
577 {
578 LLVMValueRef lod_fpart;
579 double pre_offset = (factor - 0.5)/factor - 0.5;
580 double post_offset = 1 - factor;
581
582 if (0) {
583 lp_build_printf(bld->gallivm, "lod = %f\n", lod);
584 }
585
586 lod = lp_build_add(bld, lod,
587 lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
588
589 lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
590
591 lod_fpart = lp_build_mul(bld, lod_fpart,
592 lp_build_const_vec(bld->gallivm, bld->type, factor));
593
594 lod_fpart = lp_build_add(bld, lod_fpart,
595 lp_build_const_vec(bld->gallivm, bld->type, post_offset));
596
597 /*
598 * It's not necessary to clamp lod_fpart since:
599 * - the above expression will never produce numbers greater than one.
600 * - the mip filtering branch is only taken if lod_fpart is positive
601 */
602
603 *out_lod_fpart = lod_fpart;
604
605 if (0) {
606 lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
607 lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
608 }
609 }
610
611
612 /*
613 * Combined log2 and brilinear lod computation.
614 *
615 * It's in all identical to calling lp_build_fast_log2() and
616 * lp_build_brilinear_lod() above, but by combining we can compute the integer
617 * and fractional part independently.
618 */
619 static void
620 lp_build_brilinear_rho(struct lp_build_context *bld,
621 LLVMValueRef rho,
622 double factor,
623 LLVMValueRef *out_lod_ipart,
624 LLVMValueRef *out_lod_fpart)
625 {
626 LLVMValueRef lod_ipart;
627 LLVMValueRef lod_fpart;
628
629 const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
630 const double post_offset = 1 - 2*factor;
631
632 assert(bld->type.floating);
633
634 assert(lp_check_value(bld->type, rho));
635
636 /*
637 * The pre factor will make the intersections with the exact powers of two
638 * happen precisely where we want then to be, which means that the integer
639 * part will not need any post adjustments.
640 */
641 rho = lp_build_mul(bld, rho,
642 lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
643
644 /* ipart = ifloor(log2(rho)) */
645 lod_ipart = lp_build_extract_exponent(bld, rho, 0);
646
647 /* fpart = rho / 2**ipart */
648 lod_fpart = lp_build_extract_mantissa(bld, rho);
649
650 lod_fpart = lp_build_mul(bld, lod_fpart,
651 lp_build_const_vec(bld->gallivm, bld->type, factor));
652
653 lod_fpart = lp_build_add(bld, lod_fpart,
654 lp_build_const_vec(bld->gallivm, bld->type, post_offset));
655
656 /*
657 * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
658 * - the above expression will never produce numbers greater than one.
659 * - the mip filtering branch is only taken if lod_fpart is positive
660 */
661
662 *out_lod_ipart = lod_ipart;
663 *out_lod_fpart = lod_fpart;
664 }
665
666
667 /**
668 * Generate code to compute texture level of detail (lambda).
669 * \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
670 * \param lod_bias optional float vector with the shader lod bias
671 * \param explicit_lod optional float vector with the explicit lod
672 * \param width scalar int texture width
673 * \param height scalar int texture height
674 * \param depth scalar int texture depth
675 *
676 * The resulting lod is scalar per quad, so only the first value per quad
677 * passed in from lod_bias, explicit_lod is used.
678 */
679 void
680 lp_build_lod_selector(struct lp_build_sample_context *bld,
681 unsigned texture_unit,
682 unsigned sampler_unit,
683 LLVMValueRef s,
684 LLVMValueRef t,
685 LLVMValueRef r,
686 LLVMValueRef cube_rho,
687 const struct lp_derivatives *derivs,
688 LLVMValueRef lod_bias, /* optional */
689 LLVMValueRef explicit_lod, /* optional */
690 unsigned mip_filter,
691 LLVMValueRef *out_lod_ipart,
692 LLVMValueRef *out_lod_fpart)
693
694 {
695 LLVMBuilderRef builder = bld->gallivm->builder;
696 struct lp_build_context *levelf_bld = &bld->levelf_bld;
697 LLVMValueRef lod;
698
699 *out_lod_ipart = bld->leveli_bld.zero;
700 *out_lod_fpart = levelf_bld->zero;
701
702 if (bld->static_sampler_state->min_max_lod_equal) {
703 /* User is forcing sampling from a particular mipmap level.
704 * This is hit during mipmap generation.
705 */
706 LLVMValueRef min_lod =
707 bld->dynamic_state->min_lod(bld->dynamic_state,
708 bld->gallivm, sampler_unit);
709
710 lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
711 }
712 else {
713 if (explicit_lod) {
714 if (bld->num_lods != bld->coord_type.length)
715 lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
716 levelf_bld->type, explicit_lod, 0);
717 else
718 lod = explicit_lod;
719 }
720 else {
721 LLVMValueRef rho;
722
723 rho = lp_build_rho(bld, texture_unit, s, t, r, cube_rho, derivs);
724
725 /*
726 * Compute lod = log2(rho)
727 */
728
729 if (!lod_bias &&
730 !bld->static_sampler_state->lod_bias_non_zero &&
731 !bld->static_sampler_state->apply_max_lod &&
732 !bld->static_sampler_state->apply_min_lod) {
733 /*
734 * Special case when there are no post-log2 adjustments, which
735 * saves instructions but keeping the integer and fractional lod
736 * computations separate from the start.
737 */
738
739 if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
740 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
741 /*
742 * FIXME: this is not entirely correct, as out_lod_ipart is used
743 * both for mip level determination as well as mag/min switchover
744 * point (if different min/mag filters are used). In particular,
745 * lod values between [-0.5,0] (rho between [sqrt(2), 1.0]) will
746 * incorrectly use min filter instead of mag (the non-optimized
747 * calculation further down has exactly the same problem).
748 */
749 *out_lod_ipart = lp_build_ilog2(levelf_bld, rho);
750 *out_lod_fpart = levelf_bld->zero;
751 return;
752 }
753 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
754 !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
755 lp_build_brilinear_rho(levelf_bld, rho, BRILINEAR_FACTOR,
756 out_lod_ipart, out_lod_fpart);
757 return;
758 }
759 }
760
761 if (0) {
762 lod = lp_build_log2(levelf_bld, rho);
763 }
764 else {
765 lod = lp_build_fast_log2(levelf_bld, rho);
766 }
767
768 /* add shader lod bias */
769 if (lod_bias) {
770 if (bld->num_lods != bld->coord_type.length)
771 lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
772 levelf_bld->type, lod_bias, 0);
773 lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
774 }
775 }
776
777 /* add sampler lod bias */
778 if (bld->static_sampler_state->lod_bias_non_zero) {
779 LLVMValueRef sampler_lod_bias =
780 bld->dynamic_state->lod_bias(bld->dynamic_state,
781 bld->gallivm, sampler_unit);
782 sampler_lod_bias = lp_build_broadcast_scalar(levelf_bld,
783 sampler_lod_bias);
784 lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
785 }
786
787 /* clamp lod */
788 if (bld->static_sampler_state->apply_max_lod) {
789 LLVMValueRef max_lod =
790 bld->dynamic_state->max_lod(bld->dynamic_state,
791 bld->gallivm, sampler_unit);
792 max_lod = lp_build_broadcast_scalar(levelf_bld, max_lod);
793
794 lod = lp_build_min(levelf_bld, lod, max_lod);
795 }
796 if (bld->static_sampler_state->apply_min_lod) {
797 LLVMValueRef min_lod =
798 bld->dynamic_state->min_lod(bld->dynamic_state,
799 bld->gallivm, sampler_unit);
800 min_lod = lp_build_broadcast_scalar(levelf_bld, min_lod);
801
802 lod = lp_build_max(levelf_bld, lod, min_lod);
803 }
804 }
805
806 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
807 if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
808 lp_build_brilinear_lod(levelf_bld, lod, BRILINEAR_FACTOR,
809 out_lod_ipart, out_lod_fpart);
810 }
811 else {
812 lp_build_ifloor_fract(levelf_bld, lod, out_lod_ipart, out_lod_fpart);
813 }
814
815 lp_build_name(*out_lod_fpart, "lod_fpart");
816 }
817 else {
818 *out_lod_ipart = lp_build_iround(levelf_bld, lod);
819 }
820
821 lp_build_name(*out_lod_ipart, "lod_ipart");
822
823 return;
824 }
825
826
827 /**
828 * For PIPE_TEX_MIPFILTER_NEAREST, convert int part of lod
829 * to actual mip level.
830 * Note: this is all scalar per quad code.
831 * \param lod_ipart int texture level of detail
832 * \param level_out returns integer
833 * \param out_of_bounds returns per coord out_of_bounds mask if provided
834 */
835 void
836 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
837 unsigned texture_unit,
838 LLVMValueRef lod_ipart,
839 LLVMValueRef *level_out,
840 LLVMValueRef *out_of_bounds)
841 {
842 struct lp_build_context *leveli_bld = &bld->leveli_bld;
843 LLVMValueRef first_level, last_level, level;
844
845 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
846 bld->gallivm, texture_unit);
847 last_level = bld->dynamic_state->last_level(bld->dynamic_state,
848 bld->gallivm, texture_unit);
849 first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
850 last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
851
852 level = lp_build_add(leveli_bld, lod_ipart, first_level);
853
854 if (out_of_bounds) {
855 LLVMValueRef out, out1;
856 out = lp_build_cmp(leveli_bld, PIPE_FUNC_LESS, level, first_level);
857 out1 = lp_build_cmp(leveli_bld, PIPE_FUNC_GREATER, level, last_level);
858 out = lp_build_or(leveli_bld, out, out1);
859 if (bld->num_lods == bld->coord_bld.type.length) {
860 *out_of_bounds = out;
861 }
862 else if (bld->num_lods == 1) {
863 *out_of_bounds = lp_build_broadcast_scalar(&bld->int_coord_bld, out);
864 }
865 else {
866 assert(bld->num_lods == bld->coord_bld.type.length / 4);
867 *out_of_bounds = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
868 leveli_bld->type,
869 bld->int_coord_bld.type,
870 out);
871 }
872 *level_out = level;
873 }
874 else {
875 /* clamp level to legal range of levels */
876 *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level);
877
878 }
879 }
880
881
882 /**
883 * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad int LOD(s) to two (per-quad)
884 * (adjacent) mipmap level indexes, and fix up float lod part accordingly.
885 * Later, we'll sample from those two mipmap levels and interpolate between them.
886 */
887 void
888 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
889 unsigned texture_unit,
890 LLVMValueRef lod_ipart,
891 LLVMValueRef *lod_fpart_inout,
892 LLVMValueRef *level0_out,
893 LLVMValueRef *level1_out)
894 {
895 LLVMBuilderRef builder = bld->gallivm->builder;
896 struct lp_build_context *leveli_bld = &bld->leveli_bld;
897 struct lp_build_context *levelf_bld = &bld->levelf_bld;
898 LLVMValueRef first_level, last_level;
899 LLVMValueRef clamp_min;
900 LLVMValueRef clamp_max;
901
902 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
903 bld->gallivm, texture_unit);
904 last_level = bld->dynamic_state->last_level(bld->dynamic_state,
905 bld->gallivm, texture_unit);
906 first_level = lp_build_broadcast_scalar(leveli_bld, first_level);
907 last_level = lp_build_broadcast_scalar(leveli_bld, last_level);
908
909 *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level);
910 *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one);
911
912 /*
913 * Clamp both *level0_out and *level1_out to [first_level, last_level], with
914 * the minimum number of comparisons, and zeroing lod_fpart in the extreme
915 * ends in the process.
916 */
917
918 /*
919 * This code (vector select in particular) only works with llvm 3.1
920 * (if there's more than one quad, with x86 backend). Might consider
921 * converting to our lp_bld_logic helpers.
922 */
923 #if HAVE_LLVM < 0x0301
924 assert(leveli_bld->type.length == 1);
925 #endif
926
927 /* *level0_out < first_level */
928 clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
929 *level0_out, first_level,
930 "clamp_lod_to_first");
931
932 *level0_out = LLVMBuildSelect(builder, clamp_min,
933 first_level, *level0_out, "");
934
935 *level1_out = LLVMBuildSelect(builder, clamp_min,
936 first_level, *level1_out, "");
937
938 *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
939 levelf_bld->zero, *lod_fpart_inout, "");
940
941 /* *level0_out >= last_level */
942 clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
943 *level0_out, last_level,
944 "clamp_lod_to_last");
945
946 *level0_out = LLVMBuildSelect(builder, clamp_max,
947 last_level, *level0_out, "");
948
949 *level1_out = LLVMBuildSelect(builder, clamp_max,
950 last_level, *level1_out, "");
951
952 *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
953 levelf_bld->zero, *lod_fpart_inout, "");
954
955 lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
956 lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
957 lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
958 }
959
960
961 /**
962 * Return pointer to a single mipmap level.
963 * \param level integer mipmap level
964 */
965 LLVMValueRef
966 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
967 LLVMValueRef level)
968 {
969 LLVMBuilderRef builder = bld->gallivm->builder;
970 LLVMValueRef indexes[2], data_ptr, mip_offset;
971
972 indexes[0] = lp_build_const_int32(bld->gallivm, 0);
973 indexes[1] = level;
974 mip_offset = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
975 mip_offset = LLVMBuildLoad(builder, mip_offset, "");
976 data_ptr = LLVMBuildGEP(builder, bld->base_ptr, &mip_offset, 1, "");
977 return data_ptr;
978 }
979
980 /**
981 * Return (per-pixel) offsets to mip levels.
982 * \param level integer mipmap level
983 */
984 LLVMValueRef
985 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
986 LLVMValueRef level)
987 {
988 LLVMBuilderRef builder = bld->gallivm->builder;
989 LLVMValueRef indexes[2], offsets, offset1;
990
991 indexes[0] = lp_build_const_int32(bld->gallivm, 0);
992 if (bld->num_lods == 1) {
993 indexes[1] = level;
994 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
995 offset1 = LLVMBuildLoad(builder, offset1, "");
996 offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
997 }
998 else if (bld->num_lods == bld->coord_bld.type.length / 4) {
999 unsigned i;
1000
1001 offsets = bld->int_coord_bld.undef;
1002 for (i = 0; i < bld->num_lods; i++) {
1003 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1004 LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
1005 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1006 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
1007 offset1 = LLVMBuildLoad(builder, offset1, "");
1008 offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
1009 }
1010 offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0, 4);
1011 }
1012 else {
1013 unsigned i;
1014
1015 assert (bld->num_lods == bld->coord_bld.type.length);
1016
1017 offsets = bld->int_coord_bld.undef;
1018 for (i = 0; i < bld->num_lods; i++) {
1019 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1020 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1021 offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
1022 offset1 = LLVMBuildLoad(builder, offset1, "");
1023 offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
1024 }
1025 }
1026 return offsets;
1027 }
1028
1029
1030 /**
1031 * Codegen equivalent for u_minify().
1032 * Return max(1, base_size >> level);
1033 */
1034 LLVMValueRef
1035 lp_build_minify(struct lp_build_context *bld,
1036 LLVMValueRef base_size,
1037 LLVMValueRef level)
1038 {
1039 LLVMBuilderRef builder = bld->gallivm->builder;
1040 assert(lp_check_value(bld->type, base_size));
1041 assert(lp_check_value(bld->type, level));
1042
1043 if (level == bld->zero) {
1044 /* if we're using mipmap level zero, no minification is needed */
1045 return base_size;
1046 }
1047 else {
1048 LLVMValueRef size =
1049 LLVMBuildLShr(builder, base_size, level, "minify");
1050 assert(bld->type.sign);
1051 size = lp_build_max(bld, size, bld->one);
1052 return size;
1053 }
1054 }
1055
1056
1057 /**
1058 * Dereference stride_array[mipmap_level] array to get a stride.
1059 * Return stride as a vector.
1060 */
1061 static LLVMValueRef
1062 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
1063 LLVMValueRef stride_array, LLVMValueRef level)
1064 {
1065 LLVMBuilderRef builder = bld->gallivm->builder;
1066 LLVMValueRef indexes[2], stride, stride1;
1067 indexes[0] = lp_build_const_int32(bld->gallivm, 0);
1068 if (bld->num_lods == 1) {
1069 indexes[1] = level;
1070 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1071 stride1 = LLVMBuildLoad(builder, stride1, "");
1072 stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
1073 }
1074 else if (bld->num_lods == bld->coord_bld.type.length / 4) {
1075 LLVMValueRef stride1;
1076 unsigned i;
1077
1078 stride = bld->int_coord_bld.undef;
1079 for (i = 0; i < bld->num_lods; i++) {
1080 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1081 LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
1082 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1083 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1084 stride1 = LLVMBuildLoad(builder, stride1, "");
1085 stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
1086 }
1087 stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
1088 }
1089 else {
1090 LLVMValueRef stride1;
1091 unsigned i;
1092
1093 assert (bld->num_lods == bld->coord_bld.type.length);
1094
1095 stride = bld->int_coord_bld.undef;
1096 for (i = 0; i < bld->coord_bld.type.length; i++) {
1097 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1098 indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
1099 stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
1100 stride1 = LLVMBuildLoad(builder, stride1, "");
1101 stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
1102 }
1103 }
1104 return stride;
1105 }
1106
1107
1108 /**
1109 * When sampling a mipmap, we need to compute the width, height, depth
1110 * of the source levels from the level indexes. This helper function
1111 * does that.
1112 */
1113 void
1114 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
1115 LLVMValueRef ilevel,
1116 LLVMValueRef *out_size,
1117 LLVMValueRef *row_stride_vec,
1118 LLVMValueRef *img_stride_vec)
1119 {
1120 const unsigned dims = bld->dims;
1121 LLVMValueRef ilevel_vec;
1122
1123 /*
1124 * Compute width, height, depth at mipmap level 'ilevel'
1125 */
1126 if (bld->num_lods == 1) {
1127 ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
1128 *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
1129 }
1130 else {
1131 LLVMValueRef int_size_vec;
1132 LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
1133 unsigned num_quads = bld->coord_bld.type.length / 4;
1134 unsigned i;
1135
1136 if (bld->num_lods == num_quads) {
1137 /*
1138 * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
1139 * intel "forgot" the variable shift count instruction until avx2.
1140 * A harmless 8x32 shift gets translated into 32 instructions
1141 * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
1142 * unable to recognize if there are really just 2 different shift
1143 * count values. So do the shift 4-wide before expansion.
1144 */
1145 struct lp_build_context bld4;
1146 struct lp_type type4;
1147
1148 type4 = bld->int_coord_bld.type;
1149 type4.length = 4;
1150
1151 lp_build_context_init(&bld4, bld->gallivm, type4);
1152
1153 if (bld->dims == 1) {
1154 assert(bld->int_size_in_bld.type.length == 1);
1155 int_size_vec = lp_build_broadcast_scalar(&bld4,
1156 bld->int_size);
1157 }
1158 else {
1159 assert(bld->int_size_in_bld.type.length == 4);
1160 int_size_vec = bld->int_size;
1161 }
1162
1163 for (i = 0; i < num_quads; i++) {
1164 LLVMValueRef ileveli;
1165 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1166
1167 ileveli = lp_build_extract_broadcast(bld->gallivm,
1168 bld->leveli_bld.type,
1169 bld4.type,
1170 ilevel,
1171 indexi);
1172 tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
1173 }
1174 /*
1175 * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
1176 * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
1177 */
1178 *out_size = lp_build_concat(bld->gallivm,
1179 tmp,
1180 bld4.type,
1181 num_quads);
1182 }
1183 else {
1184 /* FIXME: this is terrible and results in _huge_ vector
1185 * (for the dims > 1 case).
1186 * Should refactor this (together with extract_image_sizes) and do
1187 * something more useful. Could for instance if we have width,height
1188 * with 4-wide vector pack all elements into a 8xi16 vector
1189 * (on which we can still do useful math) instead of using a 16xi32
1190 * vector.
1191 * FIXME: some callers can't handle this yet.
1192 * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
1193 * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
1194 */
1195 assert(bld->num_lods == bld->coord_bld.type.length);
1196 if (bld->dims == 1) {
1197 assert(bld->int_size_in_bld.type.length == 1);
1198 int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
1199 bld->int_size);
1200 /* vector shift with variable shift count alert... */
1201 *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
1202 }
1203 else {
1204 LLVMValueRef ilevel1;
1205 for (i = 0; i < bld->num_lods; i++) {
1206 LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1207 ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
1208 bld->int_size_in_bld.type, ilevel, indexi);
1209 tmp[i] = bld->int_size;
1210 tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
1211 }
1212 *out_size = lp_build_concat(bld->gallivm, tmp,
1213 bld->int_size_in_bld.type,
1214 bld->num_lods);
1215 }
1216 }
1217 }
1218
1219 if (dims >= 2) {
1220 *row_stride_vec = lp_build_get_level_stride_vec(bld,
1221 bld->row_stride_array,
1222 ilevel);
1223 }
1224 if (dims == 3 ||
1225 bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1226 bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1227 bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1228 *img_stride_vec = lp_build_get_level_stride_vec(bld,
1229 bld->img_stride_array,
1230 ilevel);
1231 }
1232 }
1233
1234
1235 /**
1236 * Extract and broadcast texture size.
1237 *
1238 * @param size_type type of the texture size vector (either
1239 * bld->int_size_type or bld->float_size_type)
1240 * @param coord_type type of the texture size vector (either
1241 * bld->int_coord_type or bld->coord_type)
1242 * @param size vector with the texture size (width, height, depth)
1243 */
1244 void
1245 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
1246 struct lp_build_context *size_bld,
1247 struct lp_type coord_type,
1248 LLVMValueRef size,
1249 LLVMValueRef *out_width,
1250 LLVMValueRef *out_height,
1251 LLVMValueRef *out_depth)
1252 {
1253 const unsigned dims = bld->dims;
1254 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1255 struct lp_type size_type = size_bld->type;
1256
1257 if (bld->num_lods == 1) {
1258 *out_width = lp_build_extract_broadcast(bld->gallivm,
1259 size_type,
1260 coord_type,
1261 size,
1262 LLVMConstInt(i32t, 0, 0));
1263 if (dims >= 2) {
1264 *out_height = lp_build_extract_broadcast(bld->gallivm,
1265 size_type,
1266 coord_type,
1267 size,
1268 LLVMConstInt(i32t, 1, 0));
1269 if (dims == 3) {
1270 *out_depth = lp_build_extract_broadcast(bld->gallivm,
1271 size_type,
1272 coord_type,
1273 size,
1274 LLVMConstInt(i32t, 2, 0));
1275 }
1276 }
1277 }
1278 else {
1279 unsigned num_quads = bld->coord_bld.type.length / 4;
1280
1281 if (dims == 1) {
1282 *out_width = size;
1283 }
1284 else if (bld->num_lods == num_quads) {
1285 *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
1286 if (dims >= 2) {
1287 *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
1288 if (dims == 3) {
1289 *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
1290 }
1291 }
1292 }
1293 else {
1294 assert(bld->num_lods == bld->coord_type.length);
1295 *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1296 coord_type, size, 0);
1297 if (dims >= 2) {
1298 *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1299 coord_type, size, 1);
1300 if (dims == 3) {
1301 *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1302 coord_type, size, 2);
1303 }
1304 }
1305 }
1306 }
1307 }
1308
1309
1310 /**
1311 * Unnormalize coords.
1312 *
1313 * @param flt_size vector with the integer texture size (width, height, depth)
1314 */
1315 void
1316 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
1317 LLVMValueRef flt_size,
1318 LLVMValueRef *s,
1319 LLVMValueRef *t,
1320 LLVMValueRef *r)
1321 {
1322 const unsigned dims = bld->dims;
1323 LLVMValueRef width;
1324 LLVMValueRef height;
1325 LLVMValueRef depth;
1326
1327 lp_build_extract_image_sizes(bld,
1328 &bld->float_size_bld,
1329 bld->coord_type,
1330 flt_size,
1331 &width,
1332 &height,
1333 &depth);
1334
1335 /* s = s * width, t = t * height */
1336 *s = lp_build_mul(&bld->coord_bld, *s, width);
1337 if (dims >= 2) {
1338 *t = lp_build_mul(&bld->coord_bld, *t, height);
1339 if (dims >= 3) {
1340 *r = lp_build_mul(&bld->coord_bld, *r, depth);
1341 }
1342 }
1343 }
1344
1345
1346 /** Helper used by lp_build_cube_lookup() */
1347 static LLVMValueRef
1348 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
1349 {
1350 /* ima = +0.5 / abs(coord); */
1351 LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1352 LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1353 LLVMValueRef ima = lp_build_div(coord_bld, posHalf, absCoord);
1354 return ima;
1355 }
1356
1357 /** Helper used by lp_build_cube_lookup() */
1358 static LLVMValueRef
1359 lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
1360 {
1361 /* ima = -0.5 / abs(coord); */
1362 LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
1363 LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1364 LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
1365 return ima;
1366 }
1367
1368 /**
1369 * Helper used by lp_build_cube_lookup()
1370 * FIXME: the sign here can also be 0.
1371 * Arithmetically this could definitely make a difference. Either
1372 * fix the comment or use other (simpler) sign function, not sure
1373 * which one it should be.
1374 * \param sign scalar +1 or -1
1375 * \param coord float vector
1376 * \param ima float vector
1377 */
1378 static LLVMValueRef
1379 lp_build_cube_coord(struct lp_build_context *coord_bld,
1380 LLVMValueRef sign, int negate_coord,
1381 LLVMValueRef coord, LLVMValueRef ima)
1382 {
1383 /* return negate(coord) * ima * sign + 0.5; */
1384 LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1385 LLVMValueRef res;
1386
1387 assert(negate_coord == +1 || negate_coord == -1);
1388
1389 if (negate_coord == -1) {
1390 coord = lp_build_negate(coord_bld, coord);
1391 }
1392
1393 res = lp_build_mul(coord_bld, coord, ima);
1394 if (sign) {
1395 sign = lp_build_broadcast_scalar(coord_bld, sign);
1396 res = lp_build_mul(coord_bld, res, sign);
1397 }
1398 res = lp_build_add(coord_bld, res, half);
1399
1400 return res;
1401 }
1402
1403
1404 /** Helper used by lp_build_cube_lookup()
1405 * Return (major_coord >= 0) ? pos_face : neg_face;
1406 */
1407 static LLVMValueRef
1408 lp_build_cube_face(struct lp_build_sample_context *bld,
1409 LLVMValueRef major_coord,
1410 unsigned pos_face, unsigned neg_face)
1411 {
1412 struct gallivm_state *gallivm = bld->gallivm;
1413 LLVMBuilderRef builder = gallivm->builder;
1414 LLVMValueRef cmp = LLVMBuildFCmp(builder, LLVMRealUGE,
1415 major_coord,
1416 bld->float_bld.zero, "");
1417 LLVMValueRef pos = lp_build_const_int32(gallivm, pos_face);
1418 LLVMValueRef neg = lp_build_const_int32(gallivm, neg_face);
1419 LLVMValueRef res = LLVMBuildSelect(builder, cmp, pos, neg, "");
1420 return res;
1421 }
1422
1423
1424
1425 /**
1426 * Generate code to do cube face selection and compute per-face texcoords.
1427 */
1428 void
1429 lp_build_cube_lookup(struct lp_build_sample_context *bld,
1430 LLVMValueRef *coords,
1431 const struct lp_derivatives *derivs, /* optional */
1432 LLVMValueRef *rho,
1433 boolean need_derivs)
1434 {
1435 struct lp_build_context *coord_bld = &bld->coord_bld;
1436 LLVMBuilderRef builder = bld->gallivm->builder;
1437 struct gallivm_state *gallivm = bld->gallivm;
1438 LLVMValueRef si, ti, ri;
1439
1440 if (1 || coord_bld->type.length > 4) {
1441 /*
1442 * Do per-pixel face selection. We cannot however (as we used to do)
1443 * simply calculate the derivs afterwards (which is very bogus for
1444 * explicit derivs btw) because the values would be "random" when
1445 * not all pixels lie on the same face. So what we do here is just
1446 * calculate the derivatives after scaling the coords by the absolute
1447 * value of the inverse major axis, and essentially do rho calculation
1448 * steps as if it were a 3d texture. This is perfect if all pixels hit
1449 * the same face, but not so great at edges, I believe the max error
1450 * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
1451 * the 3d distance between 2 points on the cube instead of measuring up/down
1452 * the edge). Still this is possibly a win over just selecting the same face
1453 * for all pixels. Unfortunately, something like that doesn't work for
1454 * explicit derivatives.
1455 * TODO: handle explicit derivatives by transforming them alongside coords
1456 * somehow.
1457 */
1458 struct lp_build_context *cint_bld = &bld->int_coord_bld;
1459 struct lp_type intctype = cint_bld->type;
1460 LLVMValueRef signs, signt, signr, signma;
1461 LLVMValueRef as, at, ar, face, face_s, face_t;
1462 LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
1463 LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
1464 LLVMValueRef tnegi, rnegi;
1465 LLVMValueRef ma, mai, ima;
1466 LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
1467 LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
1468 1 << (intctype.width - 1));
1469 LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
1470 intctype.width -1);
1471 LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
1472 LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
1473 LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
1474 LLVMValueRef s = coords[0];
1475 LLVMValueRef t = coords[1];
1476 LLVMValueRef r = coords[2];
1477
1478 assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
1479 assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
1480 assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
1481
1482 /*
1483 * get absolute value (for x/y/z face selection) and sign bit
1484 * (for mirroring minor coords and pos/neg face selection)
1485 * of the original coords.
1486 */
1487 as = lp_build_abs(&bld->coord_bld, s);
1488 at = lp_build_abs(&bld->coord_bld, t);
1489 ar = lp_build_abs(&bld->coord_bld, r);
1490
1491 /*
1492 * major face determination: select x if x > y else select y
1493 * select z if z >= max(x,y) else select previous result
1494 * if some axis are the same we chose z over y, y over x - the
1495 * dx10 spec seems to ask for it while OpenGL doesn't care (if we
1496 * wouldn't care could save a select or two if using different
1497 * compares and doing at_g_as_ar last since tnewx and tnewz are the
1498 * same).
1499 */
1500 as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
1501 maxasat = lp_build_max(coord_bld, as, at);
1502 ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
1503
1504 if (need_derivs) {
1505 LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
1506 static const unsigned char swizzle0[] = { /* no-op swizzle */
1507 0, LP_BLD_SWIZZLE_DONTCARE,
1508 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1509 };
1510 static const unsigned char swizzle1[] = {
1511 1, LP_BLD_SWIZZLE_DONTCARE,
1512 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1513 };
1514 static const unsigned char swizzle01[] = { /* no-op swizzle */
1515 0, 1,
1516 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1517 };
1518 static const unsigned char swizzle23[] = {
1519 2, 3,
1520 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1521 };
1522 static const unsigned char swizzle02[] = {
1523 0, 2,
1524 LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
1525 };
1526
1527 /*
1528 * scale the s/t/r coords pre-select/mirror so we can calculate
1529 * "reasonable" derivs.
1530 */
1531 ma = lp_build_select(coord_bld, as_ge_at, s, t);
1532 ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
1533 ima = lp_build_cube_imapos(coord_bld, ma);
1534 s = lp_build_mul(coord_bld, s, ima);
1535 t = lp_build_mul(coord_bld, t, ima);
1536 r = lp_build_mul(coord_bld, r, ima);
1537
1538 /*
1539 * This isn't quite the same as the "ordinary" (3d deriv) path since we
1540 * know the texture is square which simplifies things (we can omit the
1541 * size mul which happens very early completely here and do it at the
1542 * very end).
1543 */
1544 ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
1545 ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
1546
1547 if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
1548 ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
1549 ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
1550 }
1551 else {
1552 ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
1553 ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
1554 }
1555
1556 tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
1557 tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
1558 tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
1559
1560 if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
1561 rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
1562 rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
1563 }
1564 else {
1565 rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
1566 rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
1567 }
1568
1569 tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
1570 tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
1571 *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
1572 }
1573
1574 si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
1575 ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
1576 ri = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
1577 signs = LLVMBuildAnd(builder, si, signmask, "");
1578 signt = LLVMBuildAnd(builder, ti, signmask, "");
1579 signr = LLVMBuildAnd(builder, ri, signmask, "");
1580
1581 /*
1582 * compute all possible new s/t coords
1583 * snewx = signs * -r;
1584 * tnewx = -t;
1585 * snewy = s;
1586 * tnewy = signt * r;
1587 * snewz = signr * s;
1588 * tnewz = -t;
1589 */
1590 tnegi = LLVMBuildXor(builder, ti, signmask, "");
1591 rnegi = LLVMBuildXor(builder, ri, signmask, "");
1592
1593 snewx = LLVMBuildXor(builder, signs, rnegi, "");
1594 tnewx = tnegi;
1595
1596 snewy = si;
1597 tnewy = LLVMBuildXor(builder, signt, ri, "");
1598
1599 snewz = LLVMBuildXor(builder, signr, si, "");
1600 tnewz = tnegi;
1601
1602 /* XXX on x86 unclear if we should cast the values back to float
1603 * or not - on some cpus (nehalem) pblendvb has twice the throughput
1604 * of blendvps though on others there just might be domain
1605 * transition penalties when using it (this depends on what llvm
1606 * will chose for the bit ops above so there appears no "right way",
1607 * but given the boatload of selects let's just use the int type).
1608 */
1609
1610 /* select/mirror */
1611 if (!need_derivs) {
1612 ma = lp_build_select(coord_bld, as_ge_at, s, t);
1613 }
1614 face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
1615 face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
1616 face = lp_build_select(cint_bld, as_ge_at, facex, facey);
1617
1618 if (!need_derivs) {
1619 ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
1620 }
1621 face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, face_s);
1622 face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, face_t);
1623 face = lp_build_select(cint_bld, ar_ge_as_at, facez, face);
1624
1625 face_s = LLVMBuildBitCast(builder, face_s,
1626 lp_build_vec_type(gallivm, coord_bld->type), "");
1627 face_t = LLVMBuildBitCast(builder, face_t,
1628 lp_build_vec_type(gallivm, coord_bld->type), "");
1629
1630 /* add +1 for neg face */
1631 /* XXX with AVX probably want to use another select here -
1632 * as long as we ensure vblendvps gets used we can actually
1633 * skip the comparison and just use sign as a "mask" directly.
1634 */
1635 mai = LLVMBuildBitCast(builder, ma, lp_build_vec_type(gallivm, intctype), "");
1636 signma = LLVMBuildLShr(builder, mai, signshift, "");
1637 coords[2] = LLVMBuildOr(builder, face, signma, "face");
1638
1639 /* project coords */
1640 if (!need_derivs) {
1641 ima = lp_build_cube_imapos(coord_bld, ma);
1642 face_s = lp_build_mul(coord_bld, face_s, ima);
1643 face_t = lp_build_mul(coord_bld, face_t, ima);
1644 }
1645
1646 coords[0] = lp_build_add(coord_bld, face_s, posHalf);
1647 coords[1] = lp_build_add(coord_bld, face_t, posHalf);
1648 }
1649
1650 else {
1651 struct lp_build_if_state if_ctx;
1652 LLVMValueRef face_s_var;
1653 LLVMValueRef face_t_var;
1654 LLVMValueRef face_var;
1655 LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
1656 LLVMValueRef shuffles[4];
1657 LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
1658 LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
1659 LLVMValueRef tmp[4], rxyz, arxyz;
1660 struct lp_build_context *float_bld = &bld->float_bld;
1661 LLVMValueRef s, t, r, face, face_s, face_t;
1662
1663 assert(bld->coord_bld.type.length == 4);
1664
1665 tmp[0] = s = coords[0];
1666 tmp[1] = t = coords[1];
1667 tmp[2] = r = coords[2];
1668 rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
1669 arxyz = lp_build_abs(&bld->coord_bld, rxyz);
1670
1671 shuffles[0] = lp_build_const_int32(gallivm, 0);
1672 shuffles[1] = lp_build_const_int32(gallivm, 1);
1673 shuffles[2] = lp_build_const_int32(gallivm, 0);
1674 shuffles[3] = lp_build_const_int32(gallivm, 1);
1675 arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1676 shuffles[0] = lp_build_const_int32(gallivm, 1);
1677 shuffles[1] = lp_build_const_int32(gallivm, 0);
1678 shuffles[2] = lp_build_const_int32(gallivm, 2);
1679 shuffles[3] = lp_build_const_int32(gallivm, 2);
1680 aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
1681 arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
1682
1683 shuffles[0] = lp_build_const_int32(gallivm, 0);
1684 shuffles[1] = lp_build_const_int32(gallivm, 1);
1685 arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1686 LLVMConstVector(shuffles, 2), "");
1687 shuffles[0] = lp_build_const_int32(gallivm, 2);
1688 shuffles[1] = lp_build_const_int32(gallivm, 3);
1689 arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
1690 LLVMConstVector(shuffles, 2), "");
1691 arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
1692
1693 arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1694 lp_build_const_int32(gallivm, 0), "");
1695 arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
1696 lp_build_const_int32(gallivm, 0), "");
1697 ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
1698 lp_build_const_int32(gallivm, 1), "");
1699 ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
1700 lp_build_const_int32(gallivm, 0), "");
1701 face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
1702 face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
1703 face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
1704
1705 lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
1706 {
1707 /* +/- X face */
1708 LLVMValueRef sign, ima;
1709 si = LLVMBuildExtractElement(builder, rxyz,
1710 lp_build_const_int32(gallivm, 0), "");
1711 /* +/- X face */
1712 sign = lp_build_sgn(float_bld, si);
1713 ima = lp_build_cube_imaneg(coord_bld, s);
1714 face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
1715 face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1716 face = lp_build_cube_face(bld, si,
1717 PIPE_TEX_FACE_POS_X,
1718 PIPE_TEX_FACE_NEG_X);
1719 LLVMBuildStore(builder, face_s, face_s_var);
1720 LLVMBuildStore(builder, face_t, face_t_var);
1721 LLVMBuildStore(builder, face, face_var);
1722 }
1723 lp_build_else(&if_ctx);
1724 {
1725 struct lp_build_if_state if_ctx2;
1726
1727 lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
1728 {
1729 LLVMValueRef sign, ima;
1730 /* +/- Y face */
1731 ti = LLVMBuildExtractElement(builder, rxyz,
1732 lp_build_const_int32(gallivm, 1), "");
1733 sign = lp_build_sgn(float_bld, ti);
1734 ima = lp_build_cube_imaneg(coord_bld, t);
1735 face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
1736 face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
1737 face = lp_build_cube_face(bld, ti,
1738 PIPE_TEX_FACE_POS_Y,
1739 PIPE_TEX_FACE_NEG_Y);
1740 LLVMBuildStore(builder, face_s, face_s_var);
1741 LLVMBuildStore(builder, face_t, face_t_var);
1742 LLVMBuildStore(builder, face, face_var);
1743 }
1744 lp_build_else(&if_ctx2);
1745 {
1746 /* +/- Z face */
1747 LLVMValueRef sign, ima;
1748 ri = LLVMBuildExtractElement(builder, rxyz,
1749 lp_build_const_int32(gallivm, 2), "");
1750 sign = lp_build_sgn(float_bld, ri);
1751 ima = lp_build_cube_imaneg(coord_bld, r);
1752 face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
1753 face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
1754 face = lp_build_cube_face(bld, ri,
1755 PIPE_TEX_FACE_POS_Z,
1756 PIPE_TEX_FACE_NEG_Z);
1757 LLVMBuildStore(builder, face_s, face_s_var);
1758 LLVMBuildStore(builder, face_t, face_t_var);
1759 LLVMBuildStore(builder, face, face_var);
1760 }
1761 lp_build_endif(&if_ctx2);
1762 }
1763
1764 lp_build_endif(&if_ctx);
1765
1766 coords[0] = LLVMBuildLoad(builder, face_s_var, "face_s");
1767 coords[1] = LLVMBuildLoad(builder, face_t_var, "face_t");
1768 face = LLVMBuildLoad(builder, face_var, "face");
1769 coords[2] = lp_build_broadcast_scalar(&bld->int_coord_bld, face);
1770 }
1771 }
1772
1773
1774 /**
1775 * Compute the partial offset of a pixel block along an arbitrary axis.
1776 *
1777 * @param coord coordinate in pixels
1778 * @param stride number of bytes between rows of successive pixel blocks
1779 * @param block_length number of pixels in a pixels block along the coordinate
1780 * axis
1781 * @param out_offset resulting relative offset of the pixel block in bytes
1782 * @param out_subcoord resulting sub-block pixel coordinate
1783 */
1784 void
1785 lp_build_sample_partial_offset(struct lp_build_context *bld,
1786 unsigned block_length,
1787 LLVMValueRef coord,
1788 LLVMValueRef stride,
1789 LLVMValueRef *out_offset,
1790 LLVMValueRef *out_subcoord)
1791 {
1792 LLVMBuilderRef builder = bld->gallivm->builder;
1793 LLVMValueRef offset;
1794 LLVMValueRef subcoord;
1795
1796 if (block_length == 1) {
1797 subcoord = bld->zero;
1798 }
1799 else {
1800 /*
1801 * Pixel blocks have power of two dimensions. LLVM should convert the
1802 * rem/div to bit arithmetic.
1803 * TODO: Verify this.
1804 * It does indeed BUT it does transform it to scalar (and back) when doing so
1805 * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
1806 * The generated code looks seriously unfunny and is quite expensive.
1807 */
1808 #if 0
1809 LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
1810 subcoord = LLVMBuildURem(builder, coord, block_width, "");
1811 coord = LLVMBuildUDiv(builder, coord, block_width, "");
1812 #else
1813 unsigned logbase2 = util_logbase2(block_length);
1814 LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
1815 LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
1816 subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
1817 coord = LLVMBuildLShr(builder, coord, block_shift, "");
1818 #endif
1819 }
1820
1821 offset = lp_build_mul(bld, coord, stride);
1822
1823 assert(out_offset);
1824 assert(out_subcoord);
1825
1826 *out_offset = offset;
1827 *out_subcoord = subcoord;
1828 }
1829
1830
1831 /**
1832 * Compute the offset of a pixel block.
1833 *
1834 * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
1835 *
1836 * Returns the relative offset and i,j sub-block coordinates
1837 */
1838 void
1839 lp_build_sample_offset(struct lp_build_context *bld,
1840 const struct util_format_description *format_desc,
1841 LLVMValueRef x,
1842 LLVMValueRef y,
1843 LLVMValueRef z,
1844 LLVMValueRef y_stride,
1845 LLVMValueRef z_stride,
1846 LLVMValueRef *out_offset,
1847 LLVMValueRef *out_i,
1848 LLVMValueRef *out_j)
1849 {
1850 LLVMValueRef x_stride;
1851 LLVMValueRef offset;
1852
1853 x_stride = lp_build_const_vec(bld->gallivm, bld->type,
1854 format_desc->block.bits/8);
1855
1856 lp_build_sample_partial_offset(bld,
1857 format_desc->block.width,
1858 x, x_stride,
1859 &offset, out_i);
1860
1861 if (y && y_stride) {
1862 LLVMValueRef y_offset;
1863 lp_build_sample_partial_offset(bld,
1864 format_desc->block.height,
1865 y, y_stride,
1866 &y_offset, out_j);
1867 offset = lp_build_add(bld, offset, y_offset);
1868 }
1869 else {
1870 *out_j = bld->zero;
1871 }
1872
1873 if (z && z_stride) {
1874 LLVMValueRef z_offset;
1875 LLVMValueRef k;
1876 lp_build_sample_partial_offset(bld,
1877 1, /* pixel blocks are always 2D */
1878 z, z_stride,
1879 &z_offset, &k);
1880 offset = lp_build_add(bld, offset, z_offset);
1881 }
1882
1883 *out_offset = offset;
1884 }