gallivm: optimize gather a bit, by using supplied destination type
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_aos.c
1 /**************************************************************************
2 *
3 * Copyright 2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- AoS.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "util/u_debug.h"
39 #include "util/u_dump.h"
40 #include "util/u_memory.h"
41 #include "util/u_math.h"
42 #include "util/u_format.h"
43 #include "util/u_cpu_detect.h"
44 #include "lp_bld_debug.h"
45 #include "lp_bld_type.h"
46 #include "lp_bld_const.h"
47 #include "lp_bld_conv.h"
48 #include "lp_bld_arit.h"
49 #include "lp_bld_bitarit.h"
50 #include "lp_bld_logic.h"
51 #include "lp_bld_swizzle.h"
52 #include "lp_bld_pack.h"
53 #include "lp_bld_flow.h"
54 #include "lp_bld_gather.h"
55 #include "lp_bld_format.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_sample.h"
58 #include "lp_bld_sample_aos.h"
59 #include "lp_bld_quad.h"
60
61
62 /**
63 * Build LLVM code for texture coord wrapping, for nearest filtering,
64 * for scaled integer texcoords.
65 * \param block_length is the length of the pixel block along the
66 * coordinate axis
67 * \param coord the incoming texcoord (s,t or r) scaled to the texture size
68 * \param coord_f the incoming texcoord (s,t or r) as float vec
69 * \param length the texture size along one dimension
70 * \param stride pixel stride along the coordinate axis (in bytes)
71 * \param offset the texel offset along the coord axis
72 * \param is_pot if TRUE, length is a power of two
73 * \param wrap_mode one of PIPE_TEX_WRAP_x
74 * \param out_offset byte offset for the wrapped coordinate
75 * \param out_i resulting sub-block pixel coordinate for coord0
76 */
77 static void
78 lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
79 unsigned block_length,
80 LLVMValueRef coord,
81 LLVMValueRef coord_f,
82 LLVMValueRef length,
83 LLVMValueRef stride,
84 LLVMValueRef offset,
85 boolean is_pot,
86 unsigned wrap_mode,
87 LLVMValueRef *out_offset,
88 LLVMValueRef *out_i)
89 {
90 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
91 LLVMBuilderRef builder = bld->gallivm->builder;
92 LLVMValueRef length_minus_one;
93
94 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
95
96 switch(wrap_mode) {
97 case PIPE_TEX_WRAP_REPEAT:
98 if(is_pot)
99 coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
100 else {
101 struct lp_build_context *coord_bld = &bld->coord_bld;
102 LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
103 if (offset) {
104 offset = lp_build_int_to_float(coord_bld, offset);
105 offset = lp_build_div(coord_bld, offset, length_f);
106 coord_f = lp_build_add(coord_bld, coord_f, offset);
107 }
108 coord = lp_build_fract_safe(coord_bld, coord_f);
109 coord = lp_build_mul(coord_bld, coord, length_f);
110 coord = lp_build_itrunc(coord_bld, coord);
111 }
112 break;
113
114 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
115 coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
116 coord = lp_build_min(int_coord_bld, coord, length_minus_one);
117 break;
118
119 case PIPE_TEX_WRAP_CLAMP:
120 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
121 case PIPE_TEX_WRAP_MIRROR_REPEAT:
122 case PIPE_TEX_WRAP_MIRROR_CLAMP:
123 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
124 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
125 default:
126 assert(0);
127 }
128
129 lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride,
130 out_offset, out_i);
131 }
132
133
134 /**
135 * Build LLVM code for texture coord wrapping, for nearest filtering,
136 * for float texcoords.
137 * \param coord the incoming texcoord (s,t or r)
138 * \param length the texture size along one dimension
139 * \param offset the texel offset along the coord axis
140 * \param is_pot if TRUE, length is a power of two
141 * \param wrap_mode one of PIPE_TEX_WRAP_x
142 * \param icoord the texcoord after wrapping, as int
143 */
144 static void
145 lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
146 LLVMValueRef coord,
147 LLVMValueRef length,
148 LLVMValueRef offset,
149 boolean is_pot,
150 unsigned wrap_mode,
151 LLVMValueRef *icoord)
152 {
153 struct lp_build_context *coord_bld = &bld->coord_bld;
154 LLVMValueRef length_minus_one;
155
156 switch(wrap_mode) {
157 case PIPE_TEX_WRAP_REPEAT:
158 if (offset) {
159 /* this is definitely not ideal for POT case */
160 offset = lp_build_int_to_float(coord_bld, offset);
161 offset = lp_build_div(coord_bld, offset, length);
162 coord = lp_build_add(coord_bld, coord, offset);
163 }
164 /* take fraction, unnormalize */
165 coord = lp_build_fract_safe(coord_bld, coord);
166 coord = lp_build_mul(coord_bld, coord, length);
167 *icoord = lp_build_itrunc(coord_bld, coord);
168 break;
169 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
170 length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
171 if (bld->static_sampler_state->normalized_coords) {
172 /* scale coord to length */
173 coord = lp_build_mul(coord_bld, coord, length);
174 }
175 if (offset) {
176 offset = lp_build_int_to_float(coord_bld, offset);
177 coord = lp_build_add(coord_bld, coord, offset);
178 }
179 coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
180 length_minus_one);
181 *icoord = lp_build_itrunc(coord_bld, coord);
182 break;
183
184 case PIPE_TEX_WRAP_CLAMP:
185 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
186 case PIPE_TEX_WRAP_MIRROR_REPEAT:
187 case PIPE_TEX_WRAP_MIRROR_CLAMP:
188 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
189 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
190 default:
191 assert(0);
192 }
193 }
194
195
196 /**
197 * Helper to compute the first coord and the weight for
198 * linear wrap repeat npot textures
199 */
200 static void
201 lp_build_coord_repeat_npot_linear_int(struct lp_build_sample_context *bld,
202 LLVMValueRef coord_f,
203 LLVMValueRef length_i,
204 LLVMValueRef length_f,
205 LLVMValueRef *coord0_i,
206 LLVMValueRef *weight_i)
207 {
208 struct lp_build_context *coord_bld = &bld->coord_bld;
209 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
210 struct lp_build_context abs_coord_bld;
211 struct lp_type abs_type;
212 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
213 int_coord_bld->one);
214 LLVMValueRef mask, i32_c8, i32_c128, i32_c255;
215
216 /* wrap with normalized floats is just fract */
217 coord_f = lp_build_fract(coord_bld, coord_f);
218 /* mul by size */
219 coord_f = lp_build_mul(coord_bld, coord_f, length_f);
220 /* convert to int, compute lerp weight */
221 coord_f = lp_build_mul_imm(&bld->coord_bld, coord_f, 256);
222
223 /* At this point we don't have any negative numbers so use non-signed
224 * build context which might help on some archs.
225 */
226 abs_type = coord_bld->type;
227 abs_type.sign = 0;
228 lp_build_context_init(&abs_coord_bld, bld->gallivm, abs_type);
229 *coord0_i = lp_build_iround(&abs_coord_bld, coord_f);
230
231 /* subtract 0.5 (add -128) */
232 i32_c128 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, -128);
233 *coord0_i = LLVMBuildAdd(bld->gallivm->builder, *coord0_i, i32_c128, "");
234
235 /* compute fractional part (AND with 0xff) */
236 i32_c255 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 255);
237 *weight_i = LLVMBuildAnd(bld->gallivm->builder, *coord0_i, i32_c255, "");
238
239 /* compute floor (shift right 8) */
240 i32_c8 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 8);
241 *coord0_i = LLVMBuildAShr(bld->gallivm->builder, *coord0_i, i32_c8, "");
242 /*
243 * we avoided the 0.5/length division before the repeat wrap,
244 * now need to fix up edge cases with selects
245 */
246 mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
247 PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
248 *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
249 /*
250 * We should never get values too large - except if coord was nan or inf,
251 * in which case things go terribly wrong...
252 * Alternatively, could use fract_safe above...
253 */
254 *coord0_i = lp_build_min(int_coord_bld, *coord0_i, length_minus_one);
255 }
256
257
258 /**
259 * Build LLVM code for texture coord wrapping, for linear filtering,
260 * for scaled integer texcoords.
261 * \param block_length is the length of the pixel block along the
262 * coordinate axis
263 * \param coord0 the incoming texcoord (s,t or r) scaled to the texture size
264 * \param coord_f the incoming texcoord (s,t or r) as float vec
265 * \param length the texture size along one dimension
266 * \param stride pixel stride along the coordinate axis (in bytes)
267 * \param offset the texel offset along the coord axis
268 * \param is_pot if TRUE, length is a power of two
269 * \param wrap_mode one of PIPE_TEX_WRAP_x
270 * \param offset0 resulting relative offset for coord0
271 * \param offset1 resulting relative offset for coord0 + 1
272 * \param i0 resulting sub-block pixel coordinate for coord0
273 * \param i1 resulting sub-block pixel coordinate for coord0 + 1
274 */
275 static void
276 lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
277 unsigned block_length,
278 LLVMValueRef coord0,
279 LLVMValueRef *weight_i,
280 LLVMValueRef coord_f,
281 LLVMValueRef length,
282 LLVMValueRef stride,
283 LLVMValueRef offset,
284 boolean is_pot,
285 unsigned wrap_mode,
286 LLVMValueRef *offset0,
287 LLVMValueRef *offset1,
288 LLVMValueRef *i0,
289 LLVMValueRef *i1)
290 {
291 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
292 LLVMBuilderRef builder = bld->gallivm->builder;
293 LLVMValueRef length_minus_one;
294 LLVMValueRef lmask, umask, mask;
295
296 /*
297 * If the pixel block covers more than one pixel then there is no easy
298 * way to calculate offset1 relative to offset0. Instead, compute them
299 * independently. Otherwise, try to compute offset0 and offset1 with
300 * a single stride multiplication.
301 */
302
303 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
304
305 if (block_length != 1) {
306 LLVMValueRef coord1;
307 switch(wrap_mode) {
308 case PIPE_TEX_WRAP_REPEAT:
309 if (is_pot) {
310 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
311 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
312 coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
313 }
314 else {
315 LLVMValueRef mask;
316 LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
317 if (offset) {
318 offset = lp_build_int_to_float(&bld->coord_bld, offset);
319 offset = lp_build_div(&bld->coord_bld, offset, length_f);
320 coord_f = lp_build_add(&bld->coord_bld, coord_f, offset);
321 }
322 lp_build_coord_repeat_npot_linear_int(bld, coord_f,
323 length, length_f,
324 &coord0, weight_i);
325 mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
326 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
327 coord1 = LLVMBuildAnd(builder,
328 lp_build_add(int_coord_bld, coord0,
329 int_coord_bld->one),
330 mask, "");
331 }
332 break;
333
334 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
335 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
336 coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
337 length_minus_one);
338 coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
339 length_minus_one);
340 break;
341
342 case PIPE_TEX_WRAP_CLAMP:
343 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
344 case PIPE_TEX_WRAP_MIRROR_REPEAT:
345 case PIPE_TEX_WRAP_MIRROR_CLAMP:
346 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
347 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
348 default:
349 assert(0);
350 coord0 = int_coord_bld->zero;
351 coord1 = int_coord_bld->zero;
352 break;
353 }
354 lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
355 offset0, i0);
356 lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
357 offset1, i1);
358 return;
359 }
360
361 *i0 = int_coord_bld->zero;
362 *i1 = int_coord_bld->zero;
363
364 switch(wrap_mode) {
365 case PIPE_TEX_WRAP_REPEAT:
366 if (is_pot) {
367 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
368 }
369 else {
370 LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
371 if (offset) {
372 offset = lp_build_int_to_float(&bld->coord_bld, offset);
373 offset = lp_build_div(&bld->coord_bld, offset, length_f);
374 coord_f = lp_build_add(&bld->coord_bld, coord_f, offset);
375 }
376 lp_build_coord_repeat_npot_linear_int(bld, coord_f,
377 length, length_f,
378 &coord0, weight_i);
379 }
380
381 mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
382 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
383
384 *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
385 *offset1 = LLVMBuildAnd(builder,
386 lp_build_add(int_coord_bld, *offset0, stride),
387 mask, "");
388 break;
389
390 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
391 /* XXX this might be slower than the separate path
392 * on some newer cpus. With sse41 this is 8 instructions vs. 7
393 * - at least on SNB this is almost certainly slower since
394 * min/max are cheaper than selects, and the muls aren't bad.
395 */
396 lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
397 PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
398 umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
399 PIPE_FUNC_LESS, coord0, length_minus_one);
400
401 coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
402 coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
403
404 mask = LLVMBuildAnd(builder, lmask, umask, "");
405
406 *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
407 *offset1 = lp_build_add(int_coord_bld,
408 *offset0,
409 LLVMBuildAnd(builder, stride, mask, ""));
410 break;
411
412 case PIPE_TEX_WRAP_CLAMP:
413 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
414 case PIPE_TEX_WRAP_MIRROR_REPEAT:
415 case PIPE_TEX_WRAP_MIRROR_CLAMP:
416 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
417 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
418 default:
419 assert(0);
420 *offset0 = int_coord_bld->zero;
421 *offset1 = int_coord_bld->zero;
422 break;
423 }
424 }
425
426
427 /**
428 * Build LLVM code for texture coord wrapping, for linear filtering,
429 * for float texcoords.
430 * \param block_length is the length of the pixel block along the
431 * coordinate axis
432 * \param coord the incoming texcoord (s,t or r)
433 * \param length the texture size along one dimension
434 * \param offset the texel offset along the coord axis
435 * \param is_pot if TRUE, length is a power of two
436 * \param wrap_mode one of PIPE_TEX_WRAP_x
437 * \param coord0 the first texcoord after wrapping, as int
438 * \param coord1 the second texcoord after wrapping, as int
439 * \param weight the filter weight as int (0-255)
440 * \param force_nearest if this coord actually uses nearest filtering
441 */
442 static void
443 lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
444 unsigned block_length,
445 LLVMValueRef coord,
446 LLVMValueRef length,
447 LLVMValueRef offset,
448 boolean is_pot,
449 unsigned wrap_mode,
450 LLVMValueRef *coord0,
451 LLVMValueRef *coord1,
452 LLVMValueRef *weight,
453 unsigned force_nearest)
454 {
455 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
456 struct lp_build_context *coord_bld = &bld->coord_bld;
457 LLVMBuilderRef builder = bld->gallivm->builder;
458 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
459 LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
460
461 switch(wrap_mode) {
462 case PIPE_TEX_WRAP_REPEAT:
463 if (is_pot) {
464 /* mul by size and subtract 0.5 */
465 coord = lp_build_mul(coord_bld, coord, length);
466 if (offset) {
467 offset = lp_build_int_to_float(coord_bld, offset);
468 coord = lp_build_add(coord_bld, coord, offset);
469 }
470 if (!force_nearest)
471 coord = lp_build_sub(coord_bld, coord, half);
472 *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
473 /* convert to int, compute lerp weight */
474 lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
475 *coord1 = lp_build_ifloor(coord_bld, *coord1);
476 /* repeat wrap */
477 length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
478 *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
479 *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
480 }
481 else {
482 LLVMValueRef mask;
483 if (offset) {
484 offset = lp_build_int_to_float(coord_bld, offset);
485 offset = lp_build_div(coord_bld, offset, length);
486 coord = lp_build_add(coord_bld, coord, offset);
487 }
488 /* wrap with normalized floats is just fract */
489 coord = lp_build_fract(coord_bld, coord);
490 /* unnormalize */
491 coord = lp_build_mul(coord_bld, coord, length);
492 /*
493 * we avoided the 0.5/length division, have to fix up wrong
494 * edge cases with selects
495 */
496 *coord1 = lp_build_add(coord_bld, coord, half);
497 coord = lp_build_sub(coord_bld, coord, half);
498 *weight = lp_build_fract(coord_bld, coord);
499 /*
500 * It is important for this comparison to be unordered
501 * (or need fract_safe above).
502 */
503 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
504 PIPE_FUNC_LESS, coord, coord_bld->zero);
505 *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
506 *coord0 = lp_build_itrunc(coord_bld, *coord0);
507 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
508 PIPE_FUNC_LESS, *coord1, length);
509 *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
510 *coord1 = lp_build_itrunc(coord_bld, *coord1);
511 }
512 break;
513 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
514 if (bld->static_sampler_state->normalized_coords) {
515 /* mul by tex size */
516 coord = lp_build_mul(coord_bld, coord, length);
517 }
518 if (offset) {
519 offset = lp_build_int_to_float(coord_bld, offset);
520 coord = lp_build_add(coord_bld, coord, offset);
521 }
522 /* subtract 0.5 */
523 if (!force_nearest) {
524 coord = lp_build_sub(coord_bld, coord, half);
525 }
526 /* clamp to [0, length - 1] */
527 coord = lp_build_min_ext(coord_bld, coord, length_minus_one,
528 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
529 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
530 *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
531 /* convert to int, compute lerp weight */
532 lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
533 /* coord1 = min(coord1, length-1) */
534 *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
535 *coord1 = lp_build_itrunc(coord_bld, *coord1);
536 break;
537 default:
538 assert(0);
539 *coord0 = int_coord_bld->zero;
540 *coord1 = int_coord_bld->zero;
541 *weight = coord_bld->zero;
542 break;
543 }
544 *weight = lp_build_mul_imm(coord_bld, *weight, 256);
545 *weight = lp_build_itrunc(coord_bld, *weight);
546 return;
547 }
548
549
550 /**
551 * Fetch texels for image with nearest sampling.
552 * Return filtered color as two vectors of 16-bit fixed point values.
553 */
554 static void
555 lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
556 LLVMValueRef data_ptr,
557 LLVMValueRef offset,
558 LLVMValueRef x_subcoord,
559 LLVMValueRef y_subcoord,
560 LLVMValueRef *colors)
561 {
562 /*
563 * Fetch the pixels as 4 x 32bit (rgba order might differ):
564 *
565 * rgba0 rgba1 rgba2 rgba3
566 *
567 * bit cast them into 16 x u8
568 *
569 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
570 *
571 * unpack them into two 8 x i16:
572 *
573 * r0 g0 b0 a0 r1 g1 b1 a1
574 * r2 g2 b2 a2 r3 g3 b3 a3
575 *
576 * The higher 8 bits of the resulting elements will be zero.
577 */
578 LLVMBuilderRef builder = bld->gallivm->builder;
579 LLVMValueRef rgba8;
580 struct lp_build_context u8n;
581 LLVMTypeRef u8n_vec_type;
582 struct lp_type fetch_type;
583
584 lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
585 u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
586
587 fetch_type = lp_type_uint(bld->texel_type.width);
588 if (util_format_is_rgba8_variant(bld->format_desc)) {
589 /*
590 * Given the format is a rgba8, just read the pixels as is,
591 * without any swizzling. Swizzling will be done later.
592 */
593 rgba8 = lp_build_gather(bld->gallivm,
594 bld->texel_type.length,
595 bld->format_desc->block.bits,
596 fetch_type,
597 TRUE,
598 data_ptr, offset, TRUE);
599
600 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
601 }
602 else {
603 rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
604 bld->format_desc,
605 u8n.type,
606 TRUE,
607 data_ptr, offset,
608 x_subcoord,
609 y_subcoord,
610 bld->cache);
611 }
612
613 *colors = rgba8;
614 }
615
616
617 /**
618 * Sample a single texture image with nearest sampling.
619 * If sampling a cube texture, r = cube face in [0,5].
620 * Return filtered color as two vectors of 16-bit fixed point values.
621 */
622 static void
623 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
624 LLVMValueRef int_size,
625 LLVMValueRef row_stride_vec,
626 LLVMValueRef img_stride_vec,
627 LLVMValueRef data_ptr,
628 LLVMValueRef mipoffsets,
629 LLVMValueRef s,
630 LLVMValueRef t,
631 LLVMValueRef r,
632 const LLVMValueRef *offsets,
633 LLVMValueRef *colors)
634 {
635 const unsigned dims = bld->dims;
636 struct lp_build_context i32;
637 LLVMValueRef width_vec, height_vec, depth_vec;
638 LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
639 LLVMValueRef s_float, t_float = NULL, r_float = NULL;
640 LLVMValueRef x_stride;
641 LLVMValueRef x_offset, offset;
642 LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
643
644 lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
645
646 lp_build_extract_image_sizes(bld,
647 &bld->int_size_bld,
648 bld->int_coord_type,
649 int_size,
650 &width_vec,
651 &height_vec,
652 &depth_vec);
653
654 s_float = s; t_float = t; r_float = r;
655
656 if (bld->static_sampler_state->normalized_coords) {
657 LLVMValueRef flt_size;
658
659 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
660
661 lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
662 }
663
664 /* convert float to int */
665 /* For correct rounding, need floor, not truncation here.
666 * Note that in some cases (clamp to edge, no texel offsets) we
667 * could use a non-signed build context which would help archs
668 * greatly which don't have arch rounding.
669 */
670 s_ipart = lp_build_ifloor(&bld->coord_bld, s);
671 if (dims >= 2)
672 t_ipart = lp_build_ifloor(&bld->coord_bld, t);
673 if (dims >= 3)
674 r_ipart = lp_build_ifloor(&bld->coord_bld, r);
675
676 /* add texel offsets */
677 if (offsets[0]) {
678 s_ipart = lp_build_add(&i32, s_ipart, offsets[0]);
679 if (dims >= 2) {
680 t_ipart = lp_build_add(&i32, t_ipart, offsets[1]);
681 if (dims >= 3) {
682 r_ipart = lp_build_add(&i32, r_ipart, offsets[2]);
683 }
684 }
685 }
686
687 /* get pixel, row, image strides */
688 x_stride = lp_build_const_vec(bld->gallivm,
689 bld->int_coord_bld.type,
690 bld->format_desc->block.bits/8);
691
692 /* Do texcoord wrapping, compute texel offset */
693 lp_build_sample_wrap_nearest_int(bld,
694 bld->format_desc->block.width,
695 s_ipart, s_float,
696 width_vec, x_stride, offsets[0],
697 bld->static_texture_state->pot_width,
698 bld->static_sampler_state->wrap_s,
699 &x_offset, &x_subcoord);
700 offset = x_offset;
701 if (dims >= 2) {
702 LLVMValueRef y_offset;
703 lp_build_sample_wrap_nearest_int(bld,
704 bld->format_desc->block.height,
705 t_ipart, t_float,
706 height_vec, row_stride_vec, offsets[1],
707 bld->static_texture_state->pot_height,
708 bld->static_sampler_state->wrap_t,
709 &y_offset, &y_subcoord);
710 offset = lp_build_add(&bld->int_coord_bld, offset, y_offset);
711 if (dims >= 3) {
712 LLVMValueRef z_offset;
713 lp_build_sample_wrap_nearest_int(bld,
714 1, /* block length (depth) */
715 r_ipart, r_float,
716 depth_vec, img_stride_vec, offsets[2],
717 bld->static_texture_state->pot_depth,
718 bld->static_sampler_state->wrap_r,
719 &z_offset, &z_subcoord);
720 offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
721 }
722 }
723 if (has_layer_coord(bld->static_texture_state->target)) {
724 LLVMValueRef z_offset;
725 /* The r coord is the cube face in [0,5] or array layer */
726 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
727 offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
728 }
729 if (mipoffsets) {
730 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
731 }
732
733 lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
734 x_subcoord, y_subcoord,
735 colors);
736 }
737
738
739 /**
740 * Sample a single texture image with nearest sampling.
741 * If sampling a cube texture, r = cube face in [0,5].
742 * Return filtered color as two vectors of 16-bit fixed point values.
743 * Does address calcs (except offsets) with floats.
744 * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
745 */
746 static void
747 lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
748 LLVMValueRef int_size,
749 LLVMValueRef row_stride_vec,
750 LLVMValueRef img_stride_vec,
751 LLVMValueRef data_ptr,
752 LLVMValueRef mipoffsets,
753 LLVMValueRef s,
754 LLVMValueRef t,
755 LLVMValueRef r,
756 const LLVMValueRef *offsets,
757 LLVMValueRef *colors)
758 {
759 const unsigned dims = bld->dims;
760 LLVMValueRef width_vec, height_vec, depth_vec;
761 LLVMValueRef offset;
762 LLVMValueRef x_subcoord, y_subcoord;
763 LLVMValueRef x_icoord = NULL, y_icoord = NULL, z_icoord = NULL;
764 LLVMValueRef flt_size;
765
766 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
767
768 lp_build_extract_image_sizes(bld,
769 &bld->float_size_bld,
770 bld->coord_type,
771 flt_size,
772 &width_vec,
773 &height_vec,
774 &depth_vec);
775
776 /* Do texcoord wrapping */
777 lp_build_sample_wrap_nearest_float(bld,
778 s, width_vec, offsets[0],
779 bld->static_texture_state->pot_width,
780 bld->static_sampler_state->wrap_s,
781 &x_icoord);
782
783 if (dims >= 2) {
784 lp_build_sample_wrap_nearest_float(bld,
785 t, height_vec, offsets[1],
786 bld->static_texture_state->pot_height,
787 bld->static_sampler_state->wrap_t,
788 &y_icoord);
789
790 if (dims >= 3) {
791 lp_build_sample_wrap_nearest_float(bld,
792 r, depth_vec, offsets[2],
793 bld->static_texture_state->pot_depth,
794 bld->static_sampler_state->wrap_r,
795 &z_icoord);
796 }
797 }
798 if (has_layer_coord(bld->static_texture_state->target)) {
799 z_icoord = r;
800 }
801
802 /*
803 * From here on we deal with ints, and we should split up the 256bit
804 * vectors manually for better generated code.
805 */
806
807 /*
808 * compute texel offsets -
809 * cannot do offset calc with floats, difficult for block-based formats,
810 * and not enough precision anyway.
811 */
812 lp_build_sample_offset(&bld->int_coord_bld,
813 bld->format_desc,
814 x_icoord, y_icoord,
815 z_icoord,
816 row_stride_vec, img_stride_vec,
817 &offset,
818 &x_subcoord, &y_subcoord);
819 if (mipoffsets) {
820 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
821 }
822
823 lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
824 x_subcoord, y_subcoord,
825 colors);
826 }
827
828
829 /**
830 * Fetch texels for image with linear sampling.
831 * Return filtered color as two vectors of 16-bit fixed point values.
832 */
833 static void
834 lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
835 LLVMValueRef data_ptr,
836 LLVMValueRef offset[2][2][2],
837 LLVMValueRef x_subcoord[2],
838 LLVMValueRef y_subcoord[2],
839 LLVMValueRef s_fpart,
840 LLVMValueRef t_fpart,
841 LLVMValueRef r_fpart,
842 LLVMValueRef *colors)
843 {
844 const unsigned dims = bld->dims;
845 LLVMBuilderRef builder = bld->gallivm->builder;
846 struct lp_build_context u8n;
847 LLVMTypeRef u8n_vec_type;
848 LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
849 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
850 LLVMValueRef shuffle;
851 LLVMValueRef neighbors[2][2][2]; /* [z][y][x] */
852 LLVMValueRef packed;
853 unsigned i, j, k;
854 unsigned numj, numk;
855
856 lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
857 u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
858
859 /*
860 * Transform 4 x i32 in
861 *
862 * s_fpart = {s0, s1, s2, s3}
863 *
864 * where each value is between 0 and 0xff,
865 *
866 * into one 16 x i20
867 *
868 * s_fpart = {s0, s0, s0, s0, s1, s1, s1, s1, s2, s2, s2, s2, s3, s3, s3, s3}
869 *
870 * and likewise for t_fpart. There is no risk of loosing precision here
871 * since the fractional parts only use the lower 8bits.
872 */
873 s_fpart = LLVMBuildBitCast(builder, s_fpart, u8n_vec_type, "");
874 if (dims >= 2)
875 t_fpart = LLVMBuildBitCast(builder, t_fpart, u8n_vec_type, "");
876 if (dims >= 3)
877 r_fpart = LLVMBuildBitCast(builder, r_fpart, u8n_vec_type, "");
878
879 for (j = 0; j < u8n.type.length; j += 4) {
880 #ifdef PIPE_ARCH_LITTLE_ENDIAN
881 unsigned subindex = 0;
882 #else
883 unsigned subindex = 3;
884 #endif
885 LLVMValueRef index;
886
887 index = LLVMConstInt(elem_type, j + subindex, 0);
888 for (i = 0; i < 4; ++i)
889 shuffles[j + i] = index;
890 }
891
892 shuffle = LLVMConstVector(shuffles, u8n.type.length);
893
894 s_fpart = LLVMBuildShuffleVector(builder, s_fpart, u8n.undef,
895 shuffle, "");
896 if (dims >= 2) {
897 t_fpart = LLVMBuildShuffleVector(builder, t_fpart, u8n.undef,
898 shuffle, "");
899 }
900 if (dims >= 3) {
901 r_fpart = LLVMBuildShuffleVector(builder, r_fpart, u8n.undef,
902 shuffle, "");
903 }
904
905 /*
906 * Fetch the pixels as 4 x 32bit (rgba order might differ):
907 *
908 * rgba0 rgba1 rgba2 rgba3
909 *
910 * bit cast them into 16 x u8
911 *
912 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
913 *
914 * unpack them into two 8 x i16:
915 *
916 * r0 g0 b0 a0 r1 g1 b1 a1
917 * r2 g2 b2 a2 r3 g3 b3 a3
918 *
919 * The higher 8 bits of the resulting elements will be zero.
920 */
921 numj = 1 + (dims >= 2);
922 numk = 1 + (dims >= 3);
923
924 for (k = 0; k < numk; k++) {
925 for (j = 0; j < numj; j++) {
926 for (i = 0; i < 2; i++) {
927 LLVMValueRef rgba8;
928
929 if (util_format_is_rgba8_variant(bld->format_desc)) {
930 struct lp_type fetch_type;
931 /*
932 * Given the format is a rgba8, just read the pixels as is,
933 * without any swizzling. Swizzling will be done later.
934 */
935 fetch_type = lp_type_uint(bld->texel_type.width);
936 rgba8 = lp_build_gather(bld->gallivm,
937 bld->texel_type.length,
938 bld->format_desc->block.bits,
939 fetch_type,
940 TRUE,
941 data_ptr, offset[k][j][i], TRUE);
942
943 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
944 }
945 else {
946 rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
947 bld->format_desc,
948 u8n.type,
949 TRUE,
950 data_ptr, offset[k][j][i],
951 x_subcoord[i],
952 y_subcoord[j],
953 bld->cache);
954 }
955
956 neighbors[k][j][i] = rgba8;
957 }
958 }
959 }
960
961 /*
962 * Linear interpolation with 8.8 fixed point.
963 */
964 if (bld->static_sampler_state->force_nearest_s) {
965 /* special case 1-D lerp */
966 packed = lp_build_lerp(&u8n,
967 t_fpart,
968 neighbors[0][0][0],
969 neighbors[0][0][1],
970 LP_BLD_LERP_PRESCALED_WEIGHTS);
971 }
972 else if (bld->static_sampler_state->force_nearest_t) {
973 /* special case 1-D lerp */
974 packed = lp_build_lerp(&u8n,
975 s_fpart,
976 neighbors[0][0][0],
977 neighbors[0][0][1],
978 LP_BLD_LERP_PRESCALED_WEIGHTS);
979 }
980 else {
981 /* general 1/2/3-D lerping */
982 if (dims == 1) {
983 packed = lp_build_lerp(&u8n,
984 s_fpart,
985 neighbors[0][0][0],
986 neighbors[0][0][1],
987 LP_BLD_LERP_PRESCALED_WEIGHTS);
988 } else if (dims == 2) {
989 /* 2-D lerp */
990 packed = lp_build_lerp_2d(&u8n,
991 s_fpart, t_fpart,
992 neighbors[0][0][0],
993 neighbors[0][0][1],
994 neighbors[0][1][0],
995 neighbors[0][1][1],
996 LP_BLD_LERP_PRESCALED_WEIGHTS);
997 } else {
998 /* 3-D lerp */
999 assert(dims == 3);
1000 packed = lp_build_lerp_3d(&u8n,
1001 s_fpart, t_fpart, r_fpart,
1002 neighbors[0][0][0],
1003 neighbors[0][0][1],
1004 neighbors[0][1][0],
1005 neighbors[0][1][1],
1006 neighbors[1][0][0],
1007 neighbors[1][0][1],
1008 neighbors[1][1][0],
1009 neighbors[1][1][1],
1010 LP_BLD_LERP_PRESCALED_WEIGHTS);
1011 }
1012 }
1013
1014 *colors = packed;
1015 }
1016
1017 /**
1018 * Sample a single texture image with (bi-)(tri-)linear sampling.
1019 * Return filtered color as two vectors of 16-bit fixed point values.
1020 */
1021 static void
1022 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1023 LLVMValueRef int_size,
1024 LLVMValueRef row_stride_vec,
1025 LLVMValueRef img_stride_vec,
1026 LLVMValueRef data_ptr,
1027 LLVMValueRef mipoffsets,
1028 LLVMValueRef s,
1029 LLVMValueRef t,
1030 LLVMValueRef r,
1031 const LLVMValueRef *offsets,
1032 LLVMValueRef *colors)
1033 {
1034 const unsigned dims = bld->dims;
1035 LLVMBuilderRef builder = bld->gallivm->builder;
1036 struct lp_build_context i32;
1037 LLVMValueRef i32_c8, i32_c128, i32_c255;
1038 LLVMValueRef width_vec, height_vec, depth_vec;
1039 LLVMValueRef s_ipart, s_fpart, s_float;
1040 LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
1041 LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
1042 LLVMValueRef x_stride, y_stride, z_stride;
1043 LLVMValueRef x_offset0, x_offset1;
1044 LLVMValueRef y_offset0, y_offset1;
1045 LLVMValueRef z_offset0, z_offset1;
1046 LLVMValueRef offset[2][2][2]; /* [z][y][x] */
1047 LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
1048 unsigned x, y, z;
1049
1050 lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
1051
1052 lp_build_extract_image_sizes(bld,
1053 &bld->int_size_bld,
1054 bld->int_coord_type,
1055 int_size,
1056 &width_vec,
1057 &height_vec,
1058 &depth_vec);
1059
1060 s_float = s; t_float = t; r_float = r;
1061
1062 if (bld->static_sampler_state->normalized_coords) {
1063 LLVMValueRef scaled_size;
1064 LLVMValueRef flt_size;
1065
1066 /* scale size by 256 (8 fractional bits) */
1067 scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
1068
1069 flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
1070
1071 lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
1072 }
1073 else {
1074 /* scale coords by 256 (8 fractional bits) */
1075 s = lp_build_mul_imm(&bld->coord_bld, s, 256);
1076 if (dims >= 2)
1077 t = lp_build_mul_imm(&bld->coord_bld, t, 256);
1078 if (dims >= 3)
1079 r = lp_build_mul_imm(&bld->coord_bld, r, 256);
1080 }
1081
1082 /* convert float to int */
1083 /* For correct rounding, need round to nearest, not truncation here.
1084 * Note that in some cases (clamp to edge, no texel offsets) we
1085 * could use a non-signed build context which would help archs which
1086 * don't have fptosi intrinsic with nearest rounding implemented.
1087 */
1088 s = lp_build_iround(&bld->coord_bld, s);
1089 if (dims >= 2)
1090 t = lp_build_iround(&bld->coord_bld, t);
1091 if (dims >= 3)
1092 r = lp_build_iround(&bld->coord_bld, r);
1093
1094 /* subtract 0.5 (add -128) */
1095 i32_c128 = lp_build_const_int_vec(bld->gallivm, i32.type, -128);
1096 if (!bld->static_sampler_state->force_nearest_s) {
1097 s = LLVMBuildAdd(builder, s, i32_c128, "");
1098 }
1099 if (dims >= 2 && !bld->static_sampler_state->force_nearest_t) {
1100 t = LLVMBuildAdd(builder, t, i32_c128, "");
1101 }
1102 if (dims >= 3) {
1103 r = LLVMBuildAdd(builder, r, i32_c128, "");
1104 }
1105
1106 /* compute floor (shift right 8) */
1107 i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8);
1108 s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
1109 if (dims >= 2)
1110 t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
1111 if (dims >= 3)
1112 r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
1113
1114 /* add texel offsets */
1115 if (offsets[0]) {
1116 s_ipart = lp_build_add(&i32, s_ipart, offsets[0]);
1117 if (dims >= 2) {
1118 t_ipart = lp_build_add(&i32, t_ipart, offsets[1]);
1119 if (dims >= 3) {
1120 r_ipart = lp_build_add(&i32, r_ipart, offsets[2]);
1121 }
1122 }
1123 }
1124
1125 /* compute fractional part (AND with 0xff) */
1126 i32_c255 = lp_build_const_int_vec(bld->gallivm, i32.type, 255);
1127 s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
1128 if (dims >= 2)
1129 t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
1130 if (dims >= 3)
1131 r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
1132
1133 /* get pixel, row and image strides */
1134 x_stride = lp_build_const_vec(bld->gallivm, bld->int_coord_bld.type,
1135 bld->format_desc->block.bits/8);
1136 y_stride = row_stride_vec;
1137 z_stride = img_stride_vec;
1138
1139 /* do texcoord wrapping and compute texel offsets */
1140 lp_build_sample_wrap_linear_int(bld,
1141 bld->format_desc->block.width,
1142 s_ipart, &s_fpart, s_float,
1143 width_vec, x_stride, offsets[0],
1144 bld->static_texture_state->pot_width,
1145 bld->static_sampler_state->wrap_s,
1146 &x_offset0, &x_offset1,
1147 &x_subcoord[0], &x_subcoord[1]);
1148
1149 /* add potential cube/array/mip offsets now as they are constant per pixel */
1150 if (has_layer_coord(bld->static_texture_state->target)) {
1151 LLVMValueRef z_offset;
1152 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1153 /* The r coord is the cube face in [0,5] or array layer */
1154 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, z_offset);
1155 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, z_offset);
1156 }
1157 if (mipoffsets) {
1158 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, mipoffsets);
1159 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, mipoffsets);
1160 }
1161
1162 for (z = 0; z < 2; z++) {
1163 for (y = 0; y < 2; y++) {
1164 offset[z][y][0] = x_offset0;
1165 offset[z][y][1] = x_offset1;
1166 }
1167 }
1168
1169 if (dims >= 2) {
1170 lp_build_sample_wrap_linear_int(bld,
1171 bld->format_desc->block.height,
1172 t_ipart, &t_fpart, t_float,
1173 height_vec, y_stride, offsets[1],
1174 bld->static_texture_state->pot_height,
1175 bld->static_sampler_state->wrap_t,
1176 &y_offset0, &y_offset1,
1177 &y_subcoord[0], &y_subcoord[1]);
1178
1179 for (z = 0; z < 2; z++) {
1180 for (x = 0; x < 2; x++) {
1181 offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1182 offset[z][0][x], y_offset0);
1183 offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1184 offset[z][1][x], y_offset1);
1185 }
1186 }
1187 }
1188
1189 if (dims >= 3) {
1190 lp_build_sample_wrap_linear_int(bld,
1191 1, /* block length (depth) */
1192 r_ipart, &r_fpart, r_float,
1193 depth_vec, z_stride, offsets[2],
1194 bld->static_texture_state->pot_depth,
1195 bld->static_sampler_state->wrap_r,
1196 &z_offset0, &z_offset1,
1197 &z_subcoord[0], &z_subcoord[1]);
1198 for (y = 0; y < 2; y++) {
1199 for (x = 0; x < 2; x++) {
1200 offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1201 offset[0][y][x], z_offset0);
1202 offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1203 offset[1][y][x], z_offset1);
1204 }
1205 }
1206 }
1207
1208 lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1209 x_subcoord, y_subcoord,
1210 s_fpart, t_fpart, r_fpart,
1211 colors);
1212 }
1213
1214
1215 /**
1216 * Sample a single texture image with (bi-)(tri-)linear sampling.
1217 * Return filtered color as two vectors of 16-bit fixed point values.
1218 * Does address calcs (except offsets) with floats.
1219 * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
1220 */
1221 static void
1222 lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
1223 LLVMValueRef int_size,
1224 LLVMValueRef row_stride_vec,
1225 LLVMValueRef img_stride_vec,
1226 LLVMValueRef data_ptr,
1227 LLVMValueRef mipoffsets,
1228 LLVMValueRef s,
1229 LLVMValueRef t,
1230 LLVMValueRef r,
1231 const LLVMValueRef *offsets,
1232 LLVMValueRef *colors)
1233 {
1234 const unsigned dims = bld->dims;
1235 LLVMValueRef width_vec, height_vec, depth_vec;
1236 LLVMValueRef s_fpart;
1237 LLVMValueRef t_fpart = NULL;
1238 LLVMValueRef r_fpart = NULL;
1239 LLVMValueRef x_stride, y_stride, z_stride;
1240 LLVMValueRef x_offset0, x_offset1;
1241 LLVMValueRef y_offset0, y_offset1;
1242 LLVMValueRef z_offset0, z_offset1;
1243 LLVMValueRef offset[2][2][2]; /* [z][y][x] */
1244 LLVMValueRef x_subcoord[2], y_subcoord[2];
1245 LLVMValueRef flt_size;
1246 LLVMValueRef x_icoord0, x_icoord1;
1247 LLVMValueRef y_icoord0, y_icoord1;
1248 LLVMValueRef z_icoord0, z_icoord1;
1249 unsigned x, y, z;
1250
1251 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
1252
1253 lp_build_extract_image_sizes(bld,
1254 &bld->float_size_bld,
1255 bld->coord_type,
1256 flt_size,
1257 &width_vec,
1258 &height_vec,
1259 &depth_vec);
1260
1261 /* do texcoord wrapping and compute texel offsets */
1262 lp_build_sample_wrap_linear_float(bld,
1263 bld->format_desc->block.width,
1264 s, width_vec, offsets[0],
1265 bld->static_texture_state->pot_width,
1266 bld->static_sampler_state->wrap_s,
1267 &x_icoord0, &x_icoord1,
1268 &s_fpart,
1269 bld->static_sampler_state->force_nearest_s);
1270
1271 if (dims >= 2) {
1272 lp_build_sample_wrap_linear_float(bld,
1273 bld->format_desc->block.height,
1274 t, height_vec, offsets[1],
1275 bld->static_texture_state->pot_height,
1276 bld->static_sampler_state->wrap_t,
1277 &y_icoord0, &y_icoord1,
1278 &t_fpart,
1279 bld->static_sampler_state->force_nearest_t);
1280
1281 if (dims >= 3) {
1282 lp_build_sample_wrap_linear_float(bld,
1283 1, /* block length (depth) */
1284 r, depth_vec, offsets[2],
1285 bld->static_texture_state->pot_depth,
1286 bld->static_sampler_state->wrap_r,
1287 &z_icoord0, &z_icoord1,
1288 &r_fpart, 0);
1289 }
1290 }
1291
1292 /*
1293 * From here on we deal with ints, and we should split up the 256bit
1294 * vectors manually for better generated code.
1295 */
1296
1297 /* get pixel, row and image strides */
1298 x_stride = lp_build_const_vec(bld->gallivm,
1299 bld->int_coord_bld.type,
1300 bld->format_desc->block.bits/8);
1301 y_stride = row_stride_vec;
1302 z_stride = img_stride_vec;
1303
1304 /*
1305 * compute texel offset -
1306 * cannot do offset calc with floats, difficult for block-based formats,
1307 * and not enough precision anyway.
1308 */
1309 lp_build_sample_partial_offset(&bld->int_coord_bld,
1310 bld->format_desc->block.width,
1311 x_icoord0, x_stride,
1312 &x_offset0, &x_subcoord[0]);
1313 lp_build_sample_partial_offset(&bld->int_coord_bld,
1314 bld->format_desc->block.width,
1315 x_icoord1, x_stride,
1316 &x_offset1, &x_subcoord[1]);
1317
1318 /* add potential cube/array/mip offsets now as they are constant per pixel */
1319 if (has_layer_coord(bld->static_texture_state->target)) {
1320 LLVMValueRef z_offset;
1321 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1322 /* The r coord is the cube face in [0,5] or array layer */
1323 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, z_offset);
1324 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, z_offset);
1325 }
1326 if (mipoffsets) {
1327 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, mipoffsets);
1328 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, mipoffsets);
1329 }
1330
1331 for (z = 0; z < 2; z++) {
1332 for (y = 0; y < 2; y++) {
1333 offset[z][y][0] = x_offset0;
1334 offset[z][y][1] = x_offset1;
1335 }
1336 }
1337
1338 if (dims >= 2) {
1339 lp_build_sample_partial_offset(&bld->int_coord_bld,
1340 bld->format_desc->block.height,
1341 y_icoord0, y_stride,
1342 &y_offset0, &y_subcoord[0]);
1343 lp_build_sample_partial_offset(&bld->int_coord_bld,
1344 bld->format_desc->block.height,
1345 y_icoord1, y_stride,
1346 &y_offset1, &y_subcoord[1]);
1347 for (z = 0; z < 2; z++) {
1348 for (x = 0; x < 2; x++) {
1349 offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1350 offset[z][0][x], y_offset0);
1351 offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1352 offset[z][1][x], y_offset1);
1353 }
1354 }
1355 }
1356
1357 if (dims >= 3) {
1358 LLVMValueRef z_subcoord[2];
1359 lp_build_sample_partial_offset(&bld->int_coord_bld,
1360 1,
1361 z_icoord0, z_stride,
1362 &z_offset0, &z_subcoord[0]);
1363 lp_build_sample_partial_offset(&bld->int_coord_bld,
1364 1,
1365 z_icoord1, z_stride,
1366 &z_offset1, &z_subcoord[1]);
1367 for (y = 0; y < 2; y++) {
1368 for (x = 0; x < 2; x++) {
1369 offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1370 offset[0][y][x], z_offset0);
1371 offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1372 offset[1][y][x], z_offset1);
1373 }
1374 }
1375 }
1376
1377 lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1378 x_subcoord, y_subcoord,
1379 s_fpart, t_fpart, r_fpart,
1380 colors);
1381 }
1382
1383
1384 /**
1385 * Sample the texture/mipmap using given image filter and mip filter.
1386 * data0_ptr and data1_ptr point to the two mipmap levels to sample
1387 * from. width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
1388 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1389 */
1390 static void
1391 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1392 unsigned img_filter,
1393 unsigned mip_filter,
1394 LLVMValueRef s,
1395 LLVMValueRef t,
1396 LLVMValueRef r,
1397 const LLVMValueRef *offsets,
1398 LLVMValueRef ilevel0,
1399 LLVMValueRef ilevel1,
1400 LLVMValueRef lod_fpart,
1401 LLVMValueRef colors_var)
1402 {
1403 LLVMBuilderRef builder = bld->gallivm->builder;
1404 LLVMValueRef size0;
1405 LLVMValueRef size1;
1406 LLVMValueRef row_stride0_vec = NULL;
1407 LLVMValueRef row_stride1_vec = NULL;
1408 LLVMValueRef img_stride0_vec = NULL;
1409 LLVMValueRef img_stride1_vec = NULL;
1410 LLVMValueRef data_ptr0;
1411 LLVMValueRef data_ptr1;
1412 LLVMValueRef mipoff0 = NULL;
1413 LLVMValueRef mipoff1 = NULL;
1414 LLVMValueRef colors0;
1415 LLVMValueRef colors1;
1416 boolean use_floats = util_cpu_caps.has_avx &&
1417 !util_cpu_caps.has_avx2 &&
1418 bld->coord_type.length > 4;
1419
1420 /* sample the first mipmap level */
1421 lp_build_mipmap_level_sizes(bld, ilevel0,
1422 &size0,
1423 &row_stride0_vec, &img_stride0_vec);
1424 if (bld->num_mips == 1) {
1425 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1426 }
1427 else {
1428 /* This path should work for num_lods 1 too but slightly less efficient */
1429 data_ptr0 = bld->base_ptr;
1430 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1431 }
1432
1433 if (use_floats) {
1434 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1435 lp_build_sample_image_nearest_afloat(bld,
1436 size0,
1437 row_stride0_vec, img_stride0_vec,
1438 data_ptr0, mipoff0, s, t, r, offsets,
1439 &colors0);
1440 }
1441 else {
1442 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1443 lp_build_sample_image_linear_afloat(bld,
1444 size0,
1445 row_stride0_vec, img_stride0_vec,
1446 data_ptr0, mipoff0, s, t, r, offsets,
1447 &colors0);
1448 }
1449 }
1450 else {
1451 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1452 lp_build_sample_image_nearest(bld,
1453 size0,
1454 row_stride0_vec, img_stride0_vec,
1455 data_ptr0, mipoff0, s, t, r, offsets,
1456 &colors0);
1457 }
1458 else {
1459 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1460 lp_build_sample_image_linear(bld,
1461 size0,
1462 row_stride0_vec, img_stride0_vec,
1463 data_ptr0, mipoff0, s, t, r, offsets,
1464 &colors0);
1465 }
1466 }
1467
1468 /* Store the first level's colors in the output variables */
1469 LLVMBuildStore(builder, colors0, colors_var);
1470
1471 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1472 LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
1473 bld->lodf_bld.type, 256.0);
1474 LLVMTypeRef i32vec_type = bld->lodi_bld.vec_type;
1475 struct lp_build_if_state if_ctx;
1476 LLVMValueRef need_lerp;
1477 unsigned num_quads = bld->coord_bld.type.length / 4;
1478 unsigned i;
1479
1480 lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
1481 lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
1482
1483 /* need_lerp = lod_fpart > 0 */
1484 if (bld->num_lods == 1) {
1485 need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
1486 lod_fpart, bld->lodi_bld.zero,
1487 "need_lerp");
1488 }
1489 else {
1490 /*
1491 * We'll do mip filtering if any of the quads need it.
1492 * It might be better to split the vectors here and only fetch/filter
1493 * quads which need it.
1494 */
1495 /*
1496 * We need to clamp lod_fpart here since we can get negative
1497 * values which would screw up filtering if not all
1498 * lod_fpart values have same sign.
1499 * We can however then skip the greater than comparison.
1500 */
1501 lod_fpart = lp_build_max(&bld->lodi_bld, lod_fpart,
1502 bld->lodi_bld.zero);
1503 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_fpart);
1504 }
1505
1506 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1507 {
1508 struct lp_build_context u8n_bld;
1509
1510 lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width));
1511
1512 /* sample the second mipmap level */
1513 lp_build_mipmap_level_sizes(bld, ilevel1,
1514 &size1,
1515 &row_stride1_vec, &img_stride1_vec);
1516 if (bld->num_mips == 1) {
1517 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1518 }
1519 else {
1520 data_ptr1 = bld->base_ptr;
1521 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1522 }
1523
1524 if (use_floats) {
1525 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1526 lp_build_sample_image_nearest_afloat(bld,
1527 size1,
1528 row_stride1_vec, img_stride1_vec,
1529 data_ptr1, mipoff1, s, t, r, offsets,
1530 &colors1);
1531 }
1532 else {
1533 lp_build_sample_image_linear_afloat(bld,
1534 size1,
1535 row_stride1_vec, img_stride1_vec,
1536 data_ptr1, mipoff1, s, t, r, offsets,
1537 &colors1);
1538 }
1539 }
1540 else {
1541 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1542 lp_build_sample_image_nearest(bld,
1543 size1,
1544 row_stride1_vec, img_stride1_vec,
1545 data_ptr1, mipoff1, s, t, r, offsets,
1546 &colors1);
1547 }
1548 else {
1549 lp_build_sample_image_linear(bld,
1550 size1,
1551 row_stride1_vec, img_stride1_vec,
1552 data_ptr1, mipoff1, s, t, r, offsets,
1553 &colors1);
1554 }
1555 }
1556
1557 /* interpolate samples from the two mipmap levels */
1558
1559 if (num_quads == 1 && bld->num_lods == 1) {
1560 lod_fpart = LLVMBuildTrunc(builder, lod_fpart, u8n_bld.elem_type, "");
1561 lod_fpart = lp_build_broadcast_scalar(&u8n_bld, lod_fpart);
1562 }
1563 else {
1564 unsigned num_chans_per_lod = 4 * bld->coord_type.length / bld->num_lods;
1565 LLVMTypeRef tmp_vec_type = LLVMVectorType(u8n_bld.elem_type, bld->lodi_bld.type.length);
1566 LLVMValueRef shuffle[LP_MAX_VECTOR_LENGTH];
1567
1568 /* Take the LSB of lod_fpart */
1569 lod_fpart = LLVMBuildTrunc(builder, lod_fpart, tmp_vec_type, "");
1570
1571 /* Broadcast each lod weight into their respective channels */
1572 for (i = 0; i < u8n_bld.type.length; ++i) {
1573 shuffle[i] = lp_build_const_int32(bld->gallivm, i / num_chans_per_lod);
1574 }
1575 lod_fpart = LLVMBuildShuffleVector(builder, lod_fpart, LLVMGetUndef(tmp_vec_type),
1576 LLVMConstVector(shuffle, u8n_bld.type.length), "");
1577 }
1578
1579 colors0 = lp_build_lerp(&u8n_bld, lod_fpart,
1580 colors0, colors1,
1581 LP_BLD_LERP_PRESCALED_WEIGHTS);
1582
1583 LLVMBuildStore(builder, colors0, colors_var);
1584 }
1585 lp_build_endif(&if_ctx);
1586 }
1587 }
1588
1589
1590
1591 /**
1592 * Texture sampling in AoS format. Used when sampling common 32-bit/texel
1593 * formats. 1D/2D/3D/cube texture supported. All mipmap sampling modes
1594 * but only limited texture coord wrap modes.
1595 */
1596 void
1597 lp_build_sample_aos(struct lp_build_sample_context *bld,
1598 unsigned sampler_unit,
1599 LLVMValueRef s,
1600 LLVMValueRef t,
1601 LLVMValueRef r,
1602 const LLVMValueRef *offsets,
1603 LLVMValueRef lod_positive,
1604 LLVMValueRef lod_fpart,
1605 LLVMValueRef ilevel0,
1606 LLVMValueRef ilevel1,
1607 LLVMValueRef texel_out[4])
1608 {
1609 LLVMBuilderRef builder = bld->gallivm->builder;
1610 const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
1611 const unsigned min_filter = bld->static_sampler_state->min_img_filter;
1612 const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
1613 const unsigned dims = bld->dims;
1614 LLVMValueRef packed_var, packed;
1615 LLVMValueRef unswizzled[4];
1616 struct lp_build_context u8n_bld;
1617
1618 /* we only support the common/simple wrap modes at this time */
1619 assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_s));
1620 if (dims >= 2)
1621 assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_t));
1622 if (dims >= 3)
1623 assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_r));
1624
1625
1626 /* make 8-bit unorm builder context */
1627 lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width));
1628
1629 /*
1630 * Get/interpolate texture colors.
1631 */
1632
1633 packed_var = lp_build_alloca(bld->gallivm, u8n_bld.vec_type, "packed_var");
1634
1635 if (min_filter == mag_filter) {
1636 /* no need to distinguish between minification and magnification */
1637 lp_build_sample_mipmap(bld,
1638 min_filter, mip_filter,
1639 s, t, r, offsets,
1640 ilevel0, ilevel1, lod_fpart,
1641 packed_var);
1642 }
1643 else {
1644 /* Emit conditional to choose min image filter or mag image filter
1645 * depending on the lod being > 0 or <= 0, respectively.
1646 */
1647 struct lp_build_if_state if_ctx;
1648
1649 /*
1650 * FIXME this should take all lods into account, if some are min
1651 * some max probably could hack up the weights in the linear
1652 * path with selects to work for nearest.
1653 */
1654 if (bld->num_lods > 1)
1655 lod_positive = LLVMBuildExtractElement(builder, lod_positive,
1656 lp_build_const_int32(bld->gallivm, 0), "");
1657
1658 lod_positive = LLVMBuildTrunc(builder, lod_positive,
1659 LLVMInt1TypeInContext(bld->gallivm->context), "");
1660
1661 lp_build_if(&if_ctx, bld->gallivm, lod_positive);
1662 {
1663 /* Use the minification filter */
1664 lp_build_sample_mipmap(bld,
1665 min_filter, mip_filter,
1666 s, t, r, offsets,
1667 ilevel0, ilevel1, lod_fpart,
1668 packed_var);
1669 }
1670 lp_build_else(&if_ctx);
1671 {
1672 /* Use the magnification filter */
1673 lp_build_sample_mipmap(bld,
1674 mag_filter, PIPE_TEX_MIPFILTER_NONE,
1675 s, t, r, offsets,
1676 ilevel0, NULL, NULL,
1677 packed_var);
1678 }
1679 lp_build_endif(&if_ctx);
1680 }
1681
1682 packed = LLVMBuildLoad(builder, packed_var, "");
1683
1684 /*
1685 * Convert to SoA and swizzle.
1686 */
1687 lp_build_rgba8_to_fi32_soa(bld->gallivm,
1688 bld->texel_type,
1689 packed, unswizzled);
1690
1691 if (util_format_is_rgba8_variant(bld->format_desc)) {
1692 lp_build_format_swizzle_soa(bld->format_desc,
1693 &bld->texel_bld,
1694 unswizzled, texel_out);
1695 }
1696 else {
1697 texel_out[0] = unswizzled[0];
1698 texel_out[1] = unswizzled[1];
1699 texel_out[2] = unswizzled[2];
1700 texel_out[3] = unswizzled[3];
1701 }
1702 }