gallium: implement ARB_texture_query_levels
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_aos.c
1 /**************************************************************************
2 *
3 * Copyright 2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- AoS.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "util/u_debug.h"
39 #include "util/u_dump.h"
40 #include "util/u_memory.h"
41 #include "util/u_math.h"
42 #include "util/u_format.h"
43 #include "util/u_cpu_detect.h"
44 #include "lp_bld_debug.h"
45 #include "lp_bld_type.h"
46 #include "lp_bld_const.h"
47 #include "lp_bld_conv.h"
48 #include "lp_bld_arit.h"
49 #include "lp_bld_bitarit.h"
50 #include "lp_bld_logic.h"
51 #include "lp_bld_swizzle.h"
52 #include "lp_bld_pack.h"
53 #include "lp_bld_flow.h"
54 #include "lp_bld_gather.h"
55 #include "lp_bld_format.h"
56 #include "lp_bld_init.h"
57 #include "lp_bld_sample.h"
58 #include "lp_bld_sample_aos.h"
59 #include "lp_bld_quad.h"
60
61
62 /**
63 * Build LLVM code for texture coord wrapping, for nearest filtering,
64 * for scaled integer texcoords.
65 * \param block_length is the length of the pixel block along the
66 * coordinate axis
67 * \param coord the incoming texcoord (s,t or r) scaled to the texture size
68 * \param coord_f the incoming texcoord (s,t or r) as float vec
69 * \param length the texture size along one dimension
70 * \param stride pixel stride along the coordinate axis (in bytes)
71 * \param offset the texel offset along the coord axis
72 * \param is_pot if TRUE, length is a power of two
73 * \param wrap_mode one of PIPE_TEX_WRAP_x
74 * \param out_offset byte offset for the wrapped coordinate
75 * \param out_i resulting sub-block pixel coordinate for coord0
76 */
77 static void
78 lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
79 unsigned block_length,
80 LLVMValueRef coord,
81 LLVMValueRef coord_f,
82 LLVMValueRef length,
83 LLVMValueRef stride,
84 LLVMValueRef offset,
85 boolean is_pot,
86 unsigned wrap_mode,
87 LLVMValueRef *out_offset,
88 LLVMValueRef *out_i)
89 {
90 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
91 LLVMBuilderRef builder = bld->gallivm->builder;
92 LLVMValueRef length_minus_one;
93
94 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
95
96 switch(wrap_mode) {
97 case PIPE_TEX_WRAP_REPEAT:
98 if(is_pot)
99 coord = LLVMBuildAnd(builder, coord, length_minus_one, "");
100 else {
101 struct lp_build_context *coord_bld = &bld->coord_bld;
102 LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
103 if (offset) {
104 offset = lp_build_int_to_float(coord_bld, offset);
105 offset = lp_build_div(coord_bld, offset, length_f);
106 coord_f = lp_build_add(coord_bld, coord_f, offset);
107 }
108 coord = lp_build_fract_safe(coord_bld, coord_f);
109 coord = lp_build_mul(coord_bld, coord, length_f);
110 coord = lp_build_itrunc(coord_bld, coord);
111 }
112 break;
113
114 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
115 coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
116 coord = lp_build_min(int_coord_bld, coord, length_minus_one);
117 break;
118
119 case PIPE_TEX_WRAP_CLAMP:
120 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
121 case PIPE_TEX_WRAP_MIRROR_REPEAT:
122 case PIPE_TEX_WRAP_MIRROR_CLAMP:
123 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
124 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
125 default:
126 assert(0);
127 }
128
129 lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride,
130 out_offset, out_i);
131 }
132
133
134 /**
135 * Build LLVM code for texture coord wrapping, for nearest filtering,
136 * for float texcoords.
137 * \param coord the incoming texcoord (s,t or r)
138 * \param length the texture size along one dimension
139 * \param offset the texel offset along the coord axis
140 * \param is_pot if TRUE, length is a power of two
141 * \param wrap_mode one of PIPE_TEX_WRAP_x
142 * \param icoord the texcoord after wrapping, as int
143 */
144 static void
145 lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld,
146 LLVMValueRef coord,
147 LLVMValueRef length,
148 LLVMValueRef offset,
149 boolean is_pot,
150 unsigned wrap_mode,
151 LLVMValueRef *icoord)
152 {
153 struct lp_build_context *coord_bld = &bld->coord_bld;
154 LLVMValueRef length_minus_one;
155
156 switch(wrap_mode) {
157 case PIPE_TEX_WRAP_REPEAT:
158 if (offset) {
159 /* this is definitely not ideal for POT case */
160 offset = lp_build_int_to_float(coord_bld, offset);
161 offset = lp_build_div(coord_bld, offset, length);
162 coord = lp_build_add(coord_bld, coord, offset);
163 }
164 /* take fraction, unnormalize */
165 coord = lp_build_fract_safe(coord_bld, coord);
166 coord = lp_build_mul(coord_bld, coord, length);
167 *icoord = lp_build_itrunc(coord_bld, coord);
168 break;
169 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
170 length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
171 if (bld->static_sampler_state->normalized_coords) {
172 /* scale coord to length */
173 coord = lp_build_mul(coord_bld, coord, length);
174 }
175 if (offset) {
176 offset = lp_build_int_to_float(coord_bld, offset);
177 coord = lp_build_add(coord_bld, coord, offset);
178 }
179 coord = lp_build_clamp(coord_bld, coord, coord_bld->zero,
180 length_minus_one);
181 *icoord = lp_build_itrunc(coord_bld, coord);
182 break;
183
184 case PIPE_TEX_WRAP_CLAMP:
185 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
186 case PIPE_TEX_WRAP_MIRROR_REPEAT:
187 case PIPE_TEX_WRAP_MIRROR_CLAMP:
188 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
189 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
190 default:
191 assert(0);
192 }
193 }
194
195
196 /**
197 * Helper to compute the first coord and the weight for
198 * linear wrap repeat npot textures
199 */
200 static void
201 lp_build_coord_repeat_npot_linear_int(struct lp_build_sample_context *bld,
202 LLVMValueRef coord_f,
203 LLVMValueRef length_i,
204 LLVMValueRef length_f,
205 LLVMValueRef *coord0_i,
206 LLVMValueRef *weight_i)
207 {
208 struct lp_build_context *coord_bld = &bld->coord_bld;
209 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
210 struct lp_build_context abs_coord_bld;
211 struct lp_type abs_type;
212 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
213 int_coord_bld->one);
214 LLVMValueRef mask, i32_c8, i32_c128, i32_c255;
215
216 /* wrap with normalized floats is just fract */
217 coord_f = lp_build_fract(coord_bld, coord_f);
218 /* mul by size */
219 coord_f = lp_build_mul(coord_bld, coord_f, length_f);
220 /* convert to int, compute lerp weight */
221 coord_f = lp_build_mul_imm(&bld->coord_bld, coord_f, 256);
222
223 /* At this point we don't have any negative numbers so use non-signed
224 * build context which might help on some archs.
225 */
226 abs_type = coord_bld->type;
227 abs_type.sign = 0;
228 lp_build_context_init(&abs_coord_bld, bld->gallivm, abs_type);
229 *coord0_i = lp_build_iround(&abs_coord_bld, coord_f);
230
231 /* subtract 0.5 (add -128) */
232 i32_c128 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, -128);
233 *coord0_i = LLVMBuildAdd(bld->gallivm->builder, *coord0_i, i32_c128, "");
234
235 /* compute fractional part (AND with 0xff) */
236 i32_c255 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 255);
237 *weight_i = LLVMBuildAnd(bld->gallivm->builder, *coord0_i, i32_c255, "");
238
239 /* compute floor (shift right 8) */
240 i32_c8 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 8);
241 *coord0_i = LLVMBuildAShr(bld->gallivm->builder, *coord0_i, i32_c8, "");
242 /*
243 * we avoided the 0.5/length division before the repeat wrap,
244 * now need to fix up edge cases with selects
245 */
246 mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
247 PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
248 *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
249 }
250
251
252 /**
253 * Build LLVM code for texture coord wrapping, for linear filtering,
254 * for scaled integer texcoords.
255 * \param block_length is the length of the pixel block along the
256 * coordinate axis
257 * \param coord0 the incoming texcoord (s,t or r) scaled to the texture size
258 * \param coord_f the incoming texcoord (s,t or r) as float vec
259 * \param length the texture size along one dimension
260 * \param stride pixel stride along the coordinate axis (in bytes)
261 * \param offset the texel offset along the coord axis
262 * \param is_pot if TRUE, length is a power of two
263 * \param wrap_mode one of PIPE_TEX_WRAP_x
264 * \param offset0 resulting relative offset for coord0
265 * \param offset1 resulting relative offset for coord0 + 1
266 * \param i0 resulting sub-block pixel coordinate for coord0
267 * \param i1 resulting sub-block pixel coordinate for coord0 + 1
268 */
269 static void
270 lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
271 unsigned block_length,
272 LLVMValueRef coord0,
273 LLVMValueRef *weight_i,
274 LLVMValueRef coord_f,
275 LLVMValueRef length,
276 LLVMValueRef stride,
277 LLVMValueRef offset,
278 boolean is_pot,
279 unsigned wrap_mode,
280 LLVMValueRef *offset0,
281 LLVMValueRef *offset1,
282 LLVMValueRef *i0,
283 LLVMValueRef *i1)
284 {
285 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
286 LLVMBuilderRef builder = bld->gallivm->builder;
287 LLVMValueRef length_minus_one;
288 LLVMValueRef lmask, umask, mask;
289
290 /*
291 * If the pixel block covers more than one pixel then there is no easy
292 * way to calculate offset1 relative to offset0. Instead, compute them
293 * independently. Otherwise, try to compute offset0 and offset1 with
294 * a single stride multiplication.
295 */
296
297 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
298
299 if (block_length != 1) {
300 LLVMValueRef coord1;
301 switch(wrap_mode) {
302 case PIPE_TEX_WRAP_REPEAT:
303 if (is_pot) {
304 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
305 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
306 coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
307 }
308 else {
309 LLVMValueRef mask;
310 LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
311 if (offset) {
312 offset = lp_build_int_to_float(&bld->coord_bld, offset);
313 offset = lp_build_div(&bld->coord_bld, offset, length_f);
314 coord_f = lp_build_add(&bld->coord_bld, coord_f, offset);
315 }
316 lp_build_coord_repeat_npot_linear_int(bld, coord_f,
317 length, length_f,
318 &coord0, weight_i);
319 mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
320 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
321 coord1 = LLVMBuildAnd(builder,
322 lp_build_add(int_coord_bld, coord0,
323 int_coord_bld->one),
324 mask, "");
325 }
326 break;
327
328 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
329 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
330 coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero,
331 length_minus_one);
332 coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero,
333 length_minus_one);
334 break;
335
336 case PIPE_TEX_WRAP_CLAMP:
337 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
338 case PIPE_TEX_WRAP_MIRROR_REPEAT:
339 case PIPE_TEX_WRAP_MIRROR_CLAMP:
340 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
341 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
342 default:
343 assert(0);
344 coord0 = int_coord_bld->zero;
345 coord1 = int_coord_bld->zero;
346 break;
347 }
348 lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride,
349 offset0, i0);
350 lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride,
351 offset1, i1);
352 return;
353 }
354
355 *i0 = int_coord_bld->zero;
356 *i1 = int_coord_bld->zero;
357
358 switch(wrap_mode) {
359 case PIPE_TEX_WRAP_REPEAT:
360 if (is_pot) {
361 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
362 }
363 else {
364 LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length);
365 if (offset) {
366 offset = lp_build_int_to_float(&bld->coord_bld, offset);
367 offset = lp_build_div(&bld->coord_bld, offset, length_f);
368 coord_f = lp_build_add(&bld->coord_bld, coord_f, offset);
369 }
370 lp_build_coord_repeat_npot_linear_int(bld, coord_f,
371 length, length_f,
372 &coord0, weight_i);
373 }
374
375 mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
376 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
377
378 *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
379 *offset1 = LLVMBuildAnd(builder,
380 lp_build_add(int_coord_bld, *offset0, stride),
381 mask, "");
382 break;
383
384 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
385 /* XXX this might be slower than the separate path
386 * on some newer cpus. With sse41 this is 8 instructions vs. 7
387 * - at least on SNB this is almost certainly slower since
388 * min/max are cheaper than selects, and the muls aren't bad.
389 */
390 lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
391 PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
392 umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
393 PIPE_FUNC_LESS, coord0, length_minus_one);
394
395 coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
396 coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
397
398 mask = LLVMBuildAnd(builder, lmask, umask, "");
399
400 *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
401 *offset1 = lp_build_add(int_coord_bld,
402 *offset0,
403 LLVMBuildAnd(builder, stride, mask, ""));
404 break;
405
406 case PIPE_TEX_WRAP_CLAMP:
407 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
408 case PIPE_TEX_WRAP_MIRROR_REPEAT:
409 case PIPE_TEX_WRAP_MIRROR_CLAMP:
410 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
411 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
412 default:
413 assert(0);
414 *offset0 = int_coord_bld->zero;
415 *offset1 = int_coord_bld->zero;
416 break;
417 }
418 }
419
420
421 /**
422 * Build LLVM code for texture coord wrapping, for linear filtering,
423 * for float texcoords.
424 * \param block_length is the length of the pixel block along the
425 * coordinate axis
426 * \param coord the incoming texcoord (s,t or r)
427 * \param length the texture size along one dimension
428 * \param offset the texel offset along the coord axis
429 * \param is_pot if TRUE, length is a power of two
430 * \param wrap_mode one of PIPE_TEX_WRAP_x
431 * \param coord0 the first texcoord after wrapping, as int
432 * \param coord1 the second texcoord after wrapping, as int
433 * \param weight the filter weight as int (0-255)
434 * \param force_nearest if this coord actually uses nearest filtering
435 */
436 static void
437 lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld,
438 unsigned block_length,
439 LLVMValueRef coord,
440 LLVMValueRef length,
441 LLVMValueRef offset,
442 boolean is_pot,
443 unsigned wrap_mode,
444 LLVMValueRef *coord0,
445 LLVMValueRef *coord1,
446 LLVMValueRef *weight,
447 unsigned force_nearest)
448 {
449 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
450 struct lp_build_context *coord_bld = &bld->coord_bld;
451 LLVMBuilderRef builder = bld->gallivm->builder;
452 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
453 LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one);
454
455 switch(wrap_mode) {
456 case PIPE_TEX_WRAP_REPEAT:
457 if (is_pot) {
458 /* mul by size and subtract 0.5 */
459 coord = lp_build_mul(coord_bld, coord, length);
460 if (offset) {
461 offset = lp_build_int_to_float(coord_bld, offset);
462 coord = lp_build_add(coord_bld, coord, offset);
463 }
464 if (!force_nearest)
465 coord = lp_build_sub(coord_bld, coord, half);
466 *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
467 /* convert to int, compute lerp weight */
468 lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
469 *coord1 = lp_build_ifloor(coord_bld, *coord1);
470 /* repeat wrap */
471 length_minus_one = lp_build_itrunc(coord_bld, length_minus_one);
472 *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, "");
473 *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, "");
474 }
475 else {
476 LLVMValueRef mask;
477 if (offset) {
478 offset = lp_build_int_to_float(coord_bld, offset);
479 offset = lp_build_div(coord_bld, offset, length);
480 coord = lp_build_add(coord_bld, coord, offset);
481 }
482 /* wrap with normalized floats is just fract */
483 coord = lp_build_fract(coord_bld, coord);
484 /* unnormalize */
485 coord = lp_build_mul(coord_bld, coord, length);
486 /*
487 * we avoided the 0.5/length division, have to fix up wrong
488 * edge cases with selects
489 */
490 *coord1 = lp_build_add(coord_bld, coord, half);
491 coord = lp_build_sub(coord_bld, coord, half);
492 *weight = lp_build_fract(coord_bld, coord);
493 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
494 PIPE_FUNC_LESS, coord, coord_bld->zero);
495 *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord);
496 *coord0 = lp_build_itrunc(coord_bld, *coord0);
497 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
498 PIPE_FUNC_LESS, *coord1, length);
499 *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero);
500 *coord1 = lp_build_itrunc(coord_bld, *coord1);
501 }
502 break;
503 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
504 if (bld->static_sampler_state->normalized_coords) {
505 /* mul by tex size */
506 coord = lp_build_mul(coord_bld, coord, length);
507 }
508 if (offset) {
509 offset = lp_build_int_to_float(coord_bld, offset);
510 coord = lp_build_add(coord_bld, coord, offset);
511 }
512 /* subtract 0.5 */
513 if (!force_nearest) {
514 coord = lp_build_sub(coord_bld, coord, half);
515 }
516 /* clamp to [0, length - 1] */
517 coord = lp_build_min(coord_bld, coord, length_minus_one);
518 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
519 *coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
520 /* convert to int, compute lerp weight */
521 lp_build_ifloor_fract(coord_bld, coord, coord0, weight);
522 /* coord1 = min(coord1, length-1) */
523 *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one);
524 *coord1 = lp_build_itrunc(coord_bld, *coord1);
525 break;
526 default:
527 assert(0);
528 *coord0 = int_coord_bld->zero;
529 *coord1 = int_coord_bld->zero;
530 *weight = coord_bld->zero;
531 break;
532 }
533 *weight = lp_build_mul_imm(coord_bld, *weight, 256);
534 *weight = lp_build_itrunc(coord_bld, *weight);
535 return;
536 }
537
538
539 /**
540 * Fetch texels for image with nearest sampling.
541 * Return filtered color as two vectors of 16-bit fixed point values.
542 */
543 static void
544 lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
545 LLVMValueRef data_ptr,
546 LLVMValueRef offset,
547 LLVMValueRef x_subcoord,
548 LLVMValueRef y_subcoord,
549 LLVMValueRef *colors)
550 {
551 /*
552 * Fetch the pixels as 4 x 32bit (rgba order might differ):
553 *
554 * rgba0 rgba1 rgba2 rgba3
555 *
556 * bit cast them into 16 x u8
557 *
558 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
559 *
560 * unpack them into two 8 x i16:
561 *
562 * r0 g0 b0 a0 r1 g1 b1 a1
563 * r2 g2 b2 a2 r3 g3 b3 a3
564 *
565 * The higher 8 bits of the resulting elements will be zero.
566 */
567 LLVMBuilderRef builder = bld->gallivm->builder;
568 LLVMValueRef rgba8;
569 struct lp_build_context u8n;
570 LLVMTypeRef u8n_vec_type;
571
572 lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
573 u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
574
575 if (util_format_is_rgba8_variant(bld->format_desc)) {
576 /*
577 * Given the format is a rgba8, just read the pixels as is,
578 * without any swizzling. Swizzling will be done later.
579 */
580 rgba8 = lp_build_gather(bld->gallivm,
581 bld->texel_type.length,
582 bld->format_desc->block.bits,
583 bld->texel_type.width,
584 data_ptr, offset, TRUE);
585
586 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
587 }
588 else {
589 rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
590 bld->format_desc,
591 u8n.type,
592 data_ptr, offset,
593 x_subcoord,
594 y_subcoord);
595 }
596
597 *colors = rgba8;
598 }
599
600
601 /**
602 * Sample a single texture image with nearest sampling.
603 * If sampling a cube texture, r = cube face in [0,5].
604 * Return filtered color as two vectors of 16-bit fixed point values.
605 */
606 static void
607 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
608 LLVMValueRef int_size,
609 LLVMValueRef row_stride_vec,
610 LLVMValueRef img_stride_vec,
611 LLVMValueRef data_ptr,
612 LLVMValueRef mipoffsets,
613 LLVMValueRef s,
614 LLVMValueRef t,
615 LLVMValueRef r,
616 const LLVMValueRef *offsets,
617 LLVMValueRef *colors)
618 {
619 const unsigned dims = bld->dims;
620 struct lp_build_context i32;
621 LLVMValueRef width_vec, height_vec, depth_vec;
622 LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
623 LLVMValueRef s_float, t_float = NULL, r_float = NULL;
624 LLVMValueRef x_stride;
625 LLVMValueRef x_offset, offset;
626 LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
627
628 lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
629
630 lp_build_extract_image_sizes(bld,
631 &bld->int_size_bld,
632 bld->int_coord_type,
633 int_size,
634 &width_vec,
635 &height_vec,
636 &depth_vec);
637
638 s_float = s; t_float = t; r_float = r;
639
640 if (bld->static_sampler_state->normalized_coords) {
641 LLVMValueRef flt_size;
642
643 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
644
645 lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
646 }
647
648 /* convert float to int */
649 /* For correct rounding, need floor, not truncation here.
650 * Note that in some cases (clamp to edge, no texel offsets) we
651 * could use a non-signed build context which would help archs
652 * greatly which don't have arch rounding.
653 */
654 s_ipart = lp_build_ifloor(&bld->coord_bld, s);
655 if (dims >= 2)
656 t_ipart = lp_build_ifloor(&bld->coord_bld, t);
657 if (dims >= 3)
658 r_ipart = lp_build_ifloor(&bld->coord_bld, r);
659
660 /* add texel offsets */
661 if (offsets[0]) {
662 s_ipart = lp_build_add(&i32, s_ipart, offsets[0]);
663 if (dims >= 2) {
664 t_ipart = lp_build_add(&i32, t_ipart, offsets[1]);
665 if (dims >= 3) {
666 r_ipart = lp_build_add(&i32, r_ipart, offsets[2]);
667 }
668 }
669 }
670
671 /* get pixel, row, image strides */
672 x_stride = lp_build_const_vec(bld->gallivm,
673 bld->int_coord_bld.type,
674 bld->format_desc->block.bits/8);
675
676 /* Do texcoord wrapping, compute texel offset */
677 lp_build_sample_wrap_nearest_int(bld,
678 bld->format_desc->block.width,
679 s_ipart, s_float,
680 width_vec, x_stride, offsets[0],
681 bld->static_texture_state->pot_width,
682 bld->static_sampler_state->wrap_s,
683 &x_offset, &x_subcoord);
684 offset = x_offset;
685 if (dims >= 2) {
686 LLVMValueRef y_offset;
687 lp_build_sample_wrap_nearest_int(bld,
688 bld->format_desc->block.height,
689 t_ipart, t_float,
690 height_vec, row_stride_vec, offsets[1],
691 bld->static_texture_state->pot_height,
692 bld->static_sampler_state->wrap_t,
693 &y_offset, &y_subcoord);
694 offset = lp_build_add(&bld->int_coord_bld, offset, y_offset);
695 if (dims >= 3) {
696 LLVMValueRef z_offset;
697 lp_build_sample_wrap_nearest_int(bld,
698 1, /* block length (depth) */
699 r_ipart, r_float,
700 depth_vec, img_stride_vec, offsets[2],
701 bld->static_texture_state->pot_depth,
702 bld->static_sampler_state->wrap_r,
703 &z_offset, &z_subcoord);
704 offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
705 }
706 }
707 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
708 bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
709 bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
710 LLVMValueRef z_offset;
711 /* The r coord is the cube face in [0,5] or array layer */
712 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
713 offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
714 }
715 if (mipoffsets) {
716 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
717 }
718
719 lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
720 x_subcoord, y_subcoord,
721 colors);
722 }
723
724
725 /**
726 * Sample a single texture image with nearest sampling.
727 * If sampling a cube texture, r = cube face in [0,5].
728 * Return filtered color as two vectors of 16-bit fixed point values.
729 * Does address calcs (except offsets) with floats.
730 * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
731 */
732 static void
733 lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
734 LLVMValueRef int_size,
735 LLVMValueRef row_stride_vec,
736 LLVMValueRef img_stride_vec,
737 LLVMValueRef data_ptr,
738 LLVMValueRef mipoffsets,
739 LLVMValueRef s,
740 LLVMValueRef t,
741 LLVMValueRef r,
742 const LLVMValueRef *offsets,
743 LLVMValueRef *colors)
744 {
745 const unsigned dims = bld->dims;
746 LLVMValueRef width_vec, height_vec, depth_vec;
747 LLVMValueRef offset;
748 LLVMValueRef x_subcoord, y_subcoord;
749 LLVMValueRef x_icoord = NULL, y_icoord = NULL, z_icoord = NULL;
750 LLVMValueRef flt_size;
751
752 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
753
754 lp_build_extract_image_sizes(bld,
755 &bld->float_size_bld,
756 bld->coord_type,
757 flt_size,
758 &width_vec,
759 &height_vec,
760 &depth_vec);
761
762 /* Do texcoord wrapping */
763 lp_build_sample_wrap_nearest_float(bld,
764 s, width_vec, offsets[0],
765 bld->static_texture_state->pot_width,
766 bld->static_sampler_state->wrap_s,
767 &x_icoord);
768
769 if (dims >= 2) {
770 lp_build_sample_wrap_nearest_float(bld,
771 t, height_vec, offsets[1],
772 bld->static_texture_state->pot_height,
773 bld->static_sampler_state->wrap_t,
774 &y_icoord);
775
776 if (dims >= 3) {
777 lp_build_sample_wrap_nearest_float(bld,
778 r, depth_vec, offsets[2],
779 bld->static_texture_state->pot_depth,
780 bld->static_sampler_state->wrap_r,
781 &z_icoord);
782 }
783 }
784 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
785 bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
786 bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
787 z_icoord = r;
788 }
789
790 /*
791 * From here on we deal with ints, and we should split up the 256bit
792 * vectors manually for better generated code.
793 */
794
795 /*
796 * compute texel offsets -
797 * cannot do offset calc with floats, difficult for block-based formats,
798 * and not enough precision anyway.
799 */
800 lp_build_sample_offset(&bld->int_coord_bld,
801 bld->format_desc,
802 x_icoord, y_icoord,
803 z_icoord,
804 row_stride_vec, img_stride_vec,
805 &offset,
806 &x_subcoord, &y_subcoord);
807 if (mipoffsets) {
808 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
809 }
810
811 lp_build_sample_fetch_image_nearest(bld, data_ptr, offset,
812 x_subcoord, y_subcoord,
813 colors);
814 }
815
816
817 /**
818 * Fetch texels for image with linear sampling.
819 * Return filtered color as two vectors of 16-bit fixed point values.
820 */
821 static void
822 lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
823 LLVMValueRef data_ptr,
824 LLVMValueRef offset[2][2][2],
825 LLVMValueRef x_subcoord[2],
826 LLVMValueRef y_subcoord[2],
827 LLVMValueRef s_fpart,
828 LLVMValueRef t_fpart,
829 LLVMValueRef r_fpart,
830 LLVMValueRef *colors)
831 {
832 const unsigned dims = bld->dims;
833 LLVMBuilderRef builder = bld->gallivm->builder;
834 struct lp_build_context u8n;
835 LLVMTypeRef u8n_vec_type;
836 LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
837 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
838 LLVMValueRef shuffle;
839 LLVMValueRef neighbors[2][2][2]; /* [z][y][x] */
840 LLVMValueRef packed;
841 unsigned i, j, k;
842 unsigned numj, numk;
843
844 lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width));
845 u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type);
846
847 /*
848 * Transform 4 x i32 in
849 *
850 * s_fpart = {s0, s1, s2, s3}
851 *
852 * where each value is between 0 and 0xff,
853 *
854 * into one 16 x i20
855 *
856 * s_fpart = {s0, s0, s0, s0, s1, s1, s1, s1, s2, s2, s2, s2, s3, s3, s3, s3}
857 *
858 * and likewise for t_fpart. There is no risk of loosing precision here
859 * since the fractional parts only use the lower 8bits.
860 */
861 s_fpart = LLVMBuildBitCast(builder, s_fpart, u8n_vec_type, "");
862 if (dims >= 2)
863 t_fpart = LLVMBuildBitCast(builder, t_fpart, u8n_vec_type, "");
864 if (dims >= 3)
865 r_fpart = LLVMBuildBitCast(builder, r_fpart, u8n_vec_type, "");
866
867 for (j = 0; j < u8n.type.length; j += 4) {
868 #ifdef PIPE_ARCH_LITTLE_ENDIAN
869 unsigned subindex = 0;
870 #else
871 unsigned subindex = 3;
872 #endif
873 LLVMValueRef index;
874
875 index = LLVMConstInt(elem_type, j + subindex, 0);
876 for (i = 0; i < 4; ++i)
877 shuffles[j + i] = index;
878 }
879
880 shuffle = LLVMConstVector(shuffles, u8n.type.length);
881
882 s_fpart = LLVMBuildShuffleVector(builder, s_fpart, u8n.undef,
883 shuffle, "");
884 if (dims >= 2) {
885 t_fpart = LLVMBuildShuffleVector(builder, t_fpart, u8n.undef,
886 shuffle, "");
887 }
888 if (dims >= 3) {
889 r_fpart = LLVMBuildShuffleVector(builder, r_fpart, u8n.undef,
890 shuffle, "");
891 }
892
893 /*
894 * Fetch the pixels as 4 x 32bit (rgba order might differ):
895 *
896 * rgba0 rgba1 rgba2 rgba3
897 *
898 * bit cast them into 16 x u8
899 *
900 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
901 *
902 * unpack them into two 8 x i16:
903 *
904 * r0 g0 b0 a0 r1 g1 b1 a1
905 * r2 g2 b2 a2 r3 g3 b3 a3
906 *
907 * The higher 8 bits of the resulting elements will be zero.
908 */
909 numj = 1 + (dims >= 2);
910 numk = 1 + (dims >= 3);
911
912 for (k = 0; k < numk; k++) {
913 for (j = 0; j < numj; j++) {
914 for (i = 0; i < 2; i++) {
915 LLVMValueRef rgba8;
916
917 if (util_format_is_rgba8_variant(bld->format_desc)) {
918 /*
919 * Given the format is a rgba8, just read the pixels as is,
920 * without any swizzling. Swizzling will be done later.
921 */
922 rgba8 = lp_build_gather(bld->gallivm,
923 bld->texel_type.length,
924 bld->format_desc->block.bits,
925 bld->texel_type.width,
926 data_ptr, offset[k][j][i], TRUE);
927
928 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
929 }
930 else {
931 rgba8 = lp_build_fetch_rgba_aos(bld->gallivm,
932 bld->format_desc,
933 u8n.type,
934 data_ptr, offset[k][j][i],
935 x_subcoord[i],
936 y_subcoord[j]);
937 }
938
939 neighbors[k][j][i] = rgba8;
940 }
941 }
942 }
943
944 /*
945 * Linear interpolation with 8.8 fixed point.
946 */
947 if (bld->static_sampler_state->force_nearest_s) {
948 /* special case 1-D lerp */
949 packed = lp_build_lerp(&u8n,
950 t_fpart,
951 neighbors[0][0][0],
952 neighbors[0][0][1],
953 LP_BLD_LERP_PRESCALED_WEIGHTS);
954 }
955 else if (bld->static_sampler_state->force_nearest_t) {
956 /* special case 1-D lerp */
957 packed = lp_build_lerp(&u8n,
958 s_fpart,
959 neighbors[0][0][0],
960 neighbors[0][0][1],
961 LP_BLD_LERP_PRESCALED_WEIGHTS);
962 }
963 else {
964 /* general 1/2/3-D lerping */
965 if (dims == 1) {
966 packed = lp_build_lerp(&u8n,
967 s_fpart,
968 neighbors[0][0][0],
969 neighbors[0][0][1],
970 LP_BLD_LERP_PRESCALED_WEIGHTS);
971 } else if (dims == 2) {
972 /* 2-D lerp */
973 packed = lp_build_lerp_2d(&u8n,
974 s_fpart, t_fpart,
975 neighbors[0][0][0],
976 neighbors[0][0][1],
977 neighbors[0][1][0],
978 neighbors[0][1][1],
979 LP_BLD_LERP_PRESCALED_WEIGHTS);
980 } else {
981 /* 3-D lerp */
982 assert(dims == 3);
983 packed = lp_build_lerp_3d(&u8n,
984 s_fpart, t_fpart, r_fpart,
985 neighbors[0][0][0],
986 neighbors[0][0][1],
987 neighbors[0][1][0],
988 neighbors[0][1][1],
989 neighbors[1][0][0],
990 neighbors[1][0][1],
991 neighbors[1][1][0],
992 neighbors[1][1][1],
993 LP_BLD_LERP_PRESCALED_WEIGHTS);
994 }
995 }
996
997 *colors = packed;
998 }
999
1000 /**
1001 * Sample a single texture image with (bi-)(tri-)linear sampling.
1002 * Return filtered color as two vectors of 16-bit fixed point values.
1003 */
1004 static void
1005 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1006 LLVMValueRef int_size,
1007 LLVMValueRef row_stride_vec,
1008 LLVMValueRef img_stride_vec,
1009 LLVMValueRef data_ptr,
1010 LLVMValueRef mipoffsets,
1011 LLVMValueRef s,
1012 LLVMValueRef t,
1013 LLVMValueRef r,
1014 const LLVMValueRef *offsets,
1015 LLVMValueRef *colors)
1016 {
1017 const unsigned dims = bld->dims;
1018 LLVMBuilderRef builder = bld->gallivm->builder;
1019 struct lp_build_context i32;
1020 LLVMValueRef i32_c8, i32_c128, i32_c255;
1021 LLVMValueRef width_vec, height_vec, depth_vec;
1022 LLVMValueRef s_ipart, s_fpart, s_float;
1023 LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL;
1024 LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL;
1025 LLVMValueRef x_stride, y_stride, z_stride;
1026 LLVMValueRef x_offset0, x_offset1;
1027 LLVMValueRef y_offset0, y_offset1;
1028 LLVMValueRef z_offset0, z_offset1;
1029 LLVMValueRef offset[2][2][2]; /* [z][y][x] */
1030 LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
1031 unsigned x, y, z;
1032
1033 lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width));
1034
1035 lp_build_extract_image_sizes(bld,
1036 &bld->int_size_bld,
1037 bld->int_coord_type,
1038 int_size,
1039 &width_vec,
1040 &height_vec,
1041 &depth_vec);
1042
1043 s_float = s; t_float = t; r_float = r;
1044
1045 if (bld->static_sampler_state->normalized_coords) {
1046 LLVMValueRef scaled_size;
1047 LLVMValueRef flt_size;
1048
1049 /* scale size by 256 (8 fractional bits) */
1050 scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
1051
1052 flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
1053
1054 lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
1055 }
1056 else {
1057 /* scale coords by 256 (8 fractional bits) */
1058 s = lp_build_mul_imm(&bld->coord_bld, s, 256);
1059 if (dims >= 2)
1060 t = lp_build_mul_imm(&bld->coord_bld, t, 256);
1061 if (dims >= 3)
1062 r = lp_build_mul_imm(&bld->coord_bld, r, 256);
1063 }
1064
1065 /* convert float to int */
1066 /* For correct rounding, need round to nearest, not truncation here.
1067 * Note that in some cases (clamp to edge, no texel offsets) we
1068 * could use a non-signed build context which would help archs which
1069 * don't have fptosi intrinsic with nearest rounding implemented.
1070 */
1071 s = lp_build_iround(&bld->coord_bld, s);
1072 if (dims >= 2)
1073 t = lp_build_iround(&bld->coord_bld, t);
1074 if (dims >= 3)
1075 r = lp_build_iround(&bld->coord_bld, r);
1076
1077 /* subtract 0.5 (add -128) */
1078 i32_c128 = lp_build_const_int_vec(bld->gallivm, i32.type, -128);
1079 if (!bld->static_sampler_state->force_nearest_s) {
1080 s = LLVMBuildAdd(builder, s, i32_c128, "");
1081 }
1082 if (dims >= 2 && !bld->static_sampler_state->force_nearest_t) {
1083 t = LLVMBuildAdd(builder, t, i32_c128, "");
1084 }
1085 if (dims >= 3) {
1086 r = LLVMBuildAdd(builder, r, i32_c128, "");
1087 }
1088
1089 /* compute floor (shift right 8) */
1090 i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8);
1091 s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
1092 if (dims >= 2)
1093 t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
1094 if (dims >= 3)
1095 r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
1096
1097 /* add texel offsets */
1098 if (offsets[0]) {
1099 s_ipart = lp_build_add(&i32, s_ipart, offsets[0]);
1100 if (dims >= 2) {
1101 t_ipart = lp_build_add(&i32, t_ipart, offsets[1]);
1102 if (dims >= 3) {
1103 r_ipart = lp_build_add(&i32, r_ipart, offsets[2]);
1104 }
1105 }
1106 }
1107
1108 /* compute fractional part (AND with 0xff) */
1109 i32_c255 = lp_build_const_int_vec(bld->gallivm, i32.type, 255);
1110 s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
1111 if (dims >= 2)
1112 t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
1113 if (dims >= 3)
1114 r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
1115
1116 /* get pixel, row and image strides */
1117 x_stride = lp_build_const_vec(bld->gallivm, bld->int_coord_bld.type,
1118 bld->format_desc->block.bits/8);
1119 y_stride = row_stride_vec;
1120 z_stride = img_stride_vec;
1121
1122 /* do texcoord wrapping and compute texel offsets */
1123 lp_build_sample_wrap_linear_int(bld,
1124 bld->format_desc->block.width,
1125 s_ipart, &s_fpart, s_float,
1126 width_vec, x_stride, offsets[0],
1127 bld->static_texture_state->pot_width,
1128 bld->static_sampler_state->wrap_s,
1129 &x_offset0, &x_offset1,
1130 &x_subcoord[0], &x_subcoord[1]);
1131
1132 /* add potential cube/array/mip offsets now as they are constant per pixel */
1133 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1134 bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1135 bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1136 LLVMValueRef z_offset;
1137 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1138 /* The r coord is the cube face in [0,5] or array layer */
1139 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, z_offset);
1140 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, z_offset);
1141 }
1142 if (mipoffsets) {
1143 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, mipoffsets);
1144 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, mipoffsets);
1145 }
1146
1147 for (z = 0; z < 2; z++) {
1148 for (y = 0; y < 2; y++) {
1149 offset[z][y][0] = x_offset0;
1150 offset[z][y][1] = x_offset1;
1151 }
1152 }
1153
1154 if (dims >= 2) {
1155 lp_build_sample_wrap_linear_int(bld,
1156 bld->format_desc->block.height,
1157 t_ipart, &t_fpart, t_float,
1158 height_vec, y_stride, offsets[1],
1159 bld->static_texture_state->pot_height,
1160 bld->static_sampler_state->wrap_t,
1161 &y_offset0, &y_offset1,
1162 &y_subcoord[0], &y_subcoord[1]);
1163
1164 for (z = 0; z < 2; z++) {
1165 for (x = 0; x < 2; x++) {
1166 offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1167 offset[z][0][x], y_offset0);
1168 offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1169 offset[z][1][x], y_offset1);
1170 }
1171 }
1172 }
1173
1174 if (dims >= 3) {
1175 lp_build_sample_wrap_linear_int(bld,
1176 1, /* block length (depth) */
1177 r_ipart, &r_fpart, r_float,
1178 depth_vec, z_stride, offsets[2],
1179 bld->static_texture_state->pot_depth,
1180 bld->static_sampler_state->wrap_r,
1181 &z_offset0, &z_offset1,
1182 &z_subcoord[0], &z_subcoord[1]);
1183 for (y = 0; y < 2; y++) {
1184 for (x = 0; x < 2; x++) {
1185 offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1186 offset[0][y][x], z_offset0);
1187 offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1188 offset[1][y][x], z_offset1);
1189 }
1190 }
1191 }
1192
1193 lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1194 x_subcoord, y_subcoord,
1195 s_fpart, t_fpart, r_fpart,
1196 colors);
1197 }
1198
1199
1200 /**
1201 * Sample a single texture image with (bi-)(tri-)linear sampling.
1202 * Return filtered color as two vectors of 16-bit fixed point values.
1203 * Does address calcs (except offsets) with floats.
1204 * Useful for AVX which has support for 8x32 floats but not 8x32 ints.
1205 */
1206 static void
1207 lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
1208 LLVMValueRef int_size,
1209 LLVMValueRef row_stride_vec,
1210 LLVMValueRef img_stride_vec,
1211 LLVMValueRef data_ptr,
1212 LLVMValueRef mipoffsets,
1213 LLVMValueRef s,
1214 LLVMValueRef t,
1215 LLVMValueRef r,
1216 const LLVMValueRef *offsets,
1217 LLVMValueRef *colors)
1218 {
1219 const unsigned dims = bld->dims;
1220 LLVMValueRef width_vec, height_vec, depth_vec;
1221 LLVMValueRef s_fpart;
1222 LLVMValueRef t_fpart = NULL;
1223 LLVMValueRef r_fpart = NULL;
1224 LLVMValueRef x_stride, y_stride, z_stride;
1225 LLVMValueRef x_offset0, x_offset1;
1226 LLVMValueRef y_offset0, y_offset1;
1227 LLVMValueRef z_offset0, z_offset1;
1228 LLVMValueRef offset[2][2][2]; /* [z][y][x] */
1229 LLVMValueRef x_subcoord[2], y_subcoord[2];
1230 LLVMValueRef flt_size;
1231 LLVMValueRef x_icoord0, x_icoord1;
1232 LLVMValueRef y_icoord0, y_icoord1;
1233 LLVMValueRef z_icoord0, z_icoord1;
1234 unsigned x, y, z;
1235
1236 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);
1237
1238 lp_build_extract_image_sizes(bld,
1239 &bld->float_size_bld,
1240 bld->coord_type,
1241 flt_size,
1242 &width_vec,
1243 &height_vec,
1244 &depth_vec);
1245
1246 /* do texcoord wrapping and compute texel offsets */
1247 lp_build_sample_wrap_linear_float(bld,
1248 bld->format_desc->block.width,
1249 s, width_vec, offsets[0],
1250 bld->static_texture_state->pot_width,
1251 bld->static_sampler_state->wrap_s,
1252 &x_icoord0, &x_icoord1,
1253 &s_fpart,
1254 bld->static_sampler_state->force_nearest_s);
1255
1256 if (dims >= 2) {
1257 lp_build_sample_wrap_linear_float(bld,
1258 bld->format_desc->block.height,
1259 t, height_vec, offsets[1],
1260 bld->static_texture_state->pot_height,
1261 bld->static_sampler_state->wrap_t,
1262 &y_icoord0, &y_icoord1,
1263 &t_fpart,
1264 bld->static_sampler_state->force_nearest_t);
1265
1266 if (dims >= 3) {
1267 lp_build_sample_wrap_linear_float(bld,
1268 1, /* block length (depth) */
1269 r, depth_vec, offsets[2],
1270 bld->static_texture_state->pot_depth,
1271 bld->static_sampler_state->wrap_r,
1272 &z_icoord0, &z_icoord1,
1273 &r_fpart, 0);
1274 }
1275 }
1276
1277 /*
1278 * From here on we deal with ints, and we should split up the 256bit
1279 * vectors manually for better generated code.
1280 */
1281
1282 /* get pixel, row and image strides */
1283 x_stride = lp_build_const_vec(bld->gallivm,
1284 bld->int_coord_bld.type,
1285 bld->format_desc->block.bits/8);
1286 y_stride = row_stride_vec;
1287 z_stride = img_stride_vec;
1288
1289 /*
1290 * compute texel offset -
1291 * cannot do offset calc with floats, difficult for block-based formats,
1292 * and not enough precision anyway.
1293 */
1294 lp_build_sample_partial_offset(&bld->int_coord_bld,
1295 bld->format_desc->block.width,
1296 x_icoord0, x_stride,
1297 &x_offset0, &x_subcoord[0]);
1298 lp_build_sample_partial_offset(&bld->int_coord_bld,
1299 bld->format_desc->block.width,
1300 x_icoord1, x_stride,
1301 &x_offset1, &x_subcoord[1]);
1302
1303 /* add potential cube/array/mip offsets now as they are constant per pixel */
1304 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1305 bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
1306 bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
1307 LLVMValueRef z_offset;
1308 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
1309 /* The r coord is the cube face in [0,5] or array layer */
1310 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, z_offset);
1311 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, z_offset);
1312 }
1313 if (mipoffsets) {
1314 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, mipoffsets);
1315 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, mipoffsets);
1316 }
1317
1318 for (z = 0; z < 2; z++) {
1319 for (y = 0; y < 2; y++) {
1320 offset[z][y][0] = x_offset0;
1321 offset[z][y][1] = x_offset1;
1322 }
1323 }
1324
1325 if (dims >= 2) {
1326 lp_build_sample_partial_offset(&bld->int_coord_bld,
1327 bld->format_desc->block.height,
1328 y_icoord0, y_stride,
1329 &y_offset0, &y_subcoord[0]);
1330 lp_build_sample_partial_offset(&bld->int_coord_bld,
1331 bld->format_desc->block.height,
1332 y_icoord1, y_stride,
1333 &y_offset1, &y_subcoord[1]);
1334 for (z = 0; z < 2; z++) {
1335 for (x = 0; x < 2; x++) {
1336 offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
1337 offset[z][0][x], y_offset0);
1338 offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
1339 offset[z][1][x], y_offset1);
1340 }
1341 }
1342 }
1343
1344 if (dims >= 3) {
1345 LLVMValueRef z_subcoord[2];
1346 lp_build_sample_partial_offset(&bld->int_coord_bld,
1347 1,
1348 z_icoord0, z_stride,
1349 &z_offset0, &z_subcoord[0]);
1350 lp_build_sample_partial_offset(&bld->int_coord_bld,
1351 1,
1352 z_icoord1, z_stride,
1353 &z_offset1, &z_subcoord[1]);
1354 for (y = 0; y < 2; y++) {
1355 for (x = 0; x < 2; x++) {
1356 offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
1357 offset[0][y][x], z_offset0);
1358 offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
1359 offset[1][y][x], z_offset1);
1360 }
1361 }
1362 }
1363
1364 lp_build_sample_fetch_image_linear(bld, data_ptr, offset,
1365 x_subcoord, y_subcoord,
1366 s_fpart, t_fpart, r_fpart,
1367 colors);
1368 }
1369
1370
1371 /**
1372 * Sample the texture/mipmap using given image filter and mip filter.
1373 * data0_ptr and data1_ptr point to the two mipmap levels to sample
1374 * from. width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
1375 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1376 */
1377 static void
1378 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1379 unsigned img_filter,
1380 unsigned mip_filter,
1381 LLVMValueRef s,
1382 LLVMValueRef t,
1383 LLVMValueRef r,
1384 const LLVMValueRef *offsets,
1385 LLVMValueRef ilevel0,
1386 LLVMValueRef ilevel1,
1387 LLVMValueRef lod_fpart,
1388 LLVMValueRef colors_var)
1389 {
1390 LLVMBuilderRef builder = bld->gallivm->builder;
1391 LLVMValueRef size0;
1392 LLVMValueRef size1;
1393 LLVMValueRef row_stride0_vec = NULL;
1394 LLVMValueRef row_stride1_vec = NULL;
1395 LLVMValueRef img_stride0_vec = NULL;
1396 LLVMValueRef img_stride1_vec = NULL;
1397 LLVMValueRef data_ptr0;
1398 LLVMValueRef data_ptr1;
1399 LLVMValueRef mipoff0 = NULL;
1400 LLVMValueRef mipoff1 = NULL;
1401 LLVMValueRef colors0;
1402 LLVMValueRef colors1;
1403
1404 /* sample the first mipmap level */
1405 lp_build_mipmap_level_sizes(bld, ilevel0,
1406 &size0,
1407 &row_stride0_vec, &img_stride0_vec);
1408 if (bld->num_mips == 1) {
1409 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1410 }
1411 else {
1412 /* This path should work for num_lods 1 too but slightly less efficient */
1413 data_ptr0 = bld->base_ptr;
1414 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1415 }
1416
1417 if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
1418 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1419 lp_build_sample_image_nearest_afloat(bld,
1420 size0,
1421 row_stride0_vec, img_stride0_vec,
1422 data_ptr0, mipoff0, s, t, r, offsets,
1423 &colors0);
1424 }
1425 else {
1426 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1427 lp_build_sample_image_linear_afloat(bld,
1428 size0,
1429 row_stride0_vec, img_stride0_vec,
1430 data_ptr0, mipoff0, s, t, r, offsets,
1431 &colors0);
1432 }
1433 }
1434 else {
1435 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1436 lp_build_sample_image_nearest(bld,
1437 size0,
1438 row_stride0_vec, img_stride0_vec,
1439 data_ptr0, mipoff0, s, t, r, offsets,
1440 &colors0);
1441 }
1442 else {
1443 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1444 lp_build_sample_image_linear(bld,
1445 size0,
1446 row_stride0_vec, img_stride0_vec,
1447 data_ptr0, mipoff0, s, t, r, offsets,
1448 &colors0);
1449 }
1450 }
1451
1452 /* Store the first level's colors in the output variables */
1453 LLVMBuildStore(builder, colors0, colors_var);
1454
1455 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1456 LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm,
1457 bld->lodf_bld.type, 256.0);
1458 LLVMTypeRef i32vec_type = bld->lodi_bld.vec_type;
1459 struct lp_build_if_state if_ctx;
1460 LLVMValueRef need_lerp;
1461 unsigned num_quads = bld->coord_bld.type.length / 4;
1462 unsigned i;
1463
1464 lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, "");
1465 lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16");
1466
1467 /* need_lerp = lod_fpart > 0 */
1468 if (bld->num_lods == 1) {
1469 need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
1470 lod_fpart, bld->lodi_bld.zero,
1471 "need_lerp");
1472 }
1473 else {
1474 /*
1475 * We'll do mip filtering if any of the quads need it.
1476 * It might be better to split the vectors here and only fetch/filter
1477 * quads which need it.
1478 */
1479 /*
1480 * We need to clamp lod_fpart here since we can get negative
1481 * values which would screw up filtering if not all
1482 * lod_fpart values have same sign.
1483 * We can however then skip the greater than comparison.
1484 */
1485 lod_fpart = lp_build_max(&bld->lodi_bld, lod_fpart,
1486 bld->lodi_bld.zero);
1487 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_fpart);
1488 }
1489
1490 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1491 {
1492 struct lp_build_context u8n_bld;
1493
1494 lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width));
1495
1496 /* sample the second mipmap level */
1497 lp_build_mipmap_level_sizes(bld, ilevel1,
1498 &size1,
1499 &row_stride1_vec, &img_stride1_vec);
1500 if (bld->num_mips == 1) {
1501 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1502 }
1503 else {
1504 data_ptr1 = bld->base_ptr;
1505 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1506 }
1507
1508 if (util_cpu_caps.has_avx && bld->coord_type.length > 4) {
1509 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1510 lp_build_sample_image_nearest_afloat(bld,
1511 size1,
1512 row_stride1_vec, img_stride1_vec,
1513 data_ptr1, mipoff1, s, t, r, offsets,
1514 &colors1);
1515 }
1516 else {
1517 lp_build_sample_image_linear_afloat(bld,
1518 size1,
1519 row_stride1_vec, img_stride1_vec,
1520 data_ptr1, mipoff1, s, t, r, offsets,
1521 &colors1);
1522 }
1523 }
1524 else {
1525 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1526 lp_build_sample_image_nearest(bld,
1527 size1,
1528 row_stride1_vec, img_stride1_vec,
1529 data_ptr1, mipoff1, s, t, r, offsets,
1530 &colors1);
1531 }
1532 else {
1533 lp_build_sample_image_linear(bld,
1534 size1,
1535 row_stride1_vec, img_stride1_vec,
1536 data_ptr1, mipoff1, s, t, r, offsets,
1537 &colors1);
1538 }
1539 }
1540
1541 /* interpolate samples from the two mipmap levels */
1542
1543 if (num_quads == 1 && bld->num_lods == 1) {
1544 lod_fpart = LLVMBuildTrunc(builder, lod_fpart, u8n_bld.elem_type, "");
1545 lod_fpart = lp_build_broadcast_scalar(&u8n_bld, lod_fpart);
1546 }
1547 else {
1548 unsigned num_chans_per_lod = 4 * bld->coord_type.length / bld->num_lods;
1549 LLVMTypeRef tmp_vec_type = LLVMVectorType(u8n_bld.elem_type, bld->lodi_bld.type.length);
1550 LLVMValueRef shuffle[LP_MAX_VECTOR_LENGTH];
1551
1552 /* Take the LSB of lod_fpart */
1553 lod_fpart = LLVMBuildTrunc(builder, lod_fpart, tmp_vec_type, "");
1554
1555 /* Broadcast each lod weight into their respective channels */
1556 for (i = 0; i < u8n_bld.type.length; ++i) {
1557 shuffle[i] = lp_build_const_int32(bld->gallivm, i / num_chans_per_lod);
1558 }
1559 lod_fpart = LLVMBuildShuffleVector(builder, lod_fpart, LLVMGetUndef(tmp_vec_type),
1560 LLVMConstVector(shuffle, u8n_bld.type.length), "");
1561 }
1562
1563 colors0 = lp_build_lerp(&u8n_bld, lod_fpart,
1564 colors0, colors1,
1565 LP_BLD_LERP_PRESCALED_WEIGHTS);
1566
1567 LLVMBuildStore(builder, colors0, colors_var);
1568 }
1569 lp_build_endif(&if_ctx);
1570 }
1571 }
1572
1573
1574
1575 /**
1576 * Texture sampling in AoS format. Used when sampling common 32-bit/texel
1577 * formats. 1D/2D/3D/cube texture supported. All mipmap sampling modes
1578 * but only limited texture coord wrap modes.
1579 */
1580 void
1581 lp_build_sample_aos(struct lp_build_sample_context *bld,
1582 unsigned sampler_unit,
1583 LLVMValueRef s,
1584 LLVMValueRef t,
1585 LLVMValueRef r,
1586 const LLVMValueRef *offsets,
1587 LLVMValueRef lod_positive,
1588 LLVMValueRef lod_fpart,
1589 LLVMValueRef ilevel0,
1590 LLVMValueRef ilevel1,
1591 LLVMValueRef texel_out[4])
1592 {
1593 LLVMBuilderRef builder = bld->gallivm->builder;
1594 const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
1595 const unsigned min_filter = bld->static_sampler_state->min_img_filter;
1596 const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
1597 const unsigned dims = bld->dims;
1598 LLVMValueRef packed_var, packed;
1599 LLVMValueRef unswizzled[4];
1600 struct lp_build_context u8n_bld;
1601
1602 /* we only support the common/simple wrap modes at this time */
1603 assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_s));
1604 if (dims >= 2)
1605 assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_t));
1606 if (dims >= 3)
1607 assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_r));
1608
1609
1610 /* make 8-bit unorm builder context */
1611 lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width));
1612
1613 /*
1614 * Get/interpolate texture colors.
1615 */
1616
1617 packed_var = lp_build_alloca(bld->gallivm, u8n_bld.vec_type, "packed_var");
1618
1619 if (min_filter == mag_filter) {
1620 /* no need to distinguish between minification and magnification */
1621 lp_build_sample_mipmap(bld,
1622 min_filter, mip_filter,
1623 s, t, r, offsets,
1624 ilevel0, ilevel1, lod_fpart,
1625 packed_var);
1626 }
1627 else {
1628 /* Emit conditional to choose min image filter or mag image filter
1629 * depending on the lod being > 0 or <= 0, respectively.
1630 */
1631 struct lp_build_if_state if_ctx;
1632
1633 /*
1634 * FIXME this should take all lods into account, if some are min
1635 * some max probably could hack up the weights in the linear
1636 * path with selects to work for nearest.
1637 */
1638 if (bld->num_lods > 1)
1639 lod_positive = LLVMBuildExtractElement(builder, lod_positive,
1640 lp_build_const_int32(bld->gallivm, 0), "");
1641
1642 lod_positive = LLVMBuildTrunc(builder, lod_positive,
1643 LLVMInt1TypeInContext(bld->gallivm->context), "");
1644
1645 lp_build_if(&if_ctx, bld->gallivm, lod_positive);
1646 {
1647 /* Use the minification filter */
1648 lp_build_sample_mipmap(bld,
1649 min_filter, mip_filter,
1650 s, t, r, offsets,
1651 ilevel0, ilevel1, lod_fpart,
1652 packed_var);
1653 }
1654 lp_build_else(&if_ctx);
1655 {
1656 /* Use the magnification filter */
1657 lp_build_sample_mipmap(bld,
1658 mag_filter, PIPE_TEX_MIPFILTER_NONE,
1659 s, t, r, offsets,
1660 ilevel0, NULL, NULL,
1661 packed_var);
1662 }
1663 lp_build_endif(&if_ctx);
1664 }
1665
1666 packed = LLVMBuildLoad(builder, packed_var, "");
1667
1668 /*
1669 * Convert to SoA and swizzle.
1670 */
1671 lp_build_rgba8_to_fi32_soa(bld->gallivm,
1672 bld->texel_type,
1673 packed, unswizzled);
1674
1675 if (util_format_is_rgba8_variant(bld->format_desc)) {
1676 lp_build_format_swizzle_soa(bld->format_desc,
1677 &bld->texel_bld,
1678 unswizzled, texel_out);
1679 }
1680 else {
1681 texel_out[0] = unswizzled[0];
1682 texel_out[1] = unswizzled[1];
1683 texel_out[2] = unswizzled[2];
1684 texel_out[3] = unswizzled[3];
1685 }
1686 }