gallivm: Remove unnecessary headers.
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_aos.c
1 /**************************************************************************
2 *
3 * Copyright 2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- SoA.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "util/u_debug.h"
39 #include "util/u_dump.h"
40 #include "util/u_memory.h"
41 #include "util/u_math.h"
42 #include "util/u_format.h"
43 #include "util/u_cpu_detect.h"
44 #include "lp_bld_debug.h"
45 #include "lp_bld_type.h"
46 #include "lp_bld_const.h"
47 #include "lp_bld_conv.h"
48 #include "lp_bld_arit.h"
49 #include "lp_bld_logic.h"
50 #include "lp_bld_swizzle.h"
51 #include "lp_bld_pack.h"
52 #include "lp_bld_flow.h"
53 #include "lp_bld_gather.h"
54 #include "lp_bld_format.h"
55 #include "lp_bld_sample.h"
56 #include "lp_bld_sample_aos.h"
57 #include "lp_bld_quad.h"
58
59
60 /**
61 * Build LLVM code for texture coord wrapping, for nearest filtering,
62 * for scaled integer texcoords.
63 * \param block_length is the length of the pixel block along the
64 * coordinate axis
65 * \param coord the incoming texcoord (s,t,r or q) scaled to the texture size
66 * \param length the texture size along one dimension
67 * \param stride pixel stride along the coordinate axis (in bytes)
68 * \param is_pot if TRUE, length is a power of two
69 * \param wrap_mode one of PIPE_TEX_WRAP_x
70 * \param out_offset byte offset for the wrapped coordinate
71 * \param out_i resulting sub-block pixel coordinate for coord0
72 */
73 static void
74 lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
75 unsigned block_length,
76 LLVMValueRef coord,
77 LLVMValueRef length,
78 LLVMValueRef stride,
79 boolean is_pot,
80 unsigned wrap_mode,
81 LLVMValueRef *out_offset,
82 LLVMValueRef *out_i)
83 {
84 struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
85 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
86 LLVMValueRef length_minus_one;
87
88 length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
89
90 switch(wrap_mode) {
91 case PIPE_TEX_WRAP_REPEAT:
92 if(is_pot)
93 coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
94 else
95 /* Signed remainder won't give the right results for negative
96 * dividends but unsigned remainder does.*/
97 coord = LLVMBuildURem(bld->builder, coord, length, "");
98 break;
99
100 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
101 coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
102 coord = lp_build_min(int_coord_bld, coord, length_minus_one);
103 break;
104
105 case PIPE_TEX_WRAP_CLAMP:
106 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
107 case PIPE_TEX_WRAP_MIRROR_REPEAT:
108 case PIPE_TEX_WRAP_MIRROR_CLAMP:
109 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
110 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
111 default:
112 assert(0);
113 }
114
115 lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
116 out_offset, out_i);
117 }
118
119
120 /**
121 * Build LLVM code for texture coord wrapping, for linear filtering,
122 * for scaled integer texcoords.
123 * \param block_length is the length of the pixel block along the
124 * coordinate axis
125 * \param coord0 the incoming texcoord (s,t,r or q) scaled to the texture size
126 * \param length the texture size along one dimension
127 * \param stride pixel stride along the coordinate axis (in bytes)
128 * \param is_pot if TRUE, length is a power of two
129 * \param wrap_mode one of PIPE_TEX_WRAP_x
130 * \param offset0 resulting relative offset for coord0
131 * \param offset1 resulting relative offset for coord0 + 1
132 * \param i0 resulting sub-block pixel coordinate for coord0
133 * \param i1 resulting sub-block pixel coordinate for coord0 + 1
134 */
135 static void
136 lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
137 unsigned block_length,
138 LLVMValueRef coord0,
139 LLVMValueRef length,
140 LLVMValueRef stride,
141 boolean is_pot,
142 unsigned wrap_mode,
143 LLVMValueRef *offset0,
144 LLVMValueRef *offset1,
145 LLVMValueRef *i0,
146 LLVMValueRef *i1)
147 {
148 struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
149 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
150 LLVMValueRef length_minus_one;
151 LLVMValueRef lmask, umask, mask;
152
153 if (block_length != 1) {
154 /*
155 * If the pixel block covers more than one pixel then there is no easy
156 * way to calculate offset1 relative to offset0. Instead, compute them
157 * independently.
158 */
159
160 LLVMValueRef coord1;
161
162 lp_build_sample_wrap_nearest_int(bld,
163 block_length,
164 coord0,
165 length,
166 stride,
167 is_pot,
168 wrap_mode,
169 offset0, i0);
170
171 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
172
173 lp_build_sample_wrap_nearest_int(bld,
174 block_length,
175 coord1,
176 length,
177 stride,
178 is_pot,
179 wrap_mode,
180 offset1, i1);
181
182 return;
183 }
184
185 /*
186 * Scalar pixels -- try to compute offset0 and offset1 with a single stride
187 * multiplication.
188 */
189
190 *i0 = uint_coord_bld->zero;
191 *i1 = uint_coord_bld->zero;
192
193 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
194
195 switch(wrap_mode) {
196 case PIPE_TEX_WRAP_REPEAT:
197 if (is_pot) {
198 coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
199 }
200 else {
201 /* Signed remainder won't give the right results for negative
202 * dividends but unsigned remainder does.*/
203 coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
204 }
205
206 mask = lp_build_compare(bld->builder, int_coord_bld->type,
207 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
208
209 *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
210 *offset1 = LLVMBuildAnd(bld->builder,
211 lp_build_add(uint_coord_bld, *offset0, stride),
212 mask, "");
213 break;
214
215 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
216 lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
217 PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
218 umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
219 PIPE_FUNC_LESS, coord0, length_minus_one);
220
221 coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
222 coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
223
224 mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
225
226 *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
227 *offset1 = lp_build_add(uint_coord_bld,
228 *offset0,
229 LLVMBuildAnd(bld->builder, stride, mask, ""));
230 break;
231
232 case PIPE_TEX_WRAP_CLAMP:
233 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
234 case PIPE_TEX_WRAP_MIRROR_REPEAT:
235 case PIPE_TEX_WRAP_MIRROR_CLAMP:
236 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
237 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
238 default:
239 assert(0);
240 *offset0 = uint_coord_bld->zero;
241 *offset1 = uint_coord_bld->zero;
242 break;
243 }
244 }
245
246
247 /**
248 * Sample a single texture image with nearest sampling.
249 * If sampling a cube texture, r = cube face in [0,5].
250 * Return filtered color as two vectors of 16-bit fixed point values.
251 */
252 static void
253 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
254 LLVMValueRef width_vec,
255 LLVMValueRef height_vec,
256 LLVMValueRef depth_vec,
257 LLVMValueRef row_stride_vec,
258 LLVMValueRef img_stride_vec,
259 LLVMValueRef data_ptr,
260 LLVMValueRef s,
261 LLVMValueRef t,
262 LLVMValueRef r,
263 LLVMValueRef *colors_lo,
264 LLVMValueRef *colors_hi)
265 {
266 const int dims = texture_dims(bld->static_state->target);
267 LLVMBuilderRef builder = bld->builder;
268 struct lp_build_context i32, h16, u8n;
269 LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
270 LLVMValueRef i32_c8;
271 LLVMValueRef s_ipart, t_ipart, r_ipart;
272 LLVMValueRef x_stride;
273 LLVMValueRef x_offset, offset;
274 LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
275
276 lp_build_context_init(&i32, builder, lp_type_int_vec(32));
277 lp_build_context_init(&h16, builder, lp_type_ufixed(16));
278 lp_build_context_init(&u8n, builder, lp_type_unorm(8));
279
280 i32_vec_type = lp_build_vec_type(i32.type);
281 h16_vec_type = lp_build_vec_type(h16.type);
282 u8n_vec_type = lp_build_vec_type(u8n.type);
283
284 if (bld->static_state->normalized_coords) {
285 /* s = s * width, t = t * height */
286 LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
287 LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
288 coord_vec_type, "");
289 s = lp_build_mul(&bld->coord_bld, s, fp_width);
290 if (dims >= 2) {
291 LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
292 coord_vec_type, "");
293 t = lp_build_mul(&bld->coord_bld, t, fp_height);
294 if (dims >= 3) {
295 LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
296 coord_vec_type, "");
297 r = lp_build_mul(&bld->coord_bld, r, fp_depth);
298 }
299 }
300 }
301
302 /* scale coords by 256 (8 fractional bits) */
303 s = lp_build_mul_imm(&bld->coord_bld, s, 256);
304 if (dims >= 2)
305 t = lp_build_mul_imm(&bld->coord_bld, t, 256);
306 if (dims >= 3)
307 r = lp_build_mul_imm(&bld->coord_bld, r, 256);
308
309 /* convert float to int */
310 s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
311 if (dims >= 2)
312 t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
313 if (dims >= 3)
314 r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
315
316 /* compute floor (shift right 8) */
317 i32_c8 = lp_build_const_int_vec(i32.type, 8);
318 s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
319 if (dims >= 2)
320 t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
321 if (dims >= 3)
322 r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
323
324 /* get pixel, row, image strides */
325 x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
326 bld->format_desc->block.bits/8);
327
328 /* Do texcoord wrapping, compute texel offset */
329 lp_build_sample_wrap_nearest_int(bld,
330 bld->format_desc->block.width,
331 s_ipart, width_vec, x_stride,
332 bld->static_state->pot_width,
333 bld->static_state->wrap_s,
334 &x_offset, &x_subcoord);
335 offset = x_offset;
336 if (dims >= 2) {
337 LLVMValueRef y_offset;
338 lp_build_sample_wrap_nearest_int(bld,
339 bld->format_desc->block.height,
340 t_ipart, height_vec, row_stride_vec,
341 bld->static_state->pot_height,
342 bld->static_state->wrap_t,
343 &y_offset, &y_subcoord);
344 offset = lp_build_add(&bld->uint_coord_bld, offset, y_offset);
345 if (dims >= 3) {
346 LLVMValueRef z_offset;
347 lp_build_sample_wrap_nearest_int(bld,
348 1, /* block length (depth) */
349 r_ipart, depth_vec, img_stride_vec,
350 bld->static_state->pot_height,
351 bld->static_state->wrap_r,
352 &z_offset, &z_subcoord);
353 offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
354 }
355 else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
356 LLVMValueRef z_offset;
357 /* The r coord is the cube face in [0,5] */
358 z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
359 offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
360 }
361 }
362
363 /*
364 * Fetch the pixels as 4 x 32bit (rgba order might differ):
365 *
366 * rgba0 rgba1 rgba2 rgba3
367 *
368 * bit cast them into 16 x u8
369 *
370 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
371 *
372 * unpack them into two 8 x i16:
373 *
374 * r0 g0 b0 a0 r1 g1 b1 a1
375 * r2 g2 b2 a2 r3 g3 b3 a3
376 *
377 * The higher 8 bits of the resulting elements will be zero.
378 */
379 {
380 LLVMValueRef rgba8;
381
382 if (util_format_is_rgba8_variant(bld->format_desc)) {
383 /*
384 * Given the format is a rgba8, just read the pixels as is,
385 * without any swizzling. Swizzling will be done later.
386 */
387 rgba8 = lp_build_gather(bld->builder,
388 bld->texel_type.length,
389 bld->format_desc->block.bits,
390 bld->texel_type.width,
391 data_ptr, offset);
392
393 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
394 }
395 else {
396 rgba8 = lp_build_fetch_rgba_aos(bld->builder,
397 bld->format_desc,
398 u8n.type,
399 data_ptr, offset,
400 x_subcoord,
401 y_subcoord);
402 }
403
404 /* Expand one 4*rgba8 to two 2*rgba16 */
405 lp_build_unpack2(builder, u8n.type, h16.type,
406 rgba8,
407 colors_lo, colors_hi);
408 }
409 }
410
411
412 /**
413 * Sample a single texture image with (bi-)(tri-)linear sampling.
414 * Return filtered color as two vectors of 16-bit fixed point values.
415 */
416 static void
417 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
418 LLVMValueRef width_vec,
419 LLVMValueRef height_vec,
420 LLVMValueRef depth_vec,
421 LLVMValueRef row_stride_vec,
422 LLVMValueRef img_stride_vec,
423 LLVMValueRef data_ptr,
424 LLVMValueRef s,
425 LLVMValueRef t,
426 LLVMValueRef r,
427 LLVMValueRef *colors_lo,
428 LLVMValueRef *colors_hi)
429 {
430 const int dims = texture_dims(bld->static_state->target);
431 LLVMBuilderRef builder = bld->builder;
432 struct lp_build_context i32, h16, u8n;
433 LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
434 LLVMValueRef i32_c8, i32_c128, i32_c255;
435 LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
436 LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
437 LLVMValueRef r_ipart, r_fpart, r_fpart_lo, r_fpart_hi;
438 LLVMValueRef x_stride, y_stride, z_stride;
439 LLVMValueRef x_offset0, x_offset1;
440 LLVMValueRef y_offset0, y_offset1;
441 LLVMValueRef z_offset0, z_offset1;
442 LLVMValueRef offset[2][2][2]; /* [z][y][x] */
443 LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
444 LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
445 LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
446 LLVMValueRef packed_lo, packed_hi;
447 unsigned x, y, z;
448 unsigned i, j, k;
449 unsigned numj, numk;
450
451 lp_build_context_init(&i32, builder, lp_type_int_vec(32));
452 lp_build_context_init(&h16, builder, lp_type_ufixed(16));
453 lp_build_context_init(&u8n, builder, lp_type_unorm(8));
454
455 i32_vec_type = lp_build_vec_type(i32.type);
456 h16_vec_type = lp_build_vec_type(h16.type);
457 u8n_vec_type = lp_build_vec_type(u8n.type);
458
459 if (bld->static_state->normalized_coords) {
460 /* s = s * width, t = t * height */
461 LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
462 LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
463 coord_vec_type, "");
464 s = lp_build_mul(&bld->coord_bld, s, fp_width);
465 if (dims >= 2) {
466 LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
467 coord_vec_type, "");
468 t = lp_build_mul(&bld->coord_bld, t, fp_height);
469 }
470 if (dims >= 3) {
471 LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
472 coord_vec_type, "");
473 r = lp_build_mul(&bld->coord_bld, r, fp_depth);
474 }
475 }
476
477 /* scale coords by 256 (8 fractional bits) */
478 s = lp_build_mul_imm(&bld->coord_bld, s, 256);
479 if (dims >= 2)
480 t = lp_build_mul_imm(&bld->coord_bld, t, 256);
481 if (dims >= 3)
482 r = lp_build_mul_imm(&bld->coord_bld, r, 256);
483
484 /* convert float to int */
485 s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
486 if (dims >= 2)
487 t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
488 if (dims >= 3)
489 r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
490
491 /* subtract 0.5 (add -128) */
492 i32_c128 = lp_build_const_int_vec(i32.type, -128);
493 if (!bld->static_state->force_nearest_s) {
494 s = LLVMBuildAdd(builder, s, i32_c128, "");
495 }
496 if (dims >= 2 && !bld->static_state->force_nearest_t) {
497 t = LLVMBuildAdd(builder, t, i32_c128, "");
498 }
499 if (dims >= 3) {
500 r = LLVMBuildAdd(builder, r, i32_c128, "");
501 }
502
503 /* compute floor (shift right 8) */
504 i32_c8 = lp_build_const_int_vec(i32.type, 8);
505 s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
506 if (dims >= 2)
507 t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
508 if (dims >= 3)
509 r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
510
511 /* compute fractional part (AND with 0xff) */
512 i32_c255 = lp_build_const_int_vec(i32.type, 255);
513 s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
514 if (dims >= 2)
515 t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
516 if (dims >= 3)
517 r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
518
519 /* get pixel, row and image strides */
520 x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
521 bld->format_desc->block.bits/8);
522 y_stride = row_stride_vec;
523 z_stride = img_stride_vec;
524
525 /* do texcoord wrapping and compute texel offsets */
526 lp_build_sample_wrap_linear_int(bld,
527 bld->format_desc->block.width,
528 s_ipart, width_vec, x_stride,
529 bld->static_state->pot_width,
530 bld->static_state->wrap_s,
531 &x_offset0, &x_offset1,
532 &x_subcoord[0], &x_subcoord[1]);
533 for (z = 0; z < 2; z++) {
534 for (y = 0; y < 2; y++) {
535 offset[z][y][0] = x_offset0;
536 offset[z][y][1] = x_offset1;
537 }
538 }
539
540 if (dims >= 2) {
541 lp_build_sample_wrap_linear_int(bld,
542 bld->format_desc->block.height,
543 t_ipart, height_vec, y_stride,
544 bld->static_state->pot_height,
545 bld->static_state->wrap_t,
546 &y_offset0, &y_offset1,
547 &y_subcoord[0], &y_subcoord[1]);
548
549 for (z = 0; z < 2; z++) {
550 for (x = 0; x < 2; x++) {
551 offset[z][0][x] = lp_build_add(&bld->uint_coord_bld,
552 offset[z][0][x], y_offset0);
553 offset[z][1][x] = lp_build_add(&bld->uint_coord_bld,
554 offset[z][1][x], y_offset1);
555 }
556 }
557 }
558
559 if (dims >= 3) {
560 lp_build_sample_wrap_linear_int(bld,
561 bld->format_desc->block.height,
562 r_ipart, depth_vec, z_stride,
563 bld->static_state->pot_depth,
564 bld->static_state->wrap_r,
565 &z_offset0, &z_offset1,
566 &z_subcoord[0], &z_subcoord[1]);
567 for (y = 0; y < 2; y++) {
568 for (x = 0; x < 2; x++) {
569 offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
570 offset[0][y][x], z_offset0);
571 offset[1][y][x] = lp_build_add(&bld->uint_coord_bld,
572 offset[1][y][x], z_offset1);
573 }
574 }
575 }
576 else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
577 LLVMValueRef z_offset;
578 z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
579 for (y = 0; y < 2; y++) {
580 for (x = 0; x < 2; x++) {
581 /* The r coord is the cube face in [0,5] */
582 offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
583 offset[0][y][x], z_offset);
584 }
585 }
586 }
587
588 /*
589 * Transform 4 x i32 in
590 *
591 * s_fpart = {s0, s1, s2, s3}
592 *
593 * into 8 x i16
594 *
595 * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
596 *
597 * into two 8 x i16
598 *
599 * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
600 * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
601 *
602 * and likewise for t_fpart. There is no risk of loosing precision here
603 * since the fractional parts only use the lower 8bits.
604 */
605 s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
606 if (dims >= 2)
607 t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
608 if (dims >= 3)
609 r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
610
611 {
612 LLVMTypeRef elem_type = LLVMInt32Type();
613 LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
614 LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
615 LLVMValueRef shuffle_lo;
616 LLVMValueRef shuffle_hi;
617
618 for (j = 0; j < h16.type.length; j += 4) {
619 #ifdef PIPE_ARCH_LITTLE_ENDIAN
620 unsigned subindex = 0;
621 #else
622 unsigned subindex = 1;
623 #endif
624 LLVMValueRef index;
625
626 index = LLVMConstInt(elem_type, j/2 + subindex, 0);
627 for (i = 0; i < 4; ++i)
628 shuffles_lo[j + i] = index;
629
630 index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
631 for (i = 0; i < 4; ++i)
632 shuffles_hi[j + i] = index;
633 }
634
635 shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
636 shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
637
638 s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
639 shuffle_lo, "");
640 s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
641 shuffle_hi, "");
642 if (dims >= 2) {
643 t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
644 shuffle_lo, "");
645 t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
646 shuffle_hi, "");
647 }
648 if (dims >= 3) {
649 r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
650 shuffle_lo, "");
651 r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
652 shuffle_hi, "");
653 }
654 }
655
656 /*
657 * Fetch the pixels as 4 x 32bit (rgba order might differ):
658 *
659 * rgba0 rgba1 rgba2 rgba3
660 *
661 * bit cast them into 16 x u8
662 *
663 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
664 *
665 * unpack them into two 8 x i16:
666 *
667 * r0 g0 b0 a0 r1 g1 b1 a1
668 * r2 g2 b2 a2 r3 g3 b3 a3
669 *
670 * The higher 8 bits of the resulting elements will be zero.
671 */
672 numj = 1 + (dims >= 2);
673 numk = 1 + (dims >= 3);
674
675 for (k = 0; k < numk; k++) {
676 for (j = 0; j < numj; j++) {
677 for (i = 0; i < 2; i++) {
678 LLVMValueRef rgba8;
679
680 if (util_format_is_rgba8_variant(bld->format_desc)) {
681 /*
682 * Given the format is a rgba8, just read the pixels as is,
683 * without any swizzling. Swizzling will be done later.
684 */
685 rgba8 = lp_build_gather(bld->builder,
686 bld->texel_type.length,
687 bld->format_desc->block.bits,
688 bld->texel_type.width,
689 data_ptr, offset[k][j][i]);
690
691 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
692 }
693 else {
694 rgba8 = lp_build_fetch_rgba_aos(bld->builder,
695 bld->format_desc,
696 u8n.type,
697 data_ptr, offset[k][j][i],
698 x_subcoord[i],
699 y_subcoord[j]);
700 }
701
702 /* Expand one 4*rgba8 to two 2*rgba16 */
703 lp_build_unpack2(builder, u8n.type, h16.type,
704 rgba8,
705 &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
706 }
707 }
708 }
709
710 /*
711 * Linear interpolation with 8.8 fixed point.
712 */
713 if (bld->static_state->force_nearest_s) {
714 /* special case 1-D lerp */
715 packed_lo = lp_build_lerp(&h16,
716 t_fpart_lo,
717 neighbors_lo[0][0][0],
718 neighbors_lo[0][0][1]);
719
720 packed_hi = lp_build_lerp(&h16,
721 t_fpart_hi,
722 neighbors_hi[0][1][0],
723 neighbors_hi[0][1][0]);
724 }
725 else if (bld->static_state->force_nearest_t) {
726 /* special case 1-D lerp */
727 packed_lo = lp_build_lerp(&h16,
728 s_fpart_lo,
729 neighbors_lo[0][0][0],
730 neighbors_lo[0][0][1]);
731
732 packed_hi = lp_build_lerp(&h16,
733 s_fpart_hi,
734 neighbors_hi[0][0][0],
735 neighbors_hi[0][0][1]);
736 }
737 else {
738 /* general 1/2/3-D lerping */
739 if (dims == 1) {
740 packed_lo = lp_build_lerp(&h16,
741 s_fpart_lo,
742 neighbors_lo[0][0][0],
743 neighbors_lo[0][0][1]);
744
745 packed_hi = lp_build_lerp(&h16,
746 s_fpart_hi,
747 neighbors_hi[0][0][0],
748 neighbors_hi[0][0][1]);
749 }
750 else {
751 /* 2-D lerp */
752 packed_lo = lp_build_lerp_2d(&h16,
753 s_fpart_lo, t_fpart_lo,
754 neighbors_lo[0][0][0],
755 neighbors_lo[0][0][1],
756 neighbors_lo[0][1][0],
757 neighbors_lo[0][1][1]);
758
759 packed_hi = lp_build_lerp_2d(&h16,
760 s_fpart_hi, t_fpart_hi,
761 neighbors_hi[0][0][0],
762 neighbors_hi[0][0][1],
763 neighbors_hi[0][1][0],
764 neighbors_hi[0][1][1]);
765
766 if (dims >= 3) {
767 LLVMValueRef packed_lo2, packed_hi2;
768
769 /* lerp in the second z slice */
770 packed_lo2 = lp_build_lerp_2d(&h16,
771 s_fpart_lo, t_fpart_lo,
772 neighbors_lo[1][0][0],
773 neighbors_lo[1][0][1],
774 neighbors_lo[1][1][0],
775 neighbors_lo[1][1][1]);
776
777 packed_hi2 = lp_build_lerp_2d(&h16,
778 s_fpart_hi, t_fpart_hi,
779 neighbors_hi[1][0][0],
780 neighbors_hi[1][0][1],
781 neighbors_hi[1][1][0],
782 neighbors_hi[1][1][1]);
783 /* interp between two z slices */
784 packed_lo = lp_build_lerp(&h16, r_fpart_lo,
785 packed_lo, packed_lo2);
786 packed_hi = lp_build_lerp(&h16, r_fpart_hi,
787 packed_hi, packed_hi2);
788 }
789 }
790 }
791
792 *colors_lo = packed_lo;
793 *colors_hi = packed_hi;
794 }
795
796
797 /**
798 * Sample the texture/mipmap using given image filter and mip filter.
799 * data0_ptr and data1_ptr point to the two mipmap levels to sample
800 * from. width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
801 * If we're using nearest miplevel sampling the '1' values will be null/unused.
802 */
803 static void
804 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
805 unsigned img_filter,
806 unsigned mip_filter,
807 LLVMValueRef s,
808 LLVMValueRef t,
809 LLVMValueRef r,
810 LLVMValueRef lod_fpart,
811 LLVMValueRef width0_vec,
812 LLVMValueRef width1_vec,
813 LLVMValueRef height0_vec,
814 LLVMValueRef height1_vec,
815 LLVMValueRef depth0_vec,
816 LLVMValueRef depth1_vec,
817 LLVMValueRef row_stride0_vec,
818 LLVMValueRef row_stride1_vec,
819 LLVMValueRef img_stride0_vec,
820 LLVMValueRef img_stride1_vec,
821 LLVMValueRef data_ptr0,
822 LLVMValueRef data_ptr1,
823 LLVMValueRef *colors_lo,
824 LLVMValueRef *colors_hi)
825 {
826 LLVMValueRef colors0_lo, colors0_hi;
827 LLVMValueRef colors1_lo, colors1_hi;
828
829 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
830 /* sample the first mipmap level */
831 lp_build_sample_image_nearest(bld,
832 width0_vec, height0_vec, depth0_vec,
833 row_stride0_vec, img_stride0_vec,
834 data_ptr0, s, t, r,
835 &colors0_lo, &colors0_hi);
836
837 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
838 /* sample the second mipmap level */
839 lp_build_sample_image_nearest(bld,
840 width1_vec, height1_vec, depth1_vec,
841 row_stride1_vec, img_stride1_vec,
842 data_ptr1, s, t, r,
843 &colors1_lo, &colors1_hi);
844 }
845 }
846 else {
847 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
848
849 /* sample the first mipmap level */
850 lp_build_sample_image_linear(bld,
851 width0_vec, height0_vec, depth0_vec,
852 row_stride0_vec, img_stride0_vec,
853 data_ptr0, s, t, r,
854 &colors0_lo, &colors0_hi);
855
856 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
857 /* sample the second mipmap level */
858 lp_build_sample_image_linear(bld,
859 width1_vec, height1_vec, depth1_vec,
860 row_stride1_vec, img_stride1_vec,
861 data_ptr1, s, t, r,
862 &colors1_lo, &colors1_hi);
863 }
864 }
865
866 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
867 /* interpolate samples from the two mipmap levels */
868 struct lp_build_context h16;
869 lp_build_context_init(&h16, bld->builder, lp_type_ufixed(16));
870
871 *colors_lo = lp_build_lerp(&h16, lod_fpart,
872 colors0_lo, colors1_lo);
873 *colors_hi = lp_build_lerp(&h16, lod_fpart,
874 colors0_hi, colors1_hi);
875 }
876 else {
877 /* use first/only level's colors */
878 *colors_lo = colors0_lo;
879 *colors_hi = colors0_hi;
880 }
881 }
882
883
884
885 /**
886 * Texture sampling in AoS format. Used when sampling common 32-bit/texel
887 * formats. 1D/2D/3D/cube texture supported. All mipmap sampling modes
888 * but only limited texture coord wrap modes.
889 */
890 void
891 lp_build_sample_aos(struct lp_build_sample_context *bld,
892 unsigned unit,
893 LLVMValueRef s,
894 LLVMValueRef t,
895 LLVMValueRef r,
896 const LLVMValueRef *ddx,
897 const LLVMValueRef *ddy,
898 LLVMValueRef lod_bias, /* optional */
899 LLVMValueRef explicit_lod, /* optional */
900 LLVMValueRef width,
901 LLVMValueRef height,
902 LLVMValueRef depth,
903 LLVMValueRef width_vec,
904 LLVMValueRef height_vec,
905 LLVMValueRef depth_vec,
906 LLVMValueRef row_stride_array,
907 LLVMValueRef img_stride_array,
908 LLVMValueRef data_array,
909 LLVMValueRef texel_out[4])
910 {
911 struct lp_build_context *float_bld = &bld->float_bld;
912 LLVMBuilderRef builder = bld->builder;
913 const unsigned mip_filter = bld->static_state->min_mip_filter;
914 const unsigned min_filter = bld->static_state->min_img_filter;
915 const unsigned mag_filter = bld->static_state->mag_img_filter;
916 const int dims = texture_dims(bld->static_state->target);
917 LLVMValueRef lod = NULL, lod_fpart = NULL;
918 LLVMValueRef ilevel0, ilevel1 = NULL;
919 LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
920 LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
921 LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
922 LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
923 LLVMValueRef data_ptr0, data_ptr1 = NULL;
924 LLVMValueRef packed, packed_lo, packed_hi;
925 LLVMValueRef unswizzled[4];
926 LLVMValueRef face_ddx[4], face_ddy[4];
927 struct lp_build_context h16;
928 LLVMTypeRef h16_vec_type;
929
930 /* we only support the common/simple wrap modes at this time */
931 assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
932 if (dims >= 2)
933 assert(lp_is_simple_wrap_mode(bld->static_state->wrap_t));
934 if (dims >= 3)
935 assert(lp_is_simple_wrap_mode(bld->static_state->wrap_r));
936
937
938 /* make 16-bit fixed-pt builder context */
939 lp_build_context_init(&h16, builder, lp_type_ufixed(16));
940 h16_vec_type = lp_build_vec_type(h16.type);
941
942
943 /* cube face selection, compute pre-face coords, etc. */
944 if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
945 LLVMValueRef face, face_s, face_t;
946 lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
947 s = face_s; /* vec */
948 t = face_t; /* vec */
949 /* use 'r' to indicate cube face */
950 r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
951
952 /* recompute ddx, ddy using the new (s,t) face texcoords */
953 face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
954 face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
955 face_ddx[2] = NULL;
956 face_ddx[3] = NULL;
957 face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
958 face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
959 face_ddy[2] = NULL;
960 face_ddy[3] = NULL;
961 ddx = face_ddx;
962 ddy = face_ddy;
963 }
964
965
966 /*
967 * Compute the level of detail (float).
968 */
969 if (min_filter != mag_filter ||
970 mip_filter != PIPE_TEX_MIPFILTER_NONE) {
971 /* Need to compute lod either to choose mipmap levels or to
972 * distinguish between minification/magnification with one mipmap level.
973 */
974 lod = lp_build_lod_selector(bld, ddx, ddy,
975 lod_bias, explicit_lod,
976 width, height, depth);
977 }
978
979 /*
980 * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
981 * If mipfilter=linear, also compute the weight between the two
982 * mipmap levels: lod_fpart
983 */
984 switch (mip_filter) {
985 default:
986 assert(0 && "bad mip_filter value in lp_build_sample_aos()");
987 /* fall-through */
988 case PIPE_TEX_MIPFILTER_NONE:
989 /* always use mip level 0 */
990 if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
991 /* XXX this is a work-around for an apparent bug in LLVM 2.7.
992 * We should be able to set ilevel0 = const(0) but that causes
993 * bad x86 code to be emitted.
994 */
995 lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
996 lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
997 }
998 else {
999 ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
1000 }
1001 break;
1002 case PIPE_TEX_MIPFILTER_NEAREST:
1003 assert(lod);
1004 lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
1005 break;
1006 case PIPE_TEX_MIPFILTER_LINEAR:
1007 {
1008 LLVMValueRef f256 = LLVMConstReal(LLVMFloatType(), 256.0);
1009 LLVMValueRef i255 = lp_build_const_int32(255);
1010 LLVMTypeRef i16_type = LLVMIntType(16);
1011
1012 assert(lod);
1013
1014 lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1,
1015 &lod_fpart);
1016 lod_fpart = LLVMBuildFMul(builder, lod_fpart, f256, "");
1017 lod_fpart = lp_build_ifloor(&bld->float_bld, lod_fpart);
1018 lod_fpart = LLVMBuildAnd(builder, lod_fpart, i255, "");
1019 lod_fpart = LLVMBuildTrunc(builder, lod_fpart, i16_type, "");
1020 lod_fpart = lp_build_broadcast_scalar(&h16, lod_fpart);
1021
1022 /* the lod_fpart values will be fixed pt values in [0,1) */
1023 }
1024 break;
1025 }
1026
1027 /* compute image size(s) of source mipmap level(s) */
1028 lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec,
1029 ilevel0, ilevel1,
1030 row_stride_array, img_stride_array,
1031 &width0_vec, &width1_vec,
1032 &height0_vec, &height1_vec,
1033 &depth0_vec, &depth1_vec,
1034 &row_stride0_vec, &row_stride1_vec,
1035 &img_stride0_vec, &img_stride1_vec);
1036
1037 /*
1038 * Get pointer(s) to image data for mipmap level(s).
1039 */
1040 data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
1041 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1042 data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1);
1043 }
1044
1045
1046 /*
1047 * Get/interpolate texture colors.
1048 */
1049 if (min_filter == mag_filter) {
1050 /* no need to distinquish between minification and magnification */
1051 lp_build_sample_mipmap(bld, min_filter, mip_filter,
1052 s, t, r, lod_fpart,
1053 width0_vec, width1_vec,
1054 height0_vec, height1_vec,
1055 depth0_vec, depth1_vec,
1056 row_stride0_vec, row_stride1_vec,
1057 img_stride0_vec, img_stride1_vec,
1058 data_ptr0, data_ptr1,
1059 &packed_lo, &packed_hi);
1060 }
1061 else {
1062 /* Emit conditional to choose min image filter or mag image filter
1063 * depending on the lod being > 0 or <= 0, respectively.
1064 */
1065 struct lp_build_flow_context *flow_ctx;
1066 struct lp_build_if_state if_ctx;
1067 LLVMValueRef minify;
1068
1069 flow_ctx = lp_build_flow_create(builder);
1070 lp_build_flow_scope_begin(flow_ctx);
1071
1072 packed_lo = LLVMGetUndef(h16_vec_type);
1073 packed_hi = LLVMGetUndef(h16_vec_type);
1074
1075 lp_build_flow_scope_declare(flow_ctx, &packed_lo);
1076 lp_build_flow_scope_declare(flow_ctx, &packed_hi);
1077
1078 /* minify = lod > 0.0 */
1079 minify = LLVMBuildFCmp(builder, LLVMRealUGE,
1080 lod, float_bld->zero, "");
1081
1082 lp_build_if(&if_ctx, flow_ctx, builder, minify);
1083 {
1084 /* Use the minification filter */
1085 lp_build_sample_mipmap(bld, min_filter, mip_filter,
1086 s, t, r, lod_fpart,
1087 width0_vec, width1_vec,
1088 height0_vec, height1_vec,
1089 depth0_vec, depth1_vec,
1090 row_stride0_vec, row_stride1_vec,
1091 img_stride0_vec, img_stride1_vec,
1092 data_ptr0, data_ptr1,
1093 &packed_lo, &packed_hi);
1094 }
1095 lp_build_else(&if_ctx);
1096 {
1097 /* Use the magnification filter */
1098 lp_build_sample_mipmap(bld, mag_filter, mip_filter,
1099 s, t, r, lod_fpart,
1100 width0_vec, width1_vec,
1101 height0_vec, height1_vec,
1102 depth0_vec, depth1_vec,
1103 row_stride0_vec, row_stride1_vec,
1104 img_stride0_vec, img_stride1_vec,
1105 data_ptr0, data_ptr1,
1106 &packed_lo, &packed_hi);
1107 }
1108 lp_build_endif(&if_ctx);
1109
1110 lp_build_flow_scope_end(flow_ctx);
1111 lp_build_flow_destroy(flow_ctx);
1112 }
1113
1114 /* combine 'packed_lo', 'packed_hi' into 'packed' */
1115 {
1116 struct lp_build_context h16, u8n;
1117
1118 lp_build_context_init(&h16, builder, lp_type_ufixed(16));
1119 lp_build_context_init(&u8n, builder, lp_type_unorm(8));
1120
1121 packed = lp_build_pack2(builder, h16.type, u8n.type,
1122 packed_lo, packed_hi);
1123 }
1124
1125 /*
1126 * Convert to SoA and swizzle.
1127 */
1128 lp_build_rgba8_to_f32_soa(builder,
1129 bld->texel_type,
1130 packed, unswizzled);
1131
1132 if (util_format_is_rgba8_variant(bld->format_desc)) {
1133 lp_build_format_swizzle_soa(bld->format_desc,
1134 &bld->texel_bld,
1135 unswizzled, texel_out);
1136 }
1137 else {
1138 texel_out[0] = unswizzled[0];
1139 texel_out[1] = unswizzled[1];
1140 texel_out[2] = unswizzled[2];
1141 texel_out[3] = unswizzled[3];
1142 }
1143
1144 apply_sampler_swizzle(bld, texel_out);
1145 }