1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
4 * Copyright 2007-2008 VMware, Inc.
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 **************************************************************************/
31 * Position and shader input interpolation.
33 * @author Jose Fonseca <jfonseca@vmware.com>
36 #include "pipe/p_shader_tokens.h"
37 #include "util/u_debug.h"
38 #include "util/u_memory.h"
39 #include "util/u_math.h"
40 #include "tgsi/tgsi_scan.h"
41 #include "gallivm/lp_bld_debug.h"
42 #include "gallivm/lp_bld_const.h"
43 #include "gallivm/lp_bld_arit.h"
44 #include "gallivm/lp_bld_swizzle.h"
45 #include "gallivm/lp_bld_flow.h"
46 #include "gallivm/lp_bld_logic.h"
47 #include "gallivm/lp_bld_struct.h"
48 #include "lp_bld_interp.h"
52 * The shader JIT function operates on blocks of quads.
53 * Each block has 2x2 quads and each quad has 2x2 pixels.
55 * We iterate over the quads in order 0, 1, 2, 3:
67 * If we iterate over multiple quads at once, quads 01 and 23 are processed
70 * Within each quad, we have four pixels which are represented in SOA
79 * So the green channel (for example) of the four pixels is stored in
80 * a single vector register: {g0, g1, g2, g3}.
81 * The order stays the same even with multiple quads:
89 * Do one perspective divide per quad.
91 * For perspective interpolation, the final attribute value is given
97 * a = a0 + dadx*x + dady*y
98 * w = w0 + dwdx*x + dwdy*y
99 * oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
101 * Instead of computing the division per pixel, with this macro we compute the
102 * division on the upper left pixel of each quad, and use a linear
103 * approximation in the remaining pixels, given by:
105 * da'dx = (dadx - dwdx*a)*oow
106 * da'dy = (dady - dwdy*a)*oow
108 * Ironically, this actually makes things slower -- probably because the
109 * divide hardware unit is rarely used, whereas the multiply unit is typically
112 #define PERSPECTIVE_DIVIDE_PER_QUAD 0
115 static const unsigned char quad_offset_x
[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
116 static const unsigned char quad_offset_y
[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
120 attrib_name(LLVMValueRef val
, unsigned attrib
, unsigned chan
, const char *suffix
)
123 lp_build_name(val
, "pos.%c%s", "xyzw"[chan
], suffix
);
125 lp_build_name(val
, "input%u.%c%s", attrib
- 1, "xyzw"[chan
], suffix
);
129 calc_offsets(struct lp_build_context
*coeff_bld
,
130 unsigned quad_start_index
,
131 LLVMValueRef
*pixoffx
,
132 LLVMValueRef
*pixoffy
)
135 unsigned num_pix
= coeff_bld
->type
.length
;
136 struct gallivm_state
*gallivm
= coeff_bld
->gallivm
;
137 LLVMBuilderRef builder
= coeff_bld
->gallivm
->builder
;
138 LLVMValueRef nr
, pixxf
, pixyf
;
140 *pixoffx
= coeff_bld
->undef
;
141 *pixoffy
= coeff_bld
->undef
;
143 for (i
= 0; i
< num_pix
; i
++) {
144 nr
= lp_build_const_int32(gallivm
, i
);
145 pixxf
= lp_build_const_float(gallivm
, quad_offset_x
[i
% num_pix
] +
146 (quad_start_index
& 1) * 2);
147 pixyf
= lp_build_const_float(gallivm
, quad_offset_y
[i
% num_pix
] +
148 (quad_start_index
& 2));
149 *pixoffx
= LLVMBuildInsertElement(builder
, *pixoffx
, pixxf
, nr
, "");
150 *pixoffy
= LLVMBuildInsertElement(builder
, *pixoffy
, pixyf
, nr
, "");
155 /* Much easier, and significantly less instructions in the per-stamp
156 * part (less than half) but overall more instructions so a loss if
157 * most quads are active. Might be a win though with larger vectors.
158 * No ability to do per-quad divide (doable but not implemented)
159 * Could be made to work with passed in pixel offsets (i.e. active quad merging).
162 coeffs_init_simple(struct lp_build_interp_soa_context
*bld
,
164 LLVMValueRef dadx_ptr
,
165 LLVMValueRef dady_ptr
)
167 struct lp_build_context
*coeff_bld
= &bld
->coeff_bld
;
168 struct lp_build_context
*setup_bld
= &bld
->setup_bld
;
169 struct gallivm_state
*gallivm
= coeff_bld
->gallivm
;
170 LLVMBuilderRef builder
= gallivm
->builder
;
173 for (attrib
= 0; attrib
< bld
->num_attribs
; ++attrib
) {
175 * always fetch all 4 values for performance/simplicity
176 * Note: we do that here because it seems to generate better
177 * code. It generates a lot of moves initially but less
178 * moves later. As far as I can tell this looks like a
179 * llvm issue, instead of simply reloading the values from
180 * the passed in pointers it if it runs out of registers
181 * it spills/reloads them. Maybe some optimization passes
183 * Might want to investigate this again later.
185 const unsigned interp
= bld
->interp
[attrib
];
186 LLVMValueRef index
= lp_build_const_int32(gallivm
,
187 attrib
* TGSI_NUM_CHANNELS
);
189 LLVMValueRef dadxaos
= setup_bld
->zero
;
190 LLVMValueRef dadyaos
= setup_bld
->zero
;
191 LLVMValueRef a0aos
= setup_bld
->zero
;
194 case LP_INTERP_PERSPECTIVE
:
197 case LP_INTERP_LINEAR
:
198 ptr
= LLVMBuildGEP(builder
, dadx_ptr
, &index
, 1, "");
199 ptr
= LLVMBuildBitCast(builder
, ptr
,
200 LLVMPointerType(setup_bld
->vec_type
, 0), "");
201 dadxaos
= LLVMBuildLoad(builder
, ptr
, "");
203 ptr
= LLVMBuildGEP(builder
, dady_ptr
, &index
, 1, "");
204 ptr
= LLVMBuildBitCast(builder
, ptr
,
205 LLVMPointerType(setup_bld
->vec_type
, 0), "");
206 dadyaos
= LLVMBuildLoad(builder
, ptr
, "");
208 attrib_name(dadxaos
, attrib
, 0, ".dadxaos");
209 attrib_name(dadyaos
, attrib
, 0, ".dadyaos");
212 case LP_INTERP_CONSTANT
:
213 case LP_INTERP_FACING
:
214 ptr
= LLVMBuildGEP(builder
, a0_ptr
, &index
, 1, "");
215 ptr
= LLVMBuildBitCast(builder
, ptr
,
216 LLVMPointerType(setup_bld
->vec_type
, 0), "");
217 a0aos
= LLVMBuildLoad(builder
, ptr
, "");
218 attrib_name(a0aos
, attrib
, 0, ".a0aos");
221 case LP_INTERP_POSITION
:
222 /* Nothing to do as the position coeffs are already setup in slot 0 */
229 bld
->a0aos
[attrib
] = a0aos
;
230 bld
->dadxaos
[attrib
] = dadxaos
;
231 bld
->dadyaos
[attrib
] = dadyaos
;
236 * Interpolate the shader input attribute values.
237 * This is called for each (group of) quad(s).
240 attribs_update_simple(struct lp_build_interp_soa_context
*bld
,
241 struct gallivm_state
*gallivm
,
242 LLVMValueRef loop_iter
,
243 LLVMValueRef mask_store
,
244 LLVMValueRef sample_id
,
248 LLVMBuilderRef builder
= gallivm
->builder
;
249 struct lp_build_context
*coeff_bld
= &bld
->coeff_bld
;
250 struct lp_build_context
*setup_bld
= &bld
->setup_bld
;
251 LLVMValueRef oow
= NULL
;
253 LLVMValueRef pixoffx
;
254 LLVMValueRef pixoffy
;
257 /* could do this with code-generated passed in pixel offsets too */
260 ptr
= LLVMBuildGEP(builder
, bld
->xoffset_store
, &loop_iter
, 1, "");
261 pixoffx
= LLVMBuildLoad(builder
, ptr
, "");
262 ptr
= LLVMBuildGEP(builder
, bld
->yoffset_store
, &loop_iter
, 1, "");
263 pixoffy
= LLVMBuildLoad(builder
, ptr
, "");
265 pixoffx
= LLVMBuildFAdd(builder
, pixoffx
,
266 lp_build_broadcast_scalar(coeff_bld
, bld
->x
), "");
267 pixoffy
= LLVMBuildFAdd(builder
, pixoffy
,
268 lp_build_broadcast_scalar(coeff_bld
, bld
->y
), "");
270 for (attrib
= start
; attrib
< end
; attrib
++) {
271 const unsigned mask
= bld
->mask
[attrib
];
272 const unsigned interp
= bld
->interp
[attrib
];
273 const unsigned loc
= bld
->interp_loc
[attrib
];
276 for (chan
= 0; chan
< TGSI_NUM_CHANNELS
; chan
++) {
277 if (mask
& (1 << chan
)) {
279 LLVMValueRef dadx
= coeff_bld
->zero
;
280 LLVMValueRef dady
= coeff_bld
->zero
;
281 LLVMValueRef a
= coeff_bld
->zero
;
282 LLVMValueRef chan_pixoffx
= pixoffx
, chan_pixoffy
= pixoffy
;
284 index
= lp_build_const_int32(gallivm
, chan
);
286 case LP_INTERP_PERSPECTIVE
:
289 case LP_INTERP_LINEAR
:
290 if (attrib
== 0 && chan
== 0) {
291 dadx
= coeff_bld
->one
;
292 if (bld
->pos_offset
) {
293 a
= lp_build_const_vec(gallivm
, coeff_bld
->type
, bld
->pos_offset
);
296 else if (attrib
== 0 && chan
== 1) {
297 dady
= coeff_bld
->one
;
298 if (bld
->pos_offset
) {
299 a
= lp_build_const_vec(gallivm
, coeff_bld
->type
, bld
->pos_offset
);
303 dadx
= lp_build_extract_broadcast(gallivm
, setup_bld
->type
,
304 coeff_bld
->type
, bld
->dadxaos
[attrib
],
306 dady
= lp_build_extract_broadcast(gallivm
, setup_bld
->type
,
307 coeff_bld
->type
, bld
->dadyaos
[attrib
],
309 a
= lp_build_extract_broadcast(gallivm
, setup_bld
->type
,
310 coeff_bld
->type
, bld
->a0aos
[attrib
],
313 if (bld
->coverage_samples
> 1) {
314 LLVMValueRef xoffset
= lp_build_const_vec(gallivm
, coeff_bld
->type
, bld
->pos_offset
);
315 LLVMValueRef yoffset
= lp_build_const_vec(gallivm
, coeff_bld
->type
, bld
->pos_offset
);
316 if (loc
== TGSI_INTERPOLATE_LOC_SAMPLE
|| (attrib
== 0 && chan
== 2 && sample_id
)) {
317 LLVMValueRef x_val_idx
= LLVMBuildMul(gallivm
->builder
, sample_id
, lp_build_const_int32(gallivm
, 2), "");
318 LLVMValueRef y_val_idx
= LLVMBuildAdd(gallivm
->builder
, x_val_idx
, lp_build_const_int32(gallivm
, 1), "");
320 x_val_idx
= LLVMBuildGEP(builder
, bld
->sample_pos_array
, &x_val_idx
, 1, "");
321 y_val_idx
= LLVMBuildGEP(builder
, bld
->sample_pos_array
, &y_val_idx
, 1, "");
322 xoffset
= lp_build_broadcast_scalar(coeff_bld
, LLVMBuildLoad(builder
, x_val_idx
, ""));
323 yoffset
= lp_build_broadcast_scalar(coeff_bld
, LLVMBuildLoad(builder
, y_val_idx
, ""));
324 } else if (loc
== TGSI_INTERPOLATE_LOC_CENTROID
) {
325 LLVMValueRef centroid_x_offset
= lp_build_const_vec(gallivm
, coeff_bld
->type
, bld
->pos_offset
);
326 LLVMValueRef centroid_y_offset
= lp_build_const_vec(gallivm
, coeff_bld
->type
, bld
->pos_offset
);
328 /* for centroid find covered samples for this quad. */
329 /* if all samples are covered use pixel centers */
330 LLVMValueRef s_mask_and
= NULL
;
331 for (int s
= bld
->coverage_samples
- 1; s
>= 0; s
--) {
332 LLVMValueRef sample_cov
;
333 LLVMValueRef s_mask_idx
= LLVMBuildMul(builder
, bld
->num_loop
, lp_build_const_int32(gallivm
, s
), "");
335 s_mask_idx
= LLVMBuildAdd(builder
, s_mask_idx
, loop_iter
, "");
336 sample_cov
= lp_build_pointer_get(builder
, mask_store
, s_mask_idx
);
337 if (s
== bld
->coverage_samples
- 1)
338 s_mask_and
= sample_cov
;
340 s_mask_and
= LLVMBuildAnd(builder
, s_mask_and
, sample_cov
, "");
342 LLVMValueRef x_val_idx
= lp_build_const_int32(gallivm
, s
* 2);
343 LLVMValueRef y_val_idx
= lp_build_const_int32(gallivm
, s
* 2 + 1);
345 x_val_idx
= LLVMBuildGEP(builder
, bld
->sample_pos_array
, &x_val_idx
, 1, "");
346 y_val_idx
= LLVMBuildGEP(builder
, bld
->sample_pos_array
, &y_val_idx
, 1, "");
347 x_val_idx
= lp_build_broadcast_scalar(coeff_bld
, LLVMBuildLoad(builder
, x_val_idx
, ""));
348 y_val_idx
= lp_build_broadcast_scalar(coeff_bld
, LLVMBuildLoad(builder
, y_val_idx
, ""));
349 centroid_x_offset
= lp_build_select(coeff_bld
, sample_cov
, x_val_idx
, centroid_x_offset
);
350 centroid_y_offset
= lp_build_select(coeff_bld
, sample_cov
, y_val_idx
, centroid_y_offset
);
352 xoffset
= lp_build_select(coeff_bld
, s_mask_and
, xoffset
, centroid_x_offset
);
353 yoffset
= lp_build_select(coeff_bld
, s_mask_and
, yoffset
, centroid_y_offset
);
355 chan_pixoffx
= lp_build_add(coeff_bld
, chan_pixoffx
, xoffset
);
356 chan_pixoffy
= lp_build_add(coeff_bld
, chan_pixoffy
, yoffset
);
360 * a = a0 + (x * dadx + y * dady)
362 a
= lp_build_fmuladd(builder
, dadx
, chan_pixoffx
, a
);
363 a
= lp_build_fmuladd(builder
, dady
, chan_pixoffy
, a
);
365 if (interp
== LP_INTERP_PERSPECTIVE
) {
367 LLVMValueRef w
= bld
->attribs
[0][3];
369 assert(bld
->mask
[0] & TGSI_WRITEMASK_W
);
370 oow
= lp_build_rcp(coeff_bld
, w
);
372 a
= lp_build_mul(coeff_bld
, a
, oow
);
376 case LP_INTERP_CONSTANT
:
377 case LP_INTERP_FACING
:
378 a
= lp_build_extract_broadcast(gallivm
, setup_bld
->type
,
379 coeff_bld
->type
, bld
->a0aos
[attrib
],
383 case LP_INTERP_POSITION
:
385 a
= bld
->attribs
[0][chan
];
393 if ((attrib
== 0) && (chan
== 2) && !bld
->depth_clamp
){
394 /* FIXME: Depth values can exceed 1.0, due to the fact that
395 * setup interpolation coefficients refer to (0,0) which causes
396 * precision loss. So we must clamp to 1.0 here to avoid artifacts.
397 * Note though values outside [0,1] are perfectly valid with
398 * depth clip disabled.
399 * XXX: If depth clip is disabled but we force depth clamp
400 * we may get values larger than 1.0 in the fs (but not in
401 * depth test). Not sure if that's an issue...
402 * Also, on a similar note, it is not obvious if the depth values
403 * appearing in fs (with depth clip disabled) should be clamped
404 * to [0,1], clamped to near/far or not be clamped at all...
406 a
= lp_build_min(coeff_bld
, a
, coeff_bld
->one
);
408 bld
->attribs
[attrib
][chan
] = a
;
415 * Initialize the bld->a, dadq fields. This involves fetching
416 * those values from the arrays which are passed into the JIT function.
419 coeffs_init(struct lp_build_interp_soa_context
*bld
,
421 LLVMValueRef dadx_ptr
,
422 LLVMValueRef dady_ptr
)
424 struct lp_build_context
*coeff_bld
= &bld
->coeff_bld
;
425 struct lp_build_context
*setup_bld
= &bld
->setup_bld
;
426 struct gallivm_state
*gallivm
= coeff_bld
->gallivm
;
427 LLVMBuilderRef builder
= gallivm
->builder
;
428 LLVMValueRef pixoffx
, pixoffy
;
433 pixoffx
= coeff_bld
->undef
;
434 pixoffy
= coeff_bld
->undef
;
435 for (i
= 0; i
< coeff_bld
->type
.length
; i
++) {
436 LLVMValueRef nr
= lp_build_const_int32(gallivm
, i
);
437 LLVMValueRef pixxf
= lp_build_const_float(gallivm
, quad_offset_x
[i
]);
438 LLVMValueRef pixyf
= lp_build_const_float(gallivm
, quad_offset_y
[i
]);
439 pixoffx
= LLVMBuildInsertElement(builder
, pixoffx
, pixxf
, nr
, "");
440 pixoffy
= LLVMBuildInsertElement(builder
, pixoffy
, pixyf
, nr
, "");
444 for (attrib
= 0; attrib
< bld
->num_attribs
; ++attrib
) {
445 const unsigned mask
= bld
->mask
[attrib
];
446 const unsigned interp
= bld
->interp
[attrib
];
447 LLVMValueRef index
= lp_build_const_int32(gallivm
,
448 attrib
* TGSI_NUM_CHANNELS
);
450 LLVMValueRef dadxaos
= setup_bld
->zero
;
451 LLVMValueRef dadyaos
= setup_bld
->zero
;
452 LLVMValueRef a0aos
= setup_bld
->zero
;
454 /* always fetch all 4 values for performance/simplicity */
456 case LP_INTERP_PERSPECTIVE
:
459 case LP_INTERP_LINEAR
:
460 ptr
= LLVMBuildGEP(builder
, dadx_ptr
, &index
, 1, "");
461 ptr
= LLVMBuildBitCast(builder
, ptr
,
462 LLVMPointerType(setup_bld
->vec_type
, 0), "");
463 dadxaos
= LLVMBuildLoad(builder
, ptr
, "");
465 ptr
= LLVMBuildGEP(builder
, dady_ptr
, &index
, 1, "");
466 ptr
= LLVMBuildBitCast(builder
, ptr
,
467 LLVMPointerType(setup_bld
->vec_type
, 0), "");
468 dadyaos
= LLVMBuildLoad(builder
, ptr
, "");
470 attrib_name(dadxaos
, attrib
, 0, ".dadxaos");
471 attrib_name(dadyaos
, attrib
, 0, ".dadyaos");
474 case LP_INTERP_CONSTANT
:
475 case LP_INTERP_FACING
:
476 ptr
= LLVMBuildGEP(builder
, a0_ptr
, &index
, 1, "");
477 ptr
= LLVMBuildBitCast(builder
, ptr
,
478 LLVMPointerType(setup_bld
->vec_type
, 0), "");
479 a0aos
= LLVMBuildLoad(builder
, ptr
, "");
480 attrib_name(a0aos
, attrib
, 0, ".a0aos");
483 case LP_INTERP_POSITION
:
484 /* Nothing to do as the position coeffs are already setup in slot 0 */
493 * a = a0 + (x * dadx + y * dady)
494 * a0aos is the attrib value at top left corner of stamp
496 if (interp
!= LP_INTERP_CONSTANT
&&
497 interp
!= LP_INTERP_FACING
) {
498 LLVMValueRef x
= lp_build_broadcast_scalar(setup_bld
, bld
->x
);
499 LLVMValueRef y
= lp_build_broadcast_scalar(setup_bld
, bld
->y
);
500 a0aos
= lp_build_fmuladd(builder
, x
, dadxaos
, a0aos
);
501 a0aos
= lp_build_fmuladd(builder
, y
, dadyaos
, a0aos
);
505 * dadq = {0, dadx, dady, dadx + dady}
506 * for two quads (side by side) this is:
507 * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
509 for (chan
= 0; chan
< TGSI_NUM_CHANNELS
; ++chan
) {
510 /* this generates a CRAPLOAD of shuffles... */
511 if (mask
& (1 << chan
)) {
512 LLVMValueRef dadx
, dady
;
513 LLVMValueRef dadq
, dadq2
;
515 LLVMValueRef chan_index
= lp_build_const_int32(gallivm
, chan
);
517 if (attrib
== 0 && chan
== 0) {
519 if (bld
->pos_offset
) {
520 a
= LLVMBuildFAdd(builder
, a
, lp_build_const_float(gallivm
, bld
->pos_offset
), "");
522 a
= lp_build_broadcast_scalar(coeff_bld
, a
);
523 dadx
= coeff_bld
->one
;
524 dady
= coeff_bld
->zero
;
526 else if (attrib
== 0 && chan
== 1) {
528 if (bld
->pos_offset
) {
529 a
= LLVMBuildFAdd(builder
, a
, lp_build_const_float(gallivm
, bld
->pos_offset
), "");
531 a
= lp_build_broadcast_scalar(coeff_bld
, a
);
532 dady
= coeff_bld
->one
;
533 dadx
= coeff_bld
->zero
;
536 dadx
= lp_build_extract_broadcast(gallivm
, setup_bld
->type
,
537 coeff_bld
->type
, dadxaos
, chan_index
);
538 dady
= lp_build_extract_broadcast(gallivm
, setup_bld
->type
,
539 coeff_bld
->type
, dadyaos
, chan_index
);
544 a
= lp_build_extract_broadcast(gallivm
, setup_bld
->type
,
545 coeff_bld
->type
, a0aos
, chan_index
);
548 dadx
= LLVMBuildFMul(builder
, dadx
, pixoffx
, "");
549 dady
= LLVMBuildFMul(builder
, dady
, pixoffy
, "");
550 dadq
= LLVMBuildFAdd(builder
, dadx
, dady
, "");
553 * Compute the attrib values on the upper-left corner of each
555 * Note that if we process 2 quads at once this doesn't
556 * really exactly to what we want.
557 * We need to access elem 0 and 2 respectively later if we process
561 if (interp
!= LP_INTERP_CONSTANT
&&
562 interp
!= LP_INTERP_FACING
) {
563 dadq2
= LLVMBuildFAdd(builder
, dadq
, dadq
, "");
564 a
= LLVMBuildFAdd(builder
, a
, dadq2
, "");
567 #if PERSPECTIVE_DIVIDE_PER_QUAD
573 * XXX since we're only going to access elements 0,2 out of 8
574 * if we have 8-wide vectors we should do the division only 4-wide.
575 * a is really a 2-elements in a 4-wide vector disguised as 8-wide
578 if (interp
== LP_INTERP_PERSPECTIVE
) {
579 LLVMValueRef w
= bld
->a
[0][3];
581 assert(bld
->mask
[0] & TGSI_WRITEMASK_W
);
583 bld
->oow
= lp_build_rcp(coeff_bld
, w
);
584 lp_build_name(bld
->oow
, "oow");
586 a
= lp_build_mul(coeff_bld
, a
, bld
->oow
);
590 attrib_name(a
, attrib
, chan
, ".a");
591 attrib_name(dadq
, attrib
, chan
, ".dadq");
593 bld
->a
[attrib
][chan
] = lp_build_alloca(gallivm
,
595 LLVMBuildStore(builder
, a
, bld
->a
[attrib
][chan
]);
596 bld
->dadq
[attrib
][chan
] = dadq
;
604 * Increment the shader input attribute values.
605 * This is called when we move from one quad to the next.
608 attribs_update(struct lp_build_interp_soa_context
*bld
,
609 struct gallivm_state
*gallivm
,
610 LLVMValueRef loop_iter
,
614 LLVMBuilderRef builder
= gallivm
->builder
;
615 struct lp_build_context
*coeff_bld
= &bld
->coeff_bld
;
616 LLVMValueRef oow
= NULL
;
620 for(attrib
= start
; attrib
< end
; ++attrib
) {
621 const unsigned mask
= bld
->mask
[attrib
];
622 const unsigned interp
= bld
->interp
[attrib
];
623 for(chan
= 0; chan
< TGSI_NUM_CHANNELS
; ++chan
) {
624 if(mask
& (1 << chan
)) {
626 if (interp
== LP_INTERP_CONSTANT
||
627 interp
== LP_INTERP_FACING
) {
628 a
= LLVMBuildLoad(builder
, bld
->a
[attrib
][chan
], "");
630 else if (interp
== LP_INTERP_POSITION
) {
632 a
= bld
->attribs
[0][chan
];
637 a
= bld
->a
[attrib
][chan
];
640 * Broadcast the attribute value for this quad into all elements
644 /* stored as vector load as float */
645 LLVMTypeRef ptr_type
= LLVMPointerType(LLVMFloatTypeInContext(
646 gallivm
->context
), 0);
648 a
= LLVMBuildBitCast(builder
, a
, ptr_type
, "");
649 ptr
= LLVMBuildGEP(builder
, a
, &loop_iter
, 1, "");
650 a
= LLVMBuildLoad(builder
, ptr
, "");
651 a
= lp_build_broadcast_scalar(&bld
->coeff_bld
, a
);
655 * Get the derivatives.
658 dadq
= bld
->dadq
[attrib
][chan
];
660 #if PERSPECTIVE_DIVIDE_PER_QUAD
661 if (interp
== LP_INTERP_PERSPECTIVE
) {
662 LLVMValueRef dwdq
= bld
->dadq
[0][3];
666 oow
= LLVMBuildShuffleVector(coeff_bld
->builder
,
667 bld
->oow
, coeff_bld
->undef
,
671 dadq
= lp_build_sub(coeff_bld
,
673 lp_build_mul(coeff_bld
, a
, dwdq
));
674 dadq
= lp_build_mul(coeff_bld
, dadq
, oow
);
679 * Add the derivatives
682 a
= lp_build_add(coeff_bld
, a
, dadq
);
684 #if !PERSPECTIVE_DIVIDE_PER_QUAD
685 if (interp
== LP_INTERP_PERSPECTIVE
) {
687 LLVMValueRef w
= bld
->attribs
[0][3];
689 assert(bld
->mask
[0] & TGSI_WRITEMASK_W
);
690 oow
= lp_build_rcp(coeff_bld
, w
);
692 a
= lp_build_mul(coeff_bld
, a
, oow
);
696 if (attrib
== 0 && chan
== 2 && !bld
->depth_clamp
) {
697 /* FIXME: Depth values can exceed 1.0, due to the fact that
698 * setup interpolation coefficients refer to (0,0) which causes
699 * precision loss. So we must clamp to 1.0 here to avoid artifacts.
700 * Note though values outside [0,1] are perfectly valid with
701 * depth clip disabled..
702 * XXX: If depth clip is disabled but we force depth clamp
703 * we may get values larger than 1.0 in the fs (but not in
704 * depth test). Not sure if that's an issue...
705 * Also, on a similar note, it is not obvious if the depth values
706 * appearing in fs (with depth clip disabled) should be clamped
707 * to [0,1], clamped to near/far or not be clamped at all...
709 a
= lp_build_min(coeff_bld
, a
, coeff_bld
->one
);
712 attrib_name(a
, attrib
, chan
, "");
714 bld
->attribs
[attrib
][chan
] = a
;
722 * Generate the position vectors.
724 * Parameter x0, y0 are the integer values with upper left coordinates.
727 pos_init(struct lp_build_interp_soa_context
*bld
,
731 LLVMBuilderRef builder
= bld
->coeff_bld
.gallivm
->builder
;
732 struct lp_build_context
*coeff_bld
= &bld
->coeff_bld
;
734 bld
->x
= LLVMBuildSIToFP(builder
, x0
, coeff_bld
->elem_type
, "");
735 bld
->y
= LLVMBuildSIToFP(builder
, y0
, coeff_bld
->elem_type
, "");
740 * Initialize fragment shader input attribute info.
743 lp_build_interp_soa_init(struct lp_build_interp_soa_context
*bld
,
744 struct gallivm_state
*gallivm
,
746 const struct lp_shader_input
*inputs
,
747 boolean pixel_center_integer
,
748 unsigned coverage_samples
,
749 LLVMValueRef sample_pos_array
,
750 LLVMValueRef num_loop
,
752 LLVMBuilderRef builder
,
755 LLVMValueRef dadx_ptr
,
756 LLVMValueRef dady_ptr
,
760 struct lp_type coeff_type
;
761 struct lp_type setup_type
;
765 memset(bld
, 0, sizeof *bld
);
767 memset(&coeff_type
, 0, sizeof coeff_type
);
768 coeff_type
.floating
= TRUE
;
769 coeff_type
.sign
= TRUE
;
770 coeff_type
.width
= 32;
771 coeff_type
.length
= type
.length
;
773 memset(&setup_type
, 0, sizeof setup_type
);
774 setup_type
.floating
= TRUE
;
775 setup_type
.sign
= TRUE
;
776 setup_type
.width
= 32;
777 setup_type
.length
= TGSI_NUM_CHANNELS
;
780 /* XXX: we don't support interpolating into any other types */
781 assert(memcmp(&coeff_type
, &type
, sizeof coeff_type
) == 0);
783 lp_build_context_init(&bld
->coeff_bld
, gallivm
, coeff_type
);
784 lp_build_context_init(&bld
->setup_bld
, gallivm
, setup_type
);
786 /* For convenience */
787 bld
->pos
= bld
->attribs
[0];
788 bld
->inputs
= (const LLVMValueRef (*)[TGSI_NUM_CHANNELS
]) bld
->attribs
[1];
791 bld
->mask
[0] = TGSI_WRITEMASK_XYZW
;
792 bld
->interp
[0] = LP_INTERP_LINEAR
;
793 bld
->interp_loc
[0] = 0;
796 for (attrib
= 0; attrib
< num_inputs
; ++attrib
) {
797 bld
->mask
[1 + attrib
] = inputs
[attrib
].usage_mask
;
798 bld
->interp
[1 + attrib
] = inputs
[attrib
].interp
;
799 bld
->interp_loc
[1 + attrib
] = inputs
[attrib
].location
;
801 bld
->num_attribs
= 1 + num_inputs
;
803 /* Ensure all masked out input channels have a valid value */
804 for (attrib
= 0; attrib
< bld
->num_attribs
; ++attrib
) {
805 for (chan
= 0; chan
< TGSI_NUM_CHANNELS
; ++chan
) {
806 bld
->attribs
[attrib
][chan
] = bld
->coeff_bld
.undef
;
810 if (pixel_center_integer
) {
811 bld
->pos_offset
= 0.0;
813 bld
->pos_offset
= 0.5;
815 bld
->depth_clamp
= depth_clamp
;
816 bld
->coverage_samples
= coverage_samples
;
817 bld
->num_loop
= num_loop
;
818 bld
->sample_pos_array
= sample_pos_array
;
820 pos_init(bld
, x0
, y0
);
823 * Simple method (single step interpolation) may be slower if vector length
824 * is just 4, but the results are different (generally less accurate) with
825 * the other method, so always use more accurate version.
828 bld
->simple_interp
= TRUE
;
830 /* XXX this should use a global static table */
832 unsigned num_loops
= 16 / type
.length
;
833 LLVMValueRef pixoffx
, pixoffy
, index
;
836 bld
->xoffset_store
= lp_build_array_alloca(gallivm
,
837 lp_build_vec_type(gallivm
, type
),
838 lp_build_const_int32(gallivm
, num_loops
),
840 bld
->yoffset_store
= lp_build_array_alloca(gallivm
,
841 lp_build_vec_type(gallivm
, type
),
842 lp_build_const_int32(gallivm
, num_loops
),
844 for (i
= 0; i
< num_loops
; i
++) {
845 index
= lp_build_const_int32(gallivm
, i
);
846 calc_offsets(&bld
->coeff_bld
, i
*type
.length
/4, &pixoffx
, &pixoffy
);
847 ptr
= LLVMBuildGEP(builder
, bld
->xoffset_store
, &index
, 1, "");
848 LLVMBuildStore(builder
, pixoffx
, ptr
);
849 ptr
= LLVMBuildGEP(builder
, bld
->yoffset_store
, &index
, 1, "");
850 LLVMBuildStore(builder
, pixoffy
, ptr
);
853 coeffs_init_simple(bld
, a0_ptr
, dadx_ptr
, dady_ptr
);
856 bld
->simple_interp
= FALSE
;
857 coeffs_init(bld
, a0_ptr
, dadx_ptr
, dady_ptr
);
864 * Advance the position and inputs to the given quad within the block.
868 lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context
*bld
,
869 struct gallivm_state
*gallivm
,
870 LLVMValueRef quad_start_index
,
871 LLVMValueRef mask_store
,
872 LLVMValueRef sample_id
)
874 if (bld
->simple_interp
) {
875 attribs_update_simple(bld
, gallivm
, quad_start_index
, mask_store
, sample_id
, 1, bld
->num_attribs
);
878 attribs_update(bld
, gallivm
, quad_start_index
, 1, bld
->num_attribs
);
883 lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context
*bld
,
884 struct gallivm_state
*gallivm
,
885 LLVMValueRef quad_start_index
,
886 LLVMValueRef sample_id
)
888 if (bld
->simple_interp
) {
889 attribs_update_simple(bld
, gallivm
, quad_start_index
, NULL
, sample_id
, 0, 1);
892 attribs_update(bld
, gallivm
, quad_start_index
, 0, 1);