bd6bce7be2dd42d84d36a9d007a788a3f5a7b3bb
[mesa.git] / src / gallium / drivers / llvmpipe / lp_bld_interp.c
1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * Copyright 2007-2008 VMware, Inc.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 /**
30 * @file
31 * Position and shader input interpolation.
32 *
33 * @author Jose Fonseca <jfonseca@vmware.com>
34 */
35
36 #include "pipe/p_shader_tokens.h"
37 #include "util/u_debug.h"
38 #include "util/u_memory.h"
39 #include "util/u_math.h"
40 #include "tgsi/tgsi_scan.h"
41 #include "gallivm/lp_bld_debug.h"
42 #include "gallivm/lp_bld_const.h"
43 #include "gallivm/lp_bld_arit.h"
44 #include "gallivm/lp_bld_swizzle.h"
45 #include "gallivm/lp_bld_flow.h"
46 #include "gallivm/lp_bld_logic.h"
47 #include "gallivm/lp_bld_struct.h"
48 #include "lp_bld_interp.h"
49
50
51 /*
52 * The shader JIT function operates on blocks of quads.
53 * Each block has 2x2 quads and each quad has 2x2 pixels.
54 *
55 * We iterate over the quads in order 0, 1, 2, 3:
56 *
57 * #################
58 * # | # | #
59 * #---0---#---1---#
60 * # | # | #
61 * #################
62 * # | # | #
63 * #---2---#---3---#
64 * # | # | #
65 * #################
66 *
67 * If we iterate over multiple quads at once, quads 01 and 23 are processed
68 * together.
69 *
70 * Within each quad, we have four pixels which are represented in SOA
71 * order:
72 *
73 * #########
74 * # 0 | 1 #
75 * #---+---#
76 * # 2 | 3 #
77 * #########
78 *
79 * So the green channel (for example) of the four pixels is stored in
80 * a single vector register: {g0, g1, g2, g3}.
81 * The order stays the same even with multiple quads:
82 * 0 1 4 5
83 * 2 3 6 7
84 * is stored as g0..g7
85 */
86
87
88 /**
89 * Do one perspective divide per quad.
90 *
91 * For perspective interpolation, the final attribute value is given
92 *
93 * a' = a/w = a * oow
94 *
95 * where
96 *
97 * a = a0 + dadx*x + dady*y
98 * w = w0 + dwdx*x + dwdy*y
99 * oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
100 *
101 * Instead of computing the division per pixel, with this macro we compute the
102 * division on the upper left pixel of each quad, and use a linear
103 * approximation in the remaining pixels, given by:
104 *
105 * da'dx = (dadx - dwdx*a)*oow
106 * da'dy = (dady - dwdy*a)*oow
107 *
108 * Ironically, this actually makes things slower -- probably because the
109 * divide hardware unit is rarely used, whereas the multiply unit is typically
110 * already saturated.
111 */
112 #define PERSPECTIVE_DIVIDE_PER_QUAD 0
113
114
115 static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
116 static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
117
118
119 static void
120 attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
121 {
122 if(attrib == 0)
123 lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
124 else
125 lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
126 }
127
128 static void
129 calc_offsets(struct lp_build_context *coeff_bld,
130 unsigned quad_start_index,
131 LLVMValueRef *pixoffx,
132 LLVMValueRef *pixoffy)
133 {
134 unsigned i;
135 unsigned num_pix = coeff_bld->type.length;
136 struct gallivm_state *gallivm = coeff_bld->gallivm;
137 LLVMBuilderRef builder = coeff_bld->gallivm->builder;
138 LLVMValueRef nr, pixxf, pixyf;
139
140 *pixoffx = coeff_bld->undef;
141 *pixoffy = coeff_bld->undef;
142
143 for (i = 0; i < num_pix; i++) {
144 nr = lp_build_const_int32(gallivm, i);
145 pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
146 (quad_start_index & 1) * 2);
147 pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
148 (quad_start_index & 2));
149 *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, "");
150 *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, "");
151 }
152 }
153
154
155 /* Much easier, and significantly less instructions in the per-stamp
156 * part (less than half) but overall more instructions so a loss if
157 * most quads are active. Might be a win though with larger vectors.
158 * No ability to do per-quad divide (doable but not implemented)
159 * Could be made to work with passed in pixel offsets (i.e. active quad merging).
160 */
161 static void
162 coeffs_init_simple(struct lp_build_interp_soa_context *bld,
163 LLVMValueRef a0_ptr,
164 LLVMValueRef dadx_ptr,
165 LLVMValueRef dady_ptr)
166 {
167 struct lp_build_context *coeff_bld = &bld->coeff_bld;
168 struct lp_build_context *setup_bld = &bld->setup_bld;
169 struct gallivm_state *gallivm = coeff_bld->gallivm;
170 LLVMBuilderRef builder = gallivm->builder;
171 unsigned attrib;
172
173 for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
174 /*
175 * always fetch all 4 values for performance/simplicity
176 * Note: we do that here because it seems to generate better
177 * code. It generates a lot of moves initially but less
178 * moves later. As far as I can tell this looks like a
179 * llvm issue, instead of simply reloading the values from
180 * the passed in pointers it if it runs out of registers
181 * it spills/reloads them. Maybe some optimization passes
182 * would help.
183 * Might want to investigate this again later.
184 */
185 const unsigned interp = bld->interp[attrib];
186 LLVMValueRef index = lp_build_const_int32(gallivm,
187 attrib * TGSI_NUM_CHANNELS);
188 LLVMValueRef ptr;
189 LLVMValueRef dadxaos = setup_bld->zero;
190 LLVMValueRef dadyaos = setup_bld->zero;
191 LLVMValueRef a0aos = setup_bld->zero;
192
193 switch (interp) {
194 case LP_INTERP_PERSPECTIVE:
195 /* fall-through */
196
197 case LP_INTERP_LINEAR:
198 ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
199 ptr = LLVMBuildBitCast(builder, ptr,
200 LLVMPointerType(setup_bld->vec_type, 0), "");
201 dadxaos = LLVMBuildLoad(builder, ptr, "");
202
203 ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
204 ptr = LLVMBuildBitCast(builder, ptr,
205 LLVMPointerType(setup_bld->vec_type, 0), "");
206 dadyaos = LLVMBuildLoad(builder, ptr, "");
207
208 attrib_name(dadxaos, attrib, 0, ".dadxaos");
209 attrib_name(dadyaos, attrib, 0, ".dadyaos");
210 /* fall-through */
211
212 case LP_INTERP_CONSTANT:
213 case LP_INTERP_FACING:
214 ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
215 ptr = LLVMBuildBitCast(builder, ptr,
216 LLVMPointerType(setup_bld->vec_type, 0), "");
217 a0aos = LLVMBuildLoad(builder, ptr, "");
218 attrib_name(a0aos, attrib, 0, ".a0aos");
219 break;
220
221 case LP_INTERP_POSITION:
222 /* Nothing to do as the position coeffs are already setup in slot 0 */
223 continue;
224
225 default:
226 assert(0);
227 break;
228 }
229 bld->a0aos[attrib] = a0aos;
230 bld->dadxaos[attrib] = dadxaos;
231 bld->dadyaos[attrib] = dadyaos;
232 }
233 }
234
235 /**
236 * Interpolate the shader input attribute values.
237 * This is called for each (group of) quad(s).
238 */
239 static void
240 attribs_update_simple(struct lp_build_interp_soa_context *bld,
241 struct gallivm_state *gallivm,
242 LLVMValueRef loop_iter,
243 LLVMValueRef mask_store,
244 LLVMValueRef sample_id,
245 int start,
246 int end)
247 {
248 LLVMBuilderRef builder = gallivm->builder;
249 struct lp_build_context *coeff_bld = &bld->coeff_bld;
250 struct lp_build_context *setup_bld = &bld->setup_bld;
251 LLVMValueRef oow = NULL;
252 unsigned attrib;
253 LLVMValueRef pixoffx;
254 LLVMValueRef pixoffy;
255 LLVMValueRef ptr;
256
257 /* could do this with code-generated passed in pixel offsets too */
258
259 assert(loop_iter);
260 ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, "");
261 pixoffx = LLVMBuildLoad(builder, ptr, "");
262 ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, "");
263 pixoffy = LLVMBuildLoad(builder, ptr, "");
264
265 pixoffx = LLVMBuildFAdd(builder, pixoffx,
266 lp_build_broadcast_scalar(coeff_bld, bld->x), "");
267 pixoffy = LLVMBuildFAdd(builder, pixoffy,
268 lp_build_broadcast_scalar(coeff_bld, bld->y), "");
269
270 for (attrib = start; attrib < end; attrib++) {
271 const unsigned mask = bld->mask[attrib];
272 const unsigned interp = bld->interp[attrib];
273 const unsigned loc = bld->interp_loc[attrib];
274 unsigned chan;
275
276 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
277 if (mask & (1 << chan)) {
278 LLVMValueRef index;
279 LLVMValueRef dadx = coeff_bld->zero;
280 LLVMValueRef dady = coeff_bld->zero;
281 LLVMValueRef a = coeff_bld->zero;
282 LLVMValueRef chan_pixoffx = pixoffx, chan_pixoffy = pixoffy;
283
284 index = lp_build_const_int32(gallivm, chan);
285 switch (interp) {
286 case LP_INTERP_PERSPECTIVE:
287 /* fall-through */
288
289 case LP_INTERP_LINEAR:
290 if (attrib == 0 && chan == 0) {
291 dadx = coeff_bld->one;
292 if (bld->pos_offset) {
293 a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
294 }
295 }
296 else if (attrib == 0 && chan == 1) {
297 dady = coeff_bld->one;
298 if (bld->pos_offset) {
299 a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
300 }
301 }
302 else {
303 dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
304 coeff_bld->type, bld->dadxaos[attrib],
305 index);
306 dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
307 coeff_bld->type, bld->dadyaos[attrib],
308 index);
309 a = lp_build_extract_broadcast(gallivm, setup_bld->type,
310 coeff_bld->type, bld->a0aos[attrib],
311 index);
312
313 if (bld->coverage_samples > 1) {
314 LLVMValueRef xoffset = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
315 LLVMValueRef yoffset = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
316 if (loc == TGSI_INTERPOLATE_LOC_SAMPLE || (attrib == 0 && chan == 2 && sample_id)) {
317 LLVMValueRef x_val_idx = LLVMBuildMul(gallivm->builder, sample_id, lp_build_const_int32(gallivm, 2), "");
318 LLVMValueRef y_val_idx = LLVMBuildAdd(gallivm->builder, x_val_idx, lp_build_const_int32(gallivm, 1), "");
319
320 x_val_idx = LLVMBuildGEP(builder, bld->sample_pos_array, &x_val_idx, 1, "");
321 y_val_idx = LLVMBuildGEP(builder, bld->sample_pos_array, &y_val_idx, 1, "");
322 xoffset = lp_build_broadcast_scalar(coeff_bld, LLVMBuildLoad(builder, x_val_idx, ""));
323 yoffset = lp_build_broadcast_scalar(coeff_bld, LLVMBuildLoad(builder, y_val_idx, ""));
324 } else if (loc == TGSI_INTERPOLATE_LOC_CENTROID) {
325 LLVMValueRef centroid_x_offset = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
326 LLVMValueRef centroid_y_offset = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
327
328 /* for centroid find covered samples for this quad. */
329 /* if all samples are covered use pixel centers */
330 LLVMValueRef s_mask_and = NULL;
331 for (int s = bld->coverage_samples - 1; s >= 0; s--) {
332 LLVMValueRef sample_cov;
333 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, bld->num_loop, lp_build_const_int32(gallivm, s), "");
334
335 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_iter, "");
336 sample_cov = lp_build_pointer_get(builder, mask_store, s_mask_idx);
337 if (s == bld->coverage_samples - 1)
338 s_mask_and = sample_cov;
339 else
340 s_mask_and = LLVMBuildAnd(builder, s_mask_and, sample_cov, "");
341
342 LLVMValueRef x_val_idx = lp_build_const_int32(gallivm, s * 2);
343 LLVMValueRef y_val_idx = lp_build_const_int32(gallivm, s * 2 + 1);
344
345 x_val_idx = LLVMBuildGEP(builder, bld->sample_pos_array, &x_val_idx, 1, "");
346 y_val_idx = LLVMBuildGEP(builder, bld->sample_pos_array, &y_val_idx, 1, "");
347 x_val_idx = lp_build_broadcast_scalar(coeff_bld, LLVMBuildLoad(builder, x_val_idx, ""));
348 y_val_idx = lp_build_broadcast_scalar(coeff_bld, LLVMBuildLoad(builder, y_val_idx, ""));
349 centroid_x_offset = lp_build_select(coeff_bld, sample_cov, x_val_idx, centroid_x_offset);
350 centroid_y_offset = lp_build_select(coeff_bld, sample_cov, y_val_idx, centroid_y_offset);
351 }
352 xoffset = lp_build_select(coeff_bld, s_mask_and, xoffset, centroid_x_offset);
353 yoffset = lp_build_select(coeff_bld, s_mask_and, yoffset, centroid_y_offset);
354 }
355 chan_pixoffx = lp_build_add(coeff_bld, chan_pixoffx, xoffset);
356 chan_pixoffy = lp_build_add(coeff_bld, chan_pixoffy, yoffset);
357 }
358 }
359 /*
360 * a = a0 + (x * dadx + y * dady)
361 */
362 a = lp_build_fmuladd(builder, dadx, chan_pixoffx, a);
363 a = lp_build_fmuladd(builder, dady, chan_pixoffy, a);
364
365 if (interp == LP_INTERP_PERSPECTIVE) {
366 if (oow == NULL) {
367 LLVMValueRef w = bld->attribs[0][3];
368 assert(attrib != 0);
369 assert(bld->mask[0] & TGSI_WRITEMASK_W);
370 oow = lp_build_rcp(coeff_bld, w);
371 }
372 a = lp_build_mul(coeff_bld, a, oow);
373 }
374 break;
375
376 case LP_INTERP_CONSTANT:
377 case LP_INTERP_FACING:
378 a = lp_build_extract_broadcast(gallivm, setup_bld->type,
379 coeff_bld->type, bld->a0aos[attrib],
380 index);
381 break;
382
383 case LP_INTERP_POSITION:
384 assert(attrib > 0);
385 a = bld->attribs[0][chan];
386 break;
387
388 default:
389 assert(0);
390 break;
391 }
392
393 if ((attrib == 0) && (chan == 2) && !bld->depth_clamp){
394 /* FIXME: Depth values can exceed 1.0, due to the fact that
395 * setup interpolation coefficients refer to (0,0) which causes
396 * precision loss. So we must clamp to 1.0 here to avoid artifacts.
397 * Note though values outside [0,1] are perfectly valid with
398 * depth clip disabled.
399 * XXX: If depth clip is disabled but we force depth clamp
400 * we may get values larger than 1.0 in the fs (but not in
401 * depth test). Not sure if that's an issue...
402 * Also, on a similar note, it is not obvious if the depth values
403 * appearing in fs (with depth clip disabled) should be clamped
404 * to [0,1], clamped to near/far or not be clamped at all...
405 */
406 a = lp_build_min(coeff_bld, a, coeff_bld->one);
407 }
408 bld->attribs[attrib][chan] = a;
409 }
410 }
411 }
412 }
413
414 /**
415 * Initialize the bld->a, dadq fields. This involves fetching
416 * those values from the arrays which are passed into the JIT function.
417 */
418 static void
419 coeffs_init(struct lp_build_interp_soa_context *bld,
420 LLVMValueRef a0_ptr,
421 LLVMValueRef dadx_ptr,
422 LLVMValueRef dady_ptr)
423 {
424 struct lp_build_context *coeff_bld = &bld->coeff_bld;
425 struct lp_build_context *setup_bld = &bld->setup_bld;
426 struct gallivm_state *gallivm = coeff_bld->gallivm;
427 LLVMBuilderRef builder = gallivm->builder;
428 LLVMValueRef pixoffx, pixoffy;
429 unsigned attrib;
430 unsigned chan;
431 unsigned i;
432
433 pixoffx = coeff_bld->undef;
434 pixoffy = coeff_bld->undef;
435 for (i = 0; i < coeff_bld->type.length; i++) {
436 LLVMValueRef nr = lp_build_const_int32(gallivm, i);
437 LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
438 LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
439 pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
440 pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
441 }
442
443
444 for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
445 const unsigned mask = bld->mask[attrib];
446 const unsigned interp = bld->interp[attrib];
447 LLVMValueRef index = lp_build_const_int32(gallivm,
448 attrib * TGSI_NUM_CHANNELS);
449 LLVMValueRef ptr;
450 LLVMValueRef dadxaos = setup_bld->zero;
451 LLVMValueRef dadyaos = setup_bld->zero;
452 LLVMValueRef a0aos = setup_bld->zero;
453
454 /* always fetch all 4 values for performance/simplicity */
455 switch (interp) {
456 case LP_INTERP_PERSPECTIVE:
457 /* fall-through */
458
459 case LP_INTERP_LINEAR:
460 ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
461 ptr = LLVMBuildBitCast(builder, ptr,
462 LLVMPointerType(setup_bld->vec_type, 0), "");
463 dadxaos = LLVMBuildLoad(builder, ptr, "");
464
465 ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
466 ptr = LLVMBuildBitCast(builder, ptr,
467 LLVMPointerType(setup_bld->vec_type, 0), "");
468 dadyaos = LLVMBuildLoad(builder, ptr, "");
469
470 attrib_name(dadxaos, attrib, 0, ".dadxaos");
471 attrib_name(dadyaos, attrib, 0, ".dadyaos");
472 /* fall-through */
473
474 case LP_INTERP_CONSTANT:
475 case LP_INTERP_FACING:
476 ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
477 ptr = LLVMBuildBitCast(builder, ptr,
478 LLVMPointerType(setup_bld->vec_type, 0), "");
479 a0aos = LLVMBuildLoad(builder, ptr, "");
480 attrib_name(a0aos, attrib, 0, ".a0aos");
481 break;
482
483 case LP_INTERP_POSITION:
484 /* Nothing to do as the position coeffs are already setup in slot 0 */
485 continue;
486
487 default:
488 assert(0);
489 break;
490 }
491
492 /*
493 * a = a0 + (x * dadx + y * dady)
494 * a0aos is the attrib value at top left corner of stamp
495 */
496 if (interp != LP_INTERP_CONSTANT &&
497 interp != LP_INTERP_FACING) {
498 LLVMValueRef x = lp_build_broadcast_scalar(setup_bld, bld->x);
499 LLVMValueRef y = lp_build_broadcast_scalar(setup_bld, bld->y);
500 a0aos = lp_build_fmuladd(builder, x, dadxaos, a0aos);
501 a0aos = lp_build_fmuladd(builder, y, dadyaos, a0aos);
502 }
503
504 /*
505 * dadq = {0, dadx, dady, dadx + dady}
506 * for two quads (side by side) this is:
507 * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
508 */
509 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
510 /* this generates a CRAPLOAD of shuffles... */
511 if (mask & (1 << chan)) {
512 LLVMValueRef dadx, dady;
513 LLVMValueRef dadq, dadq2;
514 LLVMValueRef a;
515 LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
516
517 if (attrib == 0 && chan == 0) {
518 a = bld->x;
519 if (bld->pos_offset) {
520 a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
521 }
522 a = lp_build_broadcast_scalar(coeff_bld, a);
523 dadx = coeff_bld->one;
524 dady = coeff_bld->zero;
525 }
526 else if (attrib == 0 && chan == 1) {
527 a = bld->y;
528 if (bld->pos_offset) {
529 a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
530 }
531 a = lp_build_broadcast_scalar(coeff_bld, a);
532 dady = coeff_bld->one;
533 dadx = coeff_bld->zero;
534 }
535 else {
536 dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
537 coeff_bld->type, dadxaos, chan_index);
538 dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
539 coeff_bld->type, dadyaos, chan_index);
540
541 /*
542 * a = {a, a, a, a}
543 */
544 a = lp_build_extract_broadcast(gallivm, setup_bld->type,
545 coeff_bld->type, a0aos, chan_index);
546 }
547
548 dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
549 dady = LLVMBuildFMul(builder, dady, pixoffy, "");
550 dadq = LLVMBuildFAdd(builder, dadx, dady, "");
551
552 /*
553 * Compute the attrib values on the upper-left corner of each
554 * group of quads.
555 * Note that if we process 2 quads at once this doesn't
556 * really exactly to what we want.
557 * We need to access elem 0 and 2 respectively later if we process
558 * 2 quads at once.
559 */
560
561 if (interp != LP_INTERP_CONSTANT &&
562 interp != LP_INTERP_FACING) {
563 dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
564 a = LLVMBuildFAdd(builder, a, dadq2, "");
565 }
566
567 #if PERSPECTIVE_DIVIDE_PER_QUAD
568 /*
569 * a *= 1 / w
570 */
571
572 /*
573 * XXX since we're only going to access elements 0,2 out of 8
574 * if we have 8-wide vectors we should do the division only 4-wide.
575 * a is really a 2-elements in a 4-wide vector disguised as 8-wide
576 * in this case.
577 */
578 if (interp == LP_INTERP_PERSPECTIVE) {
579 LLVMValueRef w = bld->a[0][3];
580 assert(attrib != 0);
581 assert(bld->mask[0] & TGSI_WRITEMASK_W);
582 if (!bld->oow) {
583 bld->oow = lp_build_rcp(coeff_bld, w);
584 lp_build_name(bld->oow, "oow");
585 }
586 a = lp_build_mul(coeff_bld, a, bld->oow);
587 }
588 #endif
589
590 attrib_name(a, attrib, chan, ".a");
591 attrib_name(dadq, attrib, chan, ".dadq");
592
593 bld->a[attrib][chan] = lp_build_alloca(gallivm,
594 LLVMTypeOf(a), "");
595 LLVMBuildStore(builder, a, bld->a[attrib][chan]);
596 bld->dadq[attrib][chan] = dadq;
597 }
598 }
599 }
600 }
601
602
603 /**
604 * Increment the shader input attribute values.
605 * This is called when we move from one quad to the next.
606 */
607 static void
608 attribs_update(struct lp_build_interp_soa_context *bld,
609 struct gallivm_state *gallivm,
610 LLVMValueRef loop_iter,
611 int start,
612 int end)
613 {
614 LLVMBuilderRef builder = gallivm->builder;
615 struct lp_build_context *coeff_bld = &bld->coeff_bld;
616 LLVMValueRef oow = NULL;
617 unsigned attrib;
618 unsigned chan;
619
620 for(attrib = start; attrib < end; ++attrib) {
621 const unsigned mask = bld->mask[attrib];
622 const unsigned interp = bld->interp[attrib];
623 for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
624 if(mask & (1 << chan)) {
625 LLVMValueRef a;
626 if (interp == LP_INTERP_CONSTANT ||
627 interp == LP_INTERP_FACING) {
628 a = LLVMBuildLoad(builder, bld->a[attrib][chan], "");
629 }
630 else if (interp == LP_INTERP_POSITION) {
631 assert(attrib > 0);
632 a = bld->attribs[0][chan];
633 }
634 else {
635 LLVMValueRef dadq;
636
637 a = bld->a[attrib][chan];
638
639 /*
640 * Broadcast the attribute value for this quad into all elements
641 */
642
643 {
644 /* stored as vector load as float */
645 LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext(
646 gallivm->context), 0);
647 LLVMValueRef ptr;
648 a = LLVMBuildBitCast(builder, a, ptr_type, "");
649 ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, "");
650 a = LLVMBuildLoad(builder, ptr, "");
651 a = lp_build_broadcast_scalar(&bld->coeff_bld, a);
652 }
653
654 /*
655 * Get the derivatives.
656 */
657
658 dadq = bld->dadq[attrib][chan];
659
660 #if PERSPECTIVE_DIVIDE_PER_QUAD
661 if (interp == LP_INTERP_PERSPECTIVE) {
662 LLVMValueRef dwdq = bld->dadq[0][3];
663
664 if (oow == NULL) {
665 assert(bld->oow);
666 oow = LLVMBuildShuffleVector(coeff_bld->builder,
667 bld->oow, coeff_bld->undef,
668 shuffle, "");
669 }
670
671 dadq = lp_build_sub(coeff_bld,
672 dadq,
673 lp_build_mul(coeff_bld, a, dwdq));
674 dadq = lp_build_mul(coeff_bld, dadq, oow);
675 }
676 #endif
677
678 /*
679 * Add the derivatives
680 */
681
682 a = lp_build_add(coeff_bld, a, dadq);
683
684 #if !PERSPECTIVE_DIVIDE_PER_QUAD
685 if (interp == LP_INTERP_PERSPECTIVE) {
686 if (oow == NULL) {
687 LLVMValueRef w = bld->attribs[0][3];
688 assert(attrib != 0);
689 assert(bld->mask[0] & TGSI_WRITEMASK_W);
690 oow = lp_build_rcp(coeff_bld, w);
691 }
692 a = lp_build_mul(coeff_bld, a, oow);
693 }
694 #endif
695
696 if (attrib == 0 && chan == 2 && !bld->depth_clamp) {
697 /* FIXME: Depth values can exceed 1.0, due to the fact that
698 * setup interpolation coefficients refer to (0,0) which causes
699 * precision loss. So we must clamp to 1.0 here to avoid artifacts.
700 * Note though values outside [0,1] are perfectly valid with
701 * depth clip disabled..
702 * XXX: If depth clip is disabled but we force depth clamp
703 * we may get values larger than 1.0 in the fs (but not in
704 * depth test). Not sure if that's an issue...
705 * Also, on a similar note, it is not obvious if the depth values
706 * appearing in fs (with depth clip disabled) should be clamped
707 * to [0,1], clamped to near/far or not be clamped at all...
708 */
709 a = lp_build_min(coeff_bld, a, coeff_bld->one);
710 }
711
712 attrib_name(a, attrib, chan, "");
713 }
714 bld->attribs[attrib][chan] = a;
715 }
716 }
717 }
718 }
719
720
721 /**
722 * Generate the position vectors.
723 *
724 * Parameter x0, y0 are the integer values with upper left coordinates.
725 */
726 static void
727 pos_init(struct lp_build_interp_soa_context *bld,
728 LLVMValueRef x0,
729 LLVMValueRef y0)
730 {
731 LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder;
732 struct lp_build_context *coeff_bld = &bld->coeff_bld;
733
734 bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, "");
735 bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, "");
736 }
737
738
739 /**
740 * Initialize fragment shader input attribute info.
741 */
742 void
743 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
744 struct gallivm_state *gallivm,
745 unsigned num_inputs,
746 const struct lp_shader_input *inputs,
747 boolean pixel_center_integer,
748 unsigned coverage_samples,
749 LLVMValueRef sample_pos_array,
750 LLVMValueRef num_loop,
751 boolean depth_clamp,
752 LLVMBuilderRef builder,
753 struct lp_type type,
754 LLVMValueRef a0_ptr,
755 LLVMValueRef dadx_ptr,
756 LLVMValueRef dady_ptr,
757 LLVMValueRef x0,
758 LLVMValueRef y0)
759 {
760 struct lp_type coeff_type;
761 struct lp_type setup_type;
762 unsigned attrib;
763 unsigned chan;
764
765 memset(bld, 0, sizeof *bld);
766
767 memset(&coeff_type, 0, sizeof coeff_type);
768 coeff_type.floating = TRUE;
769 coeff_type.sign = TRUE;
770 coeff_type.width = 32;
771 coeff_type.length = type.length;
772
773 memset(&setup_type, 0, sizeof setup_type);
774 setup_type.floating = TRUE;
775 setup_type.sign = TRUE;
776 setup_type.width = 32;
777 setup_type.length = TGSI_NUM_CHANNELS;
778
779
780 /* XXX: we don't support interpolating into any other types */
781 assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
782
783 lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
784 lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
785
786 /* For convenience */
787 bld->pos = bld->attribs[0];
788 bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
789
790 /* Position */
791 bld->mask[0] = TGSI_WRITEMASK_XYZW;
792 bld->interp[0] = LP_INTERP_LINEAR;
793 bld->interp_loc[0] = 0;
794
795 /* Inputs */
796 for (attrib = 0; attrib < num_inputs; ++attrib) {
797 bld->mask[1 + attrib] = inputs[attrib].usage_mask;
798 bld->interp[1 + attrib] = inputs[attrib].interp;
799 bld->interp_loc[1 + attrib] = inputs[attrib].location;
800 }
801 bld->num_attribs = 1 + num_inputs;
802
803 /* Ensure all masked out input channels have a valid value */
804 for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
805 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
806 bld->attribs[attrib][chan] = bld->coeff_bld.undef;
807 }
808 }
809
810 if (pixel_center_integer) {
811 bld->pos_offset = 0.0;
812 } else {
813 bld->pos_offset = 0.5;
814 }
815 bld->depth_clamp = depth_clamp;
816 bld->coverage_samples = coverage_samples;
817 bld->num_loop = num_loop;
818 bld->sample_pos_array = sample_pos_array;
819
820 pos_init(bld, x0, y0);
821
822 /*
823 * Simple method (single step interpolation) may be slower if vector length
824 * is just 4, but the results are different (generally less accurate) with
825 * the other method, so always use more accurate version.
826 */
827 if (1) {
828 bld->simple_interp = TRUE;
829 {
830 /* XXX this should use a global static table */
831 unsigned i;
832 unsigned num_loops = 16 / type.length;
833 LLVMValueRef pixoffx, pixoffy, index;
834 LLVMValueRef ptr;
835
836 bld->xoffset_store = lp_build_array_alloca(gallivm,
837 lp_build_vec_type(gallivm, type),
838 lp_build_const_int32(gallivm, num_loops),
839 "");
840 bld->yoffset_store = lp_build_array_alloca(gallivm,
841 lp_build_vec_type(gallivm, type),
842 lp_build_const_int32(gallivm, num_loops),
843 "");
844 for (i = 0; i < num_loops; i++) {
845 index = lp_build_const_int32(gallivm, i);
846 calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy);
847 ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, "");
848 LLVMBuildStore(builder, pixoffx, ptr);
849 ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, "");
850 LLVMBuildStore(builder, pixoffy, ptr);
851 }
852 }
853 coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
854 }
855 else {
856 bld->simple_interp = FALSE;
857 coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
858 }
859
860 }
861
862
863 /*
864 * Advance the position and inputs to the given quad within the block.
865 */
866
867 void
868 lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
869 struct gallivm_state *gallivm,
870 LLVMValueRef quad_start_index,
871 LLVMValueRef mask_store,
872 LLVMValueRef sample_id)
873 {
874 if (bld->simple_interp) {
875 attribs_update_simple(bld, gallivm, quad_start_index, mask_store, sample_id, 1, bld->num_attribs);
876 }
877 else {
878 attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs);
879 }
880 }
881
882 void
883 lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
884 struct gallivm_state *gallivm,
885 LLVMValueRef quad_start_index,
886 LLVMValueRef sample_id)
887 {
888 if (bld->simple_interp) {
889 attribs_update_simple(bld, gallivm, quad_start_index, NULL, sample_id, 0, 1);
890 }
891 else {
892 attribs_update(bld, gallivm, quad_start_index, 0, 1);
893 }
894 }
895