/**************************************************************************
*
* Copyright 2009 VMware, Inc.
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2007-2008 VMware, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#include "util/u_debug.h"
#include "util/u_memory.h"
#include "util/u_math.h"
-#include "tgsi/tgsi_parse.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_const.h"
-#include "lp_bld_arit.h"
-#include "lp_bld_swizzle.h"
+#include "tgsi/tgsi_scan.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_flow.h"
#include "lp_bld_interp.h"
+/*
+ * The shader JIT function operates on blocks of quads.
+ * Each block has 2x2 quads and each quad has 2x2 pixels.
+ *
+ * We iterate over the quads in order 0, 1, 2, 3:
+ *
+ * #################
+ * # | # | #
+ * #---0---#---1---#
+ * # | # | #
+ * #################
+ * # | # | #
+ * #---2---#---3---#
+ * # | # | #
+ * #################
+ *
+ * If we iterate over multiple quads at once, quads 01 and 23 are processed
+ * together.
+ *
+ * Within each quad, we have four pixels which are represented in SOA
+ * order:
+ *
+ * #########
+ * # 0 | 1 #
+ * #---+---#
+ * # 2 | 3 #
+ * #########
+ *
+ * So the green channel (for example) of the four pixels is stored in
+ * a single vector register: {g0, g1, g2, g3}.
+ * The order stays the same even with multiple quads:
+ * 0 1 4 5
+ * 2 3 6 7
+ * is stored as g0..g7
+ */
+
+
+/**
+ * Do one perspective divide per quad.
+ *
+ * For perspective interpolation, the final attribute value is given
+ *
+ * a' = a/w = a * oow
+ *
+ * where
+ *
+ * a = a0 + dadx*x + dady*y
+ * w = w0 + dwdx*x + dwdy*y
+ * oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
+ *
+ * Instead of computing the division per pixel, with this macro we compute the
+ * division on the upper left pixel of each quad, and use a linear
+ * approximation in the remaining pixels, given by:
+ *
+ * da'dx = (dadx - dwdx*a)*oow
+ * da'dy = (dady - dwdy*a)*oow
+ *
+ * Ironically, this actually makes things slower -- probably because the
+ * divide hardware unit is rarely used, whereas the multiply unit is typically
+ * already saturated.
+ */
+#define PERSPECTIVE_DIVIDE_PER_QUAD 0
+
+
+static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
+static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
+
+
static void
attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
{
lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
}
+static void
+calc_offsets(struct lp_build_context *coeff_bld,
+ unsigned quad_start_index,
+ LLVMValueRef *pixoffx,
+ LLVMValueRef *pixoffy)
+{
+ unsigned i;
+ unsigned num_pix = coeff_bld->type.length;
+ struct gallivm_state *gallivm = coeff_bld->gallivm;
+ LLVMBuilderRef builder = coeff_bld->gallivm->builder;
+ LLVMValueRef nr, pixxf, pixyf;
+
+ *pixoffx = coeff_bld->undef;
+ *pixoffy = coeff_bld->undef;
+
+ for (i = 0; i < num_pix; i++) {
+ nr = lp_build_const_int32(gallivm, i);
+ pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
+ (quad_start_index & 1) * 2);
+ pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
+ (quad_start_index & 2));
+ *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, "");
+ *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, "");
+ }
+}
+
+/* Much easier, and significantly less instructions in the per-stamp
+ * part (less than half) but overall more instructions so a loss if
+ * most quads are active. Might be a win though with larger vectors.
+ * No ability to do per-quad divide (doable but not implemented)
+ * Could be made to work with passed in pixel offsets (i.e. active quad merging).
+ */
static void
-coeffs_init(struct lp_build_interp_soa_context *bld,
- LLVMValueRef a0_ptr,
- LLVMValueRef dadx_ptr,
- LLVMValueRef dady_ptr)
+coeffs_init_simple(struct lp_build_interp_soa_context *bld,
+ LLVMValueRef a0_ptr,
+ LLVMValueRef dadx_ptr,
+ LLVMValueRef dady_ptr)
{
- LLVMBuilderRef builder = bld->base.builder;
+ struct lp_build_context *coeff_bld = &bld->coeff_bld;
+ struct lp_build_context *setup_bld = &bld->setup_bld;
+ struct gallivm_state *gallivm = coeff_bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
unsigned attrib;
- unsigned chan;
- for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
- unsigned mask = bld->mask[attrib];
- unsigned mode = bld->mode[attrib];
- for(chan = 0; chan < NUM_CHANNELS; ++chan) {
- if(mask & (1 << chan)) {
- LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), attrib*NUM_CHANNELS + chan, 0);
- LLVMValueRef a0 = NULL;
- LLVMValueRef dadx = NULL;
- LLVMValueRef dady = NULL;
+ for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+ /*
+ * always fetch all 4 values for performance/simplicity
+ * Note: we do that here because it seems to generate better
+ * code. It generates a lot of moves initially but less
+ * moves later. As far as I can tell this looks like a
+ * llvm issue, instead of simply reloading the values from
+ * the passed in pointers it if it runs out of registers
+ * it spills/reloads them. Maybe some optimization passes
+ * would help.
+ * Might want to investigate this again later.
+ */
+ const unsigned interp = bld->interp[attrib];
+ LLVMValueRef index = lp_build_const_int32(gallivm,
+ attrib * TGSI_NUM_CHANNELS);
+ LLVMValueRef ptr;
+ LLVMValueRef dadxaos = setup_bld->zero;
+ LLVMValueRef dadyaos = setup_bld->zero;
+ LLVMValueRef a0aos = setup_bld->zero;
+
+ switch (interp) {
+ case LP_INTERP_PERSPECTIVE:
+ /* fall-through */
+
+ case LP_INTERP_LINEAR:
+ ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ dadxaos = LLVMBuildLoad(builder, ptr, "");
+
+ ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ dadyaos = LLVMBuildLoad(builder, ptr, "");
+
+ attrib_name(dadxaos, attrib, 0, ".dadxaos");
+ attrib_name(dadyaos, attrib, 0, ".dadyaos");
+ /* fall-through */
+
+ case LP_INTERP_CONSTANT:
+ case LP_INTERP_FACING:
+ ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ a0aos = LLVMBuildLoad(builder, ptr, "");
+ attrib_name(a0aos, attrib, 0, ".a0aos");
+ break;
- switch( mode ) {
- case TGSI_INTERPOLATE_PERSPECTIVE:
- /* fall-through */
+ case LP_INTERP_POSITION:
+ /* Nothing to do as the position coeffs are already setup in slot 0 */
+ continue;
- case TGSI_INTERPOLATE_LINEAR:
- dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
- dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
- dadx = lp_build_broadcast_scalar(&bld->base, dadx);
- dady = lp_build_broadcast_scalar(&bld->base, dady);
- attrib_name(dadx, attrib, chan, ".dadx");
- attrib_name(dady, attrib, chan, ".dady");
+ default:
+ assert(0);
+ break;
+ }
+ bld->a0aos[attrib] = a0aos;
+ bld->dadxaos[attrib] = dadxaos;
+ bld->dadyaos[attrib] = dadyaos;
+ }
+}
+
+/**
+ * Interpolate the shader input attribute values.
+ * This is called for each (group of) quad(s).
+ */
+static void
+attribs_update_simple(struct lp_build_interp_soa_context *bld,
+ struct gallivm_state *gallivm,
+ LLVMValueRef loop_iter,
+ int start,
+ int end)
+{
+ LLVMBuilderRef builder = gallivm->builder;
+ struct lp_build_context *coeff_bld = &bld->coeff_bld;
+ struct lp_build_context *setup_bld = &bld->setup_bld;
+ LLVMValueRef oow = NULL;
+ unsigned attrib;
+ LLVMValueRef pixoffx;
+ LLVMValueRef pixoffy;
+ LLVMValueRef ptr;
+
+ /* could do this with code-generated passed in pixel offsets too */
+
+ assert(loop_iter);
+ ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, "");
+ pixoffx = LLVMBuildLoad(builder, ptr, "");
+ ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, "");
+ pixoffy = LLVMBuildLoad(builder, ptr, "");
+
+ pixoffx = LLVMBuildFAdd(builder, pixoffx,
+ lp_build_broadcast_scalar(coeff_bld, bld->x), "");
+ pixoffy = LLVMBuildFAdd(builder, pixoffy,
+ lp_build_broadcast_scalar(coeff_bld, bld->y), "");
+
+ for (attrib = start; attrib < end; attrib++) {
+ const unsigned mask = bld->mask[attrib];
+ const unsigned interp = bld->interp[attrib];
+ unsigned chan;
+
+ for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+ if (mask & (1 << chan)) {
+ LLVMValueRef index;
+ LLVMValueRef dadx = coeff_bld->zero;
+ LLVMValueRef dady = coeff_bld->zero;
+ LLVMValueRef a = coeff_bld->zero;
+
+ index = lp_build_const_int32(gallivm, chan);
+ switch (interp) {
+ case LP_INTERP_PERSPECTIVE:
/* fall-through */
- case TGSI_INTERPOLATE_CONSTANT:
- a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
- a0 = lp_build_broadcast_scalar(&bld->base, a0);
- attrib_name(a0, attrib, chan, ".dady");
+ case LP_INTERP_LINEAR:
+ if (attrib == 0 && chan == 0) {
+ dadx = coeff_bld->one;
+ if (bld->pos_offset) {
+ a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
+ }
+ }
+ else if (attrib == 0 && chan == 1) {
+ dady = coeff_bld->one;
+ if (bld->pos_offset) {
+ a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
+ }
+ }
+ else {
+ dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, bld->dadxaos[attrib],
+ index);
+ dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, bld->dadyaos[attrib],
+ index);
+ a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, bld->a0aos[attrib],
+ index);
+ }
+ /*
+ * a = a0 + (x * dadx + y * dady)
+ */
+ a = lp_build_fmuladd(builder, dadx, pixoffx, a);
+ a = lp_build_fmuladd(builder, dady, pixoffy, a);
+
+ if (interp == LP_INTERP_PERSPECTIVE) {
+ if (oow == NULL) {
+ LLVMValueRef w = bld->attribs[0][3];
+ assert(attrib != 0);
+ assert(bld->mask[0] & TGSI_WRITEMASK_W);
+ oow = lp_build_rcp(coeff_bld, w);
+ }
+ a = lp_build_mul(coeff_bld, a, oow);
+ }
+ break;
+
+ case LP_INTERP_CONSTANT:
+ case LP_INTERP_FACING:
+ a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, bld->a0aos[attrib],
+ index);
+ break;
+
+ case LP_INTERP_POSITION:
+ assert(attrib > 0);
+ a = bld->attribs[0][chan];
break;
default:
break;
}
- bld->a0 [attrib][chan] = a0;
- bld->dadx[attrib][chan] = dadx;
- bld->dady[attrib][chan] = dady;
+ if ((attrib == 0) && (chan == 2) && !bld->depth_clamp){
+ /* FIXME: Depth values can exceed 1.0, due to the fact that
+ * setup interpolation coefficients refer to (0,0) which causes
+ * precision loss. So we must clamp to 1.0 here to avoid artifacts.
+ * Note though values outside [0,1] are perfectly valid with
+ * depth clip disabled.
+ * XXX: If depth clip is disabled but we force depth clamp
+ * we may get values larger than 1.0 in the fs (but not in
+ * depth test). Not sure if that's an issue...
+ * Also, on a similar note, it is not obvious if the depth values
+ * appearing in fs (with depth clip disabled) should be clamped
+ * to [0,1], clamped to near/far or not be clamped at all...
+ */
+ a = lp_build_min(coeff_bld, a, coeff_bld->one);
+ }
+ bld->attribs[attrib][chan] = a;
}
}
}
}
-
/**
- * Multiply the dadx and dady with the xstep and ystep respectively.
+ * Initialize the bld->a, dadq fields. This involves fetching
+ * those values from the arrays which are passed into the JIT function.
*/
static void
-coeffs_update(struct lp_build_interp_soa_context *bld)
+coeffs_init(struct lp_build_interp_soa_context *bld,
+ LLVMValueRef a0_ptr,
+ LLVMValueRef dadx_ptr,
+ LLVMValueRef dady_ptr)
{
+ struct lp_build_context *coeff_bld = &bld->coeff_bld;
+ struct lp_build_context *setup_bld = &bld->setup_bld;
+ struct gallivm_state *gallivm = coeff_bld->gallivm;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef pixoffx, pixoffy;
unsigned attrib;
unsigned chan;
-
- for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
- unsigned mask = bld->mask[attrib];
- unsigned mode = bld->mode[attrib];
- if (mode != TGSI_INTERPOLATE_CONSTANT) {
- for(chan = 0; chan < NUM_CHANNELS; ++chan) {
- if(mask & (1 << chan)) {
- bld->dadx[attrib][chan] = lp_build_mul_imm(&bld->base, bld->dadx[attrib][chan], bld->xstep);
- bld->dady[attrib][chan] = lp_build_mul_imm(&bld->base, bld->dady[attrib][chan], bld->ystep);
- }
- }
- }
+ unsigned i;
+
+ pixoffx = coeff_bld->undef;
+ pixoffy = coeff_bld->undef;
+ for (i = 0; i < coeff_bld->type.length; i++) {
+ LLVMValueRef nr = lp_build_const_int32(gallivm, i);
+ LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
+ LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
+ pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
+ pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
}
-}
-static void
-attribs_init(struct lp_build_interp_soa_context *bld)
-{
- LLVMValueRef x = bld->pos[0];
- LLVMValueRef y = bld->pos[1];
- LLVMValueRef oow = NULL;
- unsigned attrib;
- unsigned chan;
+ for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+ const unsigned mask = bld->mask[attrib];
+ const unsigned interp = bld->interp[attrib];
+ LLVMValueRef index = lp_build_const_int32(gallivm,
+ attrib * TGSI_NUM_CHANNELS);
+ LLVMValueRef ptr;
+ LLVMValueRef dadxaos = setup_bld->zero;
+ LLVMValueRef dadyaos = setup_bld->zero;
+ LLVMValueRef a0aos = setup_bld->zero;
+
+ /* always fetch all 4 values for performance/simplicity */
+ switch (interp) {
+ case LP_INTERP_PERSPECTIVE:
+ /* fall-through */
+
+ case LP_INTERP_LINEAR:
+ ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ dadxaos = LLVMBuildLoad(builder, ptr, "");
+
+ ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ dadyaos = LLVMBuildLoad(builder, ptr, "");
+
+ attrib_name(dadxaos, attrib, 0, ".dadxaos");
+ attrib_name(dadyaos, attrib, 0, ".dadyaos");
+ /* fall-through */
+
+ case LP_INTERP_CONSTANT:
+ case LP_INTERP_FACING:
+ ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
+ ptr = LLVMBuildBitCast(builder, ptr,
+ LLVMPointerType(setup_bld->vec_type, 0), "");
+ a0aos = LLVMBuildLoad(builder, ptr, "");
+ attrib_name(a0aos, attrib, 0, ".a0aos");
+ break;
- for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
- unsigned mask = bld->mask[attrib];
- unsigned mode = bld->mode[attrib];
- for(chan = 0; chan < NUM_CHANNELS; ++chan) {
- if(mask & (1 << chan)) {
- LLVMValueRef a0 = bld->a0 [attrib][chan];
- LLVMValueRef dadx = bld->dadx[attrib][chan];
- LLVMValueRef dady = bld->dady[attrib][chan];
- LLVMValueRef res;
+ case LP_INTERP_POSITION:
+ /* Nothing to do as the position coeffs are already setup in slot 0 */
+ continue;
- res = a0;
+ default:
+ assert(0);
+ break;
+ }
- if (mode != TGSI_INTERPOLATE_CONSTANT) {
- res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, x, dadx));
- res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, y, dady));
- }
+ /*
+ * a = a0 + (x * dadx + y * dady)
+ * a0aos is the attrib value at top left corner of stamp
+ */
+ if (interp != LP_INTERP_CONSTANT &&
+ interp != LP_INTERP_FACING) {
+ LLVMValueRef x = lp_build_broadcast_scalar(setup_bld, bld->x);
+ LLVMValueRef y = lp_build_broadcast_scalar(setup_bld, bld->y);
+ a0aos = lp_build_fmuladd(builder, x, dadxaos, a0aos);
+ a0aos = lp_build_fmuladd(builder, y, dadyaos, a0aos);
+ }
- /* Keep the value of the attribue before perspective divide for faster updates */
- bld->attribs_pre[attrib][chan] = res;
+ /*
+ * dadq = {0, dadx, dady, dadx + dady}
+ * for two quads (side by side) this is:
+ * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
+ */
+ for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+ /* this generates a CRAPLOAD of shuffles... */
+ if (mask & (1 << chan)) {
+ LLVMValueRef dadx, dady;
+ LLVMValueRef dadq, dadq2;
+ LLVMValueRef a;
+ LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
+
+ if (attrib == 0 && chan == 0) {
+ a = bld->x;
+ if (bld->pos_offset) {
+ a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
+ }
+ a = lp_build_broadcast_scalar(coeff_bld, a);
+ dadx = coeff_bld->one;
+ dady = coeff_bld->zero;
+ }
+ else if (attrib == 0 && chan == 1) {
+ a = bld->y;
+ if (bld->pos_offset) {
+ a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
+ }
+ a = lp_build_broadcast_scalar(coeff_bld, a);
+ dady = coeff_bld->one;
+ dadx = coeff_bld->zero;
+ }
+ else {
+ dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, dadxaos, chan_index);
+ dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, dadyaos, chan_index);
+
+ /*
+ * a = {a, a, a, a}
+ */
+ a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+ coeff_bld->type, a0aos, chan_index);
+ }
- if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
- LLVMValueRef w = bld->pos[3];
+ dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
+ dady = LLVMBuildFMul(builder, dady, pixoffy, "");
+ dadq = LLVMBuildFAdd(builder, dadx, dady, "");
+
+ /*
+ * Compute the attrib values on the upper-left corner of each
+ * group of quads.
+ * Note that if we process 2 quads at once this doesn't
+ * really exactly to what we want.
+ * We need to access elem 0 and 2 respectively later if we process
+ * 2 quads at once.
+ */
+
+ if (interp != LP_INTERP_CONSTANT &&
+ interp != LP_INTERP_FACING) {
+ dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
+ a = LLVMBuildFAdd(builder, a, dadq2, "");
+ }
+
+#if PERSPECTIVE_DIVIDE_PER_QUAD
+ /*
+ * a *= 1 / w
+ */
+
+ /*
+ * XXX since we're only going to access elements 0,2 out of 8
+ * if we have 8-wide vectors we should do the division only 4-wide.
+ * a is really a 2-elements in a 4-wide vector disguised as 8-wide
+ * in this case.
+ */
+ if (interp == LP_INTERP_PERSPECTIVE) {
+ LLVMValueRef w = bld->a[0][3];
assert(attrib != 0);
- if(!oow)
- oow = lp_build_rcp(&bld->base, w);
- res = lp_build_mul(&bld->base, res, oow);
+ assert(bld->mask[0] & TGSI_WRITEMASK_W);
+ if (!bld->oow) {
+ bld->oow = lp_build_rcp(coeff_bld, w);
+ lp_build_name(bld->oow, "oow");
+ }
+ a = lp_build_mul(coeff_bld, a, bld->oow);
}
+#endif
- attrib_name(res, attrib, chan, "");
+ attrib_name(a, attrib, chan, ".a");
+ attrib_name(dadq, attrib, chan, ".dadq");
- bld->attribs[attrib][chan] = res;
+ bld->a[attrib][chan] = lp_build_alloca(gallivm,
+ LLVMTypeOf(a), "");
+ LLVMBuildStore(builder, a, bld->a[attrib][chan]);
+ bld->dadq[attrib][chan] = dadq;
}
}
}
}
+/**
+ * Increment the shader input attribute values.
+ * This is called when we move from one quad to the next.
+ */
static void
-attribs_update(struct lp_build_interp_soa_context *bld)
+attribs_update(struct lp_build_interp_soa_context *bld,
+ struct gallivm_state *gallivm,
+ LLVMValueRef loop_iter,
+ int start,
+ int end)
{
+ LLVMBuilderRef builder = gallivm->builder;
+ struct lp_build_context *coeff_bld = &bld->coeff_bld;
LLVMValueRef oow = NULL;
unsigned attrib;
unsigned chan;
- for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
- unsigned mask = bld->mask[attrib];
- unsigned mode = bld->mode[attrib];
-
- if (mode != TGSI_INTERPOLATE_CONSTANT) {
- for(chan = 0; chan < NUM_CHANNELS; ++chan) {
- if(mask & (1 << chan)) {
- LLVMValueRef dadx = bld->dadx[attrib][chan];
- LLVMValueRef dady = bld->dady[attrib][chan];
- LLVMValueRef res;
+ for(attrib = start; attrib < end; ++attrib) {
+ const unsigned mask = bld->mask[attrib];
+ const unsigned interp = bld->interp[attrib];
+ for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+ if(mask & (1 << chan)) {
+ LLVMValueRef a;
+ if (interp == LP_INTERP_CONSTANT ||
+ interp == LP_INTERP_FACING) {
+ a = LLVMBuildLoad(builder, bld->a[attrib][chan], "");
+ }
+ else if (interp == LP_INTERP_POSITION) {
+ assert(attrib > 0);
+ a = bld->attribs[0][chan];
+ }
+ else {
+ LLVMValueRef dadq;
+
+ a = bld->a[attrib][chan];
+
+ /*
+ * Broadcast the attribute value for this quad into all elements
+ */
+
+ {
+ /* stored as vector load as float */
+ LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext(
+ gallivm->context), 0);
+ LLVMValueRef ptr;
+ a = LLVMBuildBitCast(builder, a, ptr_type, "");
+ ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, "");
+ a = LLVMBuildLoad(builder, ptr, "");
+ a = lp_build_broadcast_scalar(&bld->coeff_bld, a);
+ }
- res = bld->attribs_pre[attrib][chan];
+ /*
+ * Get the derivatives.
+ */
- if(bld->xstep)
- res = lp_build_add(&bld->base, res, dadx);
+ dadq = bld->dadq[attrib][chan];
- if(bld->ystep)
- res = lp_build_add(&bld->base, res, dady);
+#if PERSPECTIVE_DIVIDE_PER_QUAD
+ if (interp == LP_INTERP_PERSPECTIVE) {
+ LLVMValueRef dwdq = bld->dadq[0][3];
- bld->attribs_pre[attrib][chan] = res;
+ if (oow == NULL) {
+ assert(bld->oow);
+ oow = LLVMBuildShuffleVector(coeff_bld->builder,
+ bld->oow, coeff_bld->undef,
+ shuffle, "");
+ }
- if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
- LLVMValueRef w = bld->pos[3];
- assert(attrib != 0);
- if(!oow)
- oow = lp_build_rcp(&bld->base, w);
- res = lp_build_mul(&bld->base, res, oow);
+ dadq = lp_build_sub(coeff_bld,
+ dadq,
+ lp_build_mul(coeff_bld, a, dwdq));
+ dadq = lp_build_mul(coeff_bld, dadq, oow);
+ }
+#endif
+
+ /*
+ * Add the derivatives
+ */
+
+ a = lp_build_add(coeff_bld, a, dadq);
+
+#if !PERSPECTIVE_DIVIDE_PER_QUAD
+ if (interp == LP_INTERP_PERSPECTIVE) {
+ if (oow == NULL) {
+ LLVMValueRef w = bld->attribs[0][3];
+ assert(attrib != 0);
+ assert(bld->mask[0] & TGSI_WRITEMASK_W);
+ oow = lp_build_rcp(coeff_bld, w);
+ }
+ a = lp_build_mul(coeff_bld, a, oow);
+ }
+#endif
+
+ if (attrib == 0 && chan == 2 && !bld->depth_clamp) {
+ /* FIXME: Depth values can exceed 1.0, due to the fact that
+ * setup interpolation coefficients refer to (0,0) which causes
+ * precision loss. So we must clamp to 1.0 here to avoid artifacts.
+ * Note though values outside [0,1] are perfectly valid with
+ * depth clip disabled..
+ * XXX: If depth clip is disabled but we force depth clamp
+ * we may get values larger than 1.0 in the fs (but not in
+ * depth test). Not sure if that's an issue...
+ * Also, on a similar note, it is not obvious if the depth values
+ * appearing in fs (with depth clip disabled) should be clamped
+ * to [0,1], clamped to near/far or not be clamped at all...
+ */
+ a = lp_build_min(coeff_bld, a, coeff_bld->one);
}
- attrib_name(res, attrib, chan, "");
-
- bld->attribs[attrib][chan] = res;
+ attrib_name(a, attrib, chan, "");
}
+ bld->attribs[attrib][chan] = a;
}
}
}
/**
* Generate the position vectors.
*
- * Parameter x0, y0 are the integer values with the quad upper left coordinates.
+ * Parameter x0, y0 are the integer values with upper left coordinates.
*/
static void
pos_init(struct lp_build_interp_soa_context *bld,
LLVMValueRef x0,
LLVMValueRef y0)
{
- lp_build_name(x0, "pos.x");
- lp_build_name(y0, "pos.y");
-
- bld->attribs[0][0] = x0;
- bld->attribs[0][1] = y0;
-}
-
-
-static void
-pos_update(struct lp_build_interp_soa_context *bld)
-{
- LLVMValueRef x = bld->attribs[0][0];
- LLVMValueRef y = bld->attribs[0][1];
-
- if(bld->xstep)
- x = lp_build_add(&bld->base, x, lp_build_const_scalar(bld->base.type, bld->xstep));
+ LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder;
+ struct lp_build_context *coeff_bld = &bld->coeff_bld;
- if(bld->ystep)
- y = lp_build_add(&bld->base, y, lp_build_const_scalar(bld->base.type, bld->ystep));
-
- lp_build_name(x, "pos.x");
- lp_build_name(y, "pos.y");
-
- bld->attribs[0][0] = x;
- bld->attribs[0][1] = y;
+ bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, "");
+ bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, "");
}
+/**
+ * Initialize fragment shader input attribute info.
+ */
void
lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
- const struct tgsi_token *tokens,
+ struct gallivm_state *gallivm,
+ unsigned num_inputs,
+ const struct lp_shader_input *inputs,
+ boolean pixel_center_integer,
+ boolean depth_clamp,
LLVMBuilderRef builder,
struct lp_type type,
LLVMValueRef a0_ptr,
LLVMValueRef dadx_ptr,
LLVMValueRef dady_ptr,
LLVMValueRef x0,
- LLVMValueRef y0,
- int xstep,
- int ystep)
+ LLVMValueRef y0)
{
- struct tgsi_parse_context parse;
- struct tgsi_full_declaration *decl;
+ struct lp_type coeff_type;
+ struct lp_type setup_type;
+ unsigned attrib;
+ unsigned chan;
memset(bld, 0, sizeof *bld);
- lp_build_context_init(&bld->base, builder, type);
+ memset(&coeff_type, 0, sizeof coeff_type);
+ coeff_type.floating = TRUE;
+ coeff_type.sign = TRUE;
+ coeff_type.width = 32;
+ coeff_type.length = type.length;
+
+ memset(&setup_type, 0, sizeof setup_type);
+ setup_type.floating = TRUE;
+ setup_type.sign = TRUE;
+ setup_type.width = 32;
+ setup_type.length = TGSI_NUM_CHANNELS;
+
+
+ /* XXX: we don't support interpolating into any other types */
+ assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
+
+ lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
+ lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
/* For convenience */
bld->pos = bld->attribs[0];
- bld->inputs = (const LLVMValueRef (*)[NUM_CHANNELS]) bld->attribs[1];
+ bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
/* Position */
- bld->num_attribs = 1;
- bld->mask[0] = TGSI_WRITEMASK_ZW;
- bld->mode[0] = TGSI_INTERPOLATE_LINEAR;
+ bld->mask[0] = TGSI_WRITEMASK_XYZW;
+ bld->interp[0] = LP_INTERP_LINEAR;
/* Inputs */
- tgsi_parse_init( &parse, tokens );
- while( !tgsi_parse_end_of_tokens( &parse ) ) {
- tgsi_parse_token( &parse );
-
- switch( parse.FullToken.Token.Type ) {
- case TGSI_TOKEN_TYPE_DECLARATION:
- decl = &parse.FullToken.FullDeclaration;
- if( decl->Declaration.File == TGSI_FILE_INPUT ) {
- unsigned first, last, mask;
- unsigned attrib;
-
- first = decl->Range.First;
- last = decl->Range.Last;
- mask = decl->Declaration.UsageMask;
-
- for( attrib = first; attrib <= last; ++attrib ) {
- bld->mask[1 + attrib] = mask;
- bld->mode[1 + attrib] = decl->Declaration.Interpolate;
- }
-
- bld->num_attribs = MAX2(bld->num_attribs, 1 + last + 1);
- }
- break;
-
- case TGSI_TOKEN_TYPE_INSTRUCTION:
- case TGSI_TOKEN_TYPE_IMMEDIATE:
- break;
+ for (attrib = 0; attrib < num_inputs; ++attrib) {
+ bld->mask[1 + attrib] = inputs[attrib].usage_mask;
+ bld->interp[1 + attrib] = inputs[attrib].interp;
+ }
+ bld->num_attribs = 1 + num_inputs;
- default:
- assert( 0 );
+ /* Ensure all masked out input channels have a valid value */
+ for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+ for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+ bld->attribs[attrib][chan] = bld->coeff_bld.undef;
}
}
- tgsi_parse_free( &parse );
- coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+ if (pixel_center_integer) {
+ bld->pos_offset = 0.0;
+ } else {
+ bld->pos_offset = 0.5;
+ }
+ bld->depth_clamp = depth_clamp;
pos_init(bld, x0, y0);
- attribs_init(bld);
-
- bld->xstep = xstep;
- bld->ystep = ystep;
+ /*
+ * Simple method (single step interpolation) may be slower if vector length
+ * is just 4, but the results are different (generally less accurate) with
+ * the other method, so always use more accurate version.
+ */
+ if (1) {
+ bld->simple_interp = TRUE;
+ {
+ /* XXX this should use a global static table */
+ unsigned i;
+ unsigned num_loops = 16 / type.length;
+ LLVMValueRef pixoffx, pixoffy, index;
+ LLVMValueRef ptr;
+
+ bld->xoffset_store = lp_build_array_alloca(gallivm,
+ lp_build_vec_type(gallivm, type),
+ lp_build_const_int32(gallivm, num_loops),
+ "");
+ bld->yoffset_store = lp_build_array_alloca(gallivm,
+ lp_build_vec_type(gallivm, type),
+ lp_build_const_int32(gallivm, num_loops),
+ "");
+ for (i = 0; i < num_loops; i++) {
+ index = lp_build_const_int32(gallivm, i);
+ calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy);
+ ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, "");
+ LLVMBuildStore(builder, pixoffx, ptr);
+ ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, "");
+ LLVMBuildStore(builder, pixoffy, ptr);
+ }
+ }
+ coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
+ }
+ else {
+ bld->simple_interp = FALSE;
+ coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+ }
- coeffs_update(bld);
}
-/**
- * Advance the position and inputs with the xstep and ystep.
+/*
+ * Advance the position and inputs to the given quad within the block.
*/
+
void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld)
+lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
+ struct gallivm_state *gallivm,
+ LLVMValueRef quad_start_index)
{
- pos_update(bld);
+ if (bld->simple_interp) {
+ attribs_update_simple(bld, gallivm, quad_start_index, 1, bld->num_attribs);
+ }
+ else {
+ attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs);
+ }
+}
- attribs_update(bld);
+void
+lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
+ struct gallivm_state *gallivm,
+ LLVMValueRef quad_start_index)
+{
+ if (bld->simple_interp) {
+ attribs_update_simple(bld, gallivm, quad_start_index, 0, 1);
+ }
+ else {
+ attribs_update(bld, gallivm, quad_start_index, 0, 1);
+ }
}
+