#include "util/u_format.h"
#include "util/u_dump.h"
#include "util/u_string.h"
-#include "util/u_simple_list.h"
+#include "util/simple_list.h"
#include "util/u_dual_blend.h"
-#include "os/os_time.h"
+#include "util/os_time.h"
#include "pipe/p_shader_tokens.h"
#include "draw/draw_context.h"
#include "tgsi/tgsi_dump.h"
#include "gallivm/lp_bld_flow.h"
#include "gallivm/lp_bld_debug.h"
#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
#include "gallivm/lp_bld_pack.h"
#include "gallivm/lp_bld_format.h"
#include "gallivm/lp_bld_quad.h"
struct lp_type mask_type;
LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
LLVMValueRef bits[16];
- LLVMValueRef mask;
+ LLVMValueRef mask, bits_vec;
int shift, i;
/*
* XXX: We'll need a different path for 16 x u8
*/
assert(fs_type.width == 32);
- assert(fs_type.length <= Elements(bits));
+ assert(fs_type.length <= ARRAY_SIZE(bits));
mask_type = lp_int_type(fs_type);
/*
for (i = 0; i < fs_type.length / 4; i++) {
unsigned j = 2 * (i % 2) + (i / 2) * 8;
- bits[4*i + 0] = LLVMConstInt(i32t, 1 << (j + 0), 0);
- bits[4*i + 1] = LLVMConstInt(i32t, 1 << (j + 1), 0);
- bits[4*i + 2] = LLVMConstInt(i32t, 1 << (j + 4), 0);
- bits[4*i + 3] = LLVMConstInt(i32t, 1 << (j + 5), 0);
+ bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
+ bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
+ bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
+ bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
}
- mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, fs_type.length), "");
+ bits_vec = LLVMConstVector(bits, fs_type.length);
+ mask = LLVMBuildAnd(builder, mask, bits_vec, "");
/*
- * mask = mask != 0 ? ~0 : 0
+ * mask = mask == bits ? ~0 : 0
*/
mask = lp_build_compare(gallivm,
- mask_type, PIPE_FUNC_NOTEQUAL,
- mask,
- lp_build_const_int_vec(gallivm, mask_type, 0));
+ mask_type, PIPE_FUNC_EQUAL,
+ mask, bits_vec);
return mask;
}
}
+static LLVMValueRef
+lp_build_depth_clamp(struct gallivm_state *gallivm,
+ LLVMBuilderRef builder,
+ struct lp_type type,
+ LLVMValueRef context_ptr,
+ LLVMValueRef thread_data_ptr,
+ LLVMValueRef z)
+{
+ LLVMValueRef viewport, min_depth, max_depth;
+ LLVMValueRef viewport_index;
+ struct lp_build_context f32_bld;
+
+ assert(type.floating);
+ lp_build_context_init(&f32_bld, gallivm, type);
+
+ /*
+ * Assumes clamping of the viewport index will occur in setup/gs. Value
+ * is passed through the rasterization stage via lp_rast_shader_inputs.
+ *
+ * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
+ * semantics.
+ */
+ viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
+ thread_data_ptr);
+
+ /*
+ * Load the min and max depth from the lp_jit_context.viewports
+ * array of lp_jit_viewport structures.
+ */
+ viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index);
+
+ /* viewports[viewport_index].min_depth */
+ min_depth = LLVMBuildExtractElement(builder, viewport,
+ lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), "");
+ min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
+
+ /* viewports[viewport_index].max_depth */
+ max_depth = LLVMBuildExtractElement(builder, viewport,
+ lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), "");
+ max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
+
+ /*
+ * Clamp to the min and max depth values for the given viewport.
+ */
+ return lp_build_clamp(&f32_bld, z, min_depth, max_depth);
+}
+
+
/**
* Generate the fragment shader, depth/stencil test, and alpha tests.
*/
LLVMValueRef context_ptr,
LLVMValueRef num_loop,
struct lp_build_interp_soa_context *interp,
- struct lp_build_sampler_soa *sampler,
+ const struct lp_build_sampler_soa *sampler,
LLVMValueRef mask_store,
LLVMValueRef (*out_color)[4],
LLVMValueRef depth_ptr,
{
const struct util_format_description *zs_format_desc = NULL;
const struct tgsi_token *tokens = shader->base.tokens;
- LLVMTypeRef vec_type;
+ struct lp_type int_type = lp_int_type(type);
+ LLVMTypeRef vec_type, int_vec_type;
LLVMValueRef mask_ptr, mask_val;
LLVMValueRef consts_ptr, num_consts_ptr;
+ LLVMValueRef ssbo_ptr, num_ssbo_ptr;
LLVMValueRef z;
LLVMValueRef z_value, s_value;
LLVMValueRef z_fb, s_fb;
zs_format_desc = util_format_description(key->zsbuf_format);
assert(zs_format_desc);
- if (!shader->info.base.writes_z) {
+ if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) {
if (key->alpha.enabled ||
key->blend.alpha_to_coverage ||
- shader->info.base.uses_kill) {
+ shader->info.base.uses_kill ||
+ shader->info.base.writes_samplemask) {
/* With alpha test and kill, can do the depth test early
* and hopefully eliminate some quads. But need to do a
* special deferred depth write once the final mask value
depth_mode = 0;
}
+ vec_type = lp_build_vec_type(gallivm, type);
+ int_vec_type = lp_build_vec_type(gallivm, int_type);
stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr);
stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr);
-
- vec_type = lp_build_vec_type(gallivm, type);
+ /* convert scalar stencil refs into vectors */
+ stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
+ stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
consts_ptr = lp_jit_context_constants(gallivm, context_ptr);
num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr);
+ ssbo_ptr = lp_jit_context_ssbos(gallivm, context_ptr);
+ num_ssbo_ptr = lp_jit_context_num_ssbos(gallivm, context_ptr);
+
lp_build_for_loop_begin(&loop_state, gallivm,
lp_build_const_int32(gallivm, 0),
LLVMIntULT,
z = interp->pos[2];
if (depth_mode & EARLY_DEPTH_TEST) {
+ /*
+ * Clamp according to ARB_depth_clamp semantics.
+ */
+ if (key->depth_clamp) {
+ z = lp_build_depth_clamp(gallivm, builder, type, context_ptr,
+ thread_data_ptr, z);
+ }
lp_build_depth_stencil_load_swizzled(gallivm, type,
zs_format_desc, key->resource_1d,
depth_ptr, depth_stride,
lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter);
+ struct lp_build_tgsi_params params;
+ memset(¶ms, 0, sizeof(params));
+
+ params.type = type;
+ params.mask = &mask;
+ params.consts_ptr = consts_ptr;
+ params.const_sizes_ptr = num_consts_ptr;
+ params.system_values = &system_values;
+ params.inputs = interp->inputs;
+ params.context_ptr = context_ptr;
+ params.thread_data_ptr = thread_data_ptr;
+ params.sampler = sampler;
+ params.info = &shader->info.base;
+ params.ssbo_ptr = ssbo_ptr;
+ params.ssbo_sizes_ptr = num_ssbo_ptr;
+
/* Build the actual shader */
- lp_build_tgsi_soa(gallivm, tokens, type, &mask,
- consts_ptr, num_consts_ptr, &system_values,
- interp->inputs,
- outputs, sampler, &shader->info.base, NULL);
+ lp_build_tgsi_soa(gallivm, tokens, ¶ms,
+ outputs);
/* Alpha test */
if (key->alpha.enabled) {
}
}
+ if (shader->info.base.writes_samplemask) {
+ int smaski = find_output_by_semantic(&shader->info.base,
+ TGSI_SEMANTIC_SAMPLEMASK,
+ 0);
+ LLVMValueRef smask;
+ struct lp_build_context smask_bld;
+ lp_build_context_init(&smask_bld, gallivm, int_type);
+
+ assert(smaski >= 0);
+ smask = LLVMBuildLoad(builder, outputs[smaski][0], "smask");
+ /*
+ * Pixel is alive according to the first sample in the mask.
+ */
+ smask = LLVMBuildBitCast(builder, smask, smask_bld.vec_type, "");
+ smask = lp_build_and(&smask_bld, smask, smask_bld.one);
+ smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, smask, smask_bld.zero);
+ lp_build_mask_update(&mask, smask);
+ }
+
/* Late Z test */
if (depth_mode & LATE_DEPTH_TEST) {
int pos0 = find_output_by_semantic(&shader->info.base,
TGSI_SEMANTIC_POSITION,
0);
-
+ int s_out = find_output_by_semantic(&shader->info.base,
+ TGSI_SEMANTIC_STENCIL,
+ 0);
if (pos0 != -1 && outputs[pos0][2]) {
z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
+ }
+ /*
+ * Clamp according to ARB_depth_clamp semantics.
+ */
+ if (key->depth_clamp) {
+ z = lp_build_depth_clamp(gallivm, builder, type, context_ptr,
+ thread_data_ptr, z);
+ }
- /*
- * Clamp according to ARB_depth_clamp semantics.
- */
- if (key->depth_clamp) {
- LLVMValueRef viewport, min_depth, max_depth;
- LLVMValueRef viewport_index;
- struct lp_build_context f32_bld;
-
- assert(type.floating);
- lp_build_context_init(&f32_bld, gallivm, type);
-
- /*
- * Assumes clamping of the viewport index will occur in setup/gs. Value
- * is passed through the rasterization stage via lp_rast_shader_inputs.
- *
- * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
- * semantics.
- */
- viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
- thread_data_ptr);
-
- /*
- * Load the min and max depth from the lp_jit_context.viewports
- * array of lp_jit_viewport structures.
- */
- viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index);
-
- /* viewports[viewport_index].min_depth */
- min_depth = LLVMBuildExtractElement(builder, viewport,
- lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH),
- "");
- min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
-
- /* viewports[viewport_index].max_depth */
- max_depth = LLVMBuildExtractElement(builder, viewport,
- lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH),
- "");
- max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
-
- /*
- * Clamp to the min and max depth values for the given viewport.
- */
- z = lp_build_clamp(&f32_bld, z, min_depth, max_depth);
- }
+ if (s_out != -1 && outputs[s_out][1]) {
+ /* there's only one value, and spec says to discard additional bits */
+ LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
+ stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s");
+ stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
+ stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
+ stencil_refs[1] = stencil_refs[0];
}
lp_build_depth_stencil_load_swizzled(gallivm, type,
src_count = num_fs * src_channels;
assert(pixels == 2 || pixels == 1);
- assert(num_fs * src_channels <= Elements(src));
+ assert(num_fs * src_channels <= ARRAY_SIZE(src));
/*
* Transpose from SoA -> AoS
}
} else if (twiddle) {
/* Twiddle pixels across elements of array */
+ /*
+ * XXX: we should avoid this in some cases, but would need to tell
+ * lp_build_conv to reorder (or deal with it ourselves).
+ */
lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
} else {
/* Do nothing */
}
+/*
+ * Untwiddle and transpose, much like the above.
+ * However, this is after conversion, so we get packed vectors.
+ * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
+ * the vectors will look like:
+ * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
+ * be swizzled here). Extending to 16bit should be trivial.
+ * Should also be extended to handle twice wide vectors with AVX2...
+ */
+static void
+fs_twiddle_transpose(struct gallivm_state *gallivm,
+ struct lp_type type,
+ LLVMValueRef *src,
+ unsigned src_count,
+ LLVMValueRef *dst)
+{
+ unsigned i, j;
+ struct lp_type type64, type16, type32;
+ LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
+ LLVMBuilderRef builder = gallivm->builder;
+ LLVMValueRef tmp[4], shuf[8];
+ for (j = 0; j < 2; j++) {
+ shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
+ shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
+ shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
+ shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
+ }
+
+ assert(src_count == 4 || src_count == 2 || src_count == 1);
+ assert(type.width == 8);
+ assert(type.length == 16);
+
+ type8_t = lp_build_vec_type(gallivm, type);
+
+ type64 = type;
+ type64.length /= 8;
+ type64.width *= 8;
+ type64_t = lp_build_vec_type(gallivm, type64);
+
+ type16 = type;
+ type16.length /= 2;
+ type16.width *= 2;
+ type16_t = lp_build_vec_type(gallivm, type16);
+
+ type32 = type;
+ type32.length /= 4;
+ type32.width *= 4;
+ type32_t = lp_build_vec_type(gallivm, type32);
+
+ lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
+
+ if (src_count == 1) {
+ /* transpose was no-op, just untwiddle */
+ LLVMValueRef shuf_vec;
+ shuf_vec = LLVMConstVector(shuf, 8);
+ tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
+ tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
+ dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
+ } else if (src_count == 2) {
+ LLVMValueRef shuf_vec;
+ shuf_vec = LLVMConstVector(shuf, 4);
+
+ for (i = 0; i < 2; i++) {
+ tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
+ tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
+ dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
+ }
+ } else {
+ for (j = 0; j < 2; j++) {
+ LLVMValueRef lo, hi, lo2, hi2;
+ /*
+ * Note that if we only really have 3 valid channels (rgb)
+ * and we don't need alpha we could substitute a undef here
+ * for the respective channel (causing llvm to drop conversion
+ * for alpha).
+ */
+ /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
+ lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
+ hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
+ lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
+ hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
+ dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
+ dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
+ }
+ }
+}
+
+
/**
* Load an unswizzled block of pixels from memory
*/
gep[1] = LLVMBuildAdd(builder, bx, by, "");
dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
- dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
+ dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
+ LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
- lp_set_load_alignment(dst[i], dst_alignment);
+ LLVMSetAlignment(dst[i], dst_alignment);
}
}
gep[1] = LLVMBuildAdd(builder, bx, by, "");
src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
- src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
+ src_ptr = LLVMBuildBitCast(builder, src_ptr,
+ LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
- lp_set_store_alignment(src_ptr, src_alignment);
+ LLVMSetAlignment(src_ptr, src_alignment);
}
}
*
* A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
*/
-static INLINE boolean
+static inline boolean
is_arithmetic_format(const struct util_format_description *format_desc)
{
boolean arith = false;
* to floats for blending, and furthermore has "natural" packed AoS -> unpacked
* SoA conversion.
*/
-static INLINE boolean
+static inline boolean
format_expands_to_float_soa(const struct util_format_description *format_desc)
{
if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
*
* e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
*/
-static INLINE void
+static inline void
lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
struct lp_type* type)
{
*
* e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
*/
-static INLINE void
+static inline void
lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
struct lp_type* type)
{
*
* but we try to avoid division and multiplication through shifts.
*/
-static INLINE LLVMValueRef
+static inline LLVMValueRef
scale_bits(struct gallivm_state *gallivm,
int src_bits,
int dst_bits,
lp_build_const_int_vec(gallivm, src_type, db),
"");
- if (db < src_bits) {
+ if (db <= src_bits) {
/* Enough bits in src to fill the remainder */
LLVMValueRef lower = LLVMBuildLShr(builder,
src,
/**
* If RT is a smallfloat (needing denorms) format
*/
-static INLINE int
+static inline int
have_smallfloat_format(struct lp_type dst_type,
enum pipe_format format)
{
LLVMBuilderRef builder = gallivm->builder;
struct lp_type blend_type;
struct lp_type mem_type;
- unsigned i, j, k;
+ unsigned i, j;
unsigned pixels = block_size / num_srcs;
bool is_arith;
unsigned from_lsb = src_fmt->nr_channels - j - 1;
#endif
- for (k = 0; k < src_fmt->channel[j].size; ++k) {
- mask |= 1 << k;
- }
+ mask = (1 << src_fmt->channel[j].size) - 1;
/* Extract bits from source */
chans[j] = LLVMBuildLShr(builder,
/* Extract bits */
chans[j] = LLVMBuildLShr(builder,
dst[i],
- lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
+ lp_build_const_int_vec(gallivm, src_type,
+ from_lsb * blend_type.width),
"");
chans[j] = LLVMBuildAnd(builder,
/* If there is a src for each pixel broadcast the alpha across whole row */
if (src_count == block_size) {
for (i = 0; i < src_count; ++i) {
- src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]);
+ src_alpha[i] = lp_build_broadcast(gallivm,
+ lp_build_vec_type(gallivm, row_type), src_alpha[i]);
}
} else {
unsigned pixels = block_size / src_count;
LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
LLVMValueRef src_alpha[4 * 4];
- LLVMValueRef src1_alpha[4 * 4];
+ LLVMValueRef src1_alpha[4 * 4] = { NULL };
LLVMValueRef src_mask[4 * 4];
LLVMValueRef src[4 * 4];
LLVMValueRef src1[4 * 4];
struct lp_type blend_type;
struct lp_type row_type;
struct lp_type dst_type;
+ struct lp_type ls_type;
unsigned char swizzle[TGSI_NUM_CHANNELS];
unsigned vector_width;
util_blend_state_is_dual(&variant->key.blend, 0);
const boolean is_1d = variant->key.resource_1d;
+ boolean twiddle_after_convert = FALSE;
unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
LLVMValueRef fpstate = 0;
partial_mask |= !variant->opaque;
i32_zero = lp_build_const_int32(gallivm, 0);
-#if HAVE_LLVM < 0x0302
- /*
- * undef triggers a crash in LLVMBuildTrunc in convert_from_blend_type in some
- * cases (seen with r10g10b10a2, 128bit wide vectors) (only used for 1d case).
- */
- undef_src_val = lp_build_zero(gallivm, fs_type);
-#else
undef_src_val = lp_build_undef(gallivm, fs_type);
-#endif
row_type.length = fs_type.length;
vector_width = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
}
/* If 3 channels then pad to include alpha for 4 element transpose */
- if (dst_channels == 3 && !has_alpha) {
+ if (dst_channels == 3) {
+ assert (!has_alpha);
for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
if (swizzle[i] > TGSI_NUM_CHANNELS)
swizzle[i] = 3;
}
if (out_format_desc->nr_channels == 4) {
dst_channels = 4;
+ /*
+ * We use alpha from the color conversion, not separate one.
+ * We had to include it for transpose, hence it will get converted
+ * too (albeit when doing transpose after conversion, that would
+ * no longer be the case necessarily).
+ * (It works only with 4 channel dsts, e.g. rgbx formats, because
+ * otherwise we really have padding, not alpha, included.)
+ */
+ has_alpha = true;
}
}
/*
* XXX If we include that here maybe could actually use it instead of
* separate alpha for blending?
+ * (Difficult though we actually convert pad channels, not alpha.)
*/
if (dst_channels == 3 && !has_alpha) {
fs_src[i][3] = alpha;
/* We split the row_mask and row_alpha as we want 128bit interleave */
if (fs_type.length == 8) {
- src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels);
- src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels);
+ src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i],
+ 0, src_channels);
+ src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i],
+ src_channels, src_channels);
src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
- src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+ src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
+ src_channels, src_channels);
} else {
src_mask[i] = fs_mask[i];
src_alpha[i] = alpha;
}
if (fs_type.length == 8) {
src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
- src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+ src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
+ src_channels, src_channels);
} else {
src1_alpha[i] = alpha;
}
}
}
+ /*
+ * We actually should generally do conversion first (for non-1d cases)
+ * when the blend format is 8 or 16 bits. The reason is obvious,
+ * there's 2 or 4 times less vectors to deal with for the interleave...
+ * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
+ * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
+ * unpack only with 128bit vectors).
+ * Note: for 16bit sizes really need matching pack conversion code
+ */
+ if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
+ twiddle_after_convert = TRUE;
+ }
+
/*
* Pixel twiddle from fragment shader order to memory order
*/
- src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
- dst_channels, fs_src, src, pad_inline);
- if (dual_source_blend) {
- generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
- fs_src1, src1, pad_inline);
+ if (!twiddle_after_convert) {
+ src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
+ dst_channels, fs_src, src, pad_inline);
+ if (dual_source_blend) {
+ generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
+ fs_src1, src1, pad_inline);
+ }
+ } else {
+ src_count = num_fullblock_fs * dst_channels;
+ /*
+ * We reorder things a bit here, so the cases for 4-wide and 8-wide
+ * (AVX) turn out the same later when untwiddling/transpose (albeit
+ * for true AVX2 path untwiddle needs to be different).
+ * For now just order by colors first (so we can use unpack later).
+ */
+ for (j = 0; j < num_fullblock_fs; j++) {
+ for (i = 0; i < dst_channels; i++) {
+ src[i*num_fullblock_fs + j] = fs_src[j][i];
+ if (dual_source_blend) {
+ src1[i*num_fullblock_fs + j] = fs_src1[j][i];
+ }
+ }
+ }
}
src_channels = dst_channels < 3 ? dst_channels : 4;
assert(bits == 128 || bits == 256);
}
+ if (twiddle_after_convert) {
+ fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
+ if (dual_source_blend) {
+ fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
+ }
+ }
/*
* Blend Colour conversion
*/
blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr);
- blend_color = LLVMBuildPointerCast(builder, blend_color, LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
- blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, &i32_zero, 1, ""), "");
+ blend_color = LLVMBuildPointerCast(builder, blend_color,
+ LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
+ blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color,
+ &i32_zero, 1, ""), "");
/* Convert */
lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1);
mask_type.length = pixels;
mask_type.width = row_type.width * dst_channels;
- src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+ /*
+ * If mask_type width is smaller than 32bit, this doesn't quite
+ * generate the most efficient code (could use some pack).
+ */
+ src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
+ lp_build_int_vec_type(gallivm, mask_type), "");
mask_type.length *= dst_channels;
mask_type.width /= dst_channels;
}
- src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+ src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
+ lp_build_int_vec_type(gallivm, mask_type), "");
src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
}
*/
dst_alignment = MIN2(16, dst_alignment);
+ ls_type = dst_type;
+
+ if (dst_count > src_count) {
+ if ((dst_type.width == 8 || dst_type.width == 16) &&
+ util_is_power_of_two_or_zero(dst_type.length) &&
+ dst_type.length * dst_type.width < 128) {
+ /*
+ * Never try to load values as 4xi8 which we will then
+ * concatenate to larger vectors. This gives llvm a real
+ * headache (the problem is the type legalizer (?) will
+ * try to load that as 4xi8 zext to 4xi32 to fill the vector,
+ * then the shuffles to concatenate are more or less impossible
+ * - llvm is easily capable of generating a sequence of 32
+ * pextrb/pinsrb instructions for that. Albeit it appears to
+ * be fixed in llvm 4.0. So, load and concatenate with 32bit
+ * width to avoid the trouble (16bit seems not as bad, llvm
+ * probably recognizes the load+shuffle as only one shuffle
+ * is necessary, but we can do just the same anyway).
+ */
+ ls_type.length = dst_type.length * dst_type.width / 32;
+ ls_type.width = 32;
+ }
+ }
+
if (is_1d) {
load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
- dst, dst_type, dst_count / 4, dst_alignment);
+ dst, ls_type, dst_count / 4, dst_alignment);
for (i = dst_count / 4; i < dst_count; i++) {
- dst[i] = lp_build_undef(gallivm, dst_type);
+ dst[i] = lp_build_undef(gallivm, ls_type);
}
}
else {
load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
- dst, dst_type, dst_count, dst_alignment);
+ dst, ls_type, dst_count, dst_alignment);
}
* on all 16 pixels in that single vector at once.
*/
if (dst_count > src_count) {
- lp_build_concat_n(gallivm, dst_type, dst, 4, dst, src_count);
+ if (ls_type.length != dst_type.length && ls_type.length == 1) {
+ LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
+ LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
+ for (i = 0; i < dst_count; i++) {
+ dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
+ }
+ }
+
+ lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
+
+ if (ls_type.length != dst_type.length) {
+ struct lp_type tmp_type = dst_type;
+ tmp_type.length = dst_type.length * 4 / src_count;
+ for (i = 0; i < src_count; i++) {
+ dst[i] = LLVMBuildBitCast(builder, dst[i],
+ lp_build_vec_type(gallivm, tmp_type), "");
+ }
+ }
}
/*
* It seems some cleanup could be done here (like skipping conversion/blend
* when not needed).
*/
- convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, row_type, dst, src_count);
+ convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
+ row_type, dst, src_count);
/*
* FIXME: Really should get logic ops / masks out of generic blend / row
pad_inline ? 4 : dst_channels);
}
- convert_from_blend_type(gallivm, block_size, out_format_desc, row_type, dst_type, dst, src_count);
+ convert_from_blend_type(gallivm, block_size, out_format_desc,
+ row_type, dst_type, dst, src_count);
/* Split the blend rows back to memory rows */
if (dst_count > src_count) {
/* check if writes to cbuf[0] are to be copied to all cbufs */
cbuf0_write_all =
- shader->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS][0];
+ shader->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
/* TODO: actually pick these based on the fs and color buffer
* characteristics. */
blend_vec_type = lp_build_vec_type(gallivm, blend_type);
- util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
- shader->no, variant->no, partial_mask ? "partial" : "whole");
+ snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
+ shader->no, variant->no, partial_mask ? "partial" : "whole");
arg_types[0] = variant->jit_context_ptr_type; /* context */
arg_types[1] = int32_type; /* x */
arg_types[12] = int32_type; /* depth_stride */
func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
- arg_types, Elements(arg_types), 0);
+ arg_types, ARRAY_SIZE(arg_types), 0);
function = LLVMAddFunction(gallivm->module, func_name, func_type);
LLVMSetFunctionCallConv(function, LLVMCCallConv);
/* XXX: need to propagate noalias down into color param now we are
* passing a pointer-to-pointer?
*/
- for(i = 0; i < Elements(arg_types); ++i)
+ for(i = 0; i < ARRAY_SIZE(arg_types); ++i)
if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
- LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
+ lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
context_ptr = LLVMGetParam(function, 0);
x = LLVMGetParam(function, 1);
lp_build_name(dady_ptr, "dady");
lp_build_name(color_ptr_ptr, "color_ptr_ptr");
lp_build_name(depth_ptr, "depth");
- lp_build_name(thread_data_ptr, "thread_data");
lp_build_name(mask_input, "mask_input");
+ lp_build_name(thread_data_ptr, "thread_data");
lp_build_name(stride_ptr, "stride_ptr");
lp_build_name(depth_stride, "depth_stride");
assert(builder);
LLVMPositionBuilderAtEnd(builder, block);
+ /*
+ * Must not count ps invocations if there's a null shader.
+ * (It would be ok to count with null shader if there's d/s tests,
+ * but only if there's d/s buffers too, which is different
+ * to implicit rasterization disable which must not depend
+ * on the d/s buffers.)
+ * Could use popcount on mask, but pixel accuracy is not required.
+ * Could disable if there's no stats query, but maybe not worth it.
+ */
+ if (shader->info.base.num_instructions > 1) {
+ LLVMValueRef invocs, val;
+ invocs = lp_jit_thread_data_invocations(gallivm, thread_data_ptr);
+ val = LLVMBuildLoad(builder, invocs, "");
+ val = LLVMBuildAdd(builder, val,
+ LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), 1, 0),
+ "invoc_count");
+ LLVMBuildStore(builder, val, invocs);
+ }
+
/* code generated texture sampling */
- sampler = lp_llvm_sampler_soa_create(key->state, context_ptr);
+ sampler = lp_llvm_sampler_soa_create(key->state);
num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
/* for 1d resources only run "upper half" of stamp */
LLVMValueRef mask_store = lp_build_array_alloca(gallivm, mask_type,
num_loop, "mask_store");
LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
+ boolean pixel_center_integer =
+ shader->info.base.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER];
/*
* The shader input interpolation info is not explicitely baked in the
gallivm,
shader->info.base.num_inputs,
inputs,
- shader->info.base.pixel_center_integer,
+ pixel_center_integer,
+ key->depth_clamp,
builder, fs_type,
a0_ptr, dadx_ptr, dady_ptr,
x, y);
for (i = 0; i < key->nr_cbufs; ++i) {
debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
}
- if (key->depth.enabled) {
+ if (key->depth.enabled || key->stencil[0].enabled) {
debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
- debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE));
+ }
+ if (key->depth.enabled) {
+ debug_printf("depth.func = %s\n", util_str_func(key->depth.func, TRUE));
debug_printf("depth.writemask = %u\n", key->depth.writemask);
}
for (i = 0; i < 2; ++i) {
if (key->stencil[i].enabled) {
- debug_printf("stencil[%u].func = %s\n", i, util_dump_func(key->stencil[i].func, TRUE));
- debug_printf("stencil[%u].fail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].fail_op, TRUE));
- debug_printf("stencil[%u].zpass_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zpass_op, TRUE));
- debug_printf("stencil[%u].zfail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zfail_op, TRUE));
+ debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, TRUE));
+ debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, TRUE));
+ debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, TRUE));
+ debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, TRUE));
debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
}
}
if (key->alpha.enabled) {
- debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
+ debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, TRUE));
}
if (key->occlusion_count) {
}
if (key->blend.logicop_enable) {
- debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
+ debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, TRUE));
}
else if (key->blend.rt[0].blend_enable) {
- debug_printf("blend.rgb_func = %s\n", util_dump_blend_func (key->blend.rt[0].rgb_func, TRUE));
- debug_printf("blend.rgb_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
- debug_printf("blend.rgb_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
- debug_printf("blend.alpha_func = %s\n", util_dump_blend_func (key->blend.rt[0].alpha_func, TRUE));
- debug_printf("blend.alpha_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
- debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
+ debug_printf("blend.rgb_func = %s\n", util_str_blend_func (key->blend.rt[0].rgb_func, TRUE));
+ debug_printf("blend.rgb_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
+ debug_printf("blend.rgb_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
+ debug_printf("blend.alpha_func = %s\n", util_str_blend_func (key->blend.rt[0].alpha_func, TRUE));
+ debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
+ debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
}
debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
if (key->blend.alpha_to_coverage) {
const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state;
debug_printf("sampler[%u] = \n", i);
debug_printf(" .wrap = %s %s %s\n",
- util_dump_tex_wrap(sampler->wrap_s, TRUE),
- util_dump_tex_wrap(sampler->wrap_t, TRUE),
- util_dump_tex_wrap(sampler->wrap_r, TRUE));
+ util_str_tex_wrap(sampler->wrap_s, TRUE),
+ util_str_tex_wrap(sampler->wrap_t, TRUE),
+ util_str_tex_wrap(sampler->wrap_r, TRUE));
debug_printf(" .min_img_filter = %s\n",
- util_dump_tex_filter(sampler->min_img_filter, TRUE));
+ util_str_tex_filter(sampler->min_img_filter, TRUE));
debug_printf(" .min_mip_filter = %s\n",
- util_dump_tex_mipfilter(sampler->min_mip_filter, TRUE));
+ util_str_tex_mipfilter(sampler->min_mip_filter, TRUE));
debug_printf(" .mag_img_filter = %s\n",
- util_dump_tex_filter(sampler->mag_img_filter, TRUE));
+ util_str_tex_filter(sampler->mag_img_filter, TRUE));
if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
- debug_printf(" .compare_func = %s\n", util_dump_func(sampler->compare_func, TRUE));
+ debug_printf(" .compare_func = %s\n", util_str_func(sampler->compare_func, TRUE));
debug_printf(" .normalized_coords = %u\n", sampler->normalized_coords);
debug_printf(" .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
debug_printf(" .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
debug_printf(" .format = %s\n",
util_format_name(texture->format));
debug_printf(" .target = %s\n",
- util_dump_tex_target(texture->target, TRUE));
+ util_str_tex_target(texture->target, TRUE));
debug_printf(" .level_zero_only = %u\n",
texture->level_zero_only);
debug_printf(" .pot = %u %u %u\n",
const struct lp_fragment_shader_variant_key *key)
{
struct lp_fragment_shader_variant *variant;
- const struct util_format_description *cbuf0_format_desc;
+ const struct util_format_description *cbuf0_format_desc = NULL;
boolean fullcolormask;
char module_name[64];
variant = CALLOC_STRUCT(lp_fragment_shader_variant);
- if(!variant)
+ if (!variant)
return NULL;
- util_snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
- shader->no, shader->variants_created);
+ snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
+ shader->no, shader->variants_created);
variant->gallivm = gallivm_create(module_name, lp->context);
if (!variant->gallivm) {
!key->alpha.enabled &&
!key->blend.alpha_to_coverage &&
!key->depth.enabled &&
- !shader->info.base.uses_kill
+ !shader->info.base.uses_kill &&
+ !shader->info.base.writes_samplemask
? TRUE : FALSE;
- if ((shader->info.base.num_tokens <= 1) &&
- !key->depth.enabled && !key->stencil[0].enabled) {
- variant->ps_inv_multiplier = 0;
- } else {
- variant->ps_inv_multiplier = 1;
- }
-
if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
lp_debug_fs_variant(variant);
}
switch (shader->info.base.input_interpolate[i]) {
case TGSI_INTERPOLATE_CONSTANT:
- shader->inputs[i].interp = LP_INTERP_CONSTANT;
- break;
+ shader->inputs[i].interp = LP_INTERP_CONSTANT;
+ break;
case TGSI_INTERPOLATE_LINEAR:
- shader->inputs[i].interp = LP_INTERP_LINEAR;
- break;
+ shader->inputs[i].interp = LP_INTERP_LINEAR;
+ break;
case TGSI_INTERPOLATE_PERSPECTIVE:
- shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
- break;
+ shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
+ break;
case TGSI_INTERPOLATE_COLOR:
- shader->inputs[i].interp = LP_INTERP_COLOR;
- break;
+ shader->inputs[i].interp = LP_INTERP_COLOR;
+ break;
default:
- assert(0);
- break;
+ assert(0);
+ break;
}
switch (shader->info.base.input_semantic_name[i]) {
case TGSI_SEMANTIC_FACE:
- shader->inputs[i].interp = LP_INTERP_FACING;
- break;
+ shader->inputs[i].interp = LP_INTERP_FACING;
+ break;
case TGSI_SEMANTIC_POSITION:
- /* Position was already emitted above
- */
- shader->inputs[i].interp = LP_INTERP_POSITION;
- shader->inputs[i].src_index = 0;
- continue;
+ /* Position was already emitted above
+ */
+ shader->inputs[i].interp = LP_INTERP_POSITION;
+ shader->inputs[i].src_index = 0;
+ continue;
}
+ /* XXX this is a completely pointless index map... */
shader->inputs[i].src_index = i+1;
}
* Remove shader variant from two lists: the shader's variant list
* and the context's variant list.
*/
-void
+static void
llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
struct lp_fragment_shader_variant *variant)
{
- if (gallivm_debug & GALLIVM_DEBUG_IR) {
- debug_printf("llvmpipe: del fs #%u var #%u v created #%u v cached"
- " #%u v total cached #%u\n",
- variant->shader->no,
- variant->no,
+ if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
+ debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
+ "v total cached %u inst %u total inst %u\n",
+ variant->shader->no, variant->no,
variant->shader->variants_created,
variant->shader->variants_cached,
- lp->nr_fs_variants);
+ lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
}
gallivm_destroy(variant->gallivm);
static void
llvmpipe_set_constant_buffer(struct pipe_context *pipe,
- uint shader, uint index,
- struct pipe_constant_buffer *cb)
+ enum pipe_shader_type shader, uint index,
+ const struct pipe_constant_buffer *cb)
{
struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
struct pipe_resource *constants = cb ? cb->buffer : NULL;
assert(shader < PIPE_SHADER_TYPES);
- assert(index < Elements(llvmpipe->constants[shader]));
+ assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
/* note: reference counting */
util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb);
+ if (constants) {
+ if (!(constants->bind & PIPE_BIND_CONSTANT_BUFFER)) {
+ debug_printf("Illegal set constant without bind flag\n");
+ constants->bind |= PIPE_BIND_CONSTANT_BUFFER;
+ }
+ }
+
if (shader == PIPE_SHADER_VERTEX ||
shader == PIPE_SHADER_GEOMETRY) {
/* Pass the constants to the 'draw' module */
draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
index, data, size);
}
-
- llvmpipe->dirty |= LP_NEW_CONSTANTS;
+ else {
+ llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
+ }
if (cb && cb->user_buffer) {
pipe_resource_reference(&constants, NULL);
}
}
+static void
+llvmpipe_set_shader_buffers(struct pipe_context *pipe,
+ enum pipe_shader_type shader, unsigned start_slot,
+ unsigned count, const struct pipe_shader_buffer *buffers,
+ unsigned writable_bitmask)
+{
+ struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+ unsigned i, idx;
+ for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
+ const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL;
+
+ util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer);
+
+ if (shader == PIPE_SHADER_VERTEX ||
+ shader == PIPE_SHADER_GEOMETRY) {
+ const unsigned size = buffer ? buffer->buffer_size : 0;
+ const ubyte *data = NULL;
+ if (buffer && buffer->buffer)
+ data = (ubyte *) llvmpipe_resource_data(buffer->buffer);
+ if (data)
+ data += buffer->buffer_offset;
+ draw_set_mapped_shader_buffer(llvmpipe->draw, shader,
+ i, data, size);
+ } else if (shader == PIPE_SHADER_FRAGMENT) {
+ llvmpipe->dirty |= LP_NEW_FS_SSBOS;
+ }
+ }
+}
/**
* Return the blend factor equivalent to a destination alpha of one.
*/
-static INLINE unsigned
+static inline unsigned
force_dst_alpha_one(unsigned factor, boolean clamped_zero)
{
switch(factor) {
* depth_clip == 0 implies depth clamping is enabled.
*
* When clip_halfz is enabled, then always clamp the depth values.
+ *
+ * XXX: This is incorrect for GL, but correct for d3d10 (depth
+ * clamp is always active in d3d10, regardless if depth clip is
+ * enabled or not).
+ * (GL has an always-on [0,1] clamp on fs depth output instead
+ * to ensure the depth values stay in range. Doesn't look like
+ * we do that, though...)
*/
if (lp->rasterizer->clip_halfz) {
key->depth_clamp = 1;
} else {
- key->depth_clamp = (lp->rasterizer->depth_clip == 0) ? 1 : 0;
+ key->depth_clamp = (lp->rasterizer->depth_clip_near == 0) ? 1 : 0;
}
/* alpha test only applies if render buffer 0 is non-integer (or does not exist) */
* Also, force rgb/alpha func/factors match, to make AoS blending
* easier.
*/
- if (format_desc->swizzle[3] > UTIL_FORMAT_SWIZZLE_W ||
+ if (format_desc->swizzle[3] > PIPE_SWIZZLE_W ||
format_desc->swizzle[3] == format_desc->swizzle[0]) {
/* Doesn't cover mixed snorm/unorm but can't render to them anyway */
boolean clamped_zero = !util_format_is_float(format) &&
if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
for(i = 0; i < key->nr_sampler_views; ++i) {
- if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
+ /*
+ * Note sview may exceed what's representable by file_mask.
+ * This will still work, the only downside is that not actually
+ * used views may be included in the shader key.
+ */
+ if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1u << (i & 31))) {
lp_sampler_static_texture_state(&key->state[i].texture_state,
lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
}
unsigned i;
unsigned variants_to_cull;
- if (0) {
+ if (LP_DEBUG & DEBUG_FS) {
debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
lp->nr_fs_variants,
lp->nr_fs_instrs,
}
/* First, check if we've exceeded the max number of shader variants.
- * If so, free 25% of them (the least recently used ones).
+ * If so, free 6.25% of them (the least recently used ones).
*/
- variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 4 : 0;
+ variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 16 : 0;
if (variants_to_cull ||
lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
struct pipe_context *pipe = &lp->pipe;
+ if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+ debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
+ "\t%u instrs,\t%u instrs/variant\n",
+ shader->variants_cached,
+ lp->nr_fs_variants, lp->nr_fs_instrs,
+ lp->nr_fs_instrs / lp->nr_fs_variants);
+ }
+
/*
* XXX: we need to flush the context until we have some sort of
* reference counting in fragment shaders as they may still be binned
llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
+
+ llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers;
}
-/*
- * Rasterization is disabled if there is no pixel shader and
- * both depth and stencil testing are disabled:
- * http://msdn.microsoft.com/en-us/library/windows/desktop/bb205125
- */
-boolean
-llvmpipe_rasterization_disabled(struct llvmpipe_context *lp)
-{
- boolean null_fs = !lp->fs || lp->fs->info.base.num_tokens <= 1;
- return (null_fs &&
- !lp->depth_stencil->depth.enabled &&
- !lp->depth_stencil->stencil[0].enabled);
-}