X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fllvmpipe%2Flp_state_fs.c;h=7a19d4eddab81e17ab6a5ff0f0f355764ed8c978;hp=31ed8478e9be93d1a316540bb5a38d04686d6225;hb=f9f60da813e69aacf541d25a24622c896f15ba98;hpb=93b953d139112bea1c9c64a3de462cbb52c544fd diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index 31ed8478e9b..7a19d4eddab 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -1,7 +1,7 @@ /************************************************************************** * * Copyright 2009 VMware, Inc. - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright 2007 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -19,7 +19,7 @@ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. @@ -65,9 +65,9 @@ #include "util/u_format.h" #include "util/u_dump.h" #include "util/u_string.h" -#include "util/u_simple_list.h" +#include "util/simple_list.h" #include "util/u_dual_blend.h" -#include "os/os_time.h" +#include "util/os_time.h" #include "pipe/p_shader_tokens.h" #include "draw/draw_context.h" #include "tgsi/tgsi_dump.h" @@ -84,6 +84,7 @@ #include "gallivm/lp_bld_flow.h" #include "gallivm/lp_bld_debug.h" #include "gallivm/lp_bld_arit.h" +#include "gallivm/lp_bld_bitarit.h" #include "gallivm/lp_bld_pack.h" #include "gallivm/lp_bld_format.h" #include "gallivm/lp_bld_quad.h" @@ -127,14 +128,14 @@ generate_quad_mask(struct gallivm_state *gallivm, struct lp_type mask_type; LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef bits[16]; - LLVMValueRef mask; + LLVMValueRef mask, bits_vec; int shift, i; /* * XXX: We'll need a different path for 16 x u8 */ assert(fs_type.width == 32); - assert(fs_type.length <= Elements(bits)); + assert(fs_type.length <= ARRAY_SIZE(bits)); mask_type = lp_int_type(fs_type); /* @@ -174,20 +175,20 @@ generate_quad_mask(struct gallivm_state *gallivm, for (i = 0; i < fs_type.length / 4; i++) { unsigned j = 2 * (i % 2) + (i / 2) * 8; - bits[4*i + 0] = LLVMConstInt(i32t, 1 << (j + 0), 0); - bits[4*i + 1] = LLVMConstInt(i32t, 1 << (j + 1), 0); - bits[4*i + 2] = LLVMConstInt(i32t, 1 << (j + 4), 0); - bits[4*i + 3] = LLVMConstInt(i32t, 1 << (j + 5), 0); + bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0); + bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0); + bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0); + bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0); } - mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, fs_type.length), ""); + bits_vec = LLVMConstVector(bits, fs_type.length); + mask = LLVMBuildAnd(builder, mask, bits_vec, ""); /* - * mask = mask != 0 ? ~0 : 0 + * mask = mask == bits ? ~0 : 0 */ mask = lp_build_compare(gallivm, - mask_type, PIPE_FUNC_NOTEQUAL, - mask, - lp_build_const_int_vec(gallivm, mask_type, 0)); + mask_type, PIPE_FUNC_EQUAL, + mask, bits_vec); return mask; } @@ -238,6 +239,54 @@ lp_llvm_viewport(LLVMValueRef context_ptr, } +static LLVMValueRef +lp_build_depth_clamp(struct gallivm_state *gallivm, + LLVMBuilderRef builder, + struct lp_type type, + LLVMValueRef context_ptr, + LLVMValueRef thread_data_ptr, + LLVMValueRef z) +{ + LLVMValueRef viewport, min_depth, max_depth; + LLVMValueRef viewport_index; + struct lp_build_context f32_bld; + + assert(type.floating); + lp_build_context_init(&f32_bld, gallivm, type); + + /* + * Assumes clamping of the viewport index will occur in setup/gs. Value + * is passed through the rasterization stage via lp_rast_shader_inputs. + * + * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping + * semantics. + */ + viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm, + thread_data_ptr); + + /* + * Load the min and max depth from the lp_jit_context.viewports + * array of lp_jit_viewport structures. + */ + viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index); + + /* viewports[viewport_index].min_depth */ + min_depth = LLVMBuildExtractElement(builder, viewport, + lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), ""); + min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth); + + /* viewports[viewport_index].max_depth */ + max_depth = LLVMBuildExtractElement(builder, viewport, + lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), ""); + max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth); + + /* + * Clamp to the min and max depth values for the given viewport. + */ + return lp_build_clamp(&f32_bld, z, min_depth, max_depth); +} + + /** * Generate the fragment shader, depth/stencil test, and alpha tests. */ @@ -250,7 +299,8 @@ generate_fs_loop(struct gallivm_state *gallivm, LLVMValueRef context_ptr, LLVMValueRef num_loop, struct lp_build_interp_soa_context *interp, - struct lp_build_sampler_soa *sampler, + const struct lp_build_sampler_soa *sampler, + const struct lp_build_image_soa *image, LLVMValueRef mask_store, LLVMValueRef (*out_color)[4], LLVMValueRef depth_ptr, @@ -260,9 +310,11 @@ generate_fs_loop(struct gallivm_state *gallivm, { const struct util_format_description *zs_format_desc = NULL; const struct tgsi_token *tokens = shader->base.tokens; - LLVMTypeRef vec_type; + struct lp_type int_type = lp_int_type(type); + LLVMTypeRef vec_type, int_vec_type; LLVMValueRef mask_ptr, mask_val; LLVMValueRef consts_ptr, num_consts_ptr; + LLVMValueRef ssbo_ptr, num_ssbo_ptr; LLVMValueRef z; LLVMValueRef z_value, s_value; LLVMValueRef z_fb, s_fb; @@ -295,10 +347,15 @@ generate_fs_loop(struct gallivm_state *gallivm, zs_format_desc = util_format_description(key->zsbuf_format); assert(zs_format_desc); - if (!shader->info.base.writes_z) { - if (key->alpha.enabled || + if (shader->info.base.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) + depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE; + else if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) { + if (shader->info.base.writes_memory) + depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE; + else if (key->alpha.enabled || key->blend.alpha_to_coverage || - shader->info.base.uses_kill) { + shader->info.base.uses_kill || + shader->info.base.writes_samplemask) { /* With alpha test and kill, can do the depth test early * and hopefully eliminate some quads. But need to do a * special deferred depth write once the final mask value @@ -329,15 +386,21 @@ generate_fs_loop(struct gallivm_state *gallivm, depth_mode = 0; } + vec_type = lp_build_vec_type(gallivm, type); + int_vec_type = lp_build_vec_type(gallivm, int_type); stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr); stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr); - - vec_type = lp_build_vec_type(gallivm, type); + /* convert scalar stencil refs into vectors */ + stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]); + stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]); consts_ptr = lp_jit_context_constants(gallivm, context_ptr); num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr); + ssbo_ptr = lp_jit_context_ssbos(gallivm, context_ptr); + num_ssbo_ptr = lp_jit_context_num_ssbos(gallivm, context_ptr); + lp_build_for_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0), LLVMIntULT, @@ -379,6 +442,13 @@ generate_fs_loop(struct gallivm_state *gallivm, z = interp->pos[2]; if (depth_mode & EARLY_DEPTH_TEST) { + /* + * Clamp according to ARB_depth_clamp semantics. + */ + if (key->depth_clamp) { + z = lp_build_depth_clamp(gallivm, builder, type, context_ptr, + thread_data_ptr, z); + } lp_build_depth_stencil_load_swizzled(gallivm, type, zs_format_desc, key->resource_1d, depth_ptr, depth_stride, @@ -413,11 +483,26 @@ generate_fs_loop(struct gallivm_state *gallivm, lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter); + struct lp_build_tgsi_params params; + memset(¶ms, 0, sizeof(params)); + + params.type = type; + params.mask = &mask; + params.consts_ptr = consts_ptr; + params.const_sizes_ptr = num_consts_ptr; + params.system_values = &system_values; + params.inputs = interp->inputs; + params.context_ptr = context_ptr; + params.thread_data_ptr = thread_data_ptr; + params.sampler = sampler; + params.info = &shader->info.base; + params.ssbo_ptr = ssbo_ptr; + params.ssbo_sizes_ptr = num_ssbo_ptr; + params.image = image; + /* Build the actual shader */ - lp_build_tgsi_soa(gallivm, tokens, type, &mask, - consts_ptr, num_consts_ptr, &system_values, - interp->inputs, - outputs, sampler, &shader->info.base, NULL); + lp_build_tgsi_soa(gallivm, tokens, ¶ms, + outputs); /* Alpha test */ if (key->alpha.enabled) { @@ -456,59 +541,51 @@ generate_fs_loop(struct gallivm_state *gallivm, } } + if (shader->info.base.writes_samplemask) { + int smaski = find_output_by_semantic(&shader->info.base, + TGSI_SEMANTIC_SAMPLEMASK, + 0); + LLVMValueRef smask; + struct lp_build_context smask_bld; + lp_build_context_init(&smask_bld, gallivm, int_type); + + assert(smaski >= 0); + smask = LLVMBuildLoad(builder, outputs[smaski][0], "smask"); + /* + * Pixel is alive according to the first sample in the mask. + */ + smask = LLVMBuildBitCast(builder, smask, smask_bld.vec_type, ""); + smask = lp_build_and(&smask_bld, smask, smask_bld.one); + smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, smask, smask_bld.zero); + lp_build_mask_update(&mask, smask); + } + /* Late Z test */ if (depth_mode & LATE_DEPTH_TEST) { int pos0 = find_output_by_semantic(&shader->info.base, TGSI_SEMANTIC_POSITION, 0); - + int s_out = find_output_by_semantic(&shader->info.base, + TGSI_SEMANTIC_STENCIL, + 0); if (pos0 != -1 && outputs[pos0][2]) { z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z"); + } + /* + * Clamp according to ARB_depth_clamp semantics. + */ + if (key->depth_clamp) { + z = lp_build_depth_clamp(gallivm, builder, type, context_ptr, + thread_data_ptr, z); + } - /* - * Clamp according to ARB_depth_clamp semantics. - */ - if (key->depth_clamp) { - LLVMValueRef viewport, min_depth, max_depth; - LLVMValueRef viewport_index; - struct lp_build_context f32_bld; - - assert(type.floating); - lp_build_context_init(&f32_bld, gallivm, type); - - /* - * Assumes clamping of the viewport index will occur in setup/gs. Value - * is passed through the rasterization stage via lp_rast_shader_inputs. - * - * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping - * semantics. - */ - viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm, - thread_data_ptr); - - /* - * Load the min and max depth from the lp_jit_context.viewports - * array of lp_jit_viewport structures. - */ - viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index); - - /* viewports[viewport_index].min_depth */ - min_depth = LLVMBuildExtractElement(builder, viewport, - lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), - ""); - min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth); - - /* viewports[viewport_index].max_depth */ - max_depth = LLVMBuildExtractElement(builder, viewport, - lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), - ""); - max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth); - - /* - * Clamp to the min and max depth values for the given viewport. - */ - z = lp_build_clamp(&f32_bld, z, min_depth, max_depth); - } + if (s_out != -1 && outputs[s_out][1]) { + /* there's only one value, and spec says to discard additional bits */ + LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255); + stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s"); + stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, ""); + stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, ""); + stencil_refs[1] = stencil_refs[0]; } lp_build_depth_stencil_load_swizzled(gallivm, type, @@ -630,7 +707,7 @@ generate_fs_twiddle(struct gallivm_state *gallivm, src_count = num_fs * src_channels; assert(pixels == 2 || pixels == 1); - assert(num_fs * src_channels <= Elements(src)); + assert(num_fs * src_channels <= ARRAY_SIZE(src)); /* * Transpose from SoA -> AoS @@ -701,6 +778,10 @@ generate_fs_twiddle(struct gallivm_state *gallivm, } } else if (twiddle) { /* Twiddle pixels across elements of array */ + /* + * XXX: we should avoid this in some cases, but would need to tell + * lp_build_conv to reorder (or deal with it ourselves). + */ lp_bld_quad_twiddle(gallivm, type, src, src_count, dst); } else { /* Do nothing */ @@ -731,6 +812,94 @@ generate_fs_twiddle(struct gallivm_state *gallivm, } +/* + * Untwiddle and transpose, much like the above. + * However, this is after conversion, so we get packed vectors. + * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data, + * the vectors will look like: + * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may + * be swizzled here). Extending to 16bit should be trivial. + * Should also be extended to handle twice wide vectors with AVX2... + */ +static void +fs_twiddle_transpose(struct gallivm_state *gallivm, + struct lp_type type, + LLVMValueRef *src, + unsigned src_count, + LLVMValueRef *dst) +{ + unsigned i, j; + struct lp_type type64, type16, type32; + LLVMTypeRef type64_t, type8_t, type16_t, type32_t; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef tmp[4], shuf[8]; + for (j = 0; j < 2; j++) { + shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0); + shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2); + shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1); + shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3); + } + + assert(src_count == 4 || src_count == 2 || src_count == 1); + assert(type.width == 8); + assert(type.length == 16); + + type8_t = lp_build_vec_type(gallivm, type); + + type64 = type; + type64.length /= 8; + type64.width *= 8; + type64_t = lp_build_vec_type(gallivm, type64); + + type16 = type; + type16.length /= 2; + type16.width *= 2; + type16_t = lp_build_vec_type(gallivm, type16); + + type32 = type; + type32.length /= 4; + type32.width *= 4; + type32_t = lp_build_vec_type(gallivm, type32); + + lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp); + + if (src_count == 1) { + /* transpose was no-op, just untwiddle */ + LLVMValueRef shuf_vec; + shuf_vec = LLVMConstVector(shuf, 8); + tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, ""); + tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, ""); + dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, ""); + } else if (src_count == 2) { + LLVMValueRef shuf_vec; + shuf_vec = LLVMConstVector(shuf, 4); + + for (i = 0; i < 2; i++) { + tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, ""); + tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, ""); + dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, ""); + } + } else { + for (j = 0; j < 2; j++) { + LLVMValueRef lo, hi, lo2, hi2; + /* + * Note that if we only really have 3 valid channels (rgb) + * and we don't need alpha we could substitute a undef here + * for the respective channel (causing llvm to drop conversion + * for alpha). + */ + /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */ + lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, ""); + hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, ""); + lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0); + hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1); + dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, ""); + dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, ""); + } + } +} + + /** * Load an unswizzled block of pixels from memory */ @@ -766,11 +935,12 @@ load_unswizzled_block(struct gallivm_state *gallivm, gep[1] = LLVMBuildAdd(builder, bx, by, ""); dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, ""); - dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), ""); + dst_ptr = LLVMBuildBitCast(builder, dst_ptr, + LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), ""); dst[i] = LLVMBuildLoad(builder, dst_ptr, ""); - lp_set_load_alignment(dst[i], dst_alignment); + LLVMSetAlignment(dst[i], dst_alignment); } } @@ -810,11 +980,12 @@ store_unswizzled_block(struct gallivm_state *gallivm, gep[1] = LLVMBuildAdd(builder, bx, by, ""); src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, ""); - src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), ""); + src_ptr = LLVMBuildBitCast(builder, src_ptr, + LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), ""); src_ptr = LLVMBuildStore(builder, src[i], src_ptr); - lp_set_store_alignment(src_ptr, src_alignment); + LLVMSetAlignment(src_ptr, src_alignment); } } @@ -824,7 +995,7 @@ store_unswizzled_block(struct gallivm_state *gallivm, * * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5. */ -static INLINE boolean +static inline boolean is_arithmetic_format(const struct util_format_description *format_desc) { boolean arith = false; @@ -844,7 +1015,7 @@ is_arithmetic_format(const struct util_format_description *format_desc) * to floats for blending, and furthermore has "natural" packed AoS -> unpacked * SoA conversion. */ -static INLINE boolean +static inline boolean format_expands_to_float_soa(const struct util_format_description *format_desc) { if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT || @@ -860,7 +1031,7 @@ format_expands_to_float_soa(const struct util_format_description *format_desc) * * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte */ -static INLINE void +static inline void lp_mem_type_from_format_desc(const struct util_format_description *format_desc, struct lp_type* type) { @@ -868,12 +1039,12 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc, unsigned chan; if (format_expands_to_float_soa(format_desc)) { - /* just make this a 32bit uint */ + /* just make this a uint with width of block */ type->floating = false; type->fixed = false; type->sign = false; type->norm = false; - type->width = 32; + type->width = format_desc->block.bits; type->length = 1; return; } @@ -908,7 +1079,7 @@ lp_mem_type_from_format_desc(const struct util_format_description *format_desc, * * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte */ -static INLINE void +static inline void lp_blend_type_from_format_desc(const struct util_format_description *format_desc, struct lp_type* type) { @@ -980,7 +1151,7 @@ lp_blend_type_from_format_desc(const struct util_format_description *format_desc * * but we try to avoid division and multiplication through shifts. */ -static INLINE LLVMValueRef +static inline LLVMValueRef scale_bits(struct gallivm_state *gallivm, int src_bits, int dst_bits, @@ -1063,7 +1234,7 @@ scale_bits(struct gallivm_state *gallivm, lp_build_const_int_vec(gallivm, src_type, db), ""); - if (db < src_bits) { + if (db <= src_bits) { /* Enough bits in src to fill the remainder */ LLVMValueRef lower = LLVMBuildLShr(builder, src, @@ -1089,6 +1260,19 @@ scale_bits(struct gallivm_state *gallivm, return result; } +/** + * If RT is a smallfloat (needing denorms) format + */ +static inline int +have_smallfloat_format(struct lp_type dst_type, + enum pipe_format format) +{ + return ((dst_type.floating && dst_type.width != 32) || + /* due to format handling hacks this format doesn't have floating set + * here (and actually has width set to 32 too) so special case this. */ + (format == PIPE_FORMAT_R11G11B10_FLOAT)); +} + /** * Convert from memory format to blending format @@ -1108,7 +1292,7 @@ convert_to_blend_type(struct gallivm_state *gallivm, LLVMBuilderRef builder = gallivm->builder; struct lp_type blend_type; struct lp_type mem_type; - unsigned i, j, k; + unsigned i, j; unsigned pixels = block_size / num_srcs; bool is_arith; @@ -1124,12 +1308,24 @@ convert_to_blend_type(struct gallivm_state *gallivm, * This is pretty suboptimal for this case blending in SoA would be much * better, since conversion gets us SoA values so need to convert back. */ - assert(src_type.width == 32); + assert(src_type.width == 32 || src_type.width == 16); assert(dst_type.floating); assert(dst_type.width == 32); assert(dst_type.length % 4 == 0); assert(num_srcs % 4 == 0); + if (src_type.width == 16) { + /* expand 4x16bit values to 4x32bit */ + struct lp_type type32x4 = src_type; + LLVMTypeRef ltype32x4; + unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4; + type32x4.width = 32; + ltype32x4 = lp_build_vec_type(gallivm, type32x4); + for (i = 0; i < num_fetch; i++) { + src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, ""); + } + src_type.width = 32; + } for (i = 0; i < 4; i++) { tmpsrc[i] = src[i]; } @@ -1203,15 +1399,13 @@ convert_to_blend_type(struct gallivm_state *gallivm, for (j = 0; j < src_fmt->nr_channels; ++j) { unsigned mask = 0; unsigned sa = src_fmt->channel[j].shift; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if PIPE_ARCH_LITTLE_ENDIAN unsigned from_lsb = j; #else unsigned from_lsb = src_fmt->nr_channels - j - 1; #endif - for (k = 0; k < src_fmt->channel[j].size; ++k) { - mask |= 1 << k; - } + mask = (1 << src_fmt->channel[j].size) - 1; /* Extract bits from source */ chans[j] = LLVMBuildLShr(builder, @@ -1285,7 +1479,7 @@ convert_from_blend_type(struct gallivm_state *gallivm, assert(src_type.floating); assert(src_type.width == 32); assert(src_type.length % 4 == 0); - assert(dst_type.width == 32); + assert(dst_type.width == 32 || dst_type.width == 16); for (i = 0; i < num_srcs / 4; i++) { LLVMValueRef tmpsoa[4], tmpdst; @@ -1320,6 +1514,25 @@ convert_from_blend_type(struct gallivm_state *gallivm, src[i] = tmpdst; } } + if (dst_type.width == 16) { + struct lp_type type16x8 = dst_type; + struct lp_type type32x4 = dst_type; + LLVMTypeRef ltype16x4, ltypei64, ltypei128; + unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4; + type16x8.length = 8; + type32x4.width = 32; + ltypei128 = LLVMIntTypeInContext(gallivm->context, 128); + ltypei64 = LLVMIntTypeInContext(gallivm->context, 64); + ltype16x4 = lp_build_vec_type(gallivm, dst_type); + /* We could do vector truncation but it doesn't generate very good code */ + for (i = 0; i < num_fetch; i++) { + src[i] = lp_build_pack2(gallivm, type32x4, type16x8, + src[i], lp_build_zero(gallivm, type32x4)); + src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, ""); + src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, ""); + src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, ""); + } + } return; } @@ -1368,7 +1581,7 @@ convert_from_blend_type(struct gallivm_state *gallivm, for (j = 0; j < src_fmt->nr_channels; ++j) { unsigned mask = 0; unsigned sa = src_fmt->channel[j].shift; -#ifdef PIPE_ARCH_LITTLE_ENDIAN +#if PIPE_ARCH_LITTLE_ENDIAN unsigned from_lsb = j; #else unsigned from_lsb = src_fmt->nr_channels - j - 1; @@ -1383,7 +1596,8 @@ convert_from_blend_type(struct gallivm_state *gallivm, /* Extract bits */ chans[j] = LLVMBuildLShr(builder, dst[i], - lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width), + lp_build_const_int_vec(gallivm, src_type, + from_lsb * blend_type.width), ""); chans[j] = LLVMBuildAnd(builder, @@ -1471,7 +1685,8 @@ convert_alpha(struct gallivm_state *gallivm, /* If there is a src for each pixel broadcast the alpha across whole row */ if (src_count == block_size) { for (i = 0; i < src_count; ++i) { - src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]); + src_alpha[i] = lp_build_broadcast(gallivm, + lp_build_vec_type(gallivm, row_type), src_alpha[i]); } } else { unsigned pixels = block_size / src_count; @@ -1541,7 +1756,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS]; LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS]; LLVMValueRef src_alpha[4 * 4]; - LLVMValueRef src1_alpha[4 * 4]; + LLVMValueRef src1_alpha[4 * 4] = { NULL }; LLVMValueRef src_mask[4 * 4]; LLVMValueRef src[4 * 4]; LLVMValueRef src1[4 * 4]; @@ -1557,6 +1772,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, struct lp_type blend_type; struct lp_type row_type; struct lp_type dst_type; + struct lp_type ls_type; unsigned char swizzle[TGSI_NUM_CHANNELS]; unsigned vector_width; @@ -1576,6 +1792,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, util_blend_state_is_dual(&variant->key.blend, 0); const boolean is_1d = variant->key.resource_1d; + boolean twiddle_after_convert = FALSE; unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs; LLVMValueRef fpstate = 0; @@ -1593,7 +1810,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, * this, otherwise half-float format conversions won't work * (again due to llvm bug #6393). */ - if (dst_type.floating && dst_type.width != 32) { + if (have_smallfloat_format(dst_type, out_format)) { /* We need to make sure that denorms are ok for half float conversions */ fpstate = lp_build_fpstate_get(gallivm); @@ -1622,15 +1839,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, partial_mask |= !variant->opaque; i32_zero = lp_build_const_int32(gallivm, 0); -#if HAVE_LLVM < 0x0302 - /* - * undef triggers a crash in LLVMBuildTrunc in convert_from_blend_type in some - * cases (seen with r10g10b10a2, 128bit wide vectors) (only used for 1d case). - */ - undef_src_val = lp_build_zero(gallivm, fs_type); -#else undef_src_val = lp_build_undef(gallivm, fs_type); -#endif row_type.length = fs_type.length; vector_width = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width; @@ -1679,13 +1888,23 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, } /* If 3 channels then pad to include alpha for 4 element transpose */ - if (dst_channels == 3 && !has_alpha) { + if (dst_channels == 3) { + assert (!has_alpha); for (i = 0; i < TGSI_NUM_CHANNELS; i++) { if (swizzle[i] > TGSI_NUM_CHANNELS) swizzle[i] = 3; } if (out_format_desc->nr_channels == 4) { dst_channels = 4; + /* + * We use alpha from the color conversion, not separate one. + * We had to include it for transpose, hence it will get converted + * too (albeit when doing transpose after conversion, that would + * no longer be the case necessarily). + * (It works only with 4 channel dsts, e.g. rgbx formats, because + * otherwise we really have padding, not alpha, included.) + */ + has_alpha = true; } } @@ -1717,6 +1936,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, /* * XXX If we include that here maybe could actually use it instead of * separate alpha for blending? + * (Difficult though we actually convert pad channels, not alpha.) */ if (dst_channels == 3 && !has_alpha) { fs_src[i][3] = alpha; @@ -1724,11 +1944,14 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, /* We split the row_mask and row_alpha as we want 128bit interleave */ if (fs_type.length == 8) { - src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels); - src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels); + src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i], + 0, src_channels); + src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i], + src_channels, src_channels); src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels); - src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels); + src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, + src_channels, src_channels); } else { src_mask[i] = fs_mask[i]; src_alpha[i] = alpha; @@ -1759,7 +1982,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, } if (fs_type.length == 8) { src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels); - src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels); + src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, + src_channels, src_channels); } else { src1_alpha[i] = alpha; } @@ -1785,14 +2009,45 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, } } + /* + * We actually should generally do conversion first (for non-1d cases) + * when the blend format is 8 or 16 bits. The reason is obvious, + * there's 2 or 4 times less vectors to deal with for the interleave... + * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit + * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit + * unpack only with 128bit vectors). + * Note: for 16bit sizes really need matching pack conversion code + */ + if (!is_1d && dst_channels != 3 && dst_type.width == 8) { + twiddle_after_convert = TRUE; + } + /* * Pixel twiddle from fragment shader order to memory order */ - src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, - dst_channels, fs_src, src, pad_inline); - if (dual_source_blend) { - generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels, - fs_src1, src1, pad_inline); + if (!twiddle_after_convert) { + src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, + dst_channels, fs_src, src, pad_inline); + if (dual_source_blend) { + generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels, + fs_src1, src1, pad_inline); + } + } else { + src_count = num_fullblock_fs * dst_channels; + /* + * We reorder things a bit here, so the cases for 4-wide and 8-wide + * (AVX) turn out the same later when untwiddling/transpose (albeit + * for true AVX2 path untwiddle needs to be different). + * For now just order by colors first (so we can use unpack later). + */ + for (j = 0; j < num_fullblock_fs; j++) { + for (i = 0; i < dst_channels; i++) { + src[i*num_fullblock_fs + j] = fs_src[j][i]; + if (dual_source_blend) { + src1[i*num_fullblock_fs + j] = fs_src1[j][i]; + } + } + } } src_channels = dst_channels < 3 ? dst_channels : 4; @@ -1836,13 +2091,21 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, assert(bits == 128 || bits == 256); } + if (twiddle_after_convert) { + fs_twiddle_transpose(gallivm, row_type, src, src_count, src); + if (dual_source_blend) { + fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1); + } + } /* * Blend Colour conversion */ blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr); - blend_color = LLVMBuildPointerCast(builder, blend_color, LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), ""); - blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, &i32_zero, 1, ""), ""); + blend_color = LLVMBuildPointerCast(builder, blend_color, + LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), ""); + blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, + &i32_zero, 1, ""), ""); /* Convert */ lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1); @@ -1919,13 +2182,19 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, mask_type.length = pixels; mask_type.width = row_type.width * dst_channels; - src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), ""); + /* + * If mask_type width is smaller than 32bit, this doesn't quite + * generate the most efficient code (could use some pack). + */ + src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], + lp_build_int_vec_type(gallivm, mask_type), ""); mask_type.length *= dst_channels; mask_type.width /= dst_channels; } - src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), ""); + src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], + lp_build_int_vec_type(gallivm, mask_type), ""); src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length); } @@ -1990,17 +2259,41 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, */ dst_alignment = MIN2(16, dst_alignment); + ls_type = dst_type; + + if (dst_count > src_count) { + if ((dst_type.width == 8 || dst_type.width == 16) && + util_is_power_of_two_or_zero(dst_type.length) && + dst_type.length * dst_type.width < 128) { + /* + * Never try to load values as 4xi8 which we will then + * concatenate to larger vectors. This gives llvm a real + * headache (the problem is the type legalizer (?) will + * try to load that as 4xi8 zext to 4xi32 to fill the vector, + * then the shuffles to concatenate are more or less impossible + * - llvm is easily capable of generating a sequence of 32 + * pextrb/pinsrb instructions for that. Albeit it appears to + * be fixed in llvm 4.0. So, load and concatenate with 32bit + * width to avoid the trouble (16bit seems not as bad, llvm + * probably recognizes the load+shuffle as only one shuffle + * is necessary, but we can do just the same anyway). + */ + ls_type.length = dst_type.length * dst_type.width / 32; + ls_type.width = 32; + } + } + if (is_1d) { load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1, - dst, dst_type, dst_count / 4, dst_alignment); + dst, ls_type, dst_count / 4, dst_alignment); for (i = dst_count / 4; i < dst_count; i++) { - dst[i] = lp_build_undef(gallivm, dst_type); + dst[i] = lp_build_undef(gallivm, ls_type); } } else { load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height, - dst, dst_type, dst_count, dst_alignment); + dst, ls_type, dst_count, dst_alignment); } @@ -2015,7 +2308,24 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, * on all 16 pixels in that single vector at once. */ if (dst_count > src_count) { - lp_build_concat_n(gallivm, dst_type, dst, 4, dst, src_count); + if (ls_type.length != dst_type.length && ls_type.length == 1) { + LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type); + LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1); + for (i = 0; i < dst_count; i++) { + dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, ""); + } + } + + lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count); + + if (ls_type.length != dst_type.length) { + struct lp_type tmp_type = dst_type; + tmp_type.length = dst_type.length * 4 / src_count; + for (i = 0; i < src_count; i++) { + dst[i] = LLVMBuildBitCast(builder, dst[i], + lp_build_vec_type(gallivm, tmp_type), ""); + } + } } /* @@ -2030,7 +2340,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, * It seems some cleanup could be done here (like skipping conversion/blend * when not needed). */ - convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, row_type, dst, src_count); + convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, + row_type, dst, src_count); /* * FIXME: Really should get logic ops / masks out of generic blend / row @@ -2056,7 +2367,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, pad_inline ? 4 : dst_channels); } - convert_from_blend_type(gallivm, block_size, out_format_desc, row_type, dst_type, dst, src_count); + convert_from_blend_type(gallivm, block_size, out_format_desc, + row_type, dst_type, dst, src_count); /* Split the blend rows back to memory rows */ if (dst_count > src_count) { @@ -2091,7 +2403,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm, dst, dst_type, dst_count, dst_alignment); } - if (dst_type.floating && dst_type.width != 32) { + if (have_smallfloat_format(dst_type, out_format)) { lp_build_fpstate_set(gallivm, fpstate); } @@ -2114,9 +2426,9 @@ generate_fragment(struct llvmpipe_context *lp, unsigned partial_mask) { struct gallivm_state *gallivm = variant->gallivm; - const struct lp_fragment_shader_variant_key *key = &variant->key; + struct lp_fragment_shader_variant_key *key = &variant->key; struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS]; - char func_name[256]; + char func_name[64]; struct lp_type fs_type; struct lp_type blend_type; LLVMTypeRef fs_elem_type; @@ -2140,6 +2452,7 @@ generate_fragment(struct llvmpipe_context *lp, LLVMBasicBlockRef block; LLVMBuilderRef builder; struct lp_build_sampler_soa *sampler; + struct lp_build_image_soa *image; struct lp_build_interp_soa_context interp; LLVMValueRef fs_mask[16 / 4]; LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4]; @@ -2168,14 +2481,8 @@ generate_fragment(struct llvmpipe_context *lp, } /* check if writes to cbuf[0] are to be copied to all cbufs */ - cbuf0_write_all = FALSE; - for (i = 0;i < shader->info.base.num_properties; i++) { - if (shader->info.base.properties[i].name == - TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS) { - cbuf0_write_all = TRUE; - break; - } - } + cbuf0_write_all = + shader->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]; /* TODO: actually pick these based on the fs and color buffer * characteristics. */ @@ -2203,8 +2510,8 @@ generate_fragment(struct llvmpipe_context *lp, blend_vec_type = lp_build_vec_type(gallivm, blend_type); - util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s", - shader->no, variant->no, partial_mask ? "partial" : "whole"); + snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s", + shader->no, variant->no, partial_mask ? "partial" : "whole"); arg_types[0] = variant->jit_context_ptr_type; /* context */ arg_types[1] = int32_type; /* x */ @@ -2221,7 +2528,7 @@ generate_fragment(struct llvmpipe_context *lp, arg_types[12] = int32_type; /* depth_stride */ func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context), - arg_types, Elements(arg_types), 0); + arg_types, ARRAY_SIZE(arg_types), 0); function = LLVMAddFunction(gallivm->module, func_name, func_type); LLVMSetFunctionCallConv(function, LLVMCCallConv); @@ -2231,9 +2538,9 @@ generate_fragment(struct llvmpipe_context *lp, /* XXX: need to propagate noalias down into color param now we are * passing a pointer-to-pointer? */ - for(i = 0; i < Elements(arg_types); ++i) + for(i = 0; i < ARRAY_SIZE(arg_types); ++i) if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) - LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute); + lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS); context_ptr = LLVMGetParam(function, 0); x = LLVMGetParam(function, 1); @@ -2257,8 +2564,8 @@ generate_fragment(struct llvmpipe_context *lp, lp_build_name(dady_ptr, "dady"); lp_build_name(color_ptr_ptr, "color_ptr_ptr"); lp_build_name(depth_ptr, "depth"); - lp_build_name(thread_data_ptr, "thread_data"); lp_build_name(mask_input, "mask_input"); + lp_build_name(thread_data_ptr, "thread_data"); lp_build_name(stride_ptr, "stride_ptr"); lp_build_name(depth_stride, "depth_stride"); @@ -2271,8 +2578,28 @@ generate_fragment(struct llvmpipe_context *lp, assert(builder); LLVMPositionBuilderAtEnd(builder, block); + /* + * Must not count ps invocations if there's a null shader. + * (It would be ok to count with null shader if there's d/s tests, + * but only if there's d/s buffers too, which is different + * to implicit rasterization disable which must not depend + * on the d/s buffers.) + * Could use popcount on mask, but pixel accuracy is not required. + * Could disable if there's no stats query, but maybe not worth it. + */ + if (shader->info.base.num_instructions > 1) { + LLVMValueRef invocs, val; + invocs = lp_jit_thread_data_invocations(gallivm, thread_data_ptr); + val = LLVMBuildLoad(builder, invocs, ""); + val = LLVMBuildAdd(builder, val, + LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), 1, 0), + "invoc_count"); + LLVMBuildStore(builder, val, invocs); + } + /* code generated texture sampling */ - sampler = lp_llvm_sampler_soa_create(key->state, context_ptr); + sampler = lp_llvm_sampler_soa_create(key->samplers); + image = lp_llvm_image_soa_create(lp_fs_variant_key_images(key)); num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */ /* for 1d resources only run "upper half" of stamp */ @@ -2285,6 +2612,8 @@ generate_fragment(struct llvmpipe_context *lp, LLVMValueRef mask_store = lp_build_array_alloca(gallivm, mask_type, num_loop, "mask_store"); LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS]; + boolean pixel_center_integer = + shader->info.base.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER]; /* * The shader input interpolation info is not explicitely baked in the @@ -2295,7 +2624,8 @@ generate_fragment(struct llvmpipe_context *lp, gallivm, shader->info.base.num_inputs, inputs, - shader->info.base.pixel_center_integer, + pixel_center_integer, + key->depth_clamp, builder, fs_type, a0_ptr, dadx_ptr, dady_ptr, x, y); @@ -2324,6 +2654,7 @@ generate_fragment(struct llvmpipe_context *lp, num_loop, &interp, sampler, + image, mask_store, /* output */ color_store, depth_ptr, @@ -2358,44 +2689,47 @@ generate_fragment(struct llvmpipe_context *lp, } sampler->destroy(sampler); - + image->destroy(image); /* Loop over color outputs / color buffers to do blending. */ for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) { - LLVMValueRef color_ptr; - LLVMValueRef stride; - LLVMValueRef index = lp_build_const_int32(gallivm, cbuf); + if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE) { + LLVMValueRef color_ptr; + LLVMValueRef stride; + LLVMValueRef index = lp_build_const_int32(gallivm, cbuf); - boolean do_branch = ((key->depth.enabled - || key->stencil[0].enabled - || key->alpha.enabled) - && !shader->info.base.uses_kill); + boolean do_branch = ((key->depth.enabled + || key->stencil[0].enabled + || key->alpha.enabled) + && !shader->info.base.uses_kill); - color_ptr = LLVMBuildLoad(builder, - LLVMBuildGEP(builder, color_ptr_ptr, &index, 1, ""), - ""); + color_ptr = LLVMBuildLoad(builder, + LLVMBuildGEP(builder, color_ptr_ptr, + &index, 1, ""), + ""); - lp_build_name(color_ptr, "color_ptr%d", cbuf); + lp_build_name(color_ptr, "color_ptr%d", cbuf); - stride = LLVMBuildLoad(builder, - LLVMBuildGEP(builder, stride_ptr, &index, 1, ""), - ""); + stride = LLVMBuildLoad(builder, + LLVMBuildGEP(builder, stride_ptr, &index, 1, ""), + ""); - generate_unswizzled_blend(gallivm, cbuf, variant, key->cbuf_format[cbuf], - num_fs, fs_type, fs_mask, fs_out_color, - context_ptr, color_ptr, stride, partial_mask, do_branch); + generate_unswizzled_blend(gallivm, cbuf, variant, + key->cbuf_format[cbuf], + num_fs, fs_type, fs_mask, fs_out_color, + context_ptr, color_ptr, stride, + partial_mask, do_branch); + } } LLVMBuildRetVoid(builder); gallivm_verify_function(gallivm, function); - - variant->nr_instrs += lp_build_count_instructions(function); } static void -dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) +dump_fs_variant_key(struct lp_fragment_shader_variant_key *key) { unsigned i; @@ -2407,25 +2741,27 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) for (i = 0; i < key->nr_cbufs; ++i) { debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i])); } - if (key->depth.enabled) { + if (key->depth.enabled || key->stencil[0].enabled) { debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format)); - debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE)); + } + if (key->depth.enabled) { + debug_printf("depth.func = %s\n", util_str_func(key->depth.func, TRUE)); debug_printf("depth.writemask = %u\n", key->depth.writemask); } for (i = 0; i < 2; ++i) { if (key->stencil[i].enabled) { - debug_printf("stencil[%u].func = %s\n", i, util_dump_func(key->stencil[i].func, TRUE)); - debug_printf("stencil[%u].fail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].fail_op, TRUE)); - debug_printf("stencil[%u].zpass_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zpass_op, TRUE)); - debug_printf("stencil[%u].zfail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zfail_op, TRUE)); + debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, TRUE)); + debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, TRUE)); + debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, TRUE)); + debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, TRUE)); debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask); debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask); } } if (key->alpha.enabled) { - debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE)); + debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, TRUE)); } if (key->occlusion_count) { @@ -2433,35 +2769,35 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) } if (key->blend.logicop_enable) { - debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE)); + debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, TRUE)); } else if (key->blend.rt[0].blend_enable) { - debug_printf("blend.rgb_func = %s\n", util_dump_blend_func (key->blend.rt[0].rgb_func, TRUE)); - debug_printf("blend.rgb_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE)); - debug_printf("blend.rgb_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE)); - debug_printf("blend.alpha_func = %s\n", util_dump_blend_func (key->blend.rt[0].alpha_func, TRUE)); - debug_printf("blend.alpha_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE)); - debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE)); + debug_printf("blend.rgb_func = %s\n", util_str_blend_func (key->blend.rt[0].rgb_func, TRUE)); + debug_printf("blend.rgb_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE)); + debug_printf("blend.rgb_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE)); + debug_printf("blend.alpha_func = %s\n", util_str_blend_func (key->blend.rt[0].alpha_func, TRUE)); + debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE)); + debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE)); } debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask); if (key->blend.alpha_to_coverage) { debug_printf("blend.alpha_to_coverage is enabled\n"); } for (i = 0; i < key->nr_samplers; ++i) { - const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state; + const struct lp_static_sampler_state *sampler = &key->samplers[i].sampler_state; debug_printf("sampler[%u] = \n", i); debug_printf(" .wrap = %s %s %s\n", - util_dump_tex_wrap(sampler->wrap_s, TRUE), - util_dump_tex_wrap(sampler->wrap_t, TRUE), - util_dump_tex_wrap(sampler->wrap_r, TRUE)); + util_str_tex_wrap(sampler->wrap_s, TRUE), + util_str_tex_wrap(sampler->wrap_t, TRUE), + util_str_tex_wrap(sampler->wrap_r, TRUE)); debug_printf(" .min_img_filter = %s\n", - util_dump_tex_filter(sampler->min_img_filter, TRUE)); + util_str_tex_filter(sampler->min_img_filter, TRUE)); debug_printf(" .min_mip_filter = %s\n", - util_dump_tex_mipfilter(sampler->min_mip_filter, TRUE)); + util_str_tex_mipfilter(sampler->min_mip_filter, TRUE)); debug_printf(" .mag_img_filter = %s\n", - util_dump_tex_filter(sampler->mag_img_filter, TRUE)); + util_str_tex_filter(sampler->mag_img_filter, TRUE)); if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) - debug_printf(" .compare_func = %s\n", util_dump_func(sampler->compare_func, TRUE)); + debug_printf(" .compare_func = %s\n", util_str_func(sampler->compare_func, TRUE)); debug_printf(" .normalized_coords = %u\n", sampler->normalized_coords); debug_printf(" .min_max_lod_equal = %u\n", sampler->min_max_lod_equal); debug_printf(" .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero); @@ -2469,12 +2805,12 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) debug_printf(" .apply_max_lod = %u\n", sampler->apply_max_lod); } for (i = 0; i < key->nr_sampler_views; ++i) { - const struct lp_static_texture_state *texture = &key->state[i].texture_state; + const struct lp_static_texture_state *texture = &key->samplers[i].texture_state; debug_printf("texture[%u] = \n", i); debug_printf(" .format = %s\n", util_format_name(texture->format)); debug_printf(" .target = %s\n", - util_dump_tex_target(texture->target, TRUE)); + util_str_tex_target(texture->target, TRUE)); debug_printf(" .level_zero_only = %u\n", texture->level_zero_only); debug_printf(" .pot = %u %u %u\n", @@ -2482,11 +2818,26 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key) texture->pot_height, texture->pot_depth); } + struct lp_image_static_state *images = lp_fs_variant_key_images(key); + for (i = 0; i < key->nr_images; ++i) { + const struct lp_static_texture_state *image = &images[i].image_state; + debug_printf("image[%u] = \n", i); + debug_printf(" .format = %s\n", + util_format_name(image->format)); + debug_printf(" .target = %s\n", + util_str_tex_target(image->target, TRUE)); + debug_printf(" .level_zero_only = %u\n", + image->level_zero_only); + debug_printf(" .pot = %u %u %u\n", + image->pot_width, + image->pot_height, + image->pot_depth); + } } void -lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant) +lp_debug_fs_variant(struct lp_fragment_shader_variant *variant) { debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n", variant->shader->no, variant->no); @@ -2507,14 +2858,19 @@ generate_variant(struct llvmpipe_context *lp, const struct lp_fragment_shader_variant_key *key) { struct lp_fragment_shader_variant *variant; - const struct util_format_description *cbuf0_format_desc; + const struct util_format_description *cbuf0_format_desc = NULL; boolean fullcolormask; + char module_name[64]; - variant = CALLOC_STRUCT(lp_fragment_shader_variant); - if(!variant) + variant = MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key); + if (!variant) return NULL; - variant->gallivm = gallivm_create(); + memset(variant, 0, sizeof(*variant)); + snprintf(module_name, sizeof(module_name), "fs%u_variant%u", + shader->no, shader->variants_created); + + variant->gallivm = gallivm_create(module_name, lp->context); if (!variant->gallivm) { FREE(variant); return NULL; @@ -2544,16 +2900,10 @@ generate_variant(struct llvmpipe_context *lp, !key->alpha.enabled && !key->blend.alpha_to_coverage && !key->depth.enabled && - !shader->info.base.uses_kill + !shader->info.base.uses_kill && + !shader->info.base.writes_samplemask ? TRUE : FALSE; - if ((shader->info.base.num_tokens <= 1) && - !key->depth.enabled && !key->stencil[0].enabled) { - variant->ps_inv_multiplier = 0; - } else { - variant->ps_inv_multiplier = 1; - } - if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) { lp_debug_fs_variant(variant); } @@ -2576,6 +2926,8 @@ generate_variant(struct llvmpipe_context *lp, gallivm_compile_module(variant->gallivm); + variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module); + if (variant->function[RAST_EDGE_TEST]) { variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func) gallivm_jit_function(variant->gallivm, @@ -2590,6 +2942,8 @@ generate_variant(struct llvmpipe_context *lp, variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST]; } + gallivm_free_ir(variant->gallivm); + return variant; } @@ -2602,6 +2956,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, struct lp_fragment_shader *shader; int nr_samplers; int nr_sampler_views; + int nr_images; int i; shader = CALLOC_STRUCT(lp_fragment_shader); @@ -2626,9 +2981,8 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; - - shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key, - state[MAX2(nr_samplers, nr_sampler_views)]); + nr_images = shader->info.base.file_max[TGSI_FILE_IMAGE] + 1; + shader->variant_key_size = lp_fs_variant_key_size(MAX2(nr_samplers, nr_sampler_views), nr_images); for (i = 0; i < shader->info.base.num_inputs; i++) { shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i]; @@ -2636,34 +2990,35 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, switch (shader->info.base.input_interpolate[i]) { case TGSI_INTERPOLATE_CONSTANT: - shader->inputs[i].interp = LP_INTERP_CONSTANT; - break; + shader->inputs[i].interp = LP_INTERP_CONSTANT; + break; case TGSI_INTERPOLATE_LINEAR: - shader->inputs[i].interp = LP_INTERP_LINEAR; - break; + shader->inputs[i].interp = LP_INTERP_LINEAR; + break; case TGSI_INTERPOLATE_PERSPECTIVE: - shader->inputs[i].interp = LP_INTERP_PERSPECTIVE; - break; + shader->inputs[i].interp = LP_INTERP_PERSPECTIVE; + break; case TGSI_INTERPOLATE_COLOR: - shader->inputs[i].interp = LP_INTERP_COLOR; - break; + shader->inputs[i].interp = LP_INTERP_COLOR; + break; default: - assert(0); - break; + assert(0); + break; } switch (shader->info.base.input_semantic_name[i]) { case TGSI_SEMANTIC_FACE: - shader->inputs[i].interp = LP_INTERP_FACING; - break; + shader->inputs[i].interp = LP_INTERP_FACING; + break; case TGSI_SEMANTIC_POSITION: - /* Position was already emitted above - */ - shader->inputs[i].interp = LP_INTERP_POSITION; - shader->inputs[i].src_index = 0; - continue; + /* Position was already emitted above + */ + shader->inputs[i].interp = LP_INTERP_POSITION; + shader->inputs[i].src_index = 0; + continue; } + /* XXX this is a completely pointless index map... */ shader->inputs[i].src_index = i+1; } @@ -2710,29 +3065,17 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs) * Remove shader variant from two lists: the shader's variant list * and the context's variant list. */ -void +static void llvmpipe_remove_shader_variant(struct llvmpipe_context *lp, struct lp_fragment_shader_variant *variant) { - unsigned i; - - if (gallivm_debug & GALLIVM_DEBUG_IR) { - debug_printf("llvmpipe: del fs #%u var #%u v created #%u v cached" - " #%u v total cached #%u\n", - variant->shader->no, - variant->no, + if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) { + debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u " + "v total cached %u inst %u total inst %u\n", + variant->shader->no, variant->no, variant->shader->variants_created, variant->shader->variants_cached, - lp->nr_fs_variants); - } - - /* free all the variant's JIT'd functions */ - for (i = 0; i < Elements(variant->function); i++) { - if (variant->function[i]) { - gallivm_free_function(variant->gallivm, - variant->function[i], - variant->jit_function[i]); - } + lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs); } gallivm_destroy(variant->gallivm); @@ -2786,18 +3129,25 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs) static void llvmpipe_set_constant_buffer(struct pipe_context *pipe, - uint shader, uint index, - struct pipe_constant_buffer *cb) + enum pipe_shader_type shader, uint index, + const struct pipe_constant_buffer *cb) { struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); struct pipe_resource *constants = cb ? cb->buffer : NULL; assert(shader < PIPE_SHADER_TYPES); - assert(index < Elements(llvmpipe->constants[shader])); + assert(index < ARRAY_SIZE(llvmpipe->constants[shader])); /* note: reference counting */ util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb); + if (constants) { + if (!(constants->bind & PIPE_BIND_CONSTANT_BUFFER)) { + debug_printf("Illegal set constant without bind flag\n"); + constants->bind |= PIPE_BIND_CONSTANT_BUFFER; + } + } + if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_GEOMETRY) { /* Pass the constants to the 'draw' module */ @@ -2820,19 +3170,79 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe, draw_set_mapped_constant_buffer(llvmpipe->draw, shader, index, data, size); } - - llvmpipe->dirty |= LP_NEW_CONSTANTS; + else if (shader == PIPE_SHADER_COMPUTE) + llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS; + else + llvmpipe->dirty |= LP_NEW_FS_CONSTANTS; if (cb && cb->user_buffer) { pipe_resource_reference(&constants, NULL); } } +static void +llvmpipe_set_shader_buffers(struct pipe_context *pipe, + enum pipe_shader_type shader, unsigned start_slot, + unsigned count, const struct pipe_shader_buffer *buffers, + unsigned writable_bitmask) +{ + struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); + unsigned i, idx; + for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) { + const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL; + + util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer); + + if (shader == PIPE_SHADER_VERTEX || + shader == PIPE_SHADER_GEOMETRY) { + const unsigned size = buffer ? buffer->buffer_size : 0; + const ubyte *data = NULL; + if (buffer && buffer->buffer) + data = (ubyte *) llvmpipe_resource_data(buffer->buffer); + if (data) + data += buffer->buffer_offset; + draw_set_mapped_shader_buffer(llvmpipe->draw, shader, + i, data, size); + } else if (shader == PIPE_SHADER_COMPUTE) { + llvmpipe->cs_dirty |= LP_CSNEW_SSBOS; + } else if (shader == PIPE_SHADER_FRAGMENT) { + llvmpipe->dirty |= LP_NEW_FS_SSBOS; + } + } +} + +static void +llvmpipe_set_shader_images(struct pipe_context *pipe, + enum pipe_shader_type shader, unsigned start_slot, + unsigned count, const struct pipe_image_view *images) +{ + struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); + unsigned i, idx; + + draw_flush(llvmpipe->draw); + for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) { + const struct pipe_image_view *image = images ? &images[idx] : NULL; + + util_copy_image_view(&llvmpipe->images[shader][i], image); + } + + llvmpipe->num_images[shader] = start_slot + count; + if (shader == PIPE_SHADER_VERTEX || + shader == PIPE_SHADER_GEOMETRY) { + draw_set_images(llvmpipe->draw, + shader, + llvmpipe->images[shader], + start_slot + count); + } else if (shader == PIPE_SHADER_COMPUTE) + llvmpipe->cs_dirty |= LP_CSNEW_IMAGES; + else + llvmpipe->dirty |= LP_NEW_FS_IMAGES; +} /** * Return the blend factor equivalent to a destination alpha of one. */ -static INLINE unsigned +static inline unsigned force_dst_alpha_one(unsigned factor, boolean clamped_zero) { switch(factor) { @@ -2858,14 +3268,17 @@ force_dst_alpha_one(unsigned factor, boolean clamped_zero) * TODO: there is actually no reason to tie this to context state -- the * generated code could be cached globally in the screen. */ -static void +static struct lp_fragment_shader_variant_key * make_variant_key(struct llvmpipe_context *lp, struct lp_fragment_shader *shader, - struct lp_fragment_shader_variant_key *key) + char *store) { unsigned i; + struct lp_fragment_shader_variant_key *key; + + key = (struct lp_fragment_shader_variant_key *)store; - memset(key, 0, shader->variant_key_size); + memset(key, 0, offsetof(struct lp_fragment_shader_variant_key, samplers[1])); if (lp->framebuffer.zsbuf) { enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format; @@ -2892,15 +3305,23 @@ make_variant_key(struct llvmpipe_context *lp, * depth_clip == 0 implies depth clamping is enabled. * * When clip_halfz is enabled, then always clamp the depth values. + * + * XXX: This is incorrect for GL, but correct for d3d10 (depth + * clamp is always active in d3d10, regardless if depth clip is + * enabled or not). + * (GL has an always-on [0,1] clamp on fs depth output instead + * to ensure the depth values stay in range. Doesn't look like + * we do that, though...) */ if (lp->rasterizer->clip_halfz) { key->depth_clamp = 1; } else { - key->depth_clamp = (lp->rasterizer->depth_clip == 0) ? 1 : 0; + key->depth_clamp = (lp->rasterizer->depth_clip_near == 0) ? 1 : 0; } /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */ if (!lp->framebuffer.nr_cbufs || + !lp->framebuffer.cbufs[0] || !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) { key->alpha.enabled = lp->depth_stencil->alpha.enabled; } @@ -2928,64 +3349,74 @@ make_variant_key(struct llvmpipe_context *lp, } for (i = 0; i < lp->framebuffer.nr_cbufs; i++) { - enum pipe_format format = lp->framebuffer.cbufs[i]->format; struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i]; - const struct util_format_description *format_desc; - key->cbuf_format[i] = format; + if (lp->framebuffer.cbufs[i]) { + enum pipe_format format = lp->framebuffer.cbufs[i]->format; + const struct util_format_description *format_desc; - /* - * Figure out if this is a 1d resource. Note that OpenGL allows crazy - * mixing of 2d textures with height 1 and 1d textures, so make sure - * we pick 1d if any cbuf or zsbuf is 1d. - */ - if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[0]->texture)) { - key->resource_1d = TRUE; - } + key->cbuf_format[i] = format; - format_desc = util_format_description(format); - assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || - format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); + /* + * Figure out if this is a 1d resource. Note that OpenGL allows crazy + * mixing of 2d textures with height 1 and 1d textures, so make sure + * we pick 1d if any cbuf or zsbuf is 1d. + */ + if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) { + key->resource_1d = TRUE; + } - /* - * Mask out color channels not present in the color buffer. - */ - blend_rt->colormask &= util_format_colormask(format_desc); + format_desc = util_format_description(format); + assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB); - /* - * Disable blend for integer formats. - */ - if (util_format_is_pure_integer(format)) { - blend_rt->blend_enable = 0; - } + /* + * Mask out color channels not present in the color buffer. + */ + blend_rt->colormask &= util_format_colormask(format_desc); - /* - * Our swizzled render tiles always have an alpha channel, but the linear - * render target format often does not, so force here the dst alpha to be - * one. - * - * This is not a mere optimization. Wrong results will be produced if the - * dst alpha is used, the dst format does not have alpha, and the previous - * rendering was not flushed from the swizzled to linear buffer. For - * example, NonPowTwo DCT. - * - * TODO: This should be generalized to all channels for better - * performance, but only alpha causes correctness issues. - * - * Also, force rgb/alpha func/factors match, to make AoS blending easier. - */ - if (format_desc->swizzle[3] > UTIL_FORMAT_SWIZZLE_W || - format_desc->swizzle[3] == format_desc->swizzle[0]) { - /* Doesn't cover mixed snorm/unorm but can't render to them anyway */ - boolean clamped_zero = !util_format_is_float(format) && - !util_format_is_snorm(format); - blend_rt->rgb_src_factor = force_dst_alpha_one(blend_rt->rgb_src_factor, - clamped_zero); - blend_rt->rgb_dst_factor = force_dst_alpha_one(blend_rt->rgb_dst_factor, - clamped_zero); - blend_rt->alpha_func = blend_rt->rgb_func; - blend_rt->alpha_src_factor = blend_rt->rgb_src_factor; - blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor; + /* + * Disable blend for integer formats. + */ + if (util_format_is_pure_integer(format)) { + blend_rt->blend_enable = 0; + } + + /* + * Our swizzled render tiles always have an alpha channel, but the + * linear render target format often does not, so force here the dst + * alpha to be one. + * + * This is not a mere optimization. Wrong results will be produced if + * the dst alpha is used, the dst format does not have alpha, and the + * previous rendering was not flushed from the swizzled to linear + * buffer. For example, NonPowTwo DCT. + * + * TODO: This should be generalized to all channels for better + * performance, but only alpha causes correctness issues. + * + * Also, force rgb/alpha func/factors match, to make AoS blending + * easier. + */ + if (format_desc->swizzle[3] > PIPE_SWIZZLE_W || + format_desc->swizzle[3] == format_desc->swizzle[0]) { + /* Doesn't cover mixed snorm/unorm but can't render to them anyway */ + boolean clamped_zero = !util_format_is_float(format) && + !util_format_is_snorm(format); + blend_rt->rgb_src_factor = + force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero); + blend_rt->rgb_dst_factor = + force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero); + blend_rt->alpha_func = blend_rt->rgb_func; + blend_rt->alpha_src_factor = blend_rt->rgb_src_factor; + blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor; + } + } + else { + /* no color buffer for this fragment output */ + key->cbuf_format[i] = PIPE_FORMAT_NONE; + blend_rt->colormask = 0x0; + blend_rt->blend_enable = 0; } } @@ -2993,9 +3424,15 @@ make_variant_key(struct llvmpipe_context *lp, */ key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1; + struct lp_sampler_static_state *fs_sampler; + + fs_sampler = key->samplers; + + memset(fs_sampler, 0, MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler); + for(i = 0; i < key->nr_samplers; ++i) { if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { - lp_sampler_static_sampler_state(&key->state[i].sampler_state, + lp_sampler_static_sampler_state(&fs_sampler[i].sampler_state, lp->samplers[PIPE_SHADER_FRAGMENT][i]); } } @@ -3008,8 +3445,13 @@ make_variant_key(struct llvmpipe_context *lp, if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; for(i = 0; i < key->nr_sampler_views; ++i) { - if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { - lp_sampler_static_texture_state(&key->state[i].texture_state, + /* + * Note sview may exceed what's representable by file_mask. + * This will still work, the only downside is that not actually + * used views may be included in the shader key. + */ + if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1u << (i & 31))) { + lp_sampler_static_texture_state(&fs_sampler[i].texture_state, lp->sampler_views[PIPE_SHADER_FRAGMENT][i]); } } @@ -3018,11 +3460,22 @@ make_variant_key(struct llvmpipe_context *lp, key->nr_sampler_views = key->nr_samplers; for(i = 0; i < key->nr_sampler_views; ++i) { if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { - lp_sampler_static_texture_state(&key->state[i].texture_state, + lp_sampler_static_texture_state(&fs_sampler[i].texture_state, lp->sampler_views[PIPE_SHADER_FRAGMENT][i]); } } } + + struct lp_image_static_state *lp_image; + lp_image = lp_fs_variant_key_images(key); + key->nr_images = shader->info.base.file_max[TGSI_FILE_IMAGE] + 1; + for (i = 0; i < key->nr_images; ++i) { + if (shader->info.base.file_mask[TGSI_FILE_IMAGE] & (1 << i)) { + lp_sampler_static_texture_state_image(&lp_image[i].image_state, + &lp->images[PIPE_SHADER_FRAGMENT][i]); + } + } + return key; } @@ -3035,16 +3488,17 @@ void llvmpipe_update_fs(struct llvmpipe_context *lp) { struct lp_fragment_shader *shader = lp->fs; - struct lp_fragment_shader_variant_key key; + struct lp_fragment_shader_variant_key *key; struct lp_fragment_shader_variant *variant = NULL; struct lp_fs_variant_list_item *li; + char store[LP_FS_MAX_VARIANT_KEY_SIZE]; - make_variant_key(lp, shader, &key); + key = make_variant_key(lp, shader, store); /* Search the variants for one which matches the key */ li = first_elem(&shader->variants); while(!at_end(&shader->variants, li)) { - if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) { + if(memcmp(&li->base->key, key, shader->variant_key_size) == 0) { variant = li->base; break; } @@ -3063,7 +3517,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) unsigned i; unsigned variants_to_cull; - if (0) { + if (LP_DEBUG & DEBUG_FS) { debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n", lp->nr_fs_variants, lp->nr_fs_instrs, @@ -3071,14 +3525,22 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) } /* First, check if we've exceeded the max number of shader variants. - * If so, free 25% of them (the least recently used ones). + * If so, free 6.25% of them (the least recently used ones). */ - variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 4 : 0; + variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 16 : 0; if (variants_to_cull || lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) { struct pipe_context *pipe = &lp->pipe; + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("Evicting FS: %u fs variants,\t%u total variants," + "\t%u instrs,\t%u instrs/variant\n", + shader->variants_cached, + lp->nr_fs_variants, lp->nr_fs_instrs, + lp->nr_fs_instrs / lp->nr_fs_variants); + } + /* * XXX: we need to flush the context until we have some sort of * reference counting in fragment shaders as they may still be binned @@ -3108,14 +3570,12 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) * Generate the new variant. */ t0 = os_time_get(); - variant = generate_variant(lp, shader, &key); + variant = generate_variant(lp, shader, key); t1 = os_time_get(); dt = t1 - t0; LP_COUNT_ADD(llvm_compile_time, dt); LP_COUNT_ADD(nr_llvm_compiles, 2); /* emit vs. omit in/out test */ - llvmpipe_variant_count++; - /* Put the new variant into the list */ if (variant) { insert_at_head(&shader->variants, &variant->list_item_local); @@ -3142,19 +3602,9 @@ llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe) llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state; llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer; + + llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers; + llvmpipe->pipe.set_shader_images = llvmpipe_set_shader_images; } -/* - * Rasterization is disabled if there is no pixel shader and - * both depth and stencil testing are disabled: - * http://msdn.microsoft.com/en-us/library/windows/desktop/bb205125 - */ -boolean -llvmpipe_rasterization_disabled(struct llvmpipe_context *lp) -{ - boolean null_fs = !lp->fs || lp->fs->info.base.num_tokens <= 1; - return (null_fs && - !lp->depth_stencil->depth.enabled && - !lp->depth_stencil->stencil[0].enabled); -}