X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Fgallivm%2Flp_bld_sample_soa.c;h=cb4660e424d709fd6c8ecead01c73c48b79e363b;hb=9f5dbbe030a069b9c362a48fdcd23d4d6221894f;hp=26bfa0d26776b4e993b1a095ec611343f902a068;hpb=205a3ce5c158349dda7082f44e95301526b9f183;p=mesa.git diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 26bfa0d2677..cb4660e424d 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -42,7 +42,7 @@ #include "util/u_math.h" #include "util/u_format.h" #include "util/u_cpu_detect.h" -#include "util/u_format_rgb9e5.h" +#include "util/format_rgb9e5.h" #include "lp_bld_debug.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -60,6 +60,7 @@ #include "lp_bld_struct.h" #include "lp_bld_quad.h" #include "lp_bld_pack.h" +#include "lp_bld_intr.h" /** @@ -158,9 +159,10 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, lp_build_fetch_rgba_soa(bld->gallivm, bld->format_desc, - bld->texel_type, + bld->texel_type, TRUE, data_ptr, offset, i, j, + bld->cache, texel_out); /* @@ -227,11 +229,16 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld, LLVMValueRef fract, flr, isOdd; lp_build_ifloor_fract(coord_bld, coord, &flr, &fract); + /* kill off NaNs */ + /* XXX: not safe without arch rounding, fract can be anything. */ + fract = lp_build_max_ext(coord_bld, fract, coord_bld->zero, + GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); /* isOdd = flr & 1 */ isOdd = LLVMBuildAnd(bld->gallivm->builder, flr, int_coord_bld->one, ""); /* make coord positive or negative depending on isOdd */ + /* XXX slight overkill masking out sign bit is unnecessary */ coord = lp_build_set_sign(coord_bld, fract, isOdd); /* convert isOdd to float */ @@ -271,10 +278,15 @@ lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld, * we avoided the 0.5/length division before the repeat wrap, * now need to fix up edge cases with selects */ + /* + * Note we do a float (unordered) compare so we can eliminate NaNs. + * (Otherwise would need fract_safe above). + */ + mask = lp_build_compare(coord_bld->gallivm, coord_bld->type, + PIPE_FUNC_LESS, coord_f, coord_bld->zero); + /* convert to int, compute lerp weight */ lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f); - mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, - PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero); *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i); } @@ -374,7 +386,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, } /* clamp to length max */ - coord = lp_build_min(coord_bld, coord, length_f); + coord = lp_build_min_ext(coord_bld, coord, length_f, + GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); /* subtract 0.5 */ coord = lp_build_sub(coord_bld, coord, half); /* clamp to [0, length - 0.5] */ @@ -397,7 +410,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_add(coord_bld, coord, offset); } /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */ - /* can skip clamp (though might not work for very large coord values */ + /* can skip clamp (though might not work for very large coord values) */ coord = lp_build_sub(coord_bld, coord, half); /* convert to int, compute lerp weight */ lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); @@ -464,7 +477,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_abs(coord_bld, coord); /* clamp to length max */ - coord = lp_build_min(coord_bld, coord, length_f); + coord = lp_build_min_ext(coord_bld, coord, length_f, + GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); /* subtract 0.5 */ coord = lp_build_sub(coord_bld, coord, half); /* clamp to [0, length - 0.5] */ @@ -627,9 +641,15 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, /* itrunc == ifloor here */ icoord = lp_build_itrunc(coord_bld, coord); - - /* clamp to [0, length - 1] */ - icoord = lp_build_min(int_coord_bld, icoord, length_minus_one); + /* + * Use unsigned min due to possible undef values (NaNs, overflow) + */ + { + struct lp_build_context abs_coord_bld = *int_coord_bld; + abs_coord_bld.type.sign = FALSE; + /* clamp to [0, length - 1] */ + icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one); + } break; case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: @@ -1359,7 +1379,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, if (is_gather) { /* more hacks for swizzling, should be X, ONE or ZERO... */ unsigned chan_swiz = bld->static_texture_state->swizzle_r; - if (chan_swiz <= PIPE_SWIZZLE_ALPHA) { + if (chan_swiz <= PIPE_SWIZZLE_W) { colors0[0] = lp_build_select(texel_bld, cmpval10, texel_bld->one, texel_bld->zero); colors0[1] = lp_build_select(texel_bld, cmpval11, @@ -1369,7 +1389,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, colors0[3] = lp_build_select(texel_bld, cmpval00, texel_bld->one, texel_bld->zero); } - else if (chan_swiz == PIPE_SWIZZLE_ZERO) { + else if (chan_swiz == PIPE_SWIZZLE_0) { colors0[0] = colors0[1] = colors0[2] = colors0[3] = texel_bld->zero; } @@ -1837,7 +1857,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld, const struct util_format_description *format_desc = bld->format_desc; unsigned chan_type; /* not entirely sure we couldn't end up with non-valid swizzle here */ - chan_type = format_desc->swizzle[0] <= UTIL_FORMAT_SWIZZLE_W ? + chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ? format_desc->channel[format_desc->swizzle[0]].type : UTIL_FORMAT_TYPE_FLOAT; if (chan_type != UTIL_FORMAT_TYPE_FLOAT) { @@ -1938,7 +1958,7 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld, LLVMPointerType(vec4_bld.vec_type, 0), ""); border_color = LLVMBuildLoad(builder, border_color_ptr, ""); /* we don't have aligned type in the dynamic state unfortunately */ - lp_set_load_alignment(border_color, 4); + LLVMSetAlignment(border_color, 4); /* * Instead of having some incredibly complex logic which will try to figure out @@ -1956,7 +1976,7 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld, else { chan = util_format_get_first_non_void_channel(format_desc->format); } - if (chan >= 0 && chan <= UTIL_FORMAT_SWIZZLE_W) { + if (chan >= 0 && chan <= PIPE_SWIZZLE_W) { unsigned chan_type = format_desc->channel[chan].type; unsigned chan_norm = format_desc->channel[chan].normalized; unsigned chan_pure = format_desc->channel[chan].pure_integer; @@ -2255,8 +2275,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld, * All pixels require just nearest filtering, which is way * cheaper than linear, hence do a separate path for that. */ - lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST, FALSE, - mip_filter_for_nearest, + lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST, + mip_filter_for_nearest, FALSE, coords, offsets, ilevel0, ilevel1, lod_fpart, texels); @@ -2386,9 +2406,10 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld, lp_build_fetch_rgba_soa(bld->gallivm, bld->format_desc, - bld->texel_type, + bld->texel_type, TRUE, bld->base_ptr, offset, i, j, + bld->cache, colors_out); if (out_of_bound_ret_zero) { @@ -2442,6 +2463,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, unsigned texture_index, unsigned sampler_index, LLVMValueRef context_ptr, + LLVMValueRef thread_data_ptr, const LLVMValueRef *coords, const LLVMValueRef *offsets, const struct lp_derivatives *derivs, /* optional */ @@ -2707,6 +2729,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, context_ptr, texture_index); /* Note that mip_offsets is an array[level] of offsets to texture images */ + if (dynamic_state->cache_ptr && thread_data_ptr) { + bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm, + thread_data_ptr, texture_index); + } + /* width, height, depth as single int vector */ if (dims <= 1) { bld.int_size = tex_width; @@ -2834,12 +2861,13 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, } /* - * we only try 8-wide sampling with soa as it appears to - * be a loss with aos with AVX (but it should work, except - * for conformance if min_filter != mag_filter if num_lods > 1). - * (It should be faster if we'd support avx2) + * we only try 8-wide sampling with soa or if we have AVX2 + * as it appears to be a loss with just AVX) */ - if (num_quads == 1 || !use_aos) { + if (num_quads == 1 || !use_aos || + (util_cpu_caps.has_avx2 && + (bld.num_lods == 1 || + derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) { if (use_aos) { /* do sampling/filtering with fixed pt arithmetic */ lp_build_sample_aos(&bld, sampler_index, @@ -2883,6 +2911,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, bld4.base_ptr = bld.base_ptr; bld4.mip_offsets = bld.mip_offsets; bld4.int_size = bld.int_size; + bld4.cache = bld.cache; bld4.vector_width = lp_type_width(type4); @@ -3081,12 +3110,14 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm, LLVMValueRef offsets[3] = { NULL }; LLVMValueRef lod = NULL; LLVMValueRef context_ptr; + LLVMValueRef thread_data_ptr = NULL; LLVMValueRef texel_out[4]; struct lp_derivatives derivs; struct lp_derivatives *deriv_ptr = NULL; unsigned num_param = 0; unsigned i, num_coords, num_derivs, num_offsets, layer; enum lp_sampler_lod_control lod_control; + boolean need_cache = FALSE; lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >> LP_SAMPLER_LOD_CONTROL_SHIFT; @@ -3094,8 +3125,19 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm, get_target_info(static_texture_state->target, &num_coords, &num_derivs, &num_offsets, &layer); + if (dynamic_state->cache_ptr) { + const struct util_format_description *format_desc; + format_desc = util_format_description(static_texture_state->format); + if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + need_cache = TRUE; + } + } + /* "unpack" arguments */ context_ptr = LLVMGetParam(function, num_param++); + if (need_cache) { + thread_data_ptr = LLVMGetParam(function, num_param++); + } for (i = 0; i < num_coords; i++) { coords[i] = LLVMGetParam(function, num_param++); } @@ -3146,6 +3188,7 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm, texture_index, sampler_index, context_ptr, + thread_data_ptr, coords, offsets, deriv_ptr, @@ -3189,6 +3232,7 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, const LLVMValueRef *offsets = params->offsets; const struct lp_derivatives *derivs = params->derivs; enum lp_sampler_lod_control lod_control; + boolean need_cache = FALSE; lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >> LP_SAMPLER_LOD_CONTROL_SHIFT; @@ -3196,6 +3240,17 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, get_target_info(static_texture_state->target, &num_coords, &num_derivs, &num_offsets, &layer); + if (dynamic_state->cache_ptr) { + const struct util_format_description *format_desc; + format_desc = util_format_description(static_texture_state->format); + if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + /* + * This is not 100% correct, if we have cache but the + * util_format_s3tc_prefer is true the cache won't get used + * regardless (could hook up the block decode there...) */ + need_cache = TRUE; + } + } /* * texture function matches are found by name. * Thus the name has to include both the texture and sampler unit @@ -3221,6 +3276,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, */ arg_types[num_param++] = LLVMTypeOf(params->context_ptr); + if (need_cache) { + arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr); + } for (i = 0; i < num_coords; i++) { arg_types[num_param++] = LLVMTypeOf(coords[0]); assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i])); @@ -3259,12 +3317,13 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, for (i = 0; i < num_param; ++i) { if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) { - LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute); + + lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS); } } LLVMSetFunctionCallConv(function, LLVMFastCallConv); - LLVMSetLinkage(function, LLVMPrivateLinkage); + LLVMSetLinkage(function, LLVMInternalLinkage); lp_build_sample_gen_func(gallivm, static_texture_state, @@ -3280,6 +3339,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, num_args = 0; args[num_args++] = params->context_ptr; + if (need_cache) { + args[num_args++] = params->thread_data_ptr; + } for (i = 0; i < num_coords; i++) { args[num_args++] = coords[i]; } @@ -3384,6 +3446,7 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state, params->texture_index, params->sampler_index, params->context_ptr, + params->thread_data_ptr, params->coords, params->offsets, params->derivs, @@ -3397,21 +3460,17 @@ void lp_build_size_query_soa(struct gallivm_state *gallivm, const struct lp_static_texture_state *static_state, struct lp_sampler_dynamic_state *dynamic_state, - struct lp_type int_type, - unsigned texture_unit, - unsigned target, - LLVMValueRef context_ptr, - boolean is_sviewinfo, - enum lp_sampler_lod_property lod_property, - LLVMValueRef explicit_lod, - LLVMValueRef *sizes_out) + const struct lp_sampler_size_query_params *params) { - LLVMValueRef lod, level, size; + LLVMValueRef lod, level = 0, size; LLVMValueRef first_level = NULL; int dims, i; boolean has_array; unsigned num_lods = 1; struct lp_build_context bld_int_vec4; + LLVMValueRef context_ptr = params->context_ptr; + unsigned texture_unit = params->texture_unit; + unsigned target = params->target; if (static_state->format == PIPE_FORMAT_NONE) { /* @@ -3419,9 +3478,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, * all zero as mandated by d3d10 in this case. */ unsigned chan; - LLVMValueRef zero = lp_build_const_vec(gallivm, int_type, 0.0F); + LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F); for (chan = 0; chan < 4; chan++) { - sizes_out[chan] = zero; + params->sizes_out[chan] = zero; } return; } @@ -3466,13 +3525,13 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, break; } - assert(!int_type.floating); + assert(!params->int_type.floating); lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128)); - if (explicit_lod) { + if (params->explicit_lod) { /* FIXME: this needs to honor per-element lod */ - lod = LLVMBuildExtractElement(gallivm->builder, explicit_lod, + lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod, lp_build_const_int32(gallivm, 0), ""); first_level = dynamic_state->first_level(dynamic_state, gallivm, context_ptr, texture_unit); @@ -3526,7 +3585,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, * if level is out of bounds (note this can't cover unbound texture * here, which also requires returning zero). */ - if (explicit_lod && is_sviewinfo) { + if (params->explicit_lod && params->is_sviewinfo) { LLVMValueRef last_level, out, out1; struct lp_build_context leveli_bld; @@ -3548,13 +3607,13 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, size = lp_build_andnot(&bld_int_vec4, size, out); } for (i = 0; i < dims + (has_array ? 1 : 0); i++) { - sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, int_type, + params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type, size, lp_build_const_int32(gallivm, i)); } - if (is_sviewinfo) { + if (params->is_sviewinfo) { for (; i < 4; i++) { - sizes_out[i] = lp_build_const_vec(gallivm, int_type, 0.0); + params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0); } } @@ -3562,7 +3621,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, * if there's no explicit_lod (buffers, rects) queries requiring nr of * mips would be illegal. */ - if (is_sviewinfo && explicit_lod) { + if (params->is_sviewinfo && params->explicit_lod) { struct lp_build_context bld_int_scalar; LLVMValueRef num_levels; lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32)); @@ -3578,7 +3637,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm, num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level); num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one); } - sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, int_type), + params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type), num_levels); } }