X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fauxiliary%2Fgallivm%2Flp_bld_sample_soa.c;h=cb4660e424d709fd6c8ecead01c73c48b79e363b;hb=9f5dbbe030a069b9c362a48fdcd23d4d6221894f;hp=26bfa0d26776b4e993b1a095ec611343f902a068;hpb=205a3ce5c158349dda7082f44e95301526b9f183;p=mesa.git

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 26bfa0d2677..cb4660e424d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -42,7 +42,7 @@
 #include "util/u_math.h"
 #include "util/u_format.h"
 #include "util/u_cpu_detect.h"
-#include "util/u_format_rgb9e5.h"
+#include "util/format_rgb9e5.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -60,6 +60,7 @@
 #include "lp_bld_struct.h"
 #include "lp_bld_quad.h"
 #include "lp_bld_pack.h"
+#include "lp_bld_intr.h"
 
 
 /**
@@ -158,9 +159,10 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
 
    lp_build_fetch_rgba_soa(bld->gallivm,
                            bld->format_desc,
-                           bld->texel_type,
+                           bld->texel_type, TRUE,
                            data_ptr, offset,
                            i, j,
+                           bld->cache,
                            texel_out);
 
    /*
@@ -227,11 +229,16 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
    LLVMValueRef fract, flr, isOdd;
 
    lp_build_ifloor_fract(coord_bld, coord, &flr, &fract);
+   /* kill off NaNs */
+   /* XXX: not safe without arch rounding, fract can be anything. */
+   fract = lp_build_max_ext(coord_bld, fract, coord_bld->zero,
+                            GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
 
    /* isOdd = flr & 1 */
    isOdd = LLVMBuildAnd(bld->gallivm->builder, flr, int_coord_bld->one, "");
 
    /* make coord positive or negative depending on isOdd */
+   /* XXX slight overkill masking out sign bit is unnecessary */
    coord = lp_build_set_sign(coord_bld, fract, isOdd);
 
    /* convert isOdd to float */
@@ -271,10 +278,15 @@ lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
     * we avoided the 0.5/length division before the repeat wrap,
     * now need to fix up edge cases with selects
     */
+   /*
+    * Note we do a float (unordered) compare so we can eliminate NaNs.
+    * (Otherwise would need fract_safe above).
+    */
+   mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
+                           PIPE_FUNC_LESS, coord_f, coord_bld->zero);
+
    /* convert to int, compute lerp weight */
    lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
-   mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
-                           PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
    *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
 }
 
@@ -374,7 +386,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          }
 
          /* clamp to length max */
-         coord = lp_build_min(coord_bld, coord, length_f);
+         coord = lp_build_min_ext(coord_bld, coord, length_f,
+                                  GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
          /* subtract 0.5 */
          coord = lp_build_sub(coord_bld, coord, half);
          /* clamp to [0, length - 0.5] */
@@ -397,7 +410,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          coord = lp_build_add(coord_bld, coord, offset);
       }
       /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */
-      /* can skip clamp (though might not work for very large coord values */
+      /* can skip clamp (though might not work for very large coord values) */
       coord = lp_build_sub(coord_bld, coord, half);
       /* convert to int, compute lerp weight */
       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
@@ -464,7 +477,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          coord = lp_build_abs(coord_bld, coord);
 
          /* clamp to length max */
-         coord = lp_build_min(coord_bld, coord, length_f);
+         coord = lp_build_min_ext(coord_bld, coord, length_f,
+                                  GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
          /* subtract 0.5 */
          coord = lp_build_sub(coord_bld, coord, half);
          /* clamp to [0, length - 0.5] */
@@ -627,9 +641,15 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 
       /* itrunc == ifloor here */
       icoord = lp_build_itrunc(coord_bld, coord);
-
-      /* clamp to [0, length - 1] */
-      icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
+      /*
+       * Use unsigned min due to possible undef values (NaNs, overflow)
+       */
+      {
+         struct lp_build_context abs_coord_bld = *int_coord_bld;
+         abs_coord_bld.type.sign = FALSE;
+         /* clamp to [0, length - 1] */
+         icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
+      }
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
@@ -1359,7 +1379,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
          if (is_gather) {
             /* more hacks for swizzling, should be X, ONE or ZERO... */
             unsigned chan_swiz = bld->static_texture_state->swizzle_r;
-            if (chan_swiz <= PIPE_SWIZZLE_ALPHA) {
+            if (chan_swiz <= PIPE_SWIZZLE_W) {
                colors0[0] = lp_build_select(texel_bld, cmpval10,
                                             texel_bld->one, texel_bld->zero);
                colors0[1] = lp_build_select(texel_bld, cmpval11,
@@ -1369,7 +1389,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                colors0[3] = lp_build_select(texel_bld, cmpval00,
                                             texel_bld->one, texel_bld->zero);
             }
-            else if (chan_swiz == PIPE_SWIZZLE_ZERO) {
+            else if (chan_swiz == PIPE_SWIZZLE_0) {
                colors0[0] = colors0[1] = colors0[2] = colors0[3] =
                             texel_bld->zero;
             }
@@ -1837,7 +1857,7 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
       const struct util_format_description *format_desc = bld->format_desc;
       unsigned chan_type;
       /* not entirely sure we couldn't end up with non-valid swizzle here */
-      chan_type = format_desc->swizzle[0] <= UTIL_FORMAT_SWIZZLE_W ?
+      chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
                      format_desc->channel[format_desc->swizzle[0]].type :
                      UTIL_FORMAT_TYPE_FLOAT;
       if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
@@ -1938,7 +1958,7 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld,
                                        LLVMPointerType(vec4_bld.vec_type, 0), "");
    border_color = LLVMBuildLoad(builder, border_color_ptr, "");
    /* we don't have aligned type in the dynamic state unfortunately */
-   lp_set_load_alignment(border_color, 4);
+   LLVMSetAlignment(border_color, 4);
 
    /*
     * Instead of having some incredibly complex logic which will try to figure out
@@ -1956,7 +1976,7 @@ lp_build_clamp_border_color(struct lp_build_sample_context *bld,
       else {
          chan = util_format_get_first_non_void_channel(format_desc->format);
       }
-      if (chan >= 0 && chan <= UTIL_FORMAT_SWIZZLE_W) {
+      if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
          unsigned chan_type = format_desc->channel[chan].type;
          unsigned chan_norm = format_desc->channel[chan].normalized;
          unsigned chan_pure = format_desc->channel[chan].pure_integer;
@@ -2255,8 +2275,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
              * All pixels require just nearest filtering, which is way
              * cheaper than linear, hence do a separate path for that.
              */
-            lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST, FALSE,
-                                   mip_filter_for_nearest,
+            lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
+                                   mip_filter_for_nearest, FALSE,
                                    coords, offsets,
                                    ilevel0, ilevel1, lod_fpart,
                                    texels);
@@ -2386,9 +2406,10 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
 
    lp_build_fetch_rgba_soa(bld->gallivm,
                            bld->format_desc,
-                           bld->texel_type,
+                           bld->texel_type, TRUE,
                            bld->base_ptr, offset,
                            i, j,
+                           bld->cache,
                            colors_out);
 
    if (out_of_bound_ret_zero) {
@@ -2442,6 +2463,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                          unsigned texture_index,
                          unsigned sampler_index,
                          LLVMValueRef context_ptr,
+                         LLVMValueRef thread_data_ptr,
                          const LLVMValueRef *coords,
                          const LLVMValueRef *offsets,
                          const struct lp_derivatives *derivs, /* optional */
@@ -2707,6 +2729,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                                                 context_ptr, texture_index);
    /* Note that mip_offsets is an array[level] of offsets to texture images */
 
+   if (dynamic_state->cache_ptr && thread_data_ptr) {
+      bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
+                                           thread_data_ptr, texture_index);
+   }
+
    /* width, height, depth as single int vector */
    if (dims <= 1) {
       bld.int_size = tex_width;
@@ -2834,12 +2861,13 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
       }
 
       /*
-       * we only try 8-wide sampling with soa as it appears to
-       * be a loss with aos with AVX (but it should work, except
-       * for conformance if min_filter != mag_filter if num_lods > 1).
-       * (It should be faster if we'd support avx2)
+       * we only try 8-wide sampling with soa or if we have AVX2
+       * as it appears to be a loss with just AVX)
        */
-      if (num_quads == 1 || !use_aos) {
+      if (num_quads == 1 || !use_aos ||
+          (util_cpu_caps.has_avx2 &&
+           (bld.num_lods == 1 ||
+            derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
          if (use_aos) {
             /* do sampling/filtering with fixed pt arithmetic */
             lp_build_sample_aos(&bld, sampler_index,
@@ -2883,6 +2911,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
          bld4.base_ptr = bld.base_ptr;
          bld4.mip_offsets = bld.mip_offsets;
          bld4.int_size = bld.int_size;
+         bld4.cache = bld.cache;
 
          bld4.vector_width = lp_type_width(type4);
 
@@ -3081,12 +3110,14 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
    LLVMValueRef offsets[3] = { NULL };
    LLVMValueRef lod = NULL;
    LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr = NULL;
    LLVMValueRef texel_out[4];
    struct lp_derivatives derivs;
    struct lp_derivatives *deriv_ptr = NULL;
    unsigned num_param = 0;
    unsigned i, num_coords, num_derivs, num_offsets, layer;
    enum lp_sampler_lod_control lod_control;
+   boolean need_cache = FALSE;
 
    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
                     LP_SAMPLER_LOD_CONTROL_SHIFT;
@@ -3094,8 +3125,19 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
    get_target_info(static_texture_state->target,
                    &num_coords, &num_derivs, &num_offsets, &layer);
 
+   if (dynamic_state->cache_ptr) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(static_texture_state->format);
+      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+         need_cache = TRUE;
+      }
+   }
+
    /* "unpack" arguments */
    context_ptr = LLVMGetParam(function, num_param++);
+   if (need_cache) {
+      thread_data_ptr = LLVMGetParam(function, num_param++);
+   }
    for (i = 0; i < num_coords; i++) {
       coords[i] = LLVMGetParam(function, num_param++);
    }
@@ -3146,6 +3188,7 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
                             texture_index,
                             sampler_index,
                             context_ptr,
+                            thread_data_ptr,
                             coords,
                             offsets,
                             deriv_ptr,
@@ -3189,6 +3232,7 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
    const LLVMValueRef *offsets = params->offsets;
    const struct lp_derivatives *derivs = params->derivs;
    enum lp_sampler_lod_control lod_control;
+   boolean need_cache = FALSE;
 
    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
                     LP_SAMPLER_LOD_CONTROL_SHIFT;
@@ -3196,6 +3240,17 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
    get_target_info(static_texture_state->target,
                    &num_coords, &num_derivs, &num_offsets, &layer);
 
+   if (dynamic_state->cache_ptr) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(static_texture_state->format);
+      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+         /*
+          * This is not 100% correct, if we have cache but the
+          * util_format_s3tc_prefer is true the cache won't get used
+          * regardless (could hook up the block decode there...) */
+         need_cache = TRUE;
+      }
+   }
    /*
     * texture function matches are found by name.
     * Thus the name has to include both the texture and sampler unit
@@ -3221,6 +3276,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
        */
 
       arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
+      if (need_cache) {
+         arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
+      }
       for (i = 0; i < num_coords; i++) {
          arg_types[num_param++] = LLVMTypeOf(coords[0]);
          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
@@ -3259,12 +3317,13 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
 
       for (i = 0; i < num_param; ++i) {
          if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
-            LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
+
+            lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
          }
       }
 
       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
-      LLVMSetLinkage(function, LLVMPrivateLinkage);
+      LLVMSetLinkage(function, LLVMInternalLinkage);
 
       lp_build_sample_gen_func(gallivm,
                                static_texture_state,
@@ -3280,6 +3339,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
 
    num_args = 0;
    args[num_args++] = params->context_ptr;
+   if (need_cache) {
+      args[num_args++] = params->thread_data_ptr;
+   }
    for (i = 0; i < num_coords; i++) {
       args[num_args++] = coords[i];
    }
@@ -3384,6 +3446,7 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
                                params->texture_index,
                                params->sampler_index,
                                params->context_ptr,
+                               params->thread_data_ptr,
                                params->coords,
                                params->offsets,
                                params->derivs,
@@ -3397,21 +3460,17 @@ void
 lp_build_size_query_soa(struct gallivm_state *gallivm,
                         const struct lp_static_texture_state *static_state,
                         struct lp_sampler_dynamic_state *dynamic_state,
-                        struct lp_type int_type,
-                        unsigned texture_unit,
-                        unsigned target,
-                        LLVMValueRef context_ptr,
-                        boolean is_sviewinfo,
-                        enum lp_sampler_lod_property lod_property,
-                        LLVMValueRef explicit_lod,
-                        LLVMValueRef *sizes_out)
+                        const struct lp_sampler_size_query_params *params)
 {
-   LLVMValueRef lod, level, size;
+   LLVMValueRef lod, level = 0, size;
    LLVMValueRef first_level = NULL;
    int dims, i;
    boolean has_array;
    unsigned num_lods = 1;
    struct lp_build_context bld_int_vec4;
+   LLVMValueRef context_ptr = params->context_ptr;
+   unsigned texture_unit = params->texture_unit;
+   unsigned target = params->target;
 
    if (static_state->format == PIPE_FORMAT_NONE) {
       /*
@@ -3419,9 +3478,9 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
        * all zero as mandated by d3d10 in this case.
        */
       unsigned chan;
-      LLVMValueRef zero = lp_build_const_vec(gallivm, int_type, 0.0F);
+      LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
       for (chan = 0; chan < 4; chan++) {
-         sizes_out[chan] = zero;
+         params->sizes_out[chan] = zero;
       }
       return;
    }
@@ -3466,13 +3525,13 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
       break;
    }
 
-   assert(!int_type.floating);
+   assert(!params->int_type.floating);
 
    lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
 
-   if (explicit_lod) {
+   if (params->explicit_lod) {
       /* FIXME: this needs to honor per-element lod */
-      lod = LLVMBuildExtractElement(gallivm->builder, explicit_lod,
+      lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
                                     lp_build_const_int32(gallivm, 0), "");
       first_level = dynamic_state->first_level(dynamic_state, gallivm,
                                                context_ptr, texture_unit);
@@ -3526,7 +3585,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
     * if level is out of bounds (note this can't cover unbound texture
     * here, which also requires returning zero).
     */
-   if (explicit_lod && is_sviewinfo) {
+   if (params->explicit_lod && params->is_sviewinfo) {
       LLVMValueRef last_level, out, out1;
       struct lp_build_context leveli_bld;
 
@@ -3548,13 +3607,13 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
       size = lp_build_andnot(&bld_int_vec4, size, out);
    }
    for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
-      sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, int_type,
+      params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
                                                 size,
                                                 lp_build_const_int32(gallivm, i));
    }
-   if (is_sviewinfo) {
+   if (params->is_sviewinfo) {
       for (; i < 4; i++) {
-         sizes_out[i] = lp_build_const_vec(gallivm, int_type, 0.0);
+         params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
       }
    }
 
@@ -3562,7 +3621,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
     * if there's no explicit_lod (buffers, rects) queries requiring nr of
     * mips would be illegal.
     */
-   if (is_sviewinfo && explicit_lod) {
+   if (params->is_sviewinfo && params->explicit_lod) {
       struct lp_build_context bld_int_scalar;
       LLVMValueRef num_levels;
       lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
@@ -3578,7 +3637,7 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
          num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
          num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
       }
-      sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, int_type),
+      params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
                                         num_levels);
    }
 }