From e442db8e9821d210df0a2f253d7b47bfad23a3bf Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Sun, 13 Nov 2016 16:33:37 +0100
Subject: [PATCH] draw: drop some overflow computations

It turns out that noone actually cares if the address computations overflow,
be it the stride mul or the offset adds.
Wrap around seems to be explicitly permitted even by some other API (which
is a _very_ surprising result, as these overflow computations were added just
for that and made some tests pass at that time - I suspect some later fixes
fixed the actual root cause...). So the requirements in that other api were
actually sane there all along after all...
Still need to make sure the computed buffer size needed is valid, of course.
This ditches the shiny new widening mul from these codepaths, ah well...

And now that I really understand this, change the fishy min limiting
indices to what it really should have done. Which is simply to prevent
fetching more values than valid for the last loop iteration. (This makes
the code path in the loop minimally more complex for the non-indexed case
as we have to skip the optimization combining two adds. I think it should
be safe to skip this actually there, but I don't care much about this
especially since skipping that optimization actually makes the code easier
to read elsewhere.)

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/auxiliary/draw/draw_llvm.c | 111 ++++++++++---------------
 1 file changed, 46 insertions(+), 65 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 414f2dc022d..c5485728e42 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -669,18 +669,17 @@ fetch_instanced(struct gallivm_state *gallivm,
    LLVMValueRef zero = LLVMConstNull(i32_t);
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef stride, buffer_overflowed, aos, index_valid;
-   LLVMValueRef ofbit = NULL;
    unsigned i;
 
    aosf_t = lp_build_vec_type(gallivm, lp_float32_vec4_type());
    aosi_t = lp_build_vec_type(gallivm, lp_int32_vec4_type());
 
-   stride = lp_build_umul_overflow(gallivm, vb_stride, index, &ofbit);
+   /* This mul can overflow. Wraparound is ok. */
+   stride = LLVMBuildMul(builder, vb_stride, index, "");
 
    buffer_overflowed = LLVMBuildICmp(builder, LLVMIntUGE,
                                      stride, buffer_size_adj,
                                      "buffer_overflowed");
-   buffer_overflowed = LLVMBuildOr(builder, buffer_overflowed, ofbit, "");
 
    if (0) {
       lp_build_print_value(gallivm, "   instance index = ", index);
@@ -759,7 +758,7 @@ fetch_vector(struct gallivm_state *gallivm,
    LLVMValueRef zero = LLVMConstNull(LLVMInt32TypeInContext(gallivm->context));
    LLVMBuilderRef builder = gallivm->builder;
    struct lp_build_context blduivec;
-   LLVMValueRef offset, tmp, valid_mask;
+   LLVMValueRef offset, valid_mask;
    LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32];
    unsigned i;
 
@@ -768,24 +767,11 @@ fetch_vector(struct gallivm_state *gallivm,
    vb_stride = lp_build_broadcast_scalar(&blduivec, vb_stride);
    buffer_size_adj = lp_build_broadcast_scalar(&blduivec, buffer_size_adj);
 
-   /*
-    * Sort of interestingly, with interleaved attribs, llvm 3.7+ will
-    * recognize these calculations to be constant with different attribs
-    * (the different offset has been added to map_ptr).
-    * llvm 3.3, however, will not (I can't get llvm 3.4-3.6 to link...)
-    *
-    * XXX: could actually avoid this altogether (replacing by simple
-    * non-widening mul) by precalculating the max index instead outside
-    * the loop (at the cost of one scalar udiv per vertex element).
-    */
-   offset = lp_build_mul_32_lohi_cpu(&blduivec, vb_stride, indices, &tmp);
+   /* This mul can overflow. Wraparound is ok. */
+   offset = lp_build_mul(&blduivec, vb_stride, indices);
 
    valid_mask = lp_build_compare(gallivm, blduivec.type,
-                                 PIPE_FUNC_EQUAL, tmp, blduivec.zero);
-
-   tmp = lp_build_compare(gallivm, blduivec.type,
-                          PIPE_FUNC_LESS, offset, buffer_size_adj);
-   valid_mask = LLVMBuildAnd(builder, tmp, valid_mask, "");
+                                 PIPE_FUNC_LESS, offset, buffer_size_adj);
 
    /* not valid elements use offset 0 */
    offset = LLVMBuildAnd(builder, offset, valid_mask, "");
@@ -1566,10 +1552,10 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    LLVMBuilderRef builder;
    char func_name[64];
    struct lp_type vs_type;
-   LLVMValueRef count, fetch_elts, start_or_maxelt, start;
+   LLVMValueRef count, fetch_elts, start_or_maxelt;
    LLVMValueRef vertex_id_offset, start_instance;
    LLVMValueRef stride, step, io_itr;
-   LLVMValueRef ind_vec, ind_vec_store, have_elts, fetch_max, tmp;
+   LLVMValueRef ind_vec, start_vec, have_elts, fetch_max, tmp;
    LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
    LLVMValueRef vb_stride[PIPE_MAX_ATTRIBS];
    LLVMValueRef map_ptr[PIPE_MAX_ATTRIBS];
@@ -1580,7 +1566,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    struct draw_context *draw = llvm->draw;
    const struct tgsi_shader_info *vs_info = &draw->vs.vertex_shader->info;
    unsigned i, j;
-   struct lp_build_context bld, bldivec, blduivec;
+   struct lp_build_context bld, blduivec;
    struct lp_build_loop_state lp_loop;
    struct lp_build_if_state if_ctx;
    const int vector_length = lp_native_vector_width / 32;
@@ -1640,6 +1626,11 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    io_ptr                    = LLVMGetParam(variant_func, 1);
    vbuffers_ptr              = LLVMGetParam(variant_func, 2);
    count                     = LLVMGetParam(variant_func, 3);
+   /*
+    * XXX: the maxelt part is unused. Not really useful, since we cannot
+    * get index buffer overflows due to vsplit (which provides its own
+    * elts buffer, with a different size than what's passed in here).
+    */
    start_or_maxelt           = LLVMGetParam(variant_func, 4);
    /*
     * XXX: stride is actually unused. The stride we use is strictly calculated
@@ -1682,7 +1673,6 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    vs_type.length = vector_length;
 
    lp_build_context_init(&bld, gallivm, lp_type_uint(32));
-   lp_build_context_init(&bldivec, gallivm, lp_int_type(vs_type));
    lp_build_context_init(&blduivec, gallivm, lp_uint_type(vs_type));
 
    /* hold temporary "bool" clipmask */
@@ -1706,29 +1696,16 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    }
 
    fetch_max = lp_build_alloca(gallivm, int32_type, "fetch_max");
-   ind_vec_store = lp_build_alloca(gallivm, bldivec.vec_type, "ind_vec");
 
    have_elts = LLVMBuildICmp(builder, LLVMIntNE,
                              LLVMConstPointerNull(arg_types[10]), fetch_elts, "");
-   lp_build_if(&if_ctx, gallivm, have_elts);
-   {
-      LLVMBuildStore(builder, ind_vec, ind_vec_store);
-      LLVMBuildStore(builder, count, fetch_max);
-   }
-   lp_build_else(&if_ctx);
-   {
-      tmp = lp_build_add(&bld, count, start_or_maxelt);
-      LLVMBuildStore(builder, tmp, fetch_max);
-      start = lp_build_broadcast_scalar(&bldivec, start_or_maxelt);
-      tmp = lp_build_add(&bldivec, start, ind_vec);
-      LLVMBuildStore(builder, tmp, ind_vec_store);
-   }
-   lp_build_endif(&if_ctx);
 
-   fetch_max = LLVMBuildLoad(builder, fetch_max, "");
-   fetch_max = LLVMBuildSub(builder, fetch_max, bld.one, "fetch_max");
-   fetch_max = lp_build_broadcast_scalar(&bldivec, fetch_max);
-   ind_vec = LLVMBuildLoad(builder, ind_vec_store, "");
+   fetch_max = LLVMBuildSub(builder, count, bld.one, "fetch_max");
+   fetch_max = lp_build_broadcast_scalar(&blduivec, fetch_max);
+   /*
+    * Only needed for non-indexed path.
+    */
+   start_vec = lp_build_broadcast_scalar(&blduivec, start_or_maxelt);
 
    /*
     * Pre-calculate everything which is constant per shader invocation.
@@ -1757,9 +1734,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
          /*
           * We'll set buffer_size_adj to zero if we have of, so it will
           * always overflow later automatically without having to keep ofbit.
+          * Overflows (with normal wraparound) doing the actual offset
+          * calculation should be ok, just not for the buffer size calc.
+          * It would also be possible to detect such overflows and return
+          * zeros if that happens, but this would be more complex.
           */
-         buf_offset = lp_build_uadd_overflow(gallivm, vb_buffer_offset,
-                                             src_offset, &ofbit);
+         buf_offset = lp_build_add(&bld, vb_buffer_offset, src_offset);
          tmp = lp_build_sub(&bld, bsize, bld.one);
          buffer_size_adj[j] = lp_build_usub_overflow(gallivm, buffer_size, tmp,
                                                      &ofbit);
@@ -1843,21 +1823,17 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
       true_index_array = LLVMBuildAdd(builder, true_index_array, ind_vec, "");
 
       /*
-       * XXX: This code is really fishy. We are required to use a int min
-       * here, not uint. The reason is that for some non-indexed draws, we
-       * might get something like MAX_UINT - 3 as start value (due to start
-       * vertex). So, the first 3 elements in the vector are huge, and
-       * limiting them to fetch_max is incorrect. By using int min, we'll
-       * pick that huge value - we rely on this creating an overflow (which
-       * is guaranteed) in the stride mul later (using (signed) cmp and
-       * incorporating the result into ofmask would also work).
-       * For the later elements, this just wraps around the indices, which
-       * is apparently ok...
+       * Limit indices to fetch_max, otherwise might try to access indices
+       * beyond index buffer (or rather vsplit elt buffer) size.
+       * Could probably safely (?) skip this for non-indexed draws and
+       * simplify things minimally (by removing it could combine the ind_vec
+       * and start_vec adds). I think the only effect for non-indexed draws will
+       * be that for the invalid elements they will be all fetched from the
+       * same location as the last valid one, but noone should really care.
        */
-      true_index_array = lp_build_min(&bldivec, true_index_array, fetch_max);
+      true_index_array = lp_build_min(&blduivec, true_index_array, fetch_max);
 
-      index_store = lp_build_alloca_undef(gallivm, bldivec.vec_type, "index_store");
-      LLVMBuildStore(builder, true_index_array, index_store);
+      index_store = lp_build_alloca_undef(gallivm, blduivec.vec_type, "index_store");
 
       lp_build_if(&if_ctx, gallivm, have_elts);
       {
@@ -1875,22 +1851,27 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
           * not being zero will get a different fetch index than the valid
           * index 0. So, just rely on vsplit code preventing out-of-bounds
           * fetches. This is also why it's safe to do elts fetch even if there
-          * was no index buffer bound - the real buffer is never seen here.
+          * was no index buffer bound - the real buffer is never seen here, at
+          * least not if there are index buffer overflows...
           */
 
          /*
           * XXX should not have to do this, as scale can be handled
           * natively by loads (hits asserts though).
           */
-         true_index_array = lp_build_shl_imm(&blduivec, true_index_array, 2);
+         tmp = lp_build_shl_imm(&blduivec, true_index_array, 2);
          fetch_elts = LLVMBuildBitCast(builder, fetch_elts,
                                        LLVMPointerType(LLVMInt8TypeInContext(context),
                                                        0), "");
-         true_index_array = lp_build_gather(gallivm, vs_type.length,
-                                            32, 32, TRUE,
-                                            fetch_elts, true_index_array,
-                                            FALSE);
-         LLVMBuildStore(builder, true_index_array, index_store);
+         tmp = lp_build_gather(gallivm, vs_type.length,
+                               32, 32, TRUE,
+                               fetch_elts, tmp, FALSE);
+         LLVMBuildStore(builder, tmp, index_store);
+      }
+      lp_build_else(&if_ctx);
+      {
+         tmp = LLVMBuildAdd(builder, true_index_array, start_vec, "");
+         LLVMBuildStore(builder, tmp, index_store);
       }
       lp_build_endif(&if_ctx);
 
-- 
2.30.2