From 2b2fc03beb6ff6a5e87f16afbd687ac9addc3824 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Wed, 23 Oct 2013 19:13:21 +0200
Subject: [PATCH] gallivm: implement fully accurate corner filtering for
 seamless cube maps

d3d10 requires that cube corners are filtered with accurate weights (that
is, the weight of the non-existing corner texel should be evenly distributed
to the other 3 texels). OpenGL does not require this (but recommends it).
This requires us to use different filtering code, since we need per-texel
weights which our 2d lerp doesn't (and can't) do. And of course the (now
per element) weights need to be adjusted too for it to work.
Invoke the new filtering code whenever there's an edge to keep things simpler,
as it will work for edges too not just corners but of course it's only needed
with corners.
More ugly code for not much gain but at least a hacked up cubemap demo
shows very nice corners now... Not sure yet if and how this should be
configurable...

v2: incorporate feedback from Jose, only use special corner filtering code
when there's a corner not when there's only an edge (as corner filtering code
is slower, though a perf difference was only measureable when always
forcing edge code). Plus some minor style fixes.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 .../auxiliary/gallivm/lp_bld_sample_soa.c     | 164 ++++++++++++++++--
 1 file changed, 151 insertions(+), 13 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 8e2d0d9f33b..2d833318aee 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -822,6 +822,12 @@ lp_build_masklerp2d(struct lp_build_context *bld,
    return lp_build_lerp(bld, weight1, val0, val1, 0);
 }
 
+/*
+ * this is a bit excessive code for something OpenGL just recommends
+ * but does not require.
+ */
+#define ACCURATE_CUBE_CORNERS 1
+
 /**
  * Generate code to sample a mipmap level with linear filtering.
  * If sampling a cube texture, r = cube face in [0,5].
@@ -840,6 +846,9 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                              const LLVMValueRef *offsets,
                              LLVMValueRef colors_out[4])
 {
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   struct lp_build_context *ivec_bld = &bld->int_coord_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
    const unsigned dims = bld->dims;
    LLVMValueRef width_vec;
    LLVMValueRef height_vec;
@@ -848,6 +857,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    LLVMValueRef flt_width_vec;
    LLVMValueRef flt_height_vec;
    LLVMValueRef flt_depth_vec;
+   LLVMValueRef fall_off[4], have_corners;
    LLVMValueRef z1 = NULL;
    LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
    LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
@@ -856,6 +866,11 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    LLVMValueRef xs[4], ys[4], zs[4];
    LLVMValueRef neighbors[2][2][4];
    int chan, texel_index;
+   boolean seamless_cube_filter, accurate_cube_corners;
+
+   seamless_cube_filter = bld->static_texture_state->target == PIPE_TEXTURE_CUBE &&
+                          bld->static_sampler_state->seamless_cube_map;
+   accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
 
    lp_build_extract_image_sizes(bld,
                                 &bld->int_size_bld,
@@ -875,8 +890,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
     * Compute integer texcoords.
     */
 
-   if (bld->static_texture_state->target != PIPE_TEXTURE_CUBE ||
-       !bld->static_sampler_state->seamless_cube_map) {
+   if (!seamless_cube_filter) {
       lp_build_sample_wrap_linear(bld, coords[0], width_vec,
                                   flt_width_vec, offsets[0],
                                   bld->static_texture_state->pot_width,
@@ -918,13 +932,11 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       }
    }
    else {
-      LLVMBuilderRef builder = bld->gallivm->builder;
-      struct lp_build_context *ivec_bld = &bld->int_coord_bld;
-      struct lp_build_context *coord_bld = &bld->coord_bld;
       struct lp_build_if_state edge_if;
+      LLVMTypeRef int1t;
       LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
-      LLVMValueRef fall_off[4], coord, have_edge;
-      LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp;
+      LLVMValueRef coord, have_edge, have_corner;
+      LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
       LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
       LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
       LLVMValueRef face = coords[2];
@@ -957,12 +969,15 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
       fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
 
-      have_edge = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
-      have_edge = lp_build_or(ivec_bld, have_edge, fall_off[2]);
-      have_edge = lp_build_or(ivec_bld, have_edge, fall_off[3]);
-
+      fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
+      fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
+      have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
       have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
 
+      /* needed for accurate corner filtering branch later, rely on 0 init */
+      int1t = LLVMInt1TypeInContext(bld->gallivm->context);
+      have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
+
       for (texel_index = 0; texel_index < 4; texel_index++) {
          xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
          ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
@@ -971,6 +986,10 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 
       lp_build_if(&edge_if, bld->gallivm, have_edge);
 
+      have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
+      have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
+      LLVMBuildStore(builder, have_corner, have_corners);
+
       /*
        * Need to feed clamped values here for cheap corner handling,
        * but only for y coord (as when falling off both edges we only
@@ -1074,7 +1093,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 
    if (linear_mask) {
       /*
-       * Whack filter weights into place. Whatever pixel had more weight is
+       * Whack filter weights into place. Whatever texel had more weight is
        * the one which should have been selected by nearest filtering hence
        * just use 100% weight for it.
        */
@@ -1135,7 +1154,8 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    }
    else {
       /* 2D/3D texture */
-      LLVMValueRef colors0[4];
+      struct lp_build_if_state corner_if;
+      LLVMValueRef colors0[4], colorss[4];
 
       /* get x0/x1 texels at y1 */
       lp_build_sample_texel_soa(bld,
@@ -1149,6 +1169,110 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                                 row_stride_vec, img_stride_vec,
                                 data_ptr, mipoffsets, neighbors[1][1]);
 
+      /*
+       * To avoid having to duplicate linear_mask / fetch code use
+       * another branch (with corner condition though edge would work
+       * as well) here.
+       */
+      if (accurate_cube_corners) {
+         LLVMValueRef w00, w01, w10, w11, wx0, wy0;
+         LLVMValueRef c_weight, c00, c01, c10, c11;
+         LLVMValueRef have_corner, one_third, tmp;
+
+         colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
+         colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
+         colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
+         colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
+
+         have_corner = LLVMBuildLoad(builder, have_corners, "");
+
+         lp_build_if(&corner_if, bld->gallivm, have_corner);
+
+         /*
+          * we can't use standard 2d lerp as we need per-element weight
+          * in case of corners, so just calculate bilinear result as
+          * w00*s00 + w01*s01 + w10*s10 + w11*s11.
+          * (This is actually less work than using 2d lerp, 7 vs. 9 instructions,
+          * however calculating the weights needs another 6, so actually probably
+          * not slower than 2d lerp only for 4 channels as weights only need
+          * to be calculated once - of course fixing the weights has additional cost.)
+          */
+         wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
+         wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
+         w00 = lp_build_mul(coord_bld, wx0, wy0);
+         w01 = lp_build_mul(coord_bld, s_fpart, wy0);
+         w10 = lp_build_mul(coord_bld, wx0, t_fpart);
+         w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
+
+         /* find corner weight */
+         c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
+         c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
+         c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
+         c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
+         c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
+         c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
+         c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
+         c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
+
+         /*
+          * add 1/3 of the corner weight to each of the 3 other samples
+          * and null out corner weight
+          */
+         one_third = lp_build_const_vec(bld->gallivm, coord_bld->type, 1.0f/3.0f);
+         c_weight = lp_build_mul(coord_bld, c_weight, one_third);
+         w00 = lp_build_add(coord_bld, w00, c_weight);
+         c00 = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
+         w00 = lp_build_andnot(coord_bld, w00, c00);
+         w01 = lp_build_add(coord_bld, w01, c_weight);
+         c01 = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
+         w01 = lp_build_andnot(coord_bld, w01, c01);
+         w10 = lp_build_add(coord_bld, w10, c_weight);
+         c10 = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
+         w10 = lp_build_andnot(coord_bld, w10, c10);
+         w11 = lp_build_add(coord_bld, w11, c_weight);
+         c11 = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
+         w11 = lp_build_andnot(coord_bld, w11, c11);
+
+         if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
+            for (chan = 0; chan < 4; chan++) {
+               colors0[chan] = lp_build_mul(coord_bld, w00, neighbors[0][0][chan]);
+               tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
+               colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+               tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
+               colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+               tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
+               colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
+            }
+         }
+         else {
+            LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
+            cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
+            cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
+            cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
+            cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
+            /* inputs to interpolation are just masks so just add masked weights together */
+            cmpval00 = LLVMBuildBitCast(builder, cmpval00, coord_bld->vec_type, "");
+            cmpval01 = LLVMBuildBitCast(builder, cmpval01, coord_bld->vec_type, "");
+            cmpval10 = LLVMBuildBitCast(builder, cmpval10, coord_bld->vec_type, "");
+            cmpval11 = LLVMBuildBitCast(builder, cmpval11, coord_bld->vec_type, "");
+            colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
+            tmp = lp_build_and(coord_bld, w01, cmpval01);
+            colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
+            tmp = lp_build_and(coord_bld, w10, cmpval10);
+            colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
+            tmp = lp_build_and(coord_bld, w11, cmpval11);
+            colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
+            colors0[1] = colors0[2] = colors0[3] = colors0[0];
+         }
+
+         LLVMBuildStore(builder, colors0[0], colorss[0]);
+         LLVMBuildStore(builder, colors0[1], colorss[1]);
+         LLVMBuildStore(builder, colors0[2], colorss[2]);
+         LLVMBuildStore(builder, colors0[3], colorss[3]);
+
+         lp_build_else(&corner_if);
+      }
+
       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
          /* Bilinear interpolate the four samples from the 2D image / 3D slice */
          for (chan = 0; chan < 4; chan++) {
@@ -1172,6 +1296,20 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
          colors0[1] = colors0[2] = colors0[3] = colors0[0];
       }
 
+      if (accurate_cube_corners) {
+         LLVMBuildStore(builder, colors0[0], colorss[0]);
+         LLVMBuildStore(builder, colors0[1], colorss[1]);
+         LLVMBuildStore(builder, colors0[2], colorss[2]);
+         LLVMBuildStore(builder, colors0[3], colorss[3]);
+
+         lp_build_endif(&corner_if);
+
+         colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
+         colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
+         colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
+         colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
+      }
+
       if (dims == 3) {
          LLVMValueRef neighbors1[2][2][4];
          LLVMValueRef colors1[4];
-- 
2.30.2