From: Roland Scheidegger <sroland@vmware.com>
Date: Sat, 5 Oct 2013 01:31:56 +0000 (+0200)
Subject: gallivm: kill old per-quad face selection code
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=9b3dbaf3962106be18df0f9c5d27dd993f09fe88;p=mesa.git

gallivm: kill old per-quad face selection code

Not used since ages, and it wouldn't work at all with explicit derivatives now
(not that it did before as it ignored them but now the code would just use
the derivs pre-projected which would be quite random numbers).

v2: also get rid of 3 helper functions no longer used.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 39c3a2f9d9e..1c352006f3e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -1414,72 +1414,6 @@ lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
    return ima;
 }
 
-/** Helper used by lp_build_cube_lookup() */
-static LLVMValueRef
-lp_build_cube_imaneg(struct lp_build_context *coord_bld, LLVMValueRef coord)
-{
-   /* ima = -0.5 / abs(coord); */
-   LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5);
-   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
-   LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
-   return ima;
-}
-
-/**
- * Helper used by lp_build_cube_lookup()
- * FIXME: the sign here can also be 0.
- * Arithmetically this could definitely make a difference. Either
- * fix the comment or use other (simpler) sign function, not sure
- * which one it should be.
- * \param sign  scalar +1 or -1
- * \param coord  float vector
- * \param ima  float vector
- */
-static LLVMValueRef
-lp_build_cube_coord(struct lp_build_context *coord_bld,
-                    LLVMValueRef sign, int negate_coord,
-                    LLVMValueRef coord, LLVMValueRef ima)
-{
-   /* return negate(coord) * ima * sign + 0.5; */
-   LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
-   LLVMValueRef res;
-
-   assert(negate_coord == +1 || negate_coord == -1);
-
-   if (negate_coord == -1) {
-      coord = lp_build_negate(coord_bld, coord);
-   }
-
-   res = lp_build_mul(coord_bld, coord, ima);
-   if (sign) {
-      sign = lp_build_broadcast_scalar(coord_bld, sign);
-      res = lp_build_mul(coord_bld, res, sign);
-   }
-   res = lp_build_add(coord_bld, res, half);
-
-   return res;
-}
-
-
-/** Helper used by lp_build_cube_lookup()
- * Return (major_coord >= 0) ? pos_face : neg_face;
- */
-static LLVMValueRef
-lp_build_cube_face(struct lp_build_sample_context *bld,
-                   LLVMValueRef major_coord,
-                   unsigned pos_face, unsigned neg_face)
-{
-   struct gallivm_state *gallivm = bld->gallivm;
-   LLVMBuilderRef builder = gallivm->builder;
-   LLVMValueRef cmp = LLVMBuildFCmp(builder, LLVMRealUGE,
-                                    major_coord,
-                                    bld->float_bld.zero, "");
-   LLVMValueRef pos = lp_build_const_int32(gallivm, pos_face);
-   LLVMValueRef neg = lp_build_const_int32(gallivm, neg_face);
-   LLVMValueRef res = LLVMBuildSelect(builder, cmp, pos, neg, "");
-   return res;
-}
-
 
 /** Helper for doing 3-wise selection.
  * Returns sel1 ? val2 : (sel0 ? val0 : val1).
@@ -1497,6 +1431,7 @@ lp_build_select3(struct lp_build_context *sel_bld,
    return lp_build_select(sel_bld, sel1, val2, tmp);
 }
 
+
 /**
  * Generate code to do cube face selection and compute per-face texcoords.
  */
@@ -1513,301 +1448,141 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
    struct gallivm_state *gallivm = bld->gallivm;
    LLVMValueRef si, ti, ri;
 
-   if (1 || coord_bld->type.length > 4) {
-      /*
-       * Do per-pixel face selection. We cannot however (as we used to do)
-       * simply calculate the derivs afterwards (which is very bogus for
-       * explicit derivs btw) because the values would be "random" when
-       * not all pixels lie on the same face. So what we do here is just
-       * calculate the derivatives after scaling the coords by the absolute
-       * value of the inverse major axis, and essentially do rho calculation
-       * steps as if it were a 3d texture. This is perfect if all pixels hit
-       * the same face, but not so great at edges, I believe the max error
-       * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
-       * the 3d distance between 2 points on the cube instead of measuring up/down
-       * the edge). Still this is possibly a win over just selecting the same face
-       * for all pixels. Unfortunately, something like that doesn't work for
-       * explicit derivatives.
-       */
-      struct lp_build_context *cint_bld = &bld->int_coord_bld;
-      struct lp_type intctype = cint_bld->type;
-      LLVMTypeRef coord_vec_type = coord_bld->vec_type;
-      LLVMTypeRef cint_vec_type = cint_bld->vec_type;
-      LLVMValueRef as, at, ar, face, face_s, face_t;
-      LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
-      LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
-      LLVMValueRef tnegi, rnegi;
-      LLVMValueRef ma, mai, signma, signmabit, imahalfpos;
-      LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
-      LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
-                                                     1 << (intctype.width - 1));
-      LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
-                                                      intctype.width -1);
-      LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
-      LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
-      LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
-      LLVMValueRef s = coords[0];
-      LLVMValueRef t = coords[1];
-      LLVMValueRef r = coords[2];
-
-      assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
-      assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
-      assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
+   /*
+    * Do per-pixel face selection. We cannot however (as we used to do)
+    * simply calculate the derivs afterwards (which is very bogus for
+    * explicit derivs btw) because the values would be "random" when
+    * not all pixels lie on the same face. So what we do here is just
+    * calculate the derivatives after scaling the coords by the absolute
+    * value of the inverse major axis, and essentially do rho calculation
+    * steps as if it were a 3d texture. This is perfect if all pixels hit
+    * the same face, but not so great at edges, I believe the max error
+    * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
+    * the 3d distance between 2 points on the cube instead of measuring up/down
+    * the edge). Still this is possibly a win over just selecting the same face
+    * for all pixels. Unfortunately, something like that doesn't work for
+    * explicit derivatives.
+    */
+   struct lp_build_context *cint_bld = &bld->int_coord_bld;
+   struct lp_type intctype = cint_bld->type;
+   LLVMTypeRef coord_vec_type = coord_bld->vec_type;
+   LLVMTypeRef cint_vec_type = cint_bld->vec_type;
+   LLVMValueRef as, at, ar, face, face_s, face_t;
+   LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
+   LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
+   LLVMValueRef tnegi, rnegi;
+   LLVMValueRef ma, mai, signma, signmabit, imahalfpos;
+   LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
+   LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
+                                                  1 << (intctype.width - 1));
+   LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
+                                                   intctype.width -1);
+   LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
+   LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
+   LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
+   LLVMValueRef s = coords[0];
+   LLVMValueRef t = coords[1];
+   LLVMValueRef r = coords[2];
+
+   assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
+   assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
+   assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
+
+   /*
+    * get absolute value (for x/y/z face selection) and sign bit
+    * (for mirroring minor coords and pos/neg face selection)
+    * of the original coords.
+    */
+   as = lp_build_abs(&bld->coord_bld, s);
+   at = lp_build_abs(&bld->coord_bld, t);
+   ar = lp_build_abs(&bld->coord_bld, r);
 
+   /*
+    * major face determination: select x if x > y else select y
+    * select z if z >= max(x,y) else select previous result
+    * if some axis are the same we chose z over y, y over x - the
+    * dx10 spec seems to ask for it while OpenGL doesn't care (if we
+    * wouldn't care could save a select or two if using different
+    * compares and doing at_g_as_ar last since tnewx and tnewz are the
+    * same).
+    */
+   as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
+   maxasat = lp_build_max(coord_bld, as, at);
+   ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
+
+   if (need_derivs && (derivs_in ||
+       ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
+        (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX)))) {
       /*
-       * get absolute value (for x/y/z face selection) and sign bit
-       * (for mirroring minor coords and pos/neg face selection)
-       * of the original coords.
+       * XXX: This is really really complex.
+       * It is a bit overkill to use this for implicit derivatives as well,
+       * no way this is worth the cost in practice, but seems to be the
+       * only way for getting accurate and per-pixel lod values.
        */
-      as = lp_build_abs(&bld->coord_bld, s);
-      at = lp_build_abs(&bld->coord_bld, t);
-      ar = lp_build_abs(&bld->coord_bld, r);
-
+      LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3];
+      LLVMValueRef madx, mady, madxdivma, madydivma;
+      LLVMValueRef sdxi, tdxi, rdxi, sdyi, tdyi, rdyi;
+      LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi;
+      LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz;
+      LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz;
+      LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy;
       /*
-       * major face determination: select x if x > y else select y
-       * select z if z >= max(x,y) else select previous result
-       * if some axis are the same we chose z over y, y over x - the
-       * dx10 spec seems to ask for it while OpenGL doesn't care (if we
-       * wouldn't care could save a select or two if using different
-       * compares and doing at_g_as_ar last since tnewx and tnewz are the
-       * same).
+       * s = 1/2 * ( sc / ma + 1)
+       * t = 1/2 * ( tc / ma + 1)
+       *
+       * s' = 1/2 * (sc' * ma - sc * ma') / ma^2
+       * t' = 1/2 * (tc' * ma - tc * ma') / ma^2
+       *
+       * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma
+       * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma
+       * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma
+       * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma
        */
-      as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
-      maxasat = lp_build_max(coord_bld, as, at);
-      ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
-
-      if (need_derivs && (derivs_in ||
-          ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
-           (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX)))) {
-         /*
-          * XXX: This is really really complex.
-          * It is a bit overkill to use this for implicit derivatives as well,
-          * no way this is worth the cost in practice, but seems to be the
-          * only way for getting accurate and per-pixel lod values.
-          */
-         LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3];
-         LLVMValueRef madx, mady, madxdivma, madydivma;
-         LLVMValueRef sdxi, tdxi, rdxi, sdyi, tdyi, rdyi;
-         LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi;
-         LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz;
-         LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz;
-         LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy;
-         /*
-          * s = 1/2 * ( sc / ma + 1)
-          * t = 1/2 * ( tc / ma + 1)
-          *
-          * s' = 1/2 * (sc' * ma - sc * ma') / ma^2
-          * t' = 1/2 * (tc' * ma - tc * ma') / ma^2
-          *
-          * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma
-          * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma
-          * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma
-          * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma
-          */
-
-         /* select ma, calculate ima */
-         ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
-         mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
-         signmabit = LLVMBuildAnd(builder, mai, signmask, "");
-         ima = lp_build_div(coord_bld, coord_bld->one, ma);
-         imahalf = lp_build_mul(coord_bld, posHalf, ima);
-         imahalfpos = lp_build_abs(coord_bld, imahalf);
-
-         if (!derivs_in) {
-            ddx[0] = lp_build_ddx(coord_bld, s);
-            ddx[1] = lp_build_ddx(coord_bld, t);
-            ddx[2] = lp_build_ddx(coord_bld, r);
-            ddy[0] = lp_build_ddy(coord_bld, s);
-            ddy[1] = lp_build_ddy(coord_bld, t);
-            ddy[2] = lp_build_ddy(coord_bld, r);
-         }
-         else {
-            ddx[0] = derivs_in->ddx[0];
-            ddx[1] = derivs_in->ddx[1];
-            ddx[2] = derivs_in->ddx[2];
-            ddy[0] = derivs_in->ddy[0];
-            ddy[1] = derivs_in->ddy[1];
-            ddy[2] = derivs_in->ddy[2];
-         }
-
-         /* select major derivatives */
-         madx = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddx[0], ddx[1], ddx[2]);
-         mady = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddy[0], ddy[1], ddy[2]);
-
-         si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
-         ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
-         ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
-
-         sdxi = LLVMBuildBitCast(builder, ddx[0], cint_vec_type, "");
-         tdxi = LLVMBuildBitCast(builder, ddx[1], cint_vec_type, "");
-         rdxi = LLVMBuildBitCast(builder, ddx[2], cint_vec_type, "");
-
-         sdyi = LLVMBuildBitCast(builder, ddy[0], cint_vec_type, "");
-         tdyi = LLVMBuildBitCast(builder, ddy[1], cint_vec_type, "");
-         rdyi = LLVMBuildBitCast(builder, ddy[2], cint_vec_type, "");
 
-         /*
-          * compute all possible new s/t coords, which does the mirroring,
-          * and do the same for derivs minor axes.
-          * snewx = signma * -r;
-          * tnewx = -t;
-          * snewy = s;
-          * tnewy = signma * r;
-          * snewz = signma * s;
-          * tnewz = -t;
-          */
-         tnegi = LLVMBuildXor(builder, ti, signmask, "");
-         rnegi = LLVMBuildXor(builder, ri, signmask, "");
-         tdxnegi = LLVMBuildXor(builder, tdxi, signmask, "");
-         rdxnegi = LLVMBuildXor(builder, rdxi, signmask, "");
-         tdynegi = LLVMBuildXor(builder, tdyi, signmask, "");
-         rdynegi = LLVMBuildXor(builder, rdyi, signmask, "");
-
-         snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
-         tnewx = tnegi;
-         sdxnewx = LLVMBuildXor(builder, signmabit, rdxnegi, "");
-         tdxnewx = tdxnegi;
-         sdynewx = LLVMBuildXor(builder, signmabit, rdynegi, "");
-         tdynewx = tdynegi;
-
-         snewy = si;
-         tnewy = LLVMBuildXor(builder, signmabit, ri, "");
-         sdxnewy = sdxi;
-         tdxnewy = LLVMBuildXor(builder, signmabit, rdxi, "");
-         sdynewy = sdyi;
-         tdynewy = LLVMBuildXor(builder, signmabit, rdyi, "");
-
-         snewz = LLVMBuildXor(builder, signmabit, si, "");
-         tnewz = tnegi;
-         sdxnewz = LLVMBuildXor(builder, signmabit, sdxi, "");
-         tdxnewz = tdxnegi;
-         sdynewz = LLVMBuildXor(builder, signmabit, sdyi, "");
-         tdynewz = tdynegi;
-
-         /* select the mirrored values */
-         face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
-         face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
-         face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
-         face_sdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdxnewx, sdxnewy, sdxnewz);
-         face_tdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdxnewx, tdxnewy, tdxnewz);
-         face_sdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdynewx, sdynewy, sdynewz);
-         face_tdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdynewx, tdynewy, tdynewz);
-
-         face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
-         face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
-         face_sdx = LLVMBuildBitCast(builder, face_sdx, coord_vec_type, "");
-         face_tdx = LLVMBuildBitCast(builder, face_tdx, coord_vec_type, "");
-         face_sdy = LLVMBuildBitCast(builder, face_sdy, coord_vec_type, "");
-         face_tdy = LLVMBuildBitCast(builder, face_tdy, coord_vec_type, "");
-
-         /* deriv math, dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma */
-         madxdivma = lp_build_mul(coord_bld, madx, ima);
-         tmp = lp_build_mul(coord_bld, madxdivma, face_s);
-         tmp = lp_build_sub(coord_bld, face_sdx, tmp);
-         derivs_out->ddx[0] = lp_build_mul(coord_bld, tmp, imahalf);
-
-         /* dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma */
-         tmp = lp_build_mul(coord_bld, madxdivma, face_t);
-         tmp = lp_build_sub(coord_bld, face_tdx, tmp);
-         derivs_out->ddx[1] = lp_build_mul(coord_bld, tmp, imahalf);
-
-         /* dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma */
-         madydivma = lp_build_mul(coord_bld, mady, ima);
-         tmp = lp_build_mul(coord_bld, madydivma, face_s);
-         tmp = lp_build_sub(coord_bld, face_sdy, tmp);
-         derivs_out->ddy[0] = lp_build_mul(coord_bld, tmp, imahalf);
-
-         /* dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma */
-         tmp = lp_build_mul(coord_bld, madydivma, face_t);
-         tmp = lp_build_sub(coord_bld, face_tdy, tmp);
-         derivs_out->ddy[1] = lp_build_mul(coord_bld, tmp, imahalf);
-
-         signma = LLVMBuildLShr(builder, mai, signshift, "");
-         coords[2] = LLVMBuildOr(builder, face, signma, "face");
-
-         /* project coords */
-         face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
-         face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
-
-         coords[0] = lp_build_add(coord_bld, face_s, posHalf);
-         coords[1] = lp_build_add(coord_bld, face_t, posHalf);
-
-         return;
+      /* select ma, calculate ima */
+      ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
+      mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
+      signmabit = LLVMBuildAnd(builder, mai, signmask, "");
+      ima = lp_build_div(coord_bld, coord_bld->one, ma);
+      imahalf = lp_build_mul(coord_bld, posHalf, ima);
+      imahalfpos = lp_build_abs(coord_bld, imahalf);
+
+      if (!derivs_in) {
+         ddx[0] = lp_build_ddx(coord_bld, s);
+         ddx[1] = lp_build_ddx(coord_bld, t);
+         ddx[2] = lp_build_ddx(coord_bld, r);
+         ddy[0] = lp_build_ddy(coord_bld, s);
+         ddy[1] = lp_build_ddy(coord_bld, t);
+         ddy[2] = lp_build_ddy(coord_bld, r);
       }
-
-      else if (need_derivs) {
-         LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
-         static const unsigned char swizzle0[] = { /* no-op swizzle */
-            0, LP_BLD_SWIZZLE_DONTCARE,
-            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
-         };
-         static const unsigned char swizzle1[] = {
-            1, LP_BLD_SWIZZLE_DONTCARE,
-            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
-         };
-         static const unsigned char swizzle01[] = { /* no-op swizzle */
-            0, 1,
-            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
-         };
-         static const unsigned char swizzle23[] = {
-            2, 3,
-            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
-         };
-         static const unsigned char swizzle02[] = {
-            0, 2,
-            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
-         };
-
-         /*
-          * scale the s/t/r coords pre-select/mirror so we can calculate
-          * "reasonable" derivs.
-          */
-         ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
-         imahalfpos = lp_build_cube_imapos(coord_bld, ma);
-         s = lp_build_mul(coord_bld, s, imahalfpos);
-         t = lp_build_mul(coord_bld, t, imahalfpos);
-         r = lp_build_mul(coord_bld, r, imahalfpos);
-
-         /*
-          * This isn't quite the same as the "ordinary" (3d deriv) path since we
-          * know the texture is square which simplifies things (we can omit the
-          * size mul which happens very early completely here and do it at the
-          * very end).
-          * Also always do calculations according to GALLIVM_DEBUG_NO_RHO_APPROX
-          * since the error can get quite big otherwise at edges.
-          * (With no_rho_approx max error is sqrt(2) at edges, same as it is
-          * without no_rho_approx for 2d textures, otherwise it would be factor 2.)
-          */
-         ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
-         ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
-
-         ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
-         ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
-
-         tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
-         tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
-         tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
-
-         rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
-         rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
-
-         tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
-         tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
-         *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
+      else {
+         ddx[0] = derivs_in->ddx[0];
+         ddx[1] = derivs_in->ddx[1];
+         ddx[2] = derivs_in->ddx[2];
+         ddy[0] = derivs_in->ddy[0];
+         ddy[1] = derivs_in->ddy[1];
+         ddy[2] = derivs_in->ddy[2];
       }
 
-      if (!need_derivs) {
-         ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
-      }
-      mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
-      signmabit = LLVMBuildAnd(builder, mai, signmask, "");
+      /* select major derivatives */
+      madx = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddx[0], ddx[1], ddx[2]);
+      mady = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddy[0], ddy[1], ddy[2]);
 
       si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
       ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
       ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
 
+      sdxi = LLVMBuildBitCast(builder, ddx[0], cint_vec_type, "");
+      tdxi = LLVMBuildBitCast(builder, ddx[1], cint_vec_type, "");
+      rdxi = LLVMBuildBitCast(builder, ddx[2], cint_vec_type, "");
+
+      sdyi = LLVMBuildBitCast(builder, ddy[0], cint_vec_type, "");
+      tdyi = LLVMBuildBitCast(builder, ddy[1], cint_vec_type, "");
+      rdyi = LLVMBuildBitCast(builder, ddy[2], cint_vec_type, "");
+
       /*
-       * compute all possible new s/t coords, which does the mirroring
+       * compute all possible new s/t coords, which does the mirroring,
+       * and do the same for derivs minor axes.
        * snewx = signma * -r;
        * tnewx = -t;
        * snewy = s;
@@ -1817,164 +1592,200 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
        */
       tnegi = LLVMBuildXor(builder, ti, signmask, "");
       rnegi = LLVMBuildXor(builder, ri, signmask, "");
+      tdxnegi = LLVMBuildXor(builder, tdxi, signmask, "");
+      rdxnegi = LLVMBuildXor(builder, rdxi, signmask, "");
+      tdynegi = LLVMBuildXor(builder, tdyi, signmask, "");
+      rdynegi = LLVMBuildXor(builder, rdyi, signmask, "");
 
       snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
       tnewx = tnegi;
+      sdxnewx = LLVMBuildXor(builder, signmabit, rdxnegi, "");
+      tdxnewx = tdxnegi;
+      sdynewx = LLVMBuildXor(builder, signmabit, rdynegi, "");
+      tdynewx = tdynegi;
 
       snewy = si;
       tnewy = LLVMBuildXor(builder, signmabit, ri, "");
+      sdxnewy = sdxi;
+      tdxnewy = LLVMBuildXor(builder, signmabit, rdxi, "");
+      sdynewy = sdyi;
+      tdynewy = LLVMBuildXor(builder, signmabit, rdyi, "");
 
       snewz = LLVMBuildXor(builder, signmabit, si, "");
       tnewz = tnegi;
+      sdxnewz = LLVMBuildXor(builder, signmabit, sdxi, "");
+      tdxnewz = tdxnegi;
+      sdynewz = LLVMBuildXor(builder, signmabit, sdyi, "");
+      tdynewz = tdynegi;
 
       /* select the mirrored values */
+      face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
       face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
       face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
-      face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
+      face_sdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdxnewx, sdxnewy, sdxnewz);
+      face_tdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdxnewx, tdxnewy, tdxnewz);
+      face_sdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdynewx, sdynewy, sdynewz);
+      face_tdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdynewx, tdynewy, tdynewz);
 
       face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
       face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
+      face_sdx = LLVMBuildBitCast(builder, face_sdx, coord_vec_type, "");
+      face_tdx = LLVMBuildBitCast(builder, face_tdx, coord_vec_type, "");
+      face_sdy = LLVMBuildBitCast(builder, face_sdy, coord_vec_type, "");
+      face_tdy = LLVMBuildBitCast(builder, face_tdy, coord_vec_type, "");
+
+      /* deriv math, dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma */
+      madxdivma = lp_build_mul(coord_bld, madx, ima);
+      tmp = lp_build_mul(coord_bld, madxdivma, face_s);
+      tmp = lp_build_sub(coord_bld, face_sdx, tmp);
+      derivs_out->ddx[0] = lp_build_mul(coord_bld, tmp, imahalf);
+
+      /* dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma */
+      tmp = lp_build_mul(coord_bld, madxdivma, face_t);
+      tmp = lp_build_sub(coord_bld, face_tdx, tmp);
+      derivs_out->ddx[1] = lp_build_mul(coord_bld, tmp, imahalf);
+
+      /* dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma */
+      madydivma = lp_build_mul(coord_bld, mady, ima);
+      tmp = lp_build_mul(coord_bld, madydivma, face_s);
+      tmp = lp_build_sub(coord_bld, face_sdy, tmp);
+      derivs_out->ddy[0] = lp_build_mul(coord_bld, tmp, imahalf);
+
+      /* dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma */
+      tmp = lp_build_mul(coord_bld, madydivma, face_t);
+      tmp = lp_build_sub(coord_bld, face_tdy, tmp);
+      derivs_out->ddy[1] = lp_build_mul(coord_bld, tmp, imahalf);
 
-      /* add +1 for neg face */
-      /* XXX with AVX probably want to use another select here -
-       * as long as we ensure vblendvps gets used we can actually
-       * skip the comparison and just use sign as a "mask" directly.
-       */
       signma = LLVMBuildLShr(builder, mai, signshift, "");
       coords[2] = LLVMBuildOr(builder, face, signma, "face");
 
       /* project coords */
-      if (!need_derivs) {
-         imahalfpos = lp_build_cube_imapos(coord_bld, ma);
-         face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
-         face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
-      }
+      face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
+      face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
 
       coords[0] = lp_build_add(coord_bld, face_s, posHalf);
       coords[1] = lp_build_add(coord_bld, face_t, posHalf);
+
+      return;
    }
 
-   else {
-      struct lp_build_if_state if_ctx;
-      LLVMValueRef face_s_var;
-      LLVMValueRef face_t_var;
-      LLVMValueRef face_var;
-      LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
-      LLVMValueRef shuffles[4];
-      LLVMValueRef arxy_ge_aryx, arxy_ge_arzz, arxy_ge_arxy_arzz;
-      LLVMValueRef arxyxy, aryxzz, arxyxy_ge_aryxzz;
-      LLVMValueRef tmp[4], rxyz, arxyz;
-      struct lp_build_context *float_bld = &bld->float_bld;
-      LLVMValueRef s, t, r, face, face_s, face_t;
-
-      assert(bld->coord_bld.type.length == 4);
-
-      tmp[0] = s = coords[0];
-      tmp[1] = t = coords[1];
-      tmp[2] = r = coords[2];
-      rxyz = lp_build_hadd_partial4(&bld->coord_bld, tmp, 3);
-      arxyz = lp_build_abs(&bld->coord_bld, rxyz);
-
-      shuffles[0] = lp_build_const_int32(gallivm, 0);
-      shuffles[1] = lp_build_const_int32(gallivm, 1);
-      shuffles[2] = lp_build_const_int32(gallivm, 0);
-      shuffles[3] = lp_build_const_int32(gallivm, 1);
-      arxyxy = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
-      shuffles[0] = lp_build_const_int32(gallivm, 1);
-      shuffles[1] = lp_build_const_int32(gallivm, 0);
-      shuffles[2] = lp_build_const_int32(gallivm, 2);
-      shuffles[3] = lp_build_const_int32(gallivm, 2);
-      aryxzz = LLVMBuildShuffleVector(builder, arxyz, arxyz, LLVMConstVector(shuffles, 4), "");
-      arxyxy_ge_aryxzz = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_GEQUAL, arxyxy, aryxzz);
-
-      shuffles[0] = lp_build_const_int32(gallivm, 0);
-      shuffles[1] = lp_build_const_int32(gallivm, 1);
-      arxy_ge_aryx = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
-                                            LLVMConstVector(shuffles, 2), "");
-      shuffles[0] = lp_build_const_int32(gallivm, 2);
-      shuffles[1] = lp_build_const_int32(gallivm, 3);
-      arxy_ge_arzz = LLVMBuildShuffleVector(builder, arxyxy_ge_aryxzz, arxyxy_ge_aryxzz,
-                                            LLVMConstVector(shuffles, 2), "");
-      arxy_ge_arxy_arzz = LLVMBuildAnd(builder, arxy_ge_aryx, arxy_ge_arzz, "");
-
-      arx_ge_ary_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
-                                               lp_build_const_int32(gallivm, 0), "");
-      arx_ge_ary_arz = LLVMBuildICmp(builder, LLVMIntNE, arx_ge_ary_arz,
-                                               lp_build_const_int32(gallivm, 0), "");
-      ary_ge_arx_arz = LLVMBuildExtractElement(builder, arxy_ge_arxy_arzz,
-                                               lp_build_const_int32(gallivm, 1), "");
-      ary_ge_arx_arz = LLVMBuildICmp(builder, LLVMIntNE, ary_ge_arx_arz,
-                                               lp_build_const_int32(gallivm, 0), "");
-      face_s_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_s_var");
-      face_t_var = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "face_t_var");
-      face_var = lp_build_alloca(gallivm, bld->int_bld.vec_type, "face_var");
-
-      lp_build_if(&if_ctx, gallivm, arx_ge_ary_arz);
-      {
-         /* +/- X face */
-         LLVMValueRef sign, ima;
-         si = LLVMBuildExtractElement(builder, rxyz,
-                                      lp_build_const_int32(gallivm, 0), "");
-         /* +/- X face */
-         sign = lp_build_sgn(float_bld, si);
-         ima = lp_build_cube_imaneg(coord_bld, s);
-         face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
-         face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
-         face = lp_build_cube_face(bld, si,
-                                    PIPE_TEX_FACE_POS_X,
-                                    PIPE_TEX_FACE_NEG_X);
-         LLVMBuildStore(builder, face_s, face_s_var);
-         LLVMBuildStore(builder, face_t, face_t_var);
-         LLVMBuildStore(builder, face, face_var);
-      }
-      lp_build_else(&if_ctx);
-      {
-         struct lp_build_if_state if_ctx2;
-
-         lp_build_if(&if_ctx2, gallivm, ary_ge_arx_arz);
-         {
-            LLVMValueRef sign, ima;
-            /* +/- Y face */
-            ti = LLVMBuildExtractElement(builder, rxyz,
-                                         lp_build_const_int32(gallivm, 1), "");
-            sign = lp_build_sgn(float_bld, ti);
-            ima = lp_build_cube_imaneg(coord_bld, t);
-            face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
-            face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
-            face = lp_build_cube_face(bld, ti,
-                                       PIPE_TEX_FACE_POS_Y,
-                                       PIPE_TEX_FACE_NEG_Y);
-            LLVMBuildStore(builder, face_s, face_s_var);
-            LLVMBuildStore(builder, face_t, face_t_var);
-            LLVMBuildStore(builder, face, face_var);
-         }
-         lp_build_else(&if_ctx2);
-         {
-            /* +/- Z face */
-            LLVMValueRef sign, ima;
-            ri = LLVMBuildExtractElement(builder, rxyz,
-                                         lp_build_const_int32(gallivm, 2), "");
-            sign = lp_build_sgn(float_bld, ri);
-            ima = lp_build_cube_imaneg(coord_bld, r);
-            face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
-            face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
-            face = lp_build_cube_face(bld, ri,
-                                       PIPE_TEX_FACE_POS_Z,
-                                       PIPE_TEX_FACE_NEG_Z);
-            LLVMBuildStore(builder, face_s, face_s_var);
-            LLVMBuildStore(builder, face_t, face_t_var);
-            LLVMBuildStore(builder, face, face_var);
-         }
-         lp_build_endif(&if_ctx2);
-      }
+   else if (need_derivs) {
+      LLVMValueRef ddx_ddy[2], tmp[3], rho_vec;
+      static const unsigned char swizzle0[] = { /* no-op swizzle */
+         0, LP_BLD_SWIZZLE_DONTCARE,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      static const unsigned char swizzle1[] = {
+         1, LP_BLD_SWIZZLE_DONTCARE,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      static const unsigned char swizzle01[] = { /* no-op swizzle */
+         0, 1,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      static const unsigned char swizzle23[] = {
+         2, 3,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+      static const unsigned char swizzle02[] = {
+         0, 2,
+         LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+      };
+
+      /*
+       * scale the s/t/r coords pre-select/mirror so we can calculate
+       * "reasonable" derivs.
+       */
+      ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
+      imahalfpos = lp_build_cube_imapos(coord_bld, ma);
+      s = lp_build_mul(coord_bld, s, imahalfpos);
+      t = lp_build_mul(coord_bld, t, imahalfpos);
+      r = lp_build_mul(coord_bld, r, imahalfpos);
+
+      /*
+       * This isn't quite the same as the "ordinary" (3d deriv) path since we
+       * know the texture is square which simplifies things (we can omit the
+       * size mul which happens very early completely here and do it at the
+       * very end).
+       * Also always do calculations according to GALLIVM_DEBUG_NO_RHO_APPROX
+       * since the error can get quite big otherwise at edges.
+       * (With no_rho_approx max error is sqrt(2) at edges, same as it is
+       * without no_rho_approx for 2d textures, otherwise it would be factor 2.)
+       */
+      ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
+      ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
 
-      lp_build_endif(&if_ctx);
+      ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
+      ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
 
-      coords[0] = LLVMBuildLoad(builder, face_s_var, "face_s");
-      coords[1] = LLVMBuildLoad(builder, face_t_var, "face_t");
-      face   = LLVMBuildLoad(builder, face_var, "face");
-      coords[2]   = lp_build_broadcast_scalar(&bld->int_coord_bld, face);
+      tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
+      tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
+      tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
+
+      rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
+      rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
+
+      tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
+      tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
+      *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
    }
+
+   if (!need_derivs) {
+      ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
+   }
+   mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
+   signmabit = LLVMBuildAnd(builder, mai, signmask, "");
+
+   si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
+   ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
+   ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
+
+   /*
+    * compute all possible new s/t coords, which does the mirroring
+    * snewx = signma * -r;
+    * tnewx = -t;
+    * snewy = s;
+    * tnewy = signma * r;
+    * snewz = signma * s;
+    * tnewz = -t;
+    */
+   tnegi = LLVMBuildXor(builder, ti, signmask, "");
+   rnegi = LLVMBuildXor(builder, ri, signmask, "");
+
+   snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
+   tnewx = tnegi;
+
+   snewy = si;
+   tnewy = LLVMBuildXor(builder, signmabit, ri, "");
+
+   snewz = LLVMBuildXor(builder, signmabit, si, "");
+   tnewz = tnegi;
+
+   /* select the mirrored values */
+   face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
+   face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
+   face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
+
+   face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
+   face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
+
+   /* add +1 for neg face */
+   /* XXX with AVX probably want to use another select here -
+    * as long as we ensure vblendvps gets used we can actually
+    * skip the comparison and just use sign as a "mask" directly.
+    */
+   signma = LLVMBuildLShr(builder, mai, signshift, "");
+   coords[2] = LLVMBuildOr(builder, face, signma, "face");
+
+   /* project coords */
+   if (!need_derivs) {
+      imahalfpos = lp_build_cube_imapos(coord_bld, ma);
+      face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
+      face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
+   }
+
+   coords[0] = lp_build_add(coord_bld, face_s, posHalf);
+   coords[1] = lp_build_add(coord_bld, face_t, posHalf);
 }