src/gallium/auxiliary/gallivm/lp_bld_format_srgb.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2013 VMware, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * The above copyright notice and this permission notice (including the
  15  * next paragraph) shall be included in all copies or substantial portions
  16  * of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25  *
  26  **************************************************************************/
  27
  28
  29 /**
  30  * @file
  31  * Format conversion code for srgb formats.
  32  *
  33  * Functions for converting from srgb to linear and vice versa.
  34  * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:
  35  *
  36  * srgb->linear:
  37  * cl = cs / 12.92,                 cs <= 0.04045
  38  * cl = ((cs + 0.055)/1.055)^2.4,   cs >  0.04045
  39  *
  40  * linear->srgb:
  41  * if (isnan(cl)) {
  42  *    Map IEEE-754 Not-a-number to zero.
  43  *    cs = 0.0;
  44  * } else if (cl > 1.0) {
  45  *    cs = 1.0;
  46  * } else if (cl < 0.0) {
  47  *    cs = 0.0;
  48  * } else if (cl < 0.0031308) {
  49  *    cs = 12.92 * cl;
  50  * } else {
  51  *    cs = 1.055 * pow(cl, 0.41666) - 0.055;
  52  * }
  53  *
  54  * This does not need to be accurate, however at least for d3d10
  55  * (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):
  56  * 1) For srgb->linear, it is required that the error on the srgb side is
  57  *    not larger than 0.5f, which I interpret that if you map the value back
  58  *    to srgb from linear using the ideal conversion, it would not be off by
  59  *    more than 0.5f (that is, it would map to the same 8-bit integer value
  60  *    as it was before conversion to linear).
  61  * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large
  62  *    error is allowed.
  63  * 3) Additionally, all srgb values converted to linear and back must result
  64  *    in the same value as they were originally.
  65  *
  66  * @author Roland Scheidegger <sroland@vmware.com>
  67  */
  68
  69
  70 #include "util/u_debug.h"
  71
  72 #include "lp_bld_type.h"
  73 #include "lp_bld_const.h"
  74 #include "lp_bld_arit.h"
  75 #include "lp_bld_bitarit.h"
  76 #include "lp_bld_logic.h"
  77 #include "lp_bld_format.h"
  78
  79
  80
  81 /**
  82  * Convert srgb int values to linear float values.
  83  * Several possibilities how to do this, e.g.
  84  * - table
  85  * - doing the pow() with int-to-float and float-to-int tricks
  86  *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
  87  * - just using standard polynomial approximation
  88  *   (3rd order polynomial is required for crappy but just sufficient accuracy)
  89  *
  90  * @param src   integer (vector) value(s) to convert
  91  *              (8 bit values unpacked to 32 bit already).
  92  */
  93 LLVMValueRef
  94 lp_build_srgb_to_linear(struct gallivm_state *gallivm,
  95                         struct lp_type src_type,
  96                         LLVMValueRef src)
  97 {
  98    struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);
  99    struct lp_build_context f32_bld;
 100    LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh;
 101    double coeffs[4] = {0.0023f,
 102                        0.0030f / 255.0f,
 103                        0.6935f / (255.0f * 255.0f),
 104                        0.3012f / (255.0f * 255.0f * 255.0f)
 105    };
 106
 107    assert(src_type.width == 32);
 108
 109    lp_build_context_init(&f32_bld, gallivm, f32_type);
 110
 111    /*
 112     * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023)
 113     * ( poly =  0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023)
 114     * (found with octave polyfit and some magic as I couldn't get the error
 115     * function right). Using the above mentioned error function, the values stay
 116     * within +-0.35, except for the lowest values - hence tweaking linear segment
 117     * to cover the first 16 instead of the first 11 values (the error stays
 118     * just about acceptable there too).
 119     * Hence: lin = src > 15 ? poly : src / 12.6
 120     * This function really only makes sense for vectors, should use LUT otherwise.
 121     * All in all (including float conversion) 11 instructions (with sse4.1),
 122     * 6 constants (polynomial could be done with 1 instruction less at the cost
 123     * of slightly worse dependency chain, fma should also help).
 124     */
 125    /* doing the 1/255 mul as part of the approximation */
 126    srcf = lp_build_int_to_float(&f32_bld, src);
 127    lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f));
 128    part_lin = lp_build_mul(&f32_bld, srcf, lin_const);
 129
 130    part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4);
 131
 132    lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);
 133    is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh);
 134    return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);
 135 }
 136
 137
 138 /**
 139  * Convert linear float values to srgb int values.
 140  * Several possibilities how to do this, e.g.
 141  * - use table (based on exponent/highest order mantissa bits) and do
 142  *   linear interpolation (https://gist.github.com/rygorous/2203834)
 143  * - Chebyshev polynomial
 144  * - Approximation using reciprocals
 145  * - using int-to-float and float-to-int tricks for pow()
 146  *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
 147  *
 148  * @param src   float (vector) value(s) to convert.
 149  */
 150 LLVMValueRef
 151 lp_build_linear_to_srgb(struct gallivm_state *gallivm,
 152                         struct lp_type src_type,
 153                         LLVMValueRef src)
 154 {
 155    LLVMBuilderRef builder = gallivm->builder;
 156    struct lp_build_context f32_bld;
 157    LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final;
 158
 159    lp_build_context_init(&f32_bld, gallivm, src_type);
 160
 161    src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);
 162
 163    if (0) {
 164       /*
 165        * using int-to-float and float-to-int trick for pow().
 166        * This is much more accurate than necessary thanks to the correction,
 167        * but it most certainly makes no sense without rsqrt available.
 168        * Bonus points if you understand how this works...
 169        * All in all (including min/max clamp, conversion) 19 instructions.
 170        */
 171
 172       float exp_f = 2.0f / 3.0f;
 173       float coeff_f = 0.62996f;
 174       LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2;
 175       struct lp_type int_type = lp_int_type(src_type);
 176
 177       /*
 178        * First calculate approx x^8/12
 179        */
 180       exponent = lp_build_const_vec(gallivm, src_type, exp_f);
 181       coeff = lp_build_const_vec(gallivm, src_type,
 182                                  exp2f(127.0f / exp_f - 127.0f) *
 183                                  powf(coeff_f, 1.0f / exp_f));
 184
 185       /* premultiply src */
 186       tmp = lp_build_mul(&f32_bld, coeff, src);
 187       /* "log2" */
 188       tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), "");
 189       tmp = lp_build_int_to_float(&f32_bld, tmp);
 190       /* multiply for pow */
 191       tmp = lp_build_mul(&f32_bld, tmp, exponent);
 192       /* "exp2" */
 193       pow_approx = lp_build_itrunc(&f32_bld, tmp);
 194       pow_approx = LLVMBuildBitCast(builder, pow_approx,
 195                                     lp_build_vec_type(gallivm, src_type), "");
 196
 197       /*
 198        * Since that pow was inaccurate (like 3 bits, though each sqrt step would
 199        * give another bit), compensate the error (which is why we chose another
 200        * exponent in the first place).
 201        */
 202       /* x * x^(8/12) = x^(20/12) */
 203       pow_1 = lp_build_mul(&f32_bld, pow_approx, src);
 204
 205       /* x * x * x^(-4/12) = x^(20/12) */
 206       /* Should avoid using rsqrt if it's not available, but
 207        * using x * x^(4/12) * x^(4/12) instead will change error weight */
 208       tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
 209       x2 = lp_build_mul(&f32_bld, src, src);
 210       pow_2 = lp_build_mul(&f32_bld, x2, tmp);
 211
 212       /* average the values so the errors cancel out, compensate bias,
 213        * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul
 214        * for conversion to int in here */
 215       tmp = lp_build_add(&f32_bld, pow_1, pow_2);
 216       coeff = lp_build_const_vec(gallivm, src_type,
 217                                  1.0f / (3.0f * coeff_f) * 0.999852f *
 218                                  powf(1.055f * 255.0f, 4.0f));
 219       pow_final = lp_build_mul(&f32_bld, tmp, coeff);
 220
 221       /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
 222       if (lp_build_fast_rsqrt_available(src_type)) {
 223          pow_final = lp_build_fast_rsqrt(&f32_bld,
 224                         lp_build_fast_rsqrt(&f32_bld, pow_final));
 225       }
 226       else {
 227          pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final));
 228       }
 229       pow_final = lp_build_add(&f32_bld, pow_final,
 230                                lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f));
 231    }
 232
 233    else {
 234       /*
 235        * using "rational polynomial" approximation here.
 236        * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
 237        * factoring in the 255.0 mul and the scaling mul.
 238        * (a is closer to actual value so has higher weight than b.)
 239        * Note: the constants are magic values. They were found empirically,
 240        * possibly could be improved but good enough (be VERY careful with
 241        * error metric if you'd want to tweak them, they also MUST fit with
 242        * the crappy polynomial above for srgb->linear since it is required
 243        * that each srgb value maps back to the same value).
 244        * This function has an error of max +-0.17 (and we'd only require +-0.6),
 245        * for the approximated srgb->linear values the error is naturally larger
 246        * (+-0.42) but still accurate enough (required +-0.5 essentially).
 247        * All in all (including min/max clamp, conversion) 15 instructions.
 248        * FMA would help (minus 2 instructions).
 249        */
 250
 251       LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2;
 252
 253       if (lp_build_fast_rsqrt_available(src_type)) {
 254          tmp = lp_build_fast_rsqrt(&f32_bld, src);
 255          x05 = lp_build_mul(&f32_bld, src, tmp);
 256       }
 257       else {
 258          /*
 259           * I don't really expect this to be practical without rsqrt
 260           * but there's no reason for triple punishment so at least
 261           * save the otherwise resulting division and unnecessary mul...
 262           */
 263          x05 = lp_build_sqrt(&f32_bld, src);
 264       }
 265
 266       tmp = lp_build_mul(&f32_bld, x05, src);
 267       if (lp_build_fast_rsqrt_available(src_type)) {
 268          x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp));
 269       }
 270       else {
 271          x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
 272       }
 273
 274       a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f);
 275       b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f);
 276       c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);
 277
 278       tmp = lp_build_mul(&f32_bld, a_const, x0375);
 279       tmp2 = lp_build_mul(&f32_bld, b_const, x05);
 280       tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
 281       pow_final = lp_build_add(&f32_bld, tmp, tmp2);
 282    }
 283
 284    /* linear part is easy */
 285    lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
 286    lin = lp_build_mul(&f32_bld, src, lin_const);
 287
 288    lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
 289    is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh);
 290    tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
 291
 292    f32_bld.type.sign = 0;
 293    return lp_build_iround(&f32_bld, tmp);
 294 }