217aaa998383a58841cd2d802751c4f78236c8e6
[mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_format_srgb.c
1 /**************************************************************************
2 *
3 * Copyright 2013 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * Format conversion code for srgb formats.
32 *
33 * Functions for converting from srgb to linear and vice versa.
34 * From http://www.opengl.org/registry/specs/EXT/texture_sRGB.txt:
35 *
36 * srgb->linear:
37 * cl = cs / 12.92, cs <= 0.04045
38 * cl = ((cs + 0.055)/1.055)^2.4, cs > 0.04045
39 *
40 * linear->srgb:
41 * if (isnan(cl)) {
42 * Map IEEE-754 Not-a-number to zero.
43 * cs = 0.0;
44 * } else if (cl > 1.0) {
45 * cs = 1.0;
46 * } else if (cl < 0.0) {
47 * cs = 0.0;
48 * } else if (cl < 0.0031308) {
49 * cs = 12.92 * cl;
50 * } else {
51 * cs = 1.055 * pow(cl, 0.41666) - 0.055;
52 * }
53 *
54 * This does not need to be accurate, however at least for d3d10
55 * (http://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx):
56 * 1) For srgb->linear, it is required that the error on the srgb side is
57 * not larger than 0.5f, which I interpret that if you map the value back
58 * to srgb from linear using the ideal conversion, it would not be off by
59 * more than 0.5f (that is, it would map to the same 8-bit integer value
60 * as it was before conversion to linear).
61 * 2) linear->srgb is permitted 0.6f which luckily looks like quite a large
62 * error is allowed.
63 * 3) Additionally, all srgb values converted to linear and back must result
64 * in the same value as they were originally.
65 *
66 * @author Roland Scheidegger <sroland@vmware.com>
67 */
68
69
70 #include "util/u_debug.h"
71
72 #include "lp_bld_type.h"
73 #include "lp_bld_const.h"
74 #include "lp_bld_arit.h"
75 #include "lp_bld_bitarit.h"
76 #include "lp_bld_logic.h"
77 #include "lp_bld_format.h"
78
79
80
81 /**
82 * Convert srgb int values to linear float values.
83 * Several possibilities how to do this, e.g.
84 * - table
85 * - doing the pow() with int-to-float and float-to-int tricks
86 * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
87 * - just using standard polynomial approximation
88 * (3rd order polynomial is required for crappy but just sufficient accuracy)
89 *
90 * @param src integer (vector) value(s) to convert
91 * (8 bit values unpacked to 32 bit already).
92 */
93 LLVMValueRef
94 lp_build_srgb_to_linear(struct gallivm_state *gallivm,
95 struct lp_type src_type,
96 LLVMValueRef src)
97 {
98 struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);
99 struct lp_build_context f32_bld;
100 LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh;
101 double coeffs[4] = {0.0023f,
102 0.0030f / 255.0f,
103 0.6935f / (255.0f * 255.0f),
104 0.3012f / (255.0f * 255.0f * 255.0f)
105 };
106
107 assert(src_type.width == 32);
108
109 lp_build_context_init(&f32_bld, gallivm, f32_type);
110
111 /*
112 * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023)
113 * ( poly = 0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023)
114 * (found with octave polyfit and some magic as I couldn't get the error
115 * function right). Using the above mentioned error function, the values stay
116 * within +-0.35, except for the lowest values - hence tweaking linear segment
117 * to cover the first 16 instead of the first 11 values (the error stays
118 * just about acceptable there too).
119 * Hence: lin = src > 15 ? poly : src / 12.6
120 * This function really only makes sense for vectors, should use LUT otherwise.
121 * All in all (including float conversion) 11 instructions (with sse4.1),
122 * 6 constants (polynomial could be done with 1 instruction less at the cost
123 * of slightly worse dependency chain, fma should also help).
124 */
125 /* doing the 1/255 mul as part of the approximation */
126 srcf = lp_build_int_to_float(&f32_bld, src);
127 lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f));
128 part_lin = lp_build_mul(&f32_bld, srcf, lin_const);
129
130 part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4);
131
132 lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);
133 is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh);
134 return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);
135 }
136
137
138 /**
139 * Convert linear float values to srgb int values.
140 * Several possibilities how to do this, e.g.
141 * - use table (based on exponent/highest order mantissa bits) and do
142 * linear interpolation (https://gist.github.com/rygorous/2203834)
143 * - Chebyshev polynomial
144 * - Approximation using reciprocals
145 * - using int-to-float and float-to-int tricks for pow()
146 * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
147 *
148 * @param src float (vector) value(s) to convert.
149 */
150 LLVMValueRef
151 lp_build_linear_to_srgb(struct gallivm_state *gallivm,
152 struct lp_type src_type,
153 LLVMValueRef src)
154 {
155 LLVMBuilderRef builder = gallivm->builder;
156 struct lp_build_context f32_bld;
157 LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final;
158
159 lp_build_context_init(&f32_bld, gallivm, src_type);
160
161 src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);
162
163 if (0) {
164 /*
165 * using int-to-float and float-to-int trick for pow().
166 * This is much more accurate than necessary thanks to the correction,
167 * but it most certainly makes no sense without rsqrt available.
168 * Bonus points if you understand how this works...
169 * All in all (including min/max clamp, conversion) 19 instructions.
170 */
171
172 float exp_f = 2.0f / 3.0f;
173 float coeff_f = 0.62996f;
174 LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2;
175 struct lp_type int_type = lp_int_type(src_type);
176
177 /*
178 * First calculate approx x^8/12
179 */
180 exponent = lp_build_const_vec(gallivm, src_type, exp_f);
181 coeff = lp_build_const_vec(gallivm, src_type,
182 exp2f(127.0f / exp_f - 127.0f) *
183 powf(coeff_f, 1.0f / exp_f));
184
185 /* premultiply src */
186 tmp = lp_build_mul(&f32_bld, coeff, src);
187 /* "log2" */
188 tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), "");
189 tmp = lp_build_int_to_float(&f32_bld, tmp);
190 /* multiply for pow */
191 tmp = lp_build_mul(&f32_bld, tmp, exponent);
192 /* "exp2" */
193 pow_approx = lp_build_itrunc(&f32_bld, tmp);
194 pow_approx = LLVMBuildBitCast(builder, pow_approx,
195 lp_build_vec_type(gallivm, src_type), "");
196
197 /*
198 * Since that pow was inaccurate (like 3 bits, though each sqrt step would
199 * give another bit), compensate the error (which is why we chose another
200 * exponent in the first place).
201 */
202 /* x * x^(8/12) = x^(20/12) */
203 pow_1 = lp_build_mul(&f32_bld, pow_approx, src);
204
205 /* x * x * x^(-4/12) = x^(20/12) */
206 /* Should avoid using rsqrt if it's not available, but
207 * using x * x^(4/12) * x^(4/12) instead will change error weight */
208 tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
209 x2 = lp_build_mul(&f32_bld, src, src);
210 pow_2 = lp_build_mul(&f32_bld, x2, tmp);
211
212 /* average the values so the errors cancel out, compensate bias,
213 * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul
214 * for conversion to int in here */
215 tmp = lp_build_add(&f32_bld, pow_1, pow_2);
216 coeff = lp_build_const_vec(gallivm, src_type,
217 1.0f / (3.0f * coeff_f) * 0.999852f *
218 powf(1.055f * 255.0f, 4.0f));
219 pow_final = lp_build_mul(&f32_bld, tmp, coeff);
220
221 /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
222 if (lp_build_fast_rsqrt_available(src_type)) {
223 pow_final = lp_build_fast_rsqrt(&f32_bld,
224 lp_build_fast_rsqrt(&f32_bld, pow_final));
225 }
226 else {
227 pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final));
228 }
229 pow_final = lp_build_add(&f32_bld, pow_final,
230 lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f));
231 }
232
233 else {
234 /*
235 * using "rational polynomial" approximation here.
236 * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
237 * factoring in the 255.0 mul and the scaling mul.
238 * (a is closer to actual value so has higher weight than b.)
239 * Note: the constants are magic values. They were found empirically,
240 * possibly could be improved but good enough (be VERY careful with
241 * error metric if you'd want to tweak them, they also MUST fit with
242 * the crappy polynomial above for srgb->linear since it is required
243 * that each srgb value maps back to the same value).
244 * This function has an error of max +-0.17 (and we'd only require +-0.6),
245 * for the approximated srgb->linear values the error is naturally larger
246 * (+-0.42) but still accurate enough (required +-0.5 essentially).
247 * All in all (including min/max clamp, conversion) 15 instructions.
248 * FMA would help (minus 2 instructions).
249 */
250
251 LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2;
252
253 if (lp_build_fast_rsqrt_available(src_type)) {
254 tmp = lp_build_fast_rsqrt(&f32_bld, src);
255 x05 = lp_build_mul(&f32_bld, src, tmp);
256 }
257 else {
258 /*
259 * I don't really expect this to be practical without rsqrt
260 * but there's no reason for triple punishment so at least
261 * save the otherwise resulting division and unnecessary mul...
262 */
263 x05 = lp_build_sqrt(&f32_bld, src);
264 }
265
266 tmp = lp_build_mul(&f32_bld, x05, src);
267 if (lp_build_fast_rsqrt_available(src_type)) {
268 x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp));
269 }
270 else {
271 x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
272 }
273
274 a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f);
275 b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f);
276 c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);
277
278 tmp = lp_build_mul(&f32_bld, a_const, x0375);
279 tmp2 = lp_build_mul(&f32_bld, b_const, x05);
280 tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
281 pow_final = lp_build_add(&f32_bld, tmp, tmp2);
282 }
283
284 /* linear part is easy */
285 lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
286 lin = lp_build_mul(&f32_bld, src, lin_const);
287
288 lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
289 is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh);
290 tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);
291
292 f32_bld.type.sign = 0;
293 return lp_build_iround(&f32_bld, tmp);
294 }